comparison MaxQuantProcessingScript.R @ 5:d4d531006735 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 92e8ab6fc27a1f02583742715d644bc96418fbdf"
author eschen42
date Thu, 10 Mar 2022 23:42:48 +0000
parents c1403d18c189
children 922d309640db
comparison
equal deleted inserted replaced
4:cfc65ae227f8 5:d4d531006735
1 #!/usr/bin/env Rscript 1 #!/usr/bin/env Rscript
2 2
3 # This is the implementation for the 3 # This is the implementation for the
4 # "MaxQuant Phosphopeptide Localization Probability Cutoff" 4 # "MaxQuant Phosphopeptide Localization Probability Cutoff"
5 # Galaxy tool (mqppep_lclztn_filter) 5 # Galaxy tool (mqppep_lclztn_filter)
6 # It is adapted from the MaxQuant Processing Script written by Larry Cheng. 6 # It is adapted from the MaxQuant Processing Script written by Larry Cheng.
7 7
8 # libraries 8 # libraries
9 library(optparse) 9 library(optparse)
10 library(data.table) 10 library(data.table)
11 library(stringr) 11 library(stringr)
12 library(ggplot2) 12 library(ggplot2)
13 #library(PTXQC)
14 #require(PTXQC)
15 #require(methods)
16 13
17 # title: "MaxQuant Processing Script" 14 # title: "MaxQuant Processing Script"
18 # author: "Larry Cheng" 15 # author: "Larry Cheng"
19 # date: "February 19, 2018" 16 # date: "February 19, 2018"
20 # 17 #
21 # # MaxQuant Processing Script 18 # # MaxQuant Processing Script
22 # Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order): 19 # Takes MaxQuant Phospho (STY)sites.txt file as input
20 # and performs the following (in order):
23 # 1) Runs the Proteomics Quality Control software 21 # 1) Runs the Proteomics Quality Control software
24 # 2) Remove contaminant and reverse sequence rows 22 # 2) Remove contaminant and reverse sequence rows
25 # 3) Filters rows based on localization probability 23 # 3) Filters rows based on localization probability
26 # 4) Extract the quantitative data 24 # 4) Extract the quantitative data
27 # 5) Sequences phosphopeptides 25 # 5) Sequences phosphopeptides
28 # 6) Merges multiply phosphorylated peptides 26 # 6) Merges multiply phosphorylated peptides
29 # 7) Filters out phosphopeptides based on enrichment 27 # 7) Filters out phosphopeptides based on enrichment
30 # The output file contains the phosphopeptide (first column) and the quantitative values for each sample 28 # The output file contains the phosphopeptide (first column)
29 # and the quantitative values for each sample.
31 # 30 #
32 # ## Revision History 31 # ## Revision History
33 # Rev. 2022-02-10 :wrap for inclusion in Galaxy 32 # Rev. 2022-02-10 :wrap for inclusion in Galaxy
34 # Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script" 33 # Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script"
34 # and "Phosphopeptide Processing Script"
35 # Rev. 2017-12-12 :added PTXQC 35 # Rev. 2017-12-12 :added PTXQC
36 # added additional plots and table outputs for quality control 36 # added additional plots and table outputs for quality control
37 # allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <- 37 # allowed for more than 2 samples to be grouped together
38 # "\\.(\\d+)[A-Z]$" 38 # (up to 26 (eg, 1A, 1B, 1C, etc))
39 # converted from .r to .rmd file to knit report for quality control 39 # converted from .r to .rmd file to knit report
40 # Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times 40 # for quality control
41 # Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows 41 # Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data
42 # Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output 42 # impute multiple times
43 # Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\.(\\d+)[AB]$" so that it looks at the end of string 43 # Rev. 2016-09-09 :added filter to eliminate contaminant & reverse sequence rows
44 # Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to
45 # preANOVA file output
46 # Rev. 2016-08-22 :use regexSampleNames <- "\\.(\\d + )[AB]$"
47 # so that it looks at the end of string
44 # Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....) 48 # Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....)
45 # Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA 49 # Rev. 2016-07-03 :Removed row names from the write.table() output for
50 # ANOVA and PreANOVA
46 # Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75 51 # Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75
47 # Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards 52 # Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting
53 # the row numbers afterwards
48 # Rev. 2016-06-21 :test18 + standardized the regexpression in protocol 54 # Rev. 2016-06-21 :test18 + standardized the regexpression in protocol
49 55
50 56
51 ### FUNCTION DECLARATIONS begin ---------------------------------------------- 57 ### FUNCTION DECLARATIONS begin ----------------------------------------------
52 58
53 # Read first line of file at filePath 59 # Read first line of file at filePath
54 # adapted from: https://stackoverflow.com/a/35761217/15509512 60 # adapted from: https://stackoverflow.com/a/35761217/15509512
55 readFirstLine <- function(filepath) { 61 read_first_line <- function(filepath) {
56 con = file(filepath, "r") 62 con <- file(filepath, "r")
57 line = readLines(con, n = 1) 63 line <- readLines(con, n = 1)
58 close(con) 64 close(con)
59 return(line) 65 return(line)
60 } 66 }
61 67
62 # Move columns to the end of dataframe 68 # Move columns to the end of dataframe
66 data[c(setdiff(names(data), move), move)] 72 data[c(setdiff(names(data), move), move)]
67 } 73 }
68 74
69 # Generate phosphopeptide and build list when applied 75 # Generate phosphopeptide and build list when applied
70 phosphopeptide_func <- function(df) { 76 phosphopeptide_func <- function(df) {
71 77 # generate peptide sequence and list of phosphopositions
72 #generate peptide sequence and list of phosphopositions 78 phosphoprobsequence <-
73 phosphoprobsequence <- strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]] 79 strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]]
74 output <- vector() 80 output <- vector()
75 phosphopeptide <- "" 81 phosphopeptide <- ""
76 counter <- 0 #keep track of position in peptide 82 counter <- 0 # keep track of position in peptide
77 phosphopositions <- vector() #keep track of phosphorylation positions in peptide 83 phosphopositions <-
84 vector() # keep track of phosphorylation positions in peptide
78 score_diff <- "" 85 score_diff <- ""
79 for (chara in phosphoprobsequence){ 86 for (chara in phosphoprobsequence) {
80 #build peptide sequence 87 # build peptide sequence
81 if (!(chara == " " | chara == "(" | chara == ")" | chara =="." | chara =="-" | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9")) { 88 if (!(
82 phosphopeptide <- paste(phosphopeptide,chara,sep="") 89 chara == " " |
90 chara == "(" |
91 chara == ")" |
92 chara == "." |
93 chara == "-" |
94 chara == "0" |
95 chara == "1" |
96 chara == "2" |
97 chara == "3" |
98 chara == "4" |
99 chara == "5" |
100 chara == "6" |
101 chara == "7" |
102 chara == "8" |
103 chara == "9")
104 ) {
105 phosphopeptide <- paste(phosphopeptide, chara, sep = "")
83 counter <- counter + 1 106 counter <- counter + 1
84 } 107 }
85 #generate score_diff 108 # generate score_diff
86 if (chara == "-" | chara =="." | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9"){ 109 if (chara == "-" |
87 score_diff <- paste(score_diff,chara,sep="") 110 chara == "." |
111 chara == "0" |
112 chara == "1" |
113 chara == "2" |
114 chara == "3" |
115 chara == "4" |
116 chara == "5" |
117 chara == "6" |
118 chara == "7" |
119 chara == "8" |
120 chara == "9"
121 ) {
122 score_diff <- paste(score_diff, chara, sep = "")
88 } 123 }
89 #evaluate score_diff 124 # evaluate score_diff
90 if (chara == ")" ){ 125 if (chara == ")") {
91 score_diff <- as.numeric(score_diff) 126 score_diff <- as.numeric(score_diff)
92 #only consider a phosphoresidue if score_diff > 0 127 # only consider a phosphoresidue if score_diff > 0
93 if (score_diff > 0) { 128 if (score_diff > 0) {
94 phosphopositions <- append(phosphopositions, counter) 129 phosphopositions <- append(phosphopositions, counter)
95 } 130 }
96 score_diff <- "" 131 score_diff <- ""
97 } 132 }
98 } 133 }
99 134
100 #generate phosphopeptide sequence (ie, peptide sequence with "p"'s) 135 # generate phosphopeptide sequence (ie, peptide sequence with "p"'s)
101 counter <- 1 136 counter <- 1
102 phosphoposition_correction1 <- -1 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string 137 phosphoposition_correction1 <-
103 phosphoposition_correction2 <- 0 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string 138 -1 # used to correct phosphosposition as "p"'s
104 while (counter <= length(phosphopositions) ) { 139 # are inserted into the phosphopeptide string
105 phosphopeptide <- paste(substr(phosphopeptide,0,phosphopositions[counter]+phosphoposition_correction1),"p",substr(phosphopeptide,phosphopositions[counter]+phosphoposition_correction2,nchar(phosphopeptide)),sep="") 140 phosphoposition_correction2 <-
141 0 # used to correct phosphosposition as "p"'s
142 # are inserted into the phosphopeptide string
143 while (counter <= length(phosphopositions)) {
144 phosphopeptide <-
145 paste(
146 substr(
147 phosphopeptide,
148 0,
149 phosphopositions[counter] + phosphoposition_correction1
150 ),
151 "p",
152 substr(
153 phosphopeptide,
154 phosphopositions[counter] + phosphoposition_correction2,
155 nchar(phosphopeptide)
156 ),
157 sep = ""
158 )
106 counter <- counter + 1 159 counter <- counter + 1
107 phosphoposition_correction1 <- phosphoposition_correction1 + 1 160 phosphoposition_correction1 <- phosphoposition_correction1 + 1
108 phosphoposition_correction2 <- phosphoposition_correction2 + 1 161 phosphoposition_correction2 <- phosphoposition_correction2 + 1
109 } 162 }
110 163 # building phosphopeptide list
111 #building phosphopeptide list 164 output <- append(output, phosphopeptide)
112 output <- append(output,phosphopeptide)
113 return(output) 165 return(output)
114 } 166 }
115 167
116 ### FUNCTION DECLARATIONS end ------------------------------------------------ 168 ### FUNCTION DECLARATIONS end ------------------------------------------------
117 169
124 c("-i", "--input"), 176 c("-i", "--input"),
125 action = "store", 177 action = "store",
126 type = "character", 178 type = "character",
127 help = "A MaxQuant Phospho (STY)Sites.txt" 179 help = "A MaxQuant Phospho (STY)Sites.txt"
128 ) 180 )
129 , make_option( 181 ,
182 make_option(
130 c("-o", "--output"), 183 c("-o", "--output"),
131 action = "store", 184 action = "store",
132 type = "character", 185 type = "character",
133 help = "path to output file" 186 help = "path to output file"
134 ) 187 )
135 , make_option( 188 ,
189 make_option(
136 c("-E", "--enrichGraph"), 190 c("-E", "--enrichGraph"),
137 action = "store", 191 action = "store",
138 type = "character", 192 type = "character",
139 help = "path to enrichment graph PDF" 193 help = "path to enrichment graph PDF"
140 ) 194 )
141 , make_option( 195 ,
196 make_option(
142 c("-F", "--enrichGraph_svg"), 197 c("-F", "--enrichGraph_svg"),
143 action = "store", 198 action = "store",
144 type = "character", 199 type = "character",
145 help = "path to enrichment graph SVG" 200 help = "path to enrichment graph SVG"
146 ) 201 )
147 , make_option( 202 ,
203 make_option(
148 c("-L", "--locProbCutoffGraph"), 204 c("-L", "--locProbCutoffGraph"),
149 action = "store", 205 action = "store",
150 type = "character", 206 type = "character",
151 help = "path to location-proability cutoff graph PDF" 207 help = "path to location-proability cutoff graph PDF"
152 ) 208 )
153 , make_option( 209 ,
210 make_option(
154 c("-M", "--locProbCutoffGraph_svg"), 211 c("-M", "--locProbCutoffGraph_svg"),
155 action = "store", 212 action = "store",
156 type = "character", 213 type = "character",
157 help = "path to location-proability cutoff graph SVG" 214 help = "path to location-proability cutoff graph SVG"
158 ) 215 )
159 , make_option( 216 ,
217 make_option(
160 c("-e", "--enriched"), 218 c("-e", "--enriched"),
161 action = "store", 219 action = "store",
162 type = "character", 220 type = "character",
163 help = "pY or pST enriched samples (ie, 'Y' or 'ST')" 221 help = "pY or pST enriched samples (ie, 'Y' or 'ST')"
164 ) 222 )
165 # default = "^Number of Phospho [(]STY[)]$", 223 # default = "^Number of Phospho [(]STY[)]$",
166 , make_option( 224 ,
225 make_option(
167 c("-p", "--phosphoCol"), 226 c("-p", "--phosphoCol"),
168 action = "store", 227 action = "store",
169 type = "character", 228 type = "character",
170 help = "PERL-compatible regular expression matching header of column having number of 'Phospho (STY)'" 229 help = paste0("PERL-compatible regular expression matching",
230 " header of column having number of 'Phospho (STY)'")
171 ) 231 )
172 # default = "^Intensity[^_]", 232 # default = "^Intensity[^_]",
173 , make_option( 233 ,
234 make_option(
174 c("-s", "--startCol"), 235 c("-s", "--startCol"),
175 action = "store", 236 action = "store",
176 type = "character", 237 type = "character",
177 help = "PERL-compatible regular expression matching column header having first sample intensity" 238 help = paste0("PERL-compatible regular expression matching",
239 " header of column having first sample intensity")
178 ) 240 )
179 # default = 1, 241 # default = 1,
180 , make_option( 242 ,
243 make_option(
181 c("-I", "--intervalCol"), 244 c("-I", "--intervalCol"),
182 action = "store", 245 action = "store",
183 type = "integer", 246 type = "integer",
184 help = "Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column" 247 help = paste0("Column interval between the Intensities of samples",
248 " (eg, 1 if subsequent column; 2 if every other column")
185 ) 249 )
186 # default = 0.75, 250 # default = 0.75,
187 , make_option( 251 ,
252 make_option(
188 c("-l", "--localProbCutoff"), 253 c("-l", "--localProbCutoff"),
189 action = "store", 254 action = "store",
190 type = "double", 255 type = "double",
191 help = "Localization Probability Cutoff" 256 help = "Localization Probability Cutoff"
192 ) 257 )
193 # default = "sum", 258 # default = "sum",
194 , make_option( 259 ,
260 make_option(
195 c("-f", "--collapse_func"), 261 c("-f", "--collapse_func"),
196 action = "store", 262 action = "store",
197 type = "character", 263 type = "character",
198 help = "merge identical phosphopeptides by ('sum' or 'average') the intensities" 264 help = paste0("merge identical phosphopeptides",
199 ) 265 " by ('sum' or 'average') the intensities")
200 # default = "filteredData.txt", 266 )
201 , make_option( 267 # default = "filtered_data.txt",
268 ,
269 make_option(
202 c("-r", "--filtered_data"), 270 c("-r", "--filtered_data"),
203 action = "store", 271 action = "store",
204 type = "character", 272 type = "character",
205 help = "filteredData.txt" 273 help = "filtered_data.txt"
206 ) 274 )
207 # default = "quantData.txt", 275 # default = "quantData.txt",
208 , make_option( 276 ,
277 make_option(
209 c("-q", "--quant_data"), 278 c("-q", "--quant_data"),
210 action = "store", 279 action = "store",
211 type = "character", 280 type = "character",
212 help = "quantData.txt" 281 help = "quantData.txt"
213 ) 282 )
214 ) 283 )
215 args <- parse_args(OptionParser(option_list=option_list)) 284 args <- parse_args(OptionParser(option_list = option_list))
216 # Check parameter values 285 # Check parameter values
217 286
218 ### EXTRACT ARGUMENTS end ---------------------------------------------------- 287 ### EXTRACT ARGUMENTS end ----------------------------------------------------
219 288
220 289
221 ### EXTRACT PARAMETERS from arguments begin ---------------------------------- 290 ### EXTRACT PARAMETERS from arguments begin ----------------------------------
222 291
223 if (! file.exists(args$input)) { 292 if (!file.exists(args$input)) {
224 stop((paste("File", args$input, "does not exist"))) 293 stop((paste("File", args$input, "does not exist")))
225 } 294 }
226 295
227 phosphoColPattern <- "^Number of Phospho [(][STY][STY]*[)]$" 296 phospho_col_pattern <- "^Number of Phospho [(][STY][STY]*[)]$"
228 startColPattern <- "^Intensity[^_]" 297 start_col_pattern <- "^Intensity[^_]"
229 phosphoColPattern <- readFirstLine(args$phosphoCol) 298 phospho_col_pattern <- read_first_line(args$phosphoCol)
230 startColPattern <- readFirstLine(args$startCol) 299 start_col_pattern <- read_first_line(args$startCol)
231 300
232 sink(getConnection(2)) 301 sink(getConnection(2))
233 #ACE print(paste("phosphoColPattern", phosphoColPattern)) 302
234 #ACE print(paste("startColPattern", startColPattern)) 303 input_file_name <- args$input
235 304 filtered_filename <- args$filtered_data
236 inputFilename <- args$input 305 quant_file_name <- args$quant_data
237 filteredFilename <- args$filtered_data 306 interval_col <- as.integer(args$intervalCol)
238 quantFilename <- args$quant_data 307
239 intervalCol <- as.integer(args$intervalCol) 308 first_line <- read_first_line(input_file_name)
240 309 col_headers <-
241 firstLine <- readFirstLine(inputFilename) 310 unlist(strsplit(
242 columnHeaders <- unlist(strsplit(x=firstLine, split=c('\t'), fixed=TRUE)) 311 x = first_line,
312 split = c("\t"),
313 fixed = TRUE
314 ))
243 sink(getConnection(2)) 315 sink(getConnection(2))
244 #ACE print("columnHeaders")
245 #ACE print(columnHeaders)
246 sink() 316 sink()
247 317
248 318
249 intensityHeaderCols <- grep(pattern=startColPattern, x=columnHeaders, perl=TRUE) 319 intensity_header_cols <-
250 if ( length(intensityHeaderCols) == 0) { 320 grep(pattern = start_col_pattern, x = col_headers, perl = TRUE)
251 err_msg <- paste("Found no intensity columns matching pattern:", startColPattern) 321 if (length(intensity_header_cols) == 0) {
252 # Divert output to stderr 322 err_msg <-
253 sink(getConnection(2)) 323 paste("Found no intensity columns matching pattern:",
254 print(err_msg) 324 start_col_pattern)
255 sink() 325 # Divert output to stderr
256 stop(err_msg) 326 sink(getConnection(2))
257 } 327 print(err_msg)
258 328 sink()
259 329 stop(err_msg)
260 phosphoCol <- grep(pattern=phosphoColPattern, x=columnHeaders, perl=TRUE)[1] 330 }
261 if (is.na(phosphoCol)) { 331
262 err_msg <- paste("Found no 'number of phospho sites' columns matching pattern:", phosphoColPattern) 332
263 # Divert output to stderr 333 phospho_col <-
264 sink(getConnection(2)) 334 grep(pattern = phospho_col_pattern, x = col_headers, perl = TRUE)[1]
265 print(err_msg) 335 if (is.na(phospho_col)) {
266 sink() 336 err_msg <-
267 stop(err_msg) 337 paste("Found no 'number of phospho sites' columns matching pattern:",
268 } 338 phospho_col_pattern)
339 # Divert output to stderr
340 sink(getConnection(2))
341 print(err_msg)
342 sink()
343 stop(err_msg)
344 }
269 345
270 346
271 i_count <- 0 347 i_count <- 0
272 this_column <- 1 348 this_column <- 1
273 last_value <- intensityHeaderCols[1] 349 last_value <- intensity_header_cols[1]
274 intensityCols <- c(last_value) 350 intensity_cols <- c(last_value)
275 351
276 while ( length(intensityHeaderCols) >= intervalCol * i_count ) { 352 while (length(intensity_header_cols) >= interval_col * i_count) {
277 i_count <- 1 + i_count 353 i_count <- 1 + i_count
278 this_column <- intervalCol + this_column 354 this_column <- interval_col + this_column
279 if ( last_value + intervalCol != intensityHeaderCols[this_column] ) break 355 if (last_value + interval_col != intensity_header_cols[this_column])
280 last_value <- intensityHeaderCols[this_column] 356 break
281 if (length(intensityHeaderCols) < intervalCol * i_count) break 357 last_value <- intensity_header_cols[this_column]
282 intensityCols <- c(intensityCols, intensityHeaderCols[this_column]) 358 if (length(intensity_header_cols) < interval_col * i_count)
283 } 359 break
284 360 intensity_cols <-
285 startCol <- intensityCols[1] 361 c(intensity_cols, intensity_header_cols[this_column])
286 numSamples <- i_count 362 }
287 363
288 outputfilename <- args$output 364 start_col <- intensity_cols[1]
289 enrichGraphFilename <- args$enrichGraph 365 num_samples <- i_count
290 locProbCutoffGraphFilename <- args$locProbCutoffGraph 366
291 enrichGraphFilename_svg <- args$enrichGraph_svg 367 output_filename <- args$output
292 locProbCutoffGraphFilename_svg <- args$locProbCutoffGraph_svg 368 enrich_graph_filename <- args$enrichGraph
293 369 loc_prob_cutoff_graph_filename <- args$locProbCutoffGraph
294 localProbCutoff <- args$localProbCutoff 370 enrich_graph_filename_svg <- args$enrichGraph_svg
371 loc_prob_cutoff_graph_fn_svg <- args$locProbCutoffGraph_svg
372
373 local_prob_cutoff <- args$localProbCutoff
295 enriched <- args$enriched 374 enriched <- args$enriched
296 collapse_FUN <- args$collapse_func 375 collapse_fn <- args$collapse_func
297 376
298 ### EXTRACT PARAMETERS from arguments end ------------------------------------ 377 ### EXTRACT PARAMETERS from arguments end ------------------------------------
299 378
300 379
301 # Proteomics Quality Control for MaxQuant Results 380 # Proteomics Quality Control for MaxQuant Results
302 # (Bielow C et al. J Proteome Res. 2016 PMID: 26653327) 381 # (Bielow C et al. J Proteome Res. 2016 PMID: 26653327)
303 # is run by the Galaxy MaxQuant wrapper and need not be invoked here. 382 # is run by the Galaxy MaxQuant wrapper and need not be invoked here.
304 383
305 384
306 # Read data, filtering out contaminants, reverse sequences, and localization probability 385 # Read & filter out contaminants, reverse sequences, & localization probability
307 # --- 386 # ---
308 fullData <- read.table(file = inputFilename, sep ="\t", header=T, quote="") 387 full_data <-
309 388 read.table(
310 #Filter out contaminant rows and reverse rows 389 file = input_file_name,
311 filteredData <- subset(fullData,!grepl("CON__", Proteins)) 390 sep = "\t",
312 filteredData <- subset(filteredData,!grepl("_MYCOPLASMA", Proteins)) 391 header = T,
313 filteredData <- subset(filteredData,!grepl("CONTAMINANT_", Proteins)) 392 quote = ""
314 filteredData <- subset(filteredData,!grepl("REV__", Protein)) #since REV__ rows are blank in the first column (Proteins) 393 )
315 write.table(filteredData, file = filteredFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE) 394
395 # Filter out contaminant rows and reverse rows
396 filtered_data <- subset(full_data, !grepl("CON__", Proteins))
397 filtered_data <-
398 subset(filtered_data, !grepl("_MYCOPLASMA", Proteins))
399 filtered_data <-
400 subset(filtered_data, !grepl("CONTAMINANT_", Proteins))
401 filtered_data <-
402 subset(filtered_data, !grepl("REV__", Protein)
403 ) # since REV__ rows are blank in the first column (Proteins)
404 write.table(
405 filtered_data,
406 file = filtered_filename,
407 sep = "\t",
408 quote = FALSE,
409 col.names = TRUE,
410 row.names = FALSE
411 )
316 # ... 412 # ...
317 413
318 414
319 # Filter out data with localization probability below localProbCutoff 415 # Filter out data with localization probability below localProbCutoff
320 # --- 416 # ---
321 #Data filtered by localization probability 417 # Data filtered by localization probability
322 locProbFilteredData <- filteredData[filteredData$Localization.prob>=localProbCutoff,] 418 loc_prob_filtered_data <-
419 filtered_data[
420 filtered_data$Localization.prob >= local_prob_cutoff,
421 ]
323 # ... 422 # ...
324 423
325 424
326 # Localization probability -- visualize locprob cutoff 425 # Localization probability -- visualize locprob cutoff
327 # --- 426 # ---
328 locProbGraphData <- data.frame( 427 loc_prob_graph_data <-
329 group = c(paste(">",toString(localProbCutoff),sep=""), paste("<",toString(localProbCutoff),sep="")), 428 data.frame(
330 value = c(nrow(locProbFilteredData)/nrow(filteredData)*100, (nrow(filteredData)-nrow(locProbFilteredData))/nrow(filteredData)*100) 429 group = c(paste(">", toString(local_prob_cutoff), sep = ""),
331 ) 430 paste("<", toString(local_prob_cutoff), sep = "")),
431 value = c(
432 nrow(loc_prob_filtered_data) / nrow(filtered_data) * 100,
433 (nrow(filtered_data) - nrow(loc_prob_filtered_data))
434 / nrow(filtered_data) * 100
435 )
436 )
332 gigi <- 437 gigi <-
333 ggplot(locProbGraphData, aes(x = "", y = value, fill = group)) + 438 ggplot(loc_prob_graph_data, aes(x = "", y = value, fill = group)) +
334 geom_bar(width = 0.5, stat = "identity", color = "black") + 439 geom_bar(width = 0.5,
335 labs( 440 stat = "identity",
336 x = NULL 441 color = "black") +
337 , y = "percent" 442 labs(x = NULL,
338 , title = "Phosphopeptides partitioned by localization-probability cutoff" 443 y = "percent",
444 title = "Phosphopeptides partitioned by localization-probability cutoff"
339 ) + 445 ) +
340 scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") + 446 scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") +
341 theme_minimal() + 447 theme_minimal() +
342 theme( 448 theme(
343 legend.position = "right" 449 legend.position = "right",
344 , legend.title=element_text() 450 legend.title = element_text(),
345 , plot.title = element_text(hjust = 0.5) 451 plot.title = element_text(hjust = 0.5),
346 , plot.subtitle = element_text(hjust = 0.5) 452 plot.subtitle = element_text(hjust = 0.5),
347 , plot.title.position = "plot" 453 plot.title.position = "plot"
348 ) 454 )
349 pdf(locProbCutoffGraphFilename) 455 pdf(loc_prob_cutoff_graph_filename)
350 print(gigi) 456 print(gigi)
351 dev.off() 457 dev.off()
352 svg(locProbCutoffGraphFilename_svg) 458 svg(loc_prob_cutoff_graph_fn_svg)
353 print(gigi) 459 print(gigi)
354 dev.off() 460 dev.off()
355 # ... 461 # ...
356 462
357 463
358 # Extract quantitative values from filtered data 464 # Extract quantitative values from filtered data
359 # --- 465 # ---
360 quantData <- locProbFilteredData[,seq(from=startCol, by=intervalCol, length.out=numSamples)] 466 quant_data <-
467 loc_prob_filtered_data[, seq(from = start_col,
468 by = interval_col,
469 length.out = num_samples)]
361 # ... 470 # ...
362 471
363 472
364 # Generate Phosphopeptide Sequence 473 # Generate Phosphopeptide Sequence
365 # for latest version of MaxQuant (Version 1.5.3.30) 474 # for latest version of MaxQuant (Version 1.5.3.30)
366 # --- 475 # ---
367 dataTable <- data.frame(locProbFilteredData[,1:8],locProbFilteredData[,phosphoCol],locProbFilteredData[,phosphoCol+1],locProbFilteredData[,phosphoCol+2],locProbFilteredData[,phosphoCol+3],locProbFilteredData[,phosphoCol+4],locProbFilteredData[,phosphoCol+5],locProbFilteredData[,phosphoCol+6],locProbFilteredData[,phosphoCol+7],quantData) 476 metadata_df <-
368 colnames(dataTable) <- c("Proteins","Positions within proteins", "Leading proteins", "Protein", "Protein names", "Gene names", "Fasta headers", "Localization prob", "Number of Phospho (STY)", "Amino Acid", "Sequence window","Modification window", "Peptide window coverage", "Phospho (STY) Probabilities", "Phospho (STY) Score diffs", "Position in peptide", colnames(quantData)) 477 data.frame(
369 # 'phosphopeptide_func' generates a phosphopeptide sequence for each row of data. 478 loc_prob_filtered_data[, 1:8],
370 # for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1,2) = both 479 loc_prob_filtered_data[, phospho_col],
371 dataTable$Phosphopeptide <- apply(X=dataTable, MARGIN=1, FUN=phosphopeptide_func) 480 loc_prob_filtered_data[, phospho_col + 1],
481 loc_prob_filtered_data[, phospho_col + 2],
482 loc_prob_filtered_data[, phospho_col + 3],
483 loc_prob_filtered_data[, phospho_col + 4],
484 loc_prob_filtered_data[, phospho_col + 5],
485 loc_prob_filtered_data[, phospho_col + 6],
486 loc_prob_filtered_data[, phospho_col + 7],
487 quant_data
488 )
489 colnames(metadata_df) <-
490 c(
491 "Proteins",
492 "Positions within proteins",
493 "Leading proteins",
494 "Protein",
495 "Protein names",
496 "Gene names",
497 "Fasta headers",
498 "Localization prob",
499 "Number of Phospho (STY)",
500 "Amino Acid",
501 "Sequence window",
502 "Modification window",
503 "Peptide window coverage",
504 "Phospho (STY) Probabilities",
505 "Phospho (STY) Score diffs",
506 "Position in peptide",
507 colnames(quant_data)
508 )
509 # 'phosphopeptide_func' generates a phosphopeptide sequence
510 # for each row of data.
511 # for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1, 2) = both
512 metadata_df$phosphopeptide <-
513 apply(X = metadata_df, MARGIN = 1, FUN = phosphopeptide_func)
514 colnames(metadata_df)[1] <- "Phosphopeptide"
372 # Move the quant data columns to the right end of the data.frame 515 # Move the quant data columns to the right end of the data.frame
373 dataTable <- movetolast(dataTable,c(colnames(quantData))) 516 metadata_df <- movetolast(metadata_df, c(colnames(quant_data)))
374 # ... 517 # ...
375 518
376 519
377 # Write quantitative values for debugging purposes 520 # Write quantitative values for debugging purposes
378 # --- 521 # ---
379 quantWrite <- cbind( dataTable[,"Sequence window"], quantData ) 522 quant_write <- cbind(metadata_df[, "Sequence window"], quant_data)
380 colnames(quantWrite)[1] <- "Sequence.Window" 523 colnames(quant_write)[1] <- "Sequence.Window"
381 write.table(quantWrite, file = quantFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE) 524 # ...
382 # ... 525
383 526
384 527 # Make new data frame containing only Phosphopeptides
385 # Make new data frame containing only Phosphopeptides to be mapped to quant data (merge_df) 528 # that are to be mapped to quant data (merge_df)
386 # --- 529 # ---
387 dataTable <- setDT(dataTable, keep.rownames=TRUE) #row name will be used to map 530 metadata_df <-
388 merge_df <- data.frame(as.integer(dataTable$rn), dataTable$Phosphopeptide) #row index to merge data frames 531 setDT(metadata_df, keep.rownames = TRUE) # row name will be used to map
532 merge_df <-
533 data.frame(
534 as.integer(metadata_df$rn),
535 metadata_df$phosphopeptide # row index to merge data frames
536 )
389 colnames(merge_df) <- c("rn", "Phosphopeptide") 537 colnames(merge_df) <- c("rn", "Phosphopeptide")
390 # ... 538 # ...
391 539
392 540
393 # Add Phosphopeptide column to quant columns for quality control checking 541 # Add Phosphopeptide column to quant columns for quality control checking
394 # --- 542 # ---
395 quantData_qc <- as.data.frame(quantData) 543 quant_data_qc <- as.data.frame(quant_data)
396 setDT(quantData_qc, keep.rownames=TRUE) #will use to match rowname to data 544 setDT(quant_data_qc, keep.rownames = TRUE) # will use to match rowname to data
397 quantData_qc$rn <- as.integer(quantData_qc$rn) 545 quant_data_qc$rn <- as.integer(quant_data_qc$rn)
398 quantData_qc <- merge(merge_df,quantData_qc, by="rn") 546 quant_data_qc <- merge(merge_df, quant_data_qc, by = "rn")
399 quantData_qc$rn <- NULL #remove rn column 547 quant_data_qc$rn <- NULL # remove rn column
400 # ... 548 # ...
401 549
402 550
403 # Collapse multiphosphorylated peptides 551 # Collapse multiphosphorylated peptides
404 # --- 552 # ---
405 quantData_qc_collapsed <- data.table(quantData_qc, key = "Phosphopeptide") 553 quant_data_qc_collapsed <-
406 quantData_qc_collapsed <- aggregate(. ~ Phosphopeptide,quantData_qc, FUN= collapse_FUN) 554 data.table(quant_data_qc, key = "Phosphopeptide")
407 # ... 555 quant_data_qc_collapsed <-
408 556 aggregate(. ~ Phosphopeptide, quant_data_qc, FUN = collapse_fn)
409 557 # ...
410 # Compute (as string) % of phosphopeptides that are multiphosphorylated (for use in next step) 558 print("quant_data_qc_collapsed")
411 # --- 559 head(quant_data_qc_collapsed)
412 pct_multiphos <- (nrow(quantData_qc) - nrow(quantData_qc_collapsed)) / (2 * nrow(quantData_qc)) 560
561 # Compute (as string) % of phosphopeptides that are multiphosphorylated
562 # (for use in next step)
563 # ---
564 pct_multiphos <-
565 (
566 nrow(quant_data_qc) - nrow(quant_data_qc_collapsed)
567 ) / (2 * nrow(quant_data_qc))
413 pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%") 568 pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%")
414 # ... 569 # ...
415 570
571 write.table(
572 quant_data_qc_collapsed,
573 file = quant_file_name,
574 sep = "\t",
575 quote = FALSE,
576 col.names = TRUE,
577 row.names = FALSE
578 )
416 579
417 # Compute and visualize breakdown of pY, pS, and pT before enrichment filter 580 # Compute and visualize breakdown of pY, pS, and pT before enrichment filter
418 # --- 581 # ---
419 pY_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),] 582 py_data <-
420 pS_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS"),] 583 quant_data_qc_collapsed[
421 pT_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),] 584 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"),
422 585 ]
423 pY_num <- nrow(pY_data) 586 ps_data <-
424 pS_num <- nrow(pS_data) 587 quant_data_qc_collapsed[
425 pT_num <- nrow(pT_data) 588 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS"),
589 ]
590 pt_data <-
591 quant_data_qc_collapsed[
592 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"),
593 ]
594
595 py_num <- nrow(py_data)
596 ps_num <- nrow(ps_data)
597 pt_num <- nrow(pt_data)
426 598
427 # Visualize enrichment 599 # Visualize enrichment
428 enrichGraphData <- data.frame( 600 enrich_graph_data <- data.frame(group = c("pY", "pS", "pT"),
429 group = c("pY", "pS", "pT"), 601 value = c(py_num, ps_num, pt_num))
430 value = c(pY_num, pS_num, pT_num) 602
431 ) 603 enrich_graph_data <-
432 604 enrich_graph_data[
433 enrichGraphData <- enrichGraphData[enrichGraphData$value > 0,] 605 enrich_graph_data$value > 0,
606 ]
434 607
435 # Plot pie chart with legend 608 # Plot pie chart with legend
436 # start: https://stackoverflow.com/a/62522478/15509512 609 # start: https://stackoverflow.com/a/62522478/15509512
437 # refine: https://www.statology.org/ggplot-pie-chart/ 610 # refine: https://www.statology.org/ggplot-pie-chart/
438 # colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8 611 # colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8
439 slices <- enrichGraphData$value 612 slices <- enrich_graph_data$value
440 phosphoresidue <- enrichGraphData$group 613 phosphoresidue <- enrich_graph_data$group
441 pct <- round(100 * slices / sum(slices)) 614 pct <- round(100 * slices / sum(slices))
442 lbls <- paste(enrichGraphData$group,"\n",pct, "%\n(", slices, ")", sep="") 615 lbls <-
616 paste(enrich_graph_data$group, "\n", pct, "%\n(", slices, ")", sep = "")
443 slc_ctr <- c() 617 slc_ctr <- c()
444 run_tot <- 0 618 run_tot <- 0
445 for (p in pct) { 619 for (p in pct) {
446 slc_ctr <- c(slc_ctr, run_tot + p/2.0) 620 slc_ctr <- c(slc_ctr, run_tot + p / 2.0)
447 run_tot <- run_tot + p 621 run_tot <- run_tot + p
448 } 622 }
449 lbl_y <- 100 - slc_ctr 623 lbl_y <- 100 - slc_ctr
450 df <- data.frame(slices, pct, lbls, phosphoresidue = factor(phosphoresidue, levels = phosphoresidue)) 624 df <-
451 gigi <- ggplot( 625 data.frame(slices,
452 df 626 pct,
453 , aes(x = 1, y = pct, fill = phosphoresidue)) + 627 lbls,
628 phosphoresidue = factor(phosphoresidue, levels = phosphoresidue))
629 gigi <- ggplot(df
630 , aes(x = 1, y = pct, fill = phosphoresidue)) +
454 geom_col(position = "stack", orientation = "x") + 631 geom_col(position = "stack", orientation = "x") +
455 geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") + 632 geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") +
456 coord_polar(theta = "y", direction = -1) + 633 coord_polar(theta = "y", direction = -1) +
457 labs( 634 labs(
458 x = NULL 635 x = NULL
459 , y = NULL 636 ,
460 , title = "Percentages (and counts) of phosphosites, by type of residue" 637 y = NULL
461 , caption = sprintf("Roughly %s of peptides have multiple phosphosites.", pct_multiphos) 638 ,
639 title = "Percentages (and counts) of phosphosites, by type of residue"
640 ,
641 caption = sprintf(
642 "Roughly %s of peptides have multiple phosphosites.",
643 pct_multiphos
644 )
462 ) + 645 ) +
463 labs(x = NULL, y = NULL, fill = NULL) + 646 labs(x = NULL, y = NULL, fill = NULL) +
464 theme_classic() + 647 theme_classic() +
465 theme( legend.position="right" 648 theme(
466 , axis.line = element_blank() 649 legend.position = "right"
467 , axis.text = element_blank() 650 ,
468 , axis.ticks = element_blank() 651 axis.line = element_blank()
469 , plot.title = element_text(hjust = 0.5) 652 ,
470 , plot.subtitle = element_text(hjust = 0.5) 653 axis.text = element_blank()
471 , plot.caption = element_text(hjust = 0.5) 654 ,
472 , plot.title.position = "plot" 655 axis.ticks = element_blank()
473 ) + 656 ,
474 scale_fill_manual(breaks = phosphoresidue, values=c("#c7eae5", "#f6e8c3", "#dfc27d")) 657 plot.title = element_text(hjust = 0.5)
475 658 ,
476 pdf(enrichGraphFilename) 659 plot.subtitle = element_text(hjust = 0.5)
660 ,
661 plot.caption = element_text(hjust = 0.5)
662 ,
663 plot.title.position = "plot"
664 ) +
665 scale_fill_manual(breaks = phosphoresidue,
666 values = c("#c7eae5", "#f6e8c3", "#dfc27d"))
667
668 pdf(enrich_graph_filename)
477 print(gigi) 669 print(gigi)
478 dev.off() 670 dev.off()
479 svg(enrichGraphFilename_svg) 671 svg(enrich_graph_filename_svg)
480 print(gigi) 672 print(gigi)
481 dev.off() 673 dev.off()
482 # ... 674 # ...
483 675
484 676
485 # Filter phosphopeptides by enrichment 677 # Filter phosphopeptides by enrichment
486 # -- 678 # --
487 if (enriched == "Y"){ 679 if (enriched == "Y") {
488 quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),] 680 quant_data_qc_enrichment <- quant_data_qc_collapsed[
489 } else if ( enriched == "ST" ) { 681 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"),
490 quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),] 682 ]
683 } else if (enriched == "ST") {
684 quant_data_qc_enrichment <- quant_data_qc_collapsed[
685 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS") |
686 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"),
687 ]
491 } else { 688 } else {
492 print("Error in enriched variable. Set to either 'Y' or 'ST'") 689 print("Error in enriched variable. Set to either 'Y' or 'ST'")
493 } 690 }
494 # ... 691 # ...
495 692
693 print("quant_data_qc_enrichment")
694 head(quant_data_qc_enrichment)
496 695
497 # Write phosphopeptides filtered by enrichment 696 # Write phosphopeptides filtered by enrichment
498 # -- 697 # --
499 write.table(quantData_qc_enrichment, file=outputfilename, sep="\t", quote = FALSE, row.names = FALSE) 698 #ACE colnames(quant_data_qc_enrichment)[1] <- "Phosphopeptide"
500 # ... 699 write.table(
700 quant_data_qc_enrichment,
701 file = output_filename,
702 sep = "\t",
703 quote = FALSE,
704 row.names = FALSE
705 )
706 # ...