Mercurial > repos > eschen42 > mqppep_anova
comparison MaxQuantProcessingScript.R @ 5:d4d531006735 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 92e8ab6fc27a1f02583742715d644bc96418fbdf"
author | eschen42 |
---|---|
date | Thu, 10 Mar 2022 23:42:48 +0000 |
parents | c1403d18c189 |
children | 922d309640db |
comparison
equal
deleted
inserted
replaced
4:cfc65ae227f8 | 5:d4d531006735 |
---|---|
1 #!/usr/bin/env Rscript | 1 #!/usr/bin/env Rscript |
2 | 2 |
3 # This is the implementation for the | 3 # This is the implementation for the |
4 # "MaxQuant Phosphopeptide Localization Probability Cutoff" | 4 # "MaxQuant Phosphopeptide Localization Probability Cutoff" |
5 # Galaxy tool (mqppep_lclztn_filter) | 5 # Galaxy tool (mqppep_lclztn_filter) |
6 # It is adapted from the MaxQuant Processing Script written by Larry Cheng. | 6 # It is adapted from the MaxQuant Processing Script written by Larry Cheng. |
7 | 7 |
8 # libraries | 8 # libraries |
9 library(optparse) | 9 library(optparse) |
10 library(data.table) | 10 library(data.table) |
11 library(stringr) | 11 library(stringr) |
12 library(ggplot2) | 12 library(ggplot2) |
13 #library(PTXQC) | |
14 #require(PTXQC) | |
15 #require(methods) | |
16 | 13 |
17 # title: "MaxQuant Processing Script" | 14 # title: "MaxQuant Processing Script" |
18 # author: "Larry Cheng" | 15 # author: "Larry Cheng" |
19 # date: "February 19, 2018" | 16 # date: "February 19, 2018" |
20 # | 17 # |
21 # # MaxQuant Processing Script | 18 # # MaxQuant Processing Script |
22 # Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order): | 19 # Takes MaxQuant Phospho (STY)sites.txt file as input |
20 # and performs the following (in order): | |
23 # 1) Runs the Proteomics Quality Control software | 21 # 1) Runs the Proteomics Quality Control software |
24 # 2) Remove contaminant and reverse sequence rows | 22 # 2) Remove contaminant and reverse sequence rows |
25 # 3) Filters rows based on localization probability | 23 # 3) Filters rows based on localization probability |
26 # 4) Extract the quantitative data | 24 # 4) Extract the quantitative data |
27 # 5) Sequences phosphopeptides | 25 # 5) Sequences phosphopeptides |
28 # 6) Merges multiply phosphorylated peptides | 26 # 6) Merges multiply phosphorylated peptides |
29 # 7) Filters out phosphopeptides based on enrichment | 27 # 7) Filters out phosphopeptides based on enrichment |
30 # The output file contains the phosphopeptide (first column) and the quantitative values for each sample | 28 # The output file contains the phosphopeptide (first column) |
29 # and the quantitative values for each sample. | |
31 # | 30 # |
32 # ## Revision History | 31 # ## Revision History |
33 # Rev. 2022-02-10 :wrap for inclusion in Galaxy | 32 # Rev. 2022-02-10 :wrap for inclusion in Galaxy |
34 # Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script" | 33 # Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" |
34 # and "Phosphopeptide Processing Script" | |
35 # Rev. 2017-12-12 :added PTXQC | 35 # Rev. 2017-12-12 :added PTXQC |
36 # added additional plots and table outputs for quality control | 36 # added additional plots and table outputs for quality control |
37 # allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <- | 37 # allowed for more than 2 samples to be grouped together |
38 # "\\.(\\d+)[A-Z]$" | 38 # (up to 26 (eg, 1A, 1B, 1C, etc)) |
39 # converted from .r to .rmd file to knit report for quality control | 39 # converted from .r to .rmd file to knit report |
40 # Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times | 40 # for quality control |
41 # Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows | 41 # Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data |
42 # Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output | 42 # impute multiple times |
43 # Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\.(\\d+)[AB]$" so that it looks at the end of string | 43 # Rev. 2016-09-09 :added filter to eliminate contaminant & reverse sequence rows |
44 # Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to | |
45 # preANOVA file output | |
46 # Rev. 2016-08-22 :use regexSampleNames <- "\\.(\\d + )[AB]$" | |
47 # so that it looks at the end of string | |
44 # Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....) | 48 # Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....) |
45 # Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA | 49 # Rev. 2016-07-03 :Removed row names from the write.table() output for |
50 # ANOVA and PreANOVA | |
46 # Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75 | 51 # Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75 |
47 # Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards | 52 # Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting |
53 # the row numbers afterwards | |
48 # Rev. 2016-06-21 :test18 + standardized the regexpression in protocol | 54 # Rev. 2016-06-21 :test18 + standardized the regexpression in protocol |
49 | 55 |
50 | 56 |
51 ### FUNCTION DECLARATIONS begin ---------------------------------------------- | 57 ### FUNCTION DECLARATIONS begin ---------------------------------------------- |
52 | 58 |
53 # Read first line of file at filePath | 59 # Read first line of file at filePath |
54 # adapted from: https://stackoverflow.com/a/35761217/15509512 | 60 # adapted from: https://stackoverflow.com/a/35761217/15509512 |
55 readFirstLine <- function(filepath) { | 61 read_first_line <- function(filepath) { |
56 con = file(filepath, "r") | 62 con <- file(filepath, "r") |
57 line = readLines(con, n = 1) | 63 line <- readLines(con, n = 1) |
58 close(con) | 64 close(con) |
59 return(line) | 65 return(line) |
60 } | 66 } |
61 | 67 |
62 # Move columns to the end of dataframe | 68 # Move columns to the end of dataframe |
66 data[c(setdiff(names(data), move), move)] | 72 data[c(setdiff(names(data), move), move)] |
67 } | 73 } |
68 | 74 |
69 # Generate phosphopeptide and build list when applied | 75 # Generate phosphopeptide and build list when applied |
70 phosphopeptide_func <- function(df) { | 76 phosphopeptide_func <- function(df) { |
71 | 77 # generate peptide sequence and list of phosphopositions |
72 #generate peptide sequence and list of phosphopositions | 78 phosphoprobsequence <- |
73 phosphoprobsequence <- strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]] | 79 strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]] |
74 output <- vector() | 80 output <- vector() |
75 phosphopeptide <- "" | 81 phosphopeptide <- "" |
76 counter <- 0 #keep track of position in peptide | 82 counter <- 0 # keep track of position in peptide |
77 phosphopositions <- vector() #keep track of phosphorylation positions in peptide | 83 phosphopositions <- |
84 vector() # keep track of phosphorylation positions in peptide | |
78 score_diff <- "" | 85 score_diff <- "" |
79 for (chara in phosphoprobsequence){ | 86 for (chara in phosphoprobsequence) { |
80 #build peptide sequence | 87 # build peptide sequence |
81 if (!(chara == " " | chara == "(" | chara == ")" | chara =="." | chara =="-" | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9")) { | 88 if (!( |
82 phosphopeptide <- paste(phosphopeptide,chara,sep="") | 89 chara == " " | |
90 chara == "(" | | |
91 chara == ")" | | |
92 chara == "." | | |
93 chara == "-" | | |
94 chara == "0" | | |
95 chara == "1" | | |
96 chara == "2" | | |
97 chara == "3" | | |
98 chara == "4" | | |
99 chara == "5" | | |
100 chara == "6" | | |
101 chara == "7" | | |
102 chara == "8" | | |
103 chara == "9") | |
104 ) { | |
105 phosphopeptide <- paste(phosphopeptide, chara, sep = "") | |
83 counter <- counter + 1 | 106 counter <- counter + 1 |
84 } | 107 } |
85 #generate score_diff | 108 # generate score_diff |
86 if (chara == "-" | chara =="." | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9"){ | 109 if (chara == "-" | |
87 score_diff <- paste(score_diff,chara,sep="") | 110 chara == "." | |
111 chara == "0" | | |
112 chara == "1" | | |
113 chara == "2" | | |
114 chara == "3" | | |
115 chara == "4" | | |
116 chara == "5" | | |
117 chara == "6" | | |
118 chara == "7" | | |
119 chara == "8" | | |
120 chara == "9" | |
121 ) { | |
122 score_diff <- paste(score_diff, chara, sep = "") | |
88 } | 123 } |
89 #evaluate score_diff | 124 # evaluate score_diff |
90 if (chara == ")" ){ | 125 if (chara == ")") { |
91 score_diff <- as.numeric(score_diff) | 126 score_diff <- as.numeric(score_diff) |
92 #only consider a phosphoresidue if score_diff > 0 | 127 # only consider a phosphoresidue if score_diff > 0 |
93 if (score_diff > 0) { | 128 if (score_diff > 0) { |
94 phosphopositions <- append(phosphopositions, counter) | 129 phosphopositions <- append(phosphopositions, counter) |
95 } | 130 } |
96 score_diff <- "" | 131 score_diff <- "" |
97 } | 132 } |
98 } | 133 } |
99 | 134 |
100 #generate phosphopeptide sequence (ie, peptide sequence with "p"'s) | 135 # generate phosphopeptide sequence (ie, peptide sequence with "p"'s) |
101 counter <- 1 | 136 counter <- 1 |
102 phosphoposition_correction1 <- -1 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string | 137 phosphoposition_correction1 <- |
103 phosphoposition_correction2 <- 0 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string | 138 -1 # used to correct phosphosposition as "p"'s |
104 while (counter <= length(phosphopositions) ) { | 139 # are inserted into the phosphopeptide string |
105 phosphopeptide <- paste(substr(phosphopeptide,0,phosphopositions[counter]+phosphoposition_correction1),"p",substr(phosphopeptide,phosphopositions[counter]+phosphoposition_correction2,nchar(phosphopeptide)),sep="") | 140 phosphoposition_correction2 <- |
141 0 # used to correct phosphosposition as "p"'s | |
142 # are inserted into the phosphopeptide string | |
143 while (counter <= length(phosphopositions)) { | |
144 phosphopeptide <- | |
145 paste( | |
146 substr( | |
147 phosphopeptide, | |
148 0, | |
149 phosphopositions[counter] + phosphoposition_correction1 | |
150 ), | |
151 "p", | |
152 substr( | |
153 phosphopeptide, | |
154 phosphopositions[counter] + phosphoposition_correction2, | |
155 nchar(phosphopeptide) | |
156 ), | |
157 sep = "" | |
158 ) | |
106 counter <- counter + 1 | 159 counter <- counter + 1 |
107 phosphoposition_correction1 <- phosphoposition_correction1 + 1 | 160 phosphoposition_correction1 <- phosphoposition_correction1 + 1 |
108 phosphoposition_correction2 <- phosphoposition_correction2 + 1 | 161 phosphoposition_correction2 <- phosphoposition_correction2 + 1 |
109 } | 162 } |
110 | 163 # building phosphopeptide list |
111 #building phosphopeptide list | 164 output <- append(output, phosphopeptide) |
112 output <- append(output,phosphopeptide) | |
113 return(output) | 165 return(output) |
114 } | 166 } |
115 | 167 |
116 ### FUNCTION DECLARATIONS end ------------------------------------------------ | 168 ### FUNCTION DECLARATIONS end ------------------------------------------------ |
117 | 169 |
124 c("-i", "--input"), | 176 c("-i", "--input"), |
125 action = "store", | 177 action = "store", |
126 type = "character", | 178 type = "character", |
127 help = "A MaxQuant Phospho (STY)Sites.txt" | 179 help = "A MaxQuant Phospho (STY)Sites.txt" |
128 ) | 180 ) |
129 , make_option( | 181 , |
182 make_option( | |
130 c("-o", "--output"), | 183 c("-o", "--output"), |
131 action = "store", | 184 action = "store", |
132 type = "character", | 185 type = "character", |
133 help = "path to output file" | 186 help = "path to output file" |
134 ) | 187 ) |
135 , make_option( | 188 , |
189 make_option( | |
136 c("-E", "--enrichGraph"), | 190 c("-E", "--enrichGraph"), |
137 action = "store", | 191 action = "store", |
138 type = "character", | 192 type = "character", |
139 help = "path to enrichment graph PDF" | 193 help = "path to enrichment graph PDF" |
140 ) | 194 ) |
141 , make_option( | 195 , |
196 make_option( | |
142 c("-F", "--enrichGraph_svg"), | 197 c("-F", "--enrichGraph_svg"), |
143 action = "store", | 198 action = "store", |
144 type = "character", | 199 type = "character", |
145 help = "path to enrichment graph SVG" | 200 help = "path to enrichment graph SVG" |
146 ) | 201 ) |
147 , make_option( | 202 , |
203 make_option( | |
148 c("-L", "--locProbCutoffGraph"), | 204 c("-L", "--locProbCutoffGraph"), |
149 action = "store", | 205 action = "store", |
150 type = "character", | 206 type = "character", |
151 help = "path to location-proability cutoff graph PDF" | 207 help = "path to location-proability cutoff graph PDF" |
152 ) | 208 ) |
153 , make_option( | 209 , |
210 make_option( | |
154 c("-M", "--locProbCutoffGraph_svg"), | 211 c("-M", "--locProbCutoffGraph_svg"), |
155 action = "store", | 212 action = "store", |
156 type = "character", | 213 type = "character", |
157 help = "path to location-proability cutoff graph SVG" | 214 help = "path to location-proability cutoff graph SVG" |
158 ) | 215 ) |
159 , make_option( | 216 , |
217 make_option( | |
160 c("-e", "--enriched"), | 218 c("-e", "--enriched"), |
161 action = "store", | 219 action = "store", |
162 type = "character", | 220 type = "character", |
163 help = "pY or pST enriched samples (ie, 'Y' or 'ST')" | 221 help = "pY or pST enriched samples (ie, 'Y' or 'ST')" |
164 ) | 222 ) |
165 # default = "^Number of Phospho [(]STY[)]$", | 223 # default = "^Number of Phospho [(]STY[)]$", |
166 , make_option( | 224 , |
225 make_option( | |
167 c("-p", "--phosphoCol"), | 226 c("-p", "--phosphoCol"), |
168 action = "store", | 227 action = "store", |
169 type = "character", | 228 type = "character", |
170 help = "PERL-compatible regular expression matching header of column having number of 'Phospho (STY)'" | 229 help = paste0("PERL-compatible regular expression matching", |
230 " header of column having number of 'Phospho (STY)'") | |
171 ) | 231 ) |
172 # default = "^Intensity[^_]", | 232 # default = "^Intensity[^_]", |
173 , make_option( | 233 , |
234 make_option( | |
174 c("-s", "--startCol"), | 235 c("-s", "--startCol"), |
175 action = "store", | 236 action = "store", |
176 type = "character", | 237 type = "character", |
177 help = "PERL-compatible regular expression matching column header having first sample intensity" | 238 help = paste0("PERL-compatible regular expression matching", |
239 " header of column having first sample intensity") | |
178 ) | 240 ) |
179 # default = 1, | 241 # default = 1, |
180 , make_option( | 242 , |
243 make_option( | |
181 c("-I", "--intervalCol"), | 244 c("-I", "--intervalCol"), |
182 action = "store", | 245 action = "store", |
183 type = "integer", | 246 type = "integer", |
184 help = "Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column" | 247 help = paste0("Column interval between the Intensities of samples", |
248 " (eg, 1 if subsequent column; 2 if every other column") | |
185 ) | 249 ) |
186 # default = 0.75, | 250 # default = 0.75, |
187 , make_option( | 251 , |
252 make_option( | |
188 c("-l", "--localProbCutoff"), | 253 c("-l", "--localProbCutoff"), |
189 action = "store", | 254 action = "store", |
190 type = "double", | 255 type = "double", |
191 help = "Localization Probability Cutoff" | 256 help = "Localization Probability Cutoff" |
192 ) | 257 ) |
193 # default = "sum", | 258 # default = "sum", |
194 , make_option( | 259 , |
260 make_option( | |
195 c("-f", "--collapse_func"), | 261 c("-f", "--collapse_func"), |
196 action = "store", | 262 action = "store", |
197 type = "character", | 263 type = "character", |
198 help = "merge identical phosphopeptides by ('sum' or 'average') the intensities" | 264 help = paste0("merge identical phosphopeptides", |
199 ) | 265 " by ('sum' or 'average') the intensities") |
200 # default = "filteredData.txt", | 266 ) |
201 , make_option( | 267 # default = "filtered_data.txt", |
268 , | |
269 make_option( | |
202 c("-r", "--filtered_data"), | 270 c("-r", "--filtered_data"), |
203 action = "store", | 271 action = "store", |
204 type = "character", | 272 type = "character", |
205 help = "filteredData.txt" | 273 help = "filtered_data.txt" |
206 ) | 274 ) |
207 # default = "quantData.txt", | 275 # default = "quantData.txt", |
208 , make_option( | 276 , |
277 make_option( | |
209 c("-q", "--quant_data"), | 278 c("-q", "--quant_data"), |
210 action = "store", | 279 action = "store", |
211 type = "character", | 280 type = "character", |
212 help = "quantData.txt" | 281 help = "quantData.txt" |
213 ) | 282 ) |
214 ) | 283 ) |
215 args <- parse_args(OptionParser(option_list=option_list)) | 284 args <- parse_args(OptionParser(option_list = option_list)) |
216 # Check parameter values | 285 # Check parameter values |
217 | 286 |
218 ### EXTRACT ARGUMENTS end ---------------------------------------------------- | 287 ### EXTRACT ARGUMENTS end ---------------------------------------------------- |
219 | 288 |
220 | 289 |
221 ### EXTRACT PARAMETERS from arguments begin ---------------------------------- | 290 ### EXTRACT PARAMETERS from arguments begin ---------------------------------- |
222 | 291 |
223 if (! file.exists(args$input)) { | 292 if (!file.exists(args$input)) { |
224 stop((paste("File", args$input, "does not exist"))) | 293 stop((paste("File", args$input, "does not exist"))) |
225 } | 294 } |
226 | 295 |
227 phosphoColPattern <- "^Number of Phospho [(][STY][STY]*[)]$" | 296 phospho_col_pattern <- "^Number of Phospho [(][STY][STY]*[)]$" |
228 startColPattern <- "^Intensity[^_]" | 297 start_col_pattern <- "^Intensity[^_]" |
229 phosphoColPattern <- readFirstLine(args$phosphoCol) | 298 phospho_col_pattern <- read_first_line(args$phosphoCol) |
230 startColPattern <- readFirstLine(args$startCol) | 299 start_col_pattern <- read_first_line(args$startCol) |
231 | 300 |
232 sink(getConnection(2)) | 301 sink(getConnection(2)) |
233 #ACE print(paste("phosphoColPattern", phosphoColPattern)) | 302 |
234 #ACE print(paste("startColPattern", startColPattern)) | 303 input_file_name <- args$input |
235 | 304 filtered_filename <- args$filtered_data |
236 inputFilename <- args$input | 305 quant_file_name <- args$quant_data |
237 filteredFilename <- args$filtered_data | 306 interval_col <- as.integer(args$intervalCol) |
238 quantFilename <- args$quant_data | 307 |
239 intervalCol <- as.integer(args$intervalCol) | 308 first_line <- read_first_line(input_file_name) |
240 | 309 col_headers <- |
241 firstLine <- readFirstLine(inputFilename) | 310 unlist(strsplit( |
242 columnHeaders <- unlist(strsplit(x=firstLine, split=c('\t'), fixed=TRUE)) | 311 x = first_line, |
312 split = c("\t"), | |
313 fixed = TRUE | |
314 )) | |
243 sink(getConnection(2)) | 315 sink(getConnection(2)) |
244 #ACE print("columnHeaders") | |
245 #ACE print(columnHeaders) | |
246 sink() | 316 sink() |
247 | 317 |
248 | 318 |
249 intensityHeaderCols <- grep(pattern=startColPattern, x=columnHeaders, perl=TRUE) | 319 intensity_header_cols <- |
250 if ( length(intensityHeaderCols) == 0) { | 320 grep(pattern = start_col_pattern, x = col_headers, perl = TRUE) |
251 err_msg <- paste("Found no intensity columns matching pattern:", startColPattern) | 321 if (length(intensity_header_cols) == 0) { |
252 # Divert output to stderr | 322 err_msg <- |
253 sink(getConnection(2)) | 323 paste("Found no intensity columns matching pattern:", |
254 print(err_msg) | 324 start_col_pattern) |
255 sink() | 325 # Divert output to stderr |
256 stop(err_msg) | 326 sink(getConnection(2)) |
257 } | 327 print(err_msg) |
258 | 328 sink() |
259 | 329 stop(err_msg) |
260 phosphoCol <- grep(pattern=phosphoColPattern, x=columnHeaders, perl=TRUE)[1] | 330 } |
261 if (is.na(phosphoCol)) { | 331 |
262 err_msg <- paste("Found no 'number of phospho sites' columns matching pattern:", phosphoColPattern) | 332 |
263 # Divert output to stderr | 333 phospho_col <- |
264 sink(getConnection(2)) | 334 grep(pattern = phospho_col_pattern, x = col_headers, perl = TRUE)[1] |
265 print(err_msg) | 335 if (is.na(phospho_col)) { |
266 sink() | 336 err_msg <- |
267 stop(err_msg) | 337 paste("Found no 'number of phospho sites' columns matching pattern:", |
268 } | 338 phospho_col_pattern) |
339 # Divert output to stderr | |
340 sink(getConnection(2)) | |
341 print(err_msg) | |
342 sink() | |
343 stop(err_msg) | |
344 } | |
269 | 345 |
270 | 346 |
271 i_count <- 0 | 347 i_count <- 0 |
272 this_column <- 1 | 348 this_column <- 1 |
273 last_value <- intensityHeaderCols[1] | 349 last_value <- intensity_header_cols[1] |
274 intensityCols <- c(last_value) | 350 intensity_cols <- c(last_value) |
275 | 351 |
276 while ( length(intensityHeaderCols) >= intervalCol * i_count ) { | 352 while (length(intensity_header_cols) >= interval_col * i_count) { |
277 i_count <- 1 + i_count | 353 i_count <- 1 + i_count |
278 this_column <- intervalCol + this_column | 354 this_column <- interval_col + this_column |
279 if ( last_value + intervalCol != intensityHeaderCols[this_column] ) break | 355 if (last_value + interval_col != intensity_header_cols[this_column]) |
280 last_value <- intensityHeaderCols[this_column] | 356 break |
281 if (length(intensityHeaderCols) < intervalCol * i_count) break | 357 last_value <- intensity_header_cols[this_column] |
282 intensityCols <- c(intensityCols, intensityHeaderCols[this_column]) | 358 if (length(intensity_header_cols) < interval_col * i_count) |
283 } | 359 break |
284 | 360 intensity_cols <- |
285 startCol <- intensityCols[1] | 361 c(intensity_cols, intensity_header_cols[this_column]) |
286 numSamples <- i_count | 362 } |
287 | 363 |
288 outputfilename <- args$output | 364 start_col <- intensity_cols[1] |
289 enrichGraphFilename <- args$enrichGraph | 365 num_samples <- i_count |
290 locProbCutoffGraphFilename <- args$locProbCutoffGraph | 366 |
291 enrichGraphFilename_svg <- args$enrichGraph_svg | 367 output_filename <- args$output |
292 locProbCutoffGraphFilename_svg <- args$locProbCutoffGraph_svg | 368 enrich_graph_filename <- args$enrichGraph |
293 | 369 loc_prob_cutoff_graph_filename <- args$locProbCutoffGraph |
294 localProbCutoff <- args$localProbCutoff | 370 enrich_graph_filename_svg <- args$enrichGraph_svg |
371 loc_prob_cutoff_graph_fn_svg <- args$locProbCutoffGraph_svg | |
372 | |
373 local_prob_cutoff <- args$localProbCutoff | |
295 enriched <- args$enriched | 374 enriched <- args$enriched |
296 collapse_FUN <- args$collapse_func | 375 collapse_fn <- args$collapse_func |
297 | 376 |
298 ### EXTRACT PARAMETERS from arguments end ------------------------------------ | 377 ### EXTRACT PARAMETERS from arguments end ------------------------------------ |
299 | 378 |
300 | 379 |
301 # Proteomics Quality Control for MaxQuant Results | 380 # Proteomics Quality Control for MaxQuant Results |
302 # (Bielow C et al. J Proteome Res. 2016 PMID: 26653327) | 381 # (Bielow C et al. J Proteome Res. 2016 PMID: 26653327) |
303 # is run by the Galaxy MaxQuant wrapper and need not be invoked here. | 382 # is run by the Galaxy MaxQuant wrapper and need not be invoked here. |
304 | 383 |
305 | 384 |
306 # Read data, filtering out contaminants, reverse sequences, and localization probability | 385 # Read & filter out contaminants, reverse sequences, & localization probability |
307 # --- | 386 # --- |
308 fullData <- read.table(file = inputFilename, sep ="\t", header=T, quote="") | 387 full_data <- |
309 | 388 read.table( |
310 #Filter out contaminant rows and reverse rows | 389 file = input_file_name, |
311 filteredData <- subset(fullData,!grepl("CON__", Proteins)) | 390 sep = "\t", |
312 filteredData <- subset(filteredData,!grepl("_MYCOPLASMA", Proteins)) | 391 header = T, |
313 filteredData <- subset(filteredData,!grepl("CONTAMINANT_", Proteins)) | 392 quote = "" |
314 filteredData <- subset(filteredData,!grepl("REV__", Protein)) #since REV__ rows are blank in the first column (Proteins) | 393 ) |
315 write.table(filteredData, file = filteredFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE) | 394 |
395 # Filter out contaminant rows and reverse rows | |
396 filtered_data <- subset(full_data, !grepl("CON__", Proteins)) | |
397 filtered_data <- | |
398 subset(filtered_data, !grepl("_MYCOPLASMA", Proteins)) | |
399 filtered_data <- | |
400 subset(filtered_data, !grepl("CONTAMINANT_", Proteins)) | |
401 filtered_data <- | |
402 subset(filtered_data, !grepl("REV__", Protein) | |
403 ) # since REV__ rows are blank in the first column (Proteins) | |
404 write.table( | |
405 filtered_data, | |
406 file = filtered_filename, | |
407 sep = "\t", | |
408 quote = FALSE, | |
409 col.names = TRUE, | |
410 row.names = FALSE | |
411 ) | |
316 # ... | 412 # ... |
317 | 413 |
318 | 414 |
319 # Filter out data with localization probability below localProbCutoff | 415 # Filter out data with localization probability below localProbCutoff |
320 # --- | 416 # --- |
321 #Data filtered by localization probability | 417 # Data filtered by localization probability |
322 locProbFilteredData <- filteredData[filteredData$Localization.prob>=localProbCutoff,] | 418 loc_prob_filtered_data <- |
419 filtered_data[ | |
420 filtered_data$Localization.prob >= local_prob_cutoff, | |
421 ] | |
323 # ... | 422 # ... |
324 | 423 |
325 | 424 |
326 # Localization probability -- visualize locprob cutoff | 425 # Localization probability -- visualize locprob cutoff |
327 # --- | 426 # --- |
328 locProbGraphData <- data.frame( | 427 loc_prob_graph_data <- |
329 group = c(paste(">",toString(localProbCutoff),sep=""), paste("<",toString(localProbCutoff),sep="")), | 428 data.frame( |
330 value = c(nrow(locProbFilteredData)/nrow(filteredData)*100, (nrow(filteredData)-nrow(locProbFilteredData))/nrow(filteredData)*100) | 429 group = c(paste(">", toString(local_prob_cutoff), sep = ""), |
331 ) | 430 paste("<", toString(local_prob_cutoff), sep = "")), |
431 value = c( | |
432 nrow(loc_prob_filtered_data) / nrow(filtered_data) * 100, | |
433 (nrow(filtered_data) - nrow(loc_prob_filtered_data)) | |
434 / nrow(filtered_data) * 100 | |
435 ) | |
436 ) | |
332 gigi <- | 437 gigi <- |
333 ggplot(locProbGraphData, aes(x = "", y = value, fill = group)) + | 438 ggplot(loc_prob_graph_data, aes(x = "", y = value, fill = group)) + |
334 geom_bar(width = 0.5, stat = "identity", color = "black") + | 439 geom_bar(width = 0.5, |
335 labs( | 440 stat = "identity", |
336 x = NULL | 441 color = "black") + |
337 , y = "percent" | 442 labs(x = NULL, |
338 , title = "Phosphopeptides partitioned by localization-probability cutoff" | 443 y = "percent", |
444 title = "Phosphopeptides partitioned by localization-probability cutoff" | |
339 ) + | 445 ) + |
340 scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") + | 446 scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") + |
341 theme_minimal() + | 447 theme_minimal() + |
342 theme( | 448 theme( |
343 legend.position = "right" | 449 legend.position = "right", |
344 , legend.title=element_text() | 450 legend.title = element_text(), |
345 , plot.title = element_text(hjust = 0.5) | 451 plot.title = element_text(hjust = 0.5), |
346 , plot.subtitle = element_text(hjust = 0.5) | 452 plot.subtitle = element_text(hjust = 0.5), |
347 , plot.title.position = "plot" | 453 plot.title.position = "plot" |
348 ) | 454 ) |
349 pdf(locProbCutoffGraphFilename) | 455 pdf(loc_prob_cutoff_graph_filename) |
350 print(gigi) | 456 print(gigi) |
351 dev.off() | 457 dev.off() |
352 svg(locProbCutoffGraphFilename_svg) | 458 svg(loc_prob_cutoff_graph_fn_svg) |
353 print(gigi) | 459 print(gigi) |
354 dev.off() | 460 dev.off() |
355 # ... | 461 # ... |
356 | 462 |
357 | 463 |
358 # Extract quantitative values from filtered data | 464 # Extract quantitative values from filtered data |
359 # --- | 465 # --- |
360 quantData <- locProbFilteredData[,seq(from=startCol, by=intervalCol, length.out=numSamples)] | 466 quant_data <- |
467 loc_prob_filtered_data[, seq(from = start_col, | |
468 by = interval_col, | |
469 length.out = num_samples)] | |
361 # ... | 470 # ... |
362 | 471 |
363 | 472 |
364 # Generate Phosphopeptide Sequence | 473 # Generate Phosphopeptide Sequence |
365 # for latest version of MaxQuant (Version 1.5.3.30) | 474 # for latest version of MaxQuant (Version 1.5.3.30) |
366 # --- | 475 # --- |
367 dataTable <- data.frame(locProbFilteredData[,1:8],locProbFilteredData[,phosphoCol],locProbFilteredData[,phosphoCol+1],locProbFilteredData[,phosphoCol+2],locProbFilteredData[,phosphoCol+3],locProbFilteredData[,phosphoCol+4],locProbFilteredData[,phosphoCol+5],locProbFilteredData[,phosphoCol+6],locProbFilteredData[,phosphoCol+7],quantData) | 476 metadata_df <- |
368 colnames(dataTable) <- c("Proteins","Positions within proteins", "Leading proteins", "Protein", "Protein names", "Gene names", "Fasta headers", "Localization prob", "Number of Phospho (STY)", "Amino Acid", "Sequence window","Modification window", "Peptide window coverage", "Phospho (STY) Probabilities", "Phospho (STY) Score diffs", "Position in peptide", colnames(quantData)) | 477 data.frame( |
369 # 'phosphopeptide_func' generates a phosphopeptide sequence for each row of data. | 478 loc_prob_filtered_data[, 1:8], |
370 # for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1,2) = both | 479 loc_prob_filtered_data[, phospho_col], |
371 dataTable$Phosphopeptide <- apply(X=dataTable, MARGIN=1, FUN=phosphopeptide_func) | 480 loc_prob_filtered_data[, phospho_col + 1], |
481 loc_prob_filtered_data[, phospho_col + 2], | |
482 loc_prob_filtered_data[, phospho_col + 3], | |
483 loc_prob_filtered_data[, phospho_col + 4], | |
484 loc_prob_filtered_data[, phospho_col + 5], | |
485 loc_prob_filtered_data[, phospho_col + 6], | |
486 loc_prob_filtered_data[, phospho_col + 7], | |
487 quant_data | |
488 ) | |
489 colnames(metadata_df) <- | |
490 c( | |
491 "Proteins", | |
492 "Positions within proteins", | |
493 "Leading proteins", | |
494 "Protein", | |
495 "Protein names", | |
496 "Gene names", | |
497 "Fasta headers", | |
498 "Localization prob", | |
499 "Number of Phospho (STY)", | |
500 "Amino Acid", | |
501 "Sequence window", | |
502 "Modification window", | |
503 "Peptide window coverage", | |
504 "Phospho (STY) Probabilities", | |
505 "Phospho (STY) Score diffs", | |
506 "Position in peptide", | |
507 colnames(quant_data) | |
508 ) | |
509 # 'phosphopeptide_func' generates a phosphopeptide sequence | |
510 # for each row of data. | |
511 # for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1, 2) = both | |
512 metadata_df$phosphopeptide <- | |
513 apply(X = metadata_df, MARGIN = 1, FUN = phosphopeptide_func) | |
514 colnames(metadata_df)[1] <- "Phosphopeptide" | |
372 # Move the quant data columns to the right end of the data.frame | 515 # Move the quant data columns to the right end of the data.frame |
373 dataTable <- movetolast(dataTable,c(colnames(quantData))) | 516 metadata_df <- movetolast(metadata_df, c(colnames(quant_data))) |
374 # ... | 517 # ... |
375 | 518 |
376 | 519 |
377 # Write quantitative values for debugging purposes | 520 # Write quantitative values for debugging purposes |
378 # --- | 521 # --- |
379 quantWrite <- cbind( dataTable[,"Sequence window"], quantData ) | 522 quant_write <- cbind(metadata_df[, "Sequence window"], quant_data) |
380 colnames(quantWrite)[1] <- "Sequence.Window" | 523 colnames(quant_write)[1] <- "Sequence.Window" |
381 write.table(quantWrite, file = quantFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE) | 524 # ... |
382 # ... | 525 |
383 | 526 |
384 | 527 # Make new data frame containing only Phosphopeptides |
385 # Make new data frame containing only Phosphopeptides to be mapped to quant data (merge_df) | 528 # that are to be mapped to quant data (merge_df) |
386 # --- | 529 # --- |
387 dataTable <- setDT(dataTable, keep.rownames=TRUE) #row name will be used to map | 530 metadata_df <- |
388 merge_df <- data.frame(as.integer(dataTable$rn), dataTable$Phosphopeptide) #row index to merge data frames | 531 setDT(metadata_df, keep.rownames = TRUE) # row name will be used to map |
532 merge_df <- | |
533 data.frame( | |
534 as.integer(metadata_df$rn), | |
535 metadata_df$phosphopeptide # row index to merge data frames | |
536 ) | |
389 colnames(merge_df) <- c("rn", "Phosphopeptide") | 537 colnames(merge_df) <- c("rn", "Phosphopeptide") |
390 # ... | 538 # ... |
391 | 539 |
392 | 540 |
393 # Add Phosphopeptide column to quant columns for quality control checking | 541 # Add Phosphopeptide column to quant columns for quality control checking |
394 # --- | 542 # --- |
395 quantData_qc <- as.data.frame(quantData) | 543 quant_data_qc <- as.data.frame(quant_data) |
396 setDT(quantData_qc, keep.rownames=TRUE) #will use to match rowname to data | 544 setDT(quant_data_qc, keep.rownames = TRUE) # will use to match rowname to data |
397 quantData_qc$rn <- as.integer(quantData_qc$rn) | 545 quant_data_qc$rn <- as.integer(quant_data_qc$rn) |
398 quantData_qc <- merge(merge_df,quantData_qc, by="rn") | 546 quant_data_qc <- merge(merge_df, quant_data_qc, by = "rn") |
399 quantData_qc$rn <- NULL #remove rn column | 547 quant_data_qc$rn <- NULL # remove rn column |
400 # ... | 548 # ... |
401 | 549 |
402 | 550 |
403 # Collapse multiphosphorylated peptides | 551 # Collapse multiphosphorylated peptides |
404 # --- | 552 # --- |
405 quantData_qc_collapsed <- data.table(quantData_qc, key = "Phosphopeptide") | 553 quant_data_qc_collapsed <- |
406 quantData_qc_collapsed <- aggregate(. ~ Phosphopeptide,quantData_qc, FUN= collapse_FUN) | 554 data.table(quant_data_qc, key = "Phosphopeptide") |
407 # ... | 555 quant_data_qc_collapsed <- |
408 | 556 aggregate(. ~ Phosphopeptide, quant_data_qc, FUN = collapse_fn) |
409 | 557 # ... |
410 # Compute (as string) % of phosphopeptides that are multiphosphorylated (for use in next step) | 558 print("quant_data_qc_collapsed") |
411 # --- | 559 head(quant_data_qc_collapsed) |
412 pct_multiphos <- (nrow(quantData_qc) - nrow(quantData_qc_collapsed)) / (2 * nrow(quantData_qc)) | 560 |
561 # Compute (as string) % of phosphopeptides that are multiphosphorylated | |
562 # (for use in next step) | |
563 # --- | |
564 pct_multiphos <- | |
565 ( | |
566 nrow(quant_data_qc) - nrow(quant_data_qc_collapsed) | |
567 ) / (2 * nrow(quant_data_qc)) | |
413 pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%") | 568 pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%") |
414 # ... | 569 # ... |
415 | 570 |
571 write.table( | |
572 quant_data_qc_collapsed, | |
573 file = quant_file_name, | |
574 sep = "\t", | |
575 quote = FALSE, | |
576 col.names = TRUE, | |
577 row.names = FALSE | |
578 ) | |
416 | 579 |
417 # Compute and visualize breakdown of pY, pS, and pT before enrichment filter | 580 # Compute and visualize breakdown of pY, pS, and pT before enrichment filter |
418 # --- | 581 # --- |
419 pY_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),] | 582 py_data <- |
420 pS_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS"),] | 583 quant_data_qc_collapsed[ |
421 pT_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),] | 584 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"), |
422 | 585 ] |
423 pY_num <- nrow(pY_data) | 586 ps_data <- |
424 pS_num <- nrow(pS_data) | 587 quant_data_qc_collapsed[ |
425 pT_num <- nrow(pT_data) | 588 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS"), |
589 ] | |
590 pt_data <- | |
591 quant_data_qc_collapsed[ | |
592 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"), | |
593 ] | |
594 | |
595 py_num <- nrow(py_data) | |
596 ps_num <- nrow(ps_data) | |
597 pt_num <- nrow(pt_data) | |
426 | 598 |
427 # Visualize enrichment | 599 # Visualize enrichment |
428 enrichGraphData <- data.frame( | 600 enrich_graph_data <- data.frame(group = c("pY", "pS", "pT"), |
429 group = c("pY", "pS", "pT"), | 601 value = c(py_num, ps_num, pt_num)) |
430 value = c(pY_num, pS_num, pT_num) | 602 |
431 ) | 603 enrich_graph_data <- |
432 | 604 enrich_graph_data[ |
433 enrichGraphData <- enrichGraphData[enrichGraphData$value > 0,] | 605 enrich_graph_data$value > 0, |
606 ] | |
434 | 607 |
435 # Plot pie chart with legend | 608 # Plot pie chart with legend |
436 # start: https://stackoverflow.com/a/62522478/15509512 | 609 # start: https://stackoverflow.com/a/62522478/15509512 |
437 # refine: https://www.statology.org/ggplot-pie-chart/ | 610 # refine: https://www.statology.org/ggplot-pie-chart/ |
438 # colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8 | 611 # colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8 |
439 slices <- enrichGraphData$value | 612 slices <- enrich_graph_data$value |
440 phosphoresidue <- enrichGraphData$group | 613 phosphoresidue <- enrich_graph_data$group |
441 pct <- round(100 * slices / sum(slices)) | 614 pct <- round(100 * slices / sum(slices)) |
442 lbls <- paste(enrichGraphData$group,"\n",pct, "%\n(", slices, ")", sep="") | 615 lbls <- |
616 paste(enrich_graph_data$group, "\n", pct, "%\n(", slices, ")", sep = "") | |
443 slc_ctr <- c() | 617 slc_ctr <- c() |
444 run_tot <- 0 | 618 run_tot <- 0 |
445 for (p in pct) { | 619 for (p in pct) { |
446 slc_ctr <- c(slc_ctr, run_tot + p/2.0) | 620 slc_ctr <- c(slc_ctr, run_tot + p / 2.0) |
447 run_tot <- run_tot + p | 621 run_tot <- run_tot + p |
448 } | 622 } |
449 lbl_y <- 100 - slc_ctr | 623 lbl_y <- 100 - slc_ctr |
450 df <- data.frame(slices, pct, lbls, phosphoresidue = factor(phosphoresidue, levels = phosphoresidue)) | 624 df <- |
451 gigi <- ggplot( | 625 data.frame(slices, |
452 df | 626 pct, |
453 , aes(x = 1, y = pct, fill = phosphoresidue)) + | 627 lbls, |
628 phosphoresidue = factor(phosphoresidue, levels = phosphoresidue)) | |
629 gigi <- ggplot(df | |
630 , aes(x = 1, y = pct, fill = phosphoresidue)) + | |
454 geom_col(position = "stack", orientation = "x") + | 631 geom_col(position = "stack", orientation = "x") + |
455 geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") + | 632 geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") + |
456 coord_polar(theta = "y", direction = -1) + | 633 coord_polar(theta = "y", direction = -1) + |
457 labs( | 634 labs( |
458 x = NULL | 635 x = NULL |
459 , y = NULL | 636 , |
460 , title = "Percentages (and counts) of phosphosites, by type of residue" | 637 y = NULL |
461 , caption = sprintf("Roughly %s of peptides have multiple phosphosites.", pct_multiphos) | 638 , |
639 title = "Percentages (and counts) of phosphosites, by type of residue" | |
640 , | |
641 caption = sprintf( | |
642 "Roughly %s of peptides have multiple phosphosites.", | |
643 pct_multiphos | |
644 ) | |
462 ) + | 645 ) + |
463 labs(x = NULL, y = NULL, fill = NULL) + | 646 labs(x = NULL, y = NULL, fill = NULL) + |
464 theme_classic() + | 647 theme_classic() + |
465 theme( legend.position="right" | 648 theme( |
466 , axis.line = element_blank() | 649 legend.position = "right" |
467 , axis.text = element_blank() | 650 , |
468 , axis.ticks = element_blank() | 651 axis.line = element_blank() |
469 , plot.title = element_text(hjust = 0.5) | 652 , |
470 , plot.subtitle = element_text(hjust = 0.5) | 653 axis.text = element_blank() |
471 , plot.caption = element_text(hjust = 0.5) | 654 , |
472 , plot.title.position = "plot" | 655 axis.ticks = element_blank() |
473 ) + | 656 , |
474 scale_fill_manual(breaks = phosphoresidue, values=c("#c7eae5", "#f6e8c3", "#dfc27d")) | 657 plot.title = element_text(hjust = 0.5) |
475 | 658 , |
476 pdf(enrichGraphFilename) | 659 plot.subtitle = element_text(hjust = 0.5) |
660 , | |
661 plot.caption = element_text(hjust = 0.5) | |
662 , | |
663 plot.title.position = "plot" | |
664 ) + | |
665 scale_fill_manual(breaks = phosphoresidue, | |
666 values = c("#c7eae5", "#f6e8c3", "#dfc27d")) | |
667 | |
668 pdf(enrich_graph_filename) | |
477 print(gigi) | 669 print(gigi) |
478 dev.off() | 670 dev.off() |
479 svg(enrichGraphFilename_svg) | 671 svg(enrich_graph_filename_svg) |
480 print(gigi) | 672 print(gigi) |
481 dev.off() | 673 dev.off() |
482 # ... | 674 # ... |
483 | 675 |
484 | 676 |
485 # Filter phosphopeptides by enrichment | 677 # Filter phosphopeptides by enrichment |
486 # -- | 678 # -- |
487 if (enriched == "Y"){ | 679 if (enriched == "Y") { |
488 quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),] | 680 quant_data_qc_enrichment <- quant_data_qc_collapsed[ |
489 } else if ( enriched == "ST" ) { | 681 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"), |
490 quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),] | 682 ] |
683 } else if (enriched == "ST") { | |
684 quant_data_qc_enrichment <- quant_data_qc_collapsed[ | |
685 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS") | | |
686 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"), | |
687 ] | |
491 } else { | 688 } else { |
492 print("Error in enriched variable. Set to either 'Y' or 'ST'") | 689 print("Error in enriched variable. Set to either 'Y' or 'ST'") |
493 } | 690 } |
494 # ... | 691 # ... |
495 | 692 |
693 print("quant_data_qc_enrichment") | |
694 head(quant_data_qc_enrichment) | |
496 | 695 |
497 # Write phosphopeptides filtered by enrichment | 696 # Write phosphopeptides filtered by enrichment |
498 # -- | 697 # -- |
499 write.table(quantData_qc_enrichment, file=outputfilename, sep="\t", quote = FALSE, row.names = FALSE) | 698 #ACE colnames(quant_data_qc_enrichment)[1] <- "Phosphopeptide" |
500 # ... | 699 write.table( |
700 quant_data_qc_enrichment, | |
701 file = output_filename, | |
702 sep = "\t", | |
703 quote = FALSE, | |
704 row.names = FALSE | |
705 ) | |
706 # ... |