Mercurial > repos > eschen42 > mqppep_anova

diff MaxQuantProcessingScript.R @ 5:d4d531006735 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 92e8ab6fc27a1f02583742715d644bc96418fbdf"
author: eschen42
date: Thu, 10 Mar 2022 23:42:48 +0000
parents: c1403d18c189
children: 922d309640db
--- a/MaxQuantProcessingScript.R	Mon Mar 07 20:43:57 2022 +0000
+++ b/MaxQuantProcessingScript.R	Thu Mar 10 23:42:48 2022 +0000
@@ -1,6 +1,6 @@
 #!/usr/bin/env Rscript
 
-# This is the implementation for the 
+# This is the implementation for the
 #   "MaxQuant Phosphopeptide Localization Probability Cutoff"
 #   Galaxy tool (mqppep_lclztn_filter)
 # It is adapted from the MaxQuant Processing Script written by Larry Cheng.
@@ -10,16 +10,14 @@
 library(data.table)
 library(stringr)
 library(ggplot2)
-#library(PTXQC)
-#require(PTXQC)
-#require(methods)
 
 # title: "MaxQuant Processing Script"
 # author: "Larry Cheng"
 # date: "February 19, 2018"
 #
 # # MaxQuant Processing Script
-# Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order):
+# Takes MaxQuant Phospho (STY)sites.txt file as input
+# and performs the following (in order):
 # 1) Runs the Proteomics Quality Control software
 # 2) Remove contaminant and reverse sequence rows
 # 3) Filters rows based on localization probability
@@ -27,24 +25,32 @@
 # 5) Sequences phosphopeptides
 # 6) Merges multiply phosphorylated peptides
 # 7) Filters out phosphopeptides based on enrichment
-# The output file contains the phosphopeptide (first column) and the quantitative values for each sample
+# The output file contains the phosphopeptide (first column)
+# and the quantitative values for each sample.
 #
 # ## Revision History
 # Rev. 2022-02-10 :wrap for inclusion in Galaxy
-# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script"
+# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script"
+#                  and "Phosphopeptide Processing Script"
 # Rev. 2017-12-12 :added PTXQC
 #                  added additional plots and table outputs for quality control
-#                  allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <-
-#                  "\\.(\\d+)[A-Z]$"
-#                  converted from .r to .rmd file to knit report for quality control
-# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times
-# Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows
-# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output
-# Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\.(\\d+)[AB]$" so that it looks at the end of string
+#                  allowed for more than 2 samples to be grouped together
+#                  (up to 26 (eg, 1A, 1B, 1C, etc))
+#                  converted from .r to .rmd file to knit report
+#                  for quality control
+# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data
+#                  impute multiple times
+# Rev. 2016-09-09 :added filter to eliminate contaminant & reverse sequence rows
+# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to
+#                  preANOVA file output
+# Rev. 2016-08-22 :use regexSampleNames <- "\\.(\\d + )[AB]$"
+#                  so that it looks at the end of string
 # Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....)
-# Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA
+# Rev. 2016-07-03 :Removed row names from the write.table() output for
+#                  ANOVA and PreANOVA
 # Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75
-# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards
+# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting
+#                  the row numbers afterwards
 # Rev. 2016-06-21 :test18 + standardized the regexpression in protocol
 
 
@@ -52,9 +58,9 @@
 
 # Read first line of file at filePath
 # adapted from: https://stackoverflow.com/a/35761217/15509512
-readFirstLine <- function(filepath) {
-  con = file(filepath, "r")
-  line = readLines(con, n = 1)
+read_first_line <- function(filepath) {
+  con <- file(filepath, "r")
+  line <- readLines(con, n = 1)
   close(con)
   return(line)
 }
@@ -68,28 +74,57 @@
 
 # Generate phosphopeptide and build list when applied
 phosphopeptide_func <- function(df) {
-
-  #generate peptide sequence and list of phosphopositions
-  phosphoprobsequence <- strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]]
+  # generate peptide sequence and list of phosphopositions
+  phosphoprobsequence <-
+    strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]]
   output <- vector()
   phosphopeptide <- ""
-  counter <- 0 #keep track of position in peptide
-  phosphopositions <- vector() #keep track of phosphorylation positions in peptide
+  counter <- 0 # keep track of position in peptide
+  phosphopositions <-
+    vector() # keep track of phosphorylation positions in peptide
   score_diff <- ""
-  for (chara in phosphoprobsequence){
-    #build peptide sequence
-    if (!(chara == " " | chara == "(" | chara == ")" | chara =="." | chara =="-" | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9")) {
-      phosphopeptide <- paste(phosphopeptide,chara,sep="")
+  for (chara in phosphoprobsequence) {
+    # build peptide sequence
+    if (!(
+      chara == " " |
+      chara == "(" |
+      chara == ")" |
+      chara == "." |
+      chara == "-" |
+      chara == "0" |
+      chara == "1" |
+      chara == "2" |
+      chara == "3" |
+      chara == "4" |
+      chara == "5" |
+      chara == "6" |
+      chara == "7" |
+      chara == "8" |
+      chara == "9")
+    ) {
+      phosphopeptide <- paste(phosphopeptide, chara, sep = "")
       counter <- counter + 1
     }
-    #generate score_diff
-    if (chara == "-" | chara =="." | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9"){
-      score_diff <- paste(score_diff,chara,sep="")
+    # generate score_diff
+    if (chara == "-" |
+        chara == "." |
+        chara == "0" |
+        chara == "1" |
+        chara == "2" |
+        chara == "3" |
+        chara == "4" |
+        chara == "5" |
+        chara == "6" |
+        chara == "7" |
+        chara == "8" |
+        chara == "9"
+    ) {
+      score_diff <- paste(score_diff, chara, sep = "")
     }
-    #evaluate score_diff
-    if (chara == ")" ){
+    # evaluate score_diff
+    if (chara == ")") {
       score_diff <- as.numeric(score_diff)
-      #only consider a phosphoresidue if score_diff > 0
+      # only consider a phosphoresidue if score_diff > 0
       if (score_diff > 0) {
         phosphopositions <- append(phosphopositions, counter)
       }
@@ -97,19 +132,36 @@
     }
   }
 
-  #generate phosphopeptide sequence (ie, peptide sequence with "p"'s)
+  # generate phosphopeptide sequence (ie, peptide sequence with "p"'s)
   counter <- 1
-  phosphoposition_correction1 <- -1 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
-  phosphoposition_correction2 <- 0 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
-  while (counter <= length(phosphopositions) ) {
-    phosphopeptide <- paste(substr(phosphopeptide,0,phosphopositions[counter]+phosphoposition_correction1),"p",substr(phosphopeptide,phosphopositions[counter]+phosphoposition_correction2,nchar(phosphopeptide)),sep="")
+  phosphoposition_correction1 <-
+    -1 # used to correct phosphosposition as "p"'s
+       #  are inserted into the phosphopeptide string
+  phosphoposition_correction2 <-
+    0  # used to correct phosphosposition as "p"'s
+       #   are inserted into the phosphopeptide string
+  while (counter <= length(phosphopositions)) {
+    phosphopeptide <-
+      paste(
+        substr(
+          phosphopeptide,
+          0,
+          phosphopositions[counter] + phosphoposition_correction1
+        ),
+        "p",
+        substr(
+          phosphopeptide,
+          phosphopositions[counter] + phosphoposition_correction2,
+          nchar(phosphopeptide)
+        ),
+        sep = ""
+      )
     counter <- counter + 1
     phosphoposition_correction1 <- phosphoposition_correction1 + 1
     phosphoposition_correction2 <- phosphoposition_correction2 + 1
   }
-
-  #building phosphopeptide list
-  output <- append(output,phosphopeptide)
+  # building phosphopeptide list
+  output <- append(output, phosphopeptide)
   return(output)
 }
 
@@ -126,93 +178,110 @@
     type = "character",
     help = "A MaxQuant Phospho (STY)Sites.txt"
   )
-,  make_option(
+  ,
+  make_option(
     c("-o", "--output"),
     action = "store",
     type = "character",
     help = "path to output file"
   )
-, make_option(
+  ,
+  make_option(
     c("-E", "--enrichGraph"),
     action = "store",
     type = "character",
     help = "path to enrichment graph PDF"
   )
-, make_option(
+  ,
+  make_option(
     c("-F", "--enrichGraph_svg"),
     action = "store",
     type = "character",
     help = "path to enrichment graph SVG"
   )
-, make_option(
+  ,
+  make_option(
     c("-L", "--locProbCutoffGraph"),
     action = "store",
     type = "character",
     help = "path to location-proability cutoff graph PDF"
   )
-, make_option(
+  ,
+  make_option(
     c("-M", "--locProbCutoffGraph_svg"),
     action = "store",
     type = "character",
     help = "path to location-proability cutoff graph SVG"
   )
-, make_option(
+  ,
+  make_option(
     c("-e", "--enriched"),
     action = "store",
     type = "character",
     help = "pY or pST enriched samples (ie, 'Y' or 'ST')"
   )
   # default = "^Number of Phospho [(]STY[)]$",
-, make_option(
+  ,
+  make_option(
     c("-p", "--phosphoCol"),
     action = "store",
     type = "character",
-    help = "PERL-compatible regular expression matching header of column having number of 'Phospho (STY)'"
+    help = paste0("PERL-compatible regular expression matching",
+             " header of column having number of 'Phospho (STY)'")
   )
   # default = "^Intensity[^_]",
-, make_option(
+  ,
+  make_option(
     c("-s", "--startCol"),
     action = "store",
     type = "character",
-    help = "PERL-compatible regular expression matching column header having first sample intensity"
+    help = paste0("PERL-compatible regular expression matching",
+             " header of column having first sample intensity")
   )
   # default = 1,
-, make_option(
+  ,
+  make_option(
     c("-I", "--intervalCol"),
     action = "store",
     type = "integer",
-    help = "Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column"
+    help = paste0("Column interval between the Intensities of samples",
+             " (eg, 1 if subsequent column; 2 if every other column")
   )
   # default = 0.75,
-, make_option(
+  ,
+  make_option(
     c("-l", "--localProbCutoff"),
     action = "store",
     type = "double",
     help = "Localization Probability Cutoff"
   )
   # default = "sum",
-, make_option(
+  ,
+  make_option(
     c("-f", "--collapse_func"),
     action = "store",
     type = "character",
-    help = "merge identical phosphopeptides by ('sum' or 'average') the intensities"
+    help = paste0("merge identical phosphopeptides",
+             " by ('sum' or 'average') the intensities")
   )
-  # default = "filteredData.txt",
-, make_option(
+  # default = "filtered_data.txt",
+  ,
+  make_option(
     c("-r", "--filtered_data"),
     action = "store",
     type = "character",
-    help = "filteredData.txt"
+    help = "filtered_data.txt"
   )
   # default = "quantData.txt",
-, make_option(
+  ,
+  make_option(
     c("-q", "--quant_data"),
     action = "store",
     type = "character",
     help = "quantData.txt"
   )
 )
-args <- parse_args(OptionParser(option_list=option_list))
+args <- parse_args(OptionParser(option_list = option_list))
 # Check parameter values
 
 ### EXTRACT ARGUMENTS end ----------------------------------------------------
@@ -220,80 +289,90 @@
 
 ### EXTRACT PARAMETERS from arguments begin ----------------------------------
 
-if (! file.exists(args$input)) {
+if (!file.exists(args$input)) {
   stop((paste("File", args$input, "does not exist")))
 }
 
-phosphoColPattern <- "^Number of Phospho [(][STY][STY]*[)]$"
-startColPattern <- "^Intensity[^_]"
-phosphoColPattern <- readFirstLine(args$phosphoCol)
-startColPattern <- readFirstLine(args$startCol)
+phospho_col_pattern <- "^Number of Phospho [(][STY][STY]*[)]$"
+start_col_pattern <- "^Intensity[^_]"
+phospho_col_pattern <- read_first_line(args$phosphoCol)
+start_col_pattern <- read_first_line(args$startCol)
 
 sink(getConnection(2))
-#ACE print(paste("phosphoColPattern", phosphoColPattern))
-#ACE print(paste("startColPattern", startColPattern))
+
+input_file_name <- args$input
+filtered_filename <- args$filtered_data
+quant_file_name <- args$quant_data
+interval_col <- as.integer(args$intervalCol)
 
-inputFilename <- args$input
-filteredFilename <- args$filtered_data
-quantFilename <- args$quant_data
-intervalCol <- as.integer(args$intervalCol)
-
-firstLine <- readFirstLine(inputFilename)
-columnHeaders <- unlist(strsplit(x=firstLine, split=c('\t'), fixed=TRUE))
+first_line <- read_first_line(input_file_name)
+col_headers <-
+  unlist(strsplit(
+    x = first_line,
+    split = c("\t"),
+    fixed = TRUE
+  ))
 sink(getConnection(2))
-#ACE print("columnHeaders")
-#ACE print(columnHeaders)
 sink()
 
 
-intensityHeaderCols <- grep(pattern=startColPattern, x=columnHeaders, perl=TRUE)
-if ( length(intensityHeaderCols) == 0) {
-    err_msg <- paste("Found no intensity columns matching pattern:", startColPattern)
-    # Divert output to stderr
-    sink(getConnection(2))
-    print(err_msg)
-    sink()
-    stop(err_msg)
-    }
+intensity_header_cols <-
+  grep(pattern = start_col_pattern, x = col_headers, perl = TRUE)
+if (length(intensity_header_cols) == 0) {
+  err_msg <-
+    paste("Found no intensity columns matching pattern:",
+          start_col_pattern)
+  # Divert output to stderr
+  sink(getConnection(2))
+  print(err_msg)
+  sink()
+  stop(err_msg)
+}
 
 
-phosphoCol <- grep(pattern=phosphoColPattern, x=columnHeaders, perl=TRUE)[1]
-if (is.na(phosphoCol)) {
-    err_msg <- paste("Found no 'number of phospho sites' columns matching pattern:", phosphoColPattern)
-    # Divert output to stderr
-    sink(getConnection(2))
-    print(err_msg)
-    sink()
-    stop(err_msg)
-    }
+phospho_col <-
+  grep(pattern = phospho_col_pattern, x = col_headers, perl = TRUE)[1]
+if (is.na(phospho_col)) {
+  err_msg <-
+    paste("Found no 'number of phospho sites' columns matching pattern:",
+          phospho_col_pattern)
+  # Divert output to stderr
+  sink(getConnection(2))
+  print(err_msg)
+  sink()
+  stop(err_msg)
+}
 
 
 i_count <- 0
 this_column <- 1
-last_value <- intensityHeaderCols[1]
-intensityCols <- c(last_value)
-
-while ( length(intensityHeaderCols) >= intervalCol * i_count ) {
-  i_count <- 1 + i_count
-  this_column <- intervalCol + this_column
-  if ( last_value + intervalCol != intensityHeaderCols[this_column] ) break
-  last_value <- intensityHeaderCols[this_column]
-  if (length(intensityHeaderCols) < intervalCol * i_count) break
-  intensityCols <- c(intensityCols, intensityHeaderCols[this_column])
-  }
+last_value <- intensity_header_cols[1]
+intensity_cols <- c(last_value)
 
-startCol <- intensityCols[1]
-numSamples <- i_count
+while (length(intensity_header_cols) >= interval_col * i_count) {
+  i_count <- 1 + i_count
+  this_column <- interval_col + this_column
+  if (last_value + interval_col != intensity_header_cols[this_column])
+    break
+  last_value <- intensity_header_cols[this_column]
+  if (length(intensity_header_cols) < interval_col * i_count)
+    break
+  intensity_cols <-
+    c(intensity_cols, intensity_header_cols[this_column])
+}
 
-outputfilename <- args$output
-enrichGraphFilename <- args$enrichGraph
-locProbCutoffGraphFilename <- args$locProbCutoffGraph
-enrichGraphFilename_svg <- args$enrichGraph_svg
-locProbCutoffGraphFilename_svg <- args$locProbCutoffGraph_svg
+start_col <- intensity_cols[1]
+num_samples <- i_count
 
-localProbCutoff <- args$localProbCutoff
+output_filename <- args$output
+enrich_graph_filename <- args$enrichGraph
+loc_prob_cutoff_graph_filename <- args$locProbCutoffGraph
+enrich_graph_filename_svg <- args$enrichGraph_svg
+loc_prob_cutoff_graph_fn_svg <- args$locProbCutoffGraph_svg
+
+local_prob_cutoff <- args$localProbCutoff
 enriched <- args$enriched
-collapse_FUN <- args$collapse_func
+collapse_fn <- args$collapse_func
 
 ### EXTRACT PARAMETERS from arguments end ------------------------------------
 
@@ -303,53 +382,80 @@
 # is run by the Galaxy MaxQuant wrapper and need not be invoked here.
 
 
-# Read data, filtering out contaminants, reverse sequences, and localization probability
+# Read & filter out contaminants, reverse sequences, & localization probability
 # ---
-fullData <- read.table(file = inputFilename, sep ="\t", header=T, quote="")
+full_data <-
+  read.table(
+    file = input_file_name,
+    sep = "\t",
+    header = T,
+    quote = ""
+  )
 
-#Filter out contaminant rows and reverse rows
-filteredData <- subset(fullData,!grepl("CON__", Proteins))
-filteredData <- subset(filteredData,!grepl("_MYCOPLASMA", Proteins))
-filteredData <- subset(filteredData,!grepl("CONTAMINANT_", Proteins))
-filteredData <- subset(filteredData,!grepl("REV__", Protein)) #since REV__ rows are blank in the first column (Proteins)
-write.table(filteredData, file = filteredFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE)
+# Filter out contaminant rows and reverse rows
+filtered_data <- subset(full_data, !grepl("CON__", Proteins))
+filtered_data <-
+  subset(filtered_data, !grepl("_MYCOPLASMA", Proteins))
+filtered_data <-
+  subset(filtered_data, !grepl("CONTAMINANT_", Proteins))
+filtered_data <-
+  subset(filtered_data, !grepl("REV__", Protein)
+         ) # since REV__ rows are blank in the first column (Proteins)
+write.table(
+  filtered_data,
+  file = filtered_filename,
+  sep = "\t",
+  quote = FALSE,
+  col.names = TRUE,
+  row.names = FALSE
+)
 # ...
 
 
 # Filter out data with localization probability below localProbCutoff
 # ---
-#Data filtered by localization probability
-locProbFilteredData <- filteredData[filteredData$Localization.prob>=localProbCutoff,]
+# Data filtered by localization probability
+loc_prob_filtered_data <-
+  filtered_data[
+    filtered_data$Localization.prob >= local_prob_cutoff,
+    ]
 # ...
 
 
 # Localization probability -- visualize locprob cutoff
 # ---
-locProbGraphData <- data.frame(
-  group = c(paste(">",toString(localProbCutoff),sep=""), paste("<",toString(localProbCutoff),sep="")),
-  value = c(nrow(locProbFilteredData)/nrow(filteredData)*100, (nrow(filteredData)-nrow(locProbFilteredData))/nrow(filteredData)*100)
-)
+loc_prob_graph_data <-
+  data.frame(
+    group = c(paste(">", toString(local_prob_cutoff), sep = ""),
+              paste("<", toString(local_prob_cutoff), sep = "")),
+    value = c(
+      nrow(loc_prob_filtered_data) / nrow(filtered_data) * 100,
+      (nrow(filtered_data) - nrow(loc_prob_filtered_data))
+        / nrow(filtered_data) * 100
+    )
+  )
 gigi <-
-  ggplot(locProbGraphData, aes(x = "", y = value, fill = group)) +
-  geom_bar(width = 0.5, stat = "identity", color = "black") +
-  labs(
-    x = NULL
-  , y = "percent"
-  , title = "Phosphopeptides partitioned by localization-probability cutoff"
+  ggplot(loc_prob_graph_data, aes(x = "", y = value, fill = group)) +
+  geom_bar(width = 0.5,
+           stat = "identity",
+           color = "black") +
+  labs(x = NULL,
+    y = "percent",
+    title = "Phosphopeptides partitioned by localization-probability cutoff"
   ) +
   scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") +
   theme_minimal() +
   theme(
-         legend.position = "right"
-       , legend.title=element_text()
-       , plot.title = element_text(hjust = 0.5)
-       , plot.subtitle = element_text(hjust = 0.5)
-       , plot.title.position = "plot"
-       )
-pdf(locProbCutoffGraphFilename)
+    legend.position = "right",
+    legend.title = element_text(),
+    plot.title = element_text(hjust = 0.5),
+    plot.subtitle = element_text(hjust = 0.5),
+    plot.title.position = "plot"
+  )
+pdf(loc_prob_cutoff_graph_filename)
 print(gigi)
 dev.off()
-svg(locProbCutoffGraphFilename_svg)
+svg(loc_prob_cutoff_graph_fn_svg)
 print(gigi)
 dev.off()
 # ...
@@ -357,126 +463,212 @@
 
 # Extract quantitative values from filtered data
 # ---
-quantData <- locProbFilteredData[,seq(from=startCol, by=intervalCol, length.out=numSamples)]
+quant_data <-
+  loc_prob_filtered_data[, seq(from = start_col,
+                               by = interval_col,
+                               length.out = num_samples)]
 # ...
 
 
 # Generate Phosphopeptide Sequence
 #   for latest version of MaxQuant (Version 1.5.3.30)
 # ---
-dataTable <- data.frame(locProbFilteredData[,1:8],locProbFilteredData[,phosphoCol],locProbFilteredData[,phosphoCol+1],locProbFilteredData[,phosphoCol+2],locProbFilteredData[,phosphoCol+3],locProbFilteredData[,phosphoCol+4],locProbFilteredData[,phosphoCol+5],locProbFilteredData[,phosphoCol+6],locProbFilteredData[,phosphoCol+7],quantData)
-colnames(dataTable) <- c("Proteins","Positions within proteins", "Leading proteins", "Protein", "Protein names", "Gene names", "Fasta headers", "Localization prob", "Number of Phospho (STY)", "Amino Acid", "Sequence window","Modification window", "Peptide window coverage", "Phospho (STY) Probabilities", "Phospho (STY) Score diffs", "Position in peptide", colnames(quantData))
-# 'phosphopeptide_func' generates a phosphopeptide sequence for each row of data.
-#   for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1,2) = both
-dataTable$Phosphopeptide <- apply(X=dataTable, MARGIN=1, FUN=phosphopeptide_func)
+metadata_df <-
+  data.frame(
+    loc_prob_filtered_data[, 1:8],
+    loc_prob_filtered_data[, phospho_col],
+    loc_prob_filtered_data[, phospho_col + 1],
+    loc_prob_filtered_data[, phospho_col + 2],
+    loc_prob_filtered_data[, phospho_col + 3],
+    loc_prob_filtered_data[, phospho_col + 4],
+    loc_prob_filtered_data[, phospho_col + 5],
+    loc_prob_filtered_data[, phospho_col + 6],
+    loc_prob_filtered_data[, phospho_col + 7],
+    quant_data
+  )
+colnames(metadata_df) <-
+  c(
+    "Proteins",
+    "Positions within proteins",
+    "Leading proteins",
+    "Protein",
+    "Protein names",
+    "Gene names",
+    "Fasta headers",
+    "Localization prob",
+    "Number of Phospho (STY)",
+    "Amino Acid",
+    "Sequence window",
+    "Modification window",
+    "Peptide window coverage",
+    "Phospho (STY) Probabilities",
+    "Phospho (STY) Score diffs",
+    "Position in peptide",
+    colnames(quant_data)
+  )
+# 'phosphopeptide_func' generates a phosphopeptide sequence
+#   for each row of data.
+# for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1, 2) = both
+metadata_df$phosphopeptide <-
+  apply(X = metadata_df, MARGIN = 1, FUN = phosphopeptide_func)
+colnames(metadata_df)[1] <- "Phosphopeptide"
 # Move the quant data columns to the right end of the data.frame
-dataTable <- movetolast(dataTable,c(colnames(quantData)))
+metadata_df <- movetolast(metadata_df, c(colnames(quant_data)))
 # ...
 
 
 # Write quantitative values for debugging purposes
 # ---
-quantWrite <- cbind( dataTable[,"Sequence window"], quantData ) 
-colnames(quantWrite)[1] <- "Sequence.Window"
-write.table(quantWrite, file = quantFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE)
+quant_write <- cbind(metadata_df[, "Sequence window"], quant_data)
+colnames(quant_write)[1] <- "Sequence.Window"
 # ...
 
 
-# Make new data frame containing only Phosphopeptides to be mapped to quant data (merge_df)
+# Make new data frame containing only Phosphopeptides
+#   that are to be mapped to quant data (merge_df)
 # ---
-dataTable <- setDT(dataTable, keep.rownames=TRUE) #row name will be used to map
-merge_df <- data.frame(as.integer(dataTable$rn), dataTable$Phosphopeptide) #row index to merge data frames
+metadata_df <-
+  setDT(metadata_df, keep.rownames = TRUE) # row name will be used to map
+merge_df <-
+  data.frame(
+    as.integer(metadata_df$rn),
+    metadata_df$phosphopeptide # row index to merge data frames
+    )
 colnames(merge_df) <- c("rn", "Phosphopeptide")
 # ...
 
 
 # Add Phosphopeptide column to quant columns for quality control checking
 # ---
-quantData_qc <- as.data.frame(quantData)
-setDT(quantData_qc, keep.rownames=TRUE) #will use to match rowname to data
-quantData_qc$rn <- as.integer(quantData_qc$rn)
-quantData_qc <- merge(merge_df,quantData_qc, by="rn")
-quantData_qc$rn <- NULL #remove rn column
+quant_data_qc <- as.data.frame(quant_data)
+setDT(quant_data_qc, keep.rownames = TRUE) # will use to match rowname to data
+quant_data_qc$rn <- as.integer(quant_data_qc$rn)
+quant_data_qc <- merge(merge_df, quant_data_qc, by = "rn")
+quant_data_qc$rn <- NULL # remove rn column
 # ...
 
 
 # Collapse multiphosphorylated peptides
 # ---
-quantData_qc_collapsed <- data.table(quantData_qc, key = "Phosphopeptide")
-quantData_qc_collapsed <- aggregate(. ~ Phosphopeptide,quantData_qc, FUN= collapse_FUN)
+quant_data_qc_collapsed <-
+  data.table(quant_data_qc, key = "Phosphopeptide")
+quant_data_qc_collapsed <-
+  aggregate(. ~ Phosphopeptide, quant_data_qc, FUN = collapse_fn)
 # ...
+print("quant_data_qc_collapsed")
+head(quant_data_qc_collapsed)
 
-
-# Compute (as string) % of phosphopeptides that are multiphosphorylated (for use in next step)
+# Compute (as string) % of phosphopeptides that are multiphosphorylated
+#   (for use in next step)
 # ---
-pct_multiphos <- (nrow(quantData_qc) - nrow(quantData_qc_collapsed)) / (2 * nrow(quantData_qc))
+pct_multiphos <-
+  (
+    nrow(quant_data_qc) - nrow(quant_data_qc_collapsed)
+  ) / (2 * nrow(quant_data_qc))
 pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%")
 # ...
 
+write.table(
+  quant_data_qc_collapsed,
+  file = quant_file_name,
+  sep = "\t",
+  quote = FALSE,
+  col.names = TRUE,
+  row.names = FALSE
+)
 
 # Compute and visualize breakdown of pY, pS, and pT before enrichment filter
 # ---
-pY_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
-pS_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS"),]
-pT_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
+py_data <-
+  quant_data_qc_collapsed[
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"),
+    ]
+ps_data <-
+  quant_data_qc_collapsed[
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS"),
+    ]
+pt_data <-
+  quant_data_qc_collapsed[
+     str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"),
+     ]
 
-pY_num <- nrow(pY_data)
-pS_num <- nrow(pS_data)
-pT_num <- nrow(pT_data)
+py_num <- nrow(py_data)
+ps_num <- nrow(ps_data)
+pt_num <- nrow(pt_data)
 
 # Visualize enrichment
-enrichGraphData <- data.frame(
-  group = c("pY", "pS", "pT"),
-  value = c(pY_num, pS_num, pT_num)
-)
+enrich_graph_data <- data.frame(group = c("pY", "pS", "pT"),
+                                value = c(py_num, ps_num, pt_num))
 
-enrichGraphData <- enrichGraphData[enrichGraphData$value > 0,]
+enrich_graph_data <-
+  enrich_graph_data[
+    enrich_graph_data$value > 0,
+    ]
 
 # Plot pie chart with legend
 # start: https://stackoverflow.com/a/62522478/15509512
 # refine: https://www.statology.org/ggplot-pie-chart/
 # colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8
-slices <- enrichGraphData$value
-phosphoresidue <- enrichGraphData$group
+slices <- enrich_graph_data$value
+phosphoresidue <- enrich_graph_data$group
 pct    <- round(100 * slices / sum(slices))
-lbls   <- paste(enrichGraphData$group,"\n",pct, "%\n(", slices, ")", sep="")
+lbls   <-
+  paste(enrich_graph_data$group, "\n", pct, "%\n(", slices, ")", sep = "")
 slc_ctr <- c()
 run_tot <- 0
 for (p in pct) {
-  slc_ctr <- c(slc_ctr, run_tot + p/2.0)
+  slc_ctr <- c(slc_ctr, run_tot + p / 2.0)
   run_tot <- run_tot + p
 }
 lbl_y  <- 100 - slc_ctr
-df     <- data.frame(slices, pct, lbls, phosphoresidue = factor(phosphoresidue, levels = phosphoresidue))
-gigi <- ggplot(
-  df
-, aes(x = 1, y = pct, fill = phosphoresidue)) +
+df     <-
+  data.frame(slices,
+             pct,
+             lbls,
+             phosphoresidue = factor(phosphoresidue, levels = phosphoresidue))
+gigi <- ggplot(df
+               , aes(x = 1, y = pct, fill = phosphoresidue)) +
   geom_col(position = "stack", orientation = "x") +
   geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") +
   coord_polar(theta = "y", direction = -1) +
   labs(
     x = NULL
-  , y = NULL
-  , title = "Percentages (and counts) of phosphosites, by type of residue"
-  , caption = sprintf("Roughly %s of peptides have multiple phosphosites.", pct_multiphos)
+    ,
+    y = NULL
+    ,
+    title = "Percentages (and counts) of phosphosites, by type of residue"
+    ,
+    caption = sprintf(
+      "Roughly %s of peptides have multiple phosphosites.",
+      pct_multiphos
+    )
   ) +
   labs(x = NULL, y = NULL, fill = NULL) +
   theme_classic() +
-  theme( legend.position="right"
-       , axis.line = element_blank()
-       , axis.text = element_blank()
-       , axis.ticks = element_blank()
-       , plot.title = element_text(hjust = 0.5)
-       , plot.subtitle = element_text(hjust = 0.5)
-       , plot.caption = element_text(hjust = 0.5)
-       , plot.title.position = "plot"
-       ) +
-  scale_fill_manual(breaks = phosphoresidue, values=c("#c7eae5", "#f6e8c3", "#dfc27d"))
+  theme(
+    legend.position = "right"
+    ,
+    axis.line = element_blank()
+    ,
+    axis.text = element_blank()
+    ,
+    axis.ticks = element_blank()
+    ,
+    plot.title = element_text(hjust = 0.5)
+    ,
+    plot.subtitle = element_text(hjust = 0.5)
+    ,
+    plot.caption = element_text(hjust = 0.5)
+    ,
+    plot.title.position = "plot"
+  ) +
+  scale_fill_manual(breaks = phosphoresidue,
+                    values = c("#c7eae5", "#f6e8c3", "#dfc27d"))
 
-pdf(enrichGraphFilename)
+pdf(enrich_graph_filename)
 print(gigi)
 dev.off()
-svg(enrichGraphFilename_svg)
+svg(enrich_graph_filename_svg)
 print(gigi)
 dev.off()
 # ...
@@ -484,17 +676,31 @@
 
 # Filter phosphopeptides by enrichment
 # --
-if (enriched == "Y"){
-  quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
-} else if ( enriched == "ST" ) {
-  quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
+if (enriched == "Y") {
+  quant_data_qc_enrichment <- quant_data_qc_collapsed[
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"),
+    ]
+} else if (enriched == "ST") {
+  quant_data_qc_enrichment <- quant_data_qc_collapsed[
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS") |
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"),
+    ]
 } else {
   print("Error in enriched variable. Set to either 'Y' or 'ST'")
 }
 # ...
 
+print("quant_data_qc_enrichment")
+head(quant_data_qc_enrichment)
 
 # Write phosphopeptides filtered by enrichment
 # --
-write.table(quantData_qc_enrichment, file=outputfilename, sep="\t", quote = FALSE, row.names = FALSE)
+#ACE colnames(quant_data_qc_enrichment)[1] <- "Phosphopeptide"
+write.table(
+  quant_data_qc_enrichment,
+  file = output_filename,
+  sep = "\t",
+  quote = FALSE,
+  row.names = FALSE
+)
 # ...
author	eschen42
date	Thu, 10 Mar 2022 23:42:48 +0000
parents	c1403d18c189
children	922d309640db