Mercurial > repos > eschen42 > mqppep_anova

--- a/MaxQuantProcessingScript.R	Mon Mar 07 20:43:57 2022 +0000
+++ b/MaxQuantProcessingScript.R	Thu Mar 10 23:42:48 2022 +0000
@@ -1,6 +1,6 @@
 #!/usr/bin/env Rscript

-# This is the implementation for the
+# This is the implementation for the
 #   "MaxQuant Phosphopeptide Localization Probability Cutoff"
 #   Galaxy tool (mqppep_lclztn_filter)
 # It is adapted from the MaxQuant Processing Script written by Larry Cheng.
@@ -10,16 +10,14 @@
 library(data.table)
 library(stringr)
 library(ggplot2)
-#library(PTXQC)
-#require(PTXQC)
-#require(methods)

 # title: "MaxQuant Processing Script"
 # author: "Larry Cheng"
 # date: "February 19, 2018"
 #
 # # MaxQuant Processing Script
-# Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order):
+# Takes MaxQuant Phospho (STY)sites.txt file as input
+# and performs the following (in order):
 # 1) Runs the Proteomics Quality Control software
 # 2) Remove contaminant and reverse sequence rows
 # 3) Filters rows based on localization probability
@@ -27,24 +25,32 @@
 # 5) Sequences phosphopeptides
 # 6) Merges multiply phosphorylated peptides
 # 7) Filters out phosphopeptides based on enrichment
-# The output file contains the phosphopeptide (first column) and the quantitative values for each sample
+# The output file contains the phosphopeptide (first column)
+# and the quantitative values for each sample.
 #
 # ## Revision History
 # Rev. 2022-02-10 :wrap for inclusion in Galaxy
-# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script"
+# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script"
+#                  and "Phosphopeptide Processing Script"
 # Rev. 2017-12-12 :added PTXQC
 #                  added additional plots and table outputs for quality control
-#                  allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <-
-#                  "\\.(\\d+)[A-Z]$"
-#                  converted from .r to .rmd file to knit report for quality control
-# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times
-# Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows
-# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output
-# Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\.(\\d+)[AB]$" so that it looks at the end of string
+#                  allowed for more than 2 samples to be grouped together
+#                  (up to 26 (eg, 1A, 1B, 1C, etc))
+#                  converted from .r to .rmd file to knit report
+#                  for quality control
+# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data
+#                  impute multiple times
+# Rev. 2016-09-09 :added filter to eliminate contaminant & reverse sequence rows
+# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to
+#                  preANOVA file output
+# Rev. 2016-08-22 :use regexSampleNames <- "\\.(\\d + )[AB]$"
+#                  so that it looks at the end of string
 # Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....)
-# Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA
+# Rev. 2016-07-03 :Removed row names from the write.table() output for
+#                  ANOVA and PreANOVA
 # Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75
-# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards
+# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting
+#                  the row numbers afterwards
 # Rev. 2016-06-21 :test18 + standardized the regexpression in protocol


@@ -52,9 +58,9 @@

 # Read first line of file at filePath
 # adapted from: https://stackoverflow.com/a/35761217/15509512
-readFirstLine <- function(filepath) {
-  con = file(filepath, "r")
-  line = readLines(con, n = 1)
+read_first_line <- function(filepath) {
+  con <- file(filepath, "r")
+  line <- readLines(con, n = 1)
   close(con)
   return(line)
 }
@@ -68,28 +74,57 @@

 # Generate phosphopeptide and build list when applied
 phosphopeptide_func <- function(df) {
-
-  #generate peptide sequence and list of phosphopositions
-  phosphoprobsequence <- strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]]
+  # generate peptide sequence and list of phosphopositions
+  phosphoprobsequence <-
+    strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]]
   output <- vector()
   phosphopeptide <- ""
-  counter <- 0 #keep track of position in peptide
-  phosphopositions <- vector() #keep track of phosphorylation positions in peptide
+  counter <- 0 # keep track of position in peptide
+  phosphopositions <-
+    vector() # keep track of phosphorylation positions in peptide
   score_diff <- ""
-  for (chara in phosphoprobsequence){
-    #build peptide sequence
-    if (!(chara == " " | chara == "(" | chara == ")" | chara =="." | chara =="-" | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9")) {
-      phosphopeptide <- paste(phosphopeptide,chara,sep="")
+  for (chara in phosphoprobsequence) {
+    # build peptide sequence
+    if (!(
+      chara == " " |
+      chara == "(" |
+      chara == ")" |
+      chara == "." |
+      chara == "-" |
+      chara == "0" |
+      chara == "1" |
+      chara == "2" |
+      chara == "3" |
+      chara == "4" |
+      chara == "5" |
+      chara == "6" |
+      chara == "7" |
+      chara == "8" |
+      chara == "9")
+    ) {
+      phosphopeptide <- paste(phosphopeptide, chara, sep = "")
       counter <- counter + 1
     }
-    #generate score_diff
-    if (chara == "-" | chara =="." | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9"){
-      score_diff <- paste(score_diff,chara,sep="")
+    # generate score_diff
+    if (chara == "-" |
+        chara == "." |
+        chara == "0" |
+        chara == "1" |
+        chara == "2" |
+        chara == "3" |
+        chara == "4" |
+        chara == "5" |
+        chara == "6" |
+        chara == "7" |
+        chara == "8" |
+        chara == "9"
+    ) {
+      score_diff <- paste(score_diff, chara, sep = "")
     }
-    #evaluate score_diff
-    if (chara == ")" ){
+    # evaluate score_diff
+    if (chara == ")") {
       score_diff <- as.numeric(score_diff)
-      #only consider a phosphoresidue if score_diff > 0
+      # only consider a phosphoresidue if score_diff > 0
       if (score_diff > 0) {
         phosphopositions <- append(phosphopositions, counter)
       }
@@ -97,19 +132,36 @@
     }
   }

-  #generate phosphopeptide sequence (ie, peptide sequence with "p"'s)
+  # generate phosphopeptide sequence (ie, peptide sequence with "p"'s)
   counter <- 1
-  phosphoposition_correction1 <- -1 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
-  phosphoposition_correction2 <- 0 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
-  while (counter <= length(phosphopositions) ) {
-    phosphopeptide <- paste(substr(phosphopeptide,0,phosphopositions[counter]+phosphoposition_correction1),"p",substr(phosphopeptide,phosphopositions[counter]+phosphoposition_correction2,nchar(phosphopeptide)),sep="")
+  phosphoposition_correction1 <-
+    -1 # used to correct phosphosposition as "p"'s
+       #  are inserted into the phosphopeptide string
+  phosphoposition_correction2 <-
+    0  # used to correct phosphosposition as "p"'s
+       #   are inserted into the phosphopeptide string
+  while (counter <= length(phosphopositions)) {
+    phosphopeptide <-
+      paste(
+        substr(
+          phosphopeptide,
+          0,
+          phosphopositions[counter] + phosphoposition_correction1
+        ),
+        "p",
+        substr(
+          phosphopeptide,
+          phosphopositions[counter] + phosphoposition_correction2,
+          nchar(phosphopeptide)
+        ),
+        sep = ""
+      )
     counter <- counter + 1
     phosphoposition_correction1 <- phosphoposition_correction1 + 1
     phosphoposition_correction2 <- phosphoposition_correction2 + 1
   }
-
-  #building phosphopeptide list
-  output <- append(output,phosphopeptide)
+  # building phosphopeptide list
+  output <- append(output, phosphopeptide)
   return(output)
 }

@@ -126,93 +178,110 @@
     type = "character",
     help = "A MaxQuant Phospho (STY)Sites.txt"
   )
-,  make_option(
+  ,
+  make_option(
     c("-o", "--output"),
     action = "store",
     type = "character",
     help = "path to output file"
   )
-, make_option(
+  ,
+  make_option(
     c("-E", "--enrichGraph"),
     action = "store",
     type = "character",
     help = "path to enrichment graph PDF"
   )
-, make_option(
+  ,
+  make_option(
     c("-F", "--enrichGraph_svg"),
     action = "store",
     type = "character",
     help = "path to enrichment graph SVG"
   )
-, make_option(
+  ,
+  make_option(
     c("-L", "--locProbCutoffGraph"),
     action = "store",
     type = "character",
     help = "path to location-proability cutoff graph PDF"
   )
-, make_option(
+  ,
+  make_option(
     c("-M", "--locProbCutoffGraph_svg"),
     action = "store",
     type = "character",
     help = "path to location-proability cutoff graph SVG"
   )
-, make_option(
+  ,
+  make_option(
     c("-e", "--enriched"),
     action = "store",
     type = "character",
     help = "pY or pST enriched samples (ie, 'Y' or 'ST')"
   )
   # default = "^Number of Phospho [(]STY[)]$",
-, make_option(
+  ,
+  make_option(
     c("-p", "--phosphoCol"),
     action = "store",
     type = "character",
-    help = "PERL-compatible regular expression matching header of column having number of 'Phospho (STY)'"
+    help = paste0("PERL-compatible regular expression matching",
+             " header of column having number of 'Phospho (STY)'")
   )
   # default = "^Intensity[^_]",
-, make_option(
+  ,
+  make_option(
     c("-s", "--startCol"),
     action = "store",
     type = "character",
-    help = "PERL-compatible regular expression matching column header having first sample intensity"
+    help = paste0("PERL-compatible regular expression matching",
+             " header of column having first sample intensity")
   )
   # default = 1,
-, make_option(
+  ,
+  make_option(
     c("-I", "--intervalCol"),
     action = "store",
     type = "integer",
-    help = "Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column"
+    help = paste0("Column interval between the Intensities of samples",
+             " (eg, 1 if subsequent column; 2 if every other column")
   )
   # default = 0.75,
-, make_option(
+  ,
+  make_option(
     c("-l", "--localProbCutoff"),
     action = "store",
     type = "double",
     help = "Localization Probability Cutoff"
   )
   # default = "sum",
-, make_option(
+  ,
+  make_option(
     c("-f", "--collapse_func"),
     action = "store",
     type = "character",
-    help = "merge identical phosphopeptides by ('sum' or 'average') the intensities"
+    help = paste0("merge identical phosphopeptides",
+             " by ('sum' or 'average') the intensities")
   )
-  # default = "filteredData.txt",
-, make_option(
+  # default = "filtered_data.txt",
+  ,
+  make_option(
     c("-r", "--filtered_data"),
     action = "store",
     type = "character",
-    help = "filteredData.txt"
+    help = "filtered_data.txt"
   )
   # default = "quantData.txt",
-, make_option(
+  ,
+  make_option(
     c("-q", "--quant_data"),
     action = "store",
     type = "character",
     help = "quantData.txt"
   )
 )
-args <- parse_args(OptionParser(option_list=option_list))
+args <- parse_args(OptionParser(option_list = option_list))
 # Check parameter values

 ### EXTRACT ARGUMENTS end ----------------------------------------------------
@@ -220,80 +289,90 @@

 ### EXTRACT PARAMETERS from arguments begin ----------------------------------

-if (! file.exists(args$input)) {
+if (!file.exists(args$input)) {
   stop((paste("File", args$input, "does not exist")))
 }

-phosphoColPattern <- "^Number of Phospho [(][STY][STY]*[)]$"
-startColPattern <- "^Intensity[^_]"
-phosphoColPattern <- readFirstLine(args$phosphoCol)
-startColPattern <- readFirstLine(args$startCol)
+phospho_col_pattern <- "^Number of Phospho [(][STY][STY]*[)]$"
+start_col_pattern <- "^Intensity[^_]"
+phospho_col_pattern <- read_first_line(args$phosphoCol)
+start_col_pattern <- read_first_line(args$startCol)

 sink(getConnection(2))
-#ACE print(paste("phosphoColPattern", phosphoColPattern))
-#ACE print(paste("startColPattern", startColPattern))
+
+input_file_name <- args$input
+filtered_filename <- args$filtered_data
+quant_file_name <- args$quant_data
+interval_col <- as.integer(args$intervalCol)

-inputFilename <- args$input
-filteredFilename <- args$filtered_data
-quantFilename <- args$quant_data
-intervalCol <- as.integer(args$intervalCol)
-
-firstLine <- readFirstLine(inputFilename)
-columnHeaders <- unlist(strsplit(x=firstLine, split=c('\t'), fixed=TRUE))
+first_line <- read_first_line(input_file_name)
+col_headers <-
+  unlist(strsplit(
+    x = first_line,
+    split = c("\t"),
+    fixed = TRUE
+  ))
 sink(getConnection(2))
-#ACE print("columnHeaders")
-#ACE print(columnHeaders)
 sink()


-intensityHeaderCols <- grep(pattern=startColPattern, x=columnHeaders, perl=TRUE)
-if ( length(intensityHeaderCols) == 0) {
-    err_msg <- paste("Found no intensity columns matching pattern:", startColPattern)
-    # Divert output to stderr
-    sink(getConnection(2))
-    print(err_msg)
-    sink()
-    stop(err_msg)
-    }
+intensity_header_cols <-
+  grep(pattern = start_col_pattern, x = col_headers, perl = TRUE)
+if (length(intensity_header_cols) == 0) {
+  err_msg <-
+    paste("Found no intensity columns matching pattern:",
+          start_col_pattern)
+  # Divert output to stderr
+  sink(getConnection(2))
+  print(err_msg)
+  sink()
+  stop(err_msg)
+}


-phosphoCol <- grep(pattern=phosphoColPattern, x=columnHeaders, perl=TRUE)[1]
-if (is.na(phosphoCol)) {
-    err_msg <- paste("Found no 'number of phospho sites' columns matching pattern:", phosphoColPattern)
-    # Divert output to stderr
-    sink(getConnection(2))
-    print(err_msg)
-    sink()
-    stop(err_msg)
-    }
+phospho_col <-
+  grep(pattern = phospho_col_pattern, x = col_headers, perl = TRUE)[1]
+if (is.na(phospho_col)) {
+  err_msg <-
+    paste("Found no 'number of phospho sites' columns matching pattern:",
+          phospho_col_pattern)
+  # Divert output to stderr
+  sink(getConnection(2))
+  print(err_msg)
+  sink()
+  stop(err_msg)
+}


 i_count <- 0
 this_column <- 1
-last_value <- intensityHeaderCols[1]
-intensityCols <- c(last_value)
-
-while ( length(intensityHeaderCols) >= intervalCol * i_count ) {
-  i_count <- 1 + i_count
-  this_column <- intervalCol + this_column
-  if ( last_value + intervalCol != intensityHeaderCols[this_column] ) break
-  last_value <- intensityHeaderCols[this_column]
-  if (length(intensityHeaderCols) < intervalCol * i_count) break
-  intensityCols <- c(intensityCols, intensityHeaderCols[this_column])
-  }
+last_value <- intensity_header_cols[1]
+intensity_cols <- c(last_value)

-startCol <- intensityCols[1]
-numSamples <- i_count
+while (length(intensity_header_cols) >= interval_col * i_count) {
+  i_count <- 1 + i_count
+  this_column <- interval_col + this_column
+  if (last_value + interval_col != intensity_header_cols[this_column])
+    break
+  last_value <- intensity_header_cols[this_column]
+  if (length(intensity_header_cols) < interval_col * i_count)
+    break
+  intensity_cols <-
+    c(intensity_cols, intensity_header_cols[this_column])
+}

-outputfilename <- args$output
-enrichGraphFilename <- args$enrichGraph
-locProbCutoffGraphFilename <- args$locProbCutoffGraph
-enrichGraphFilename_svg <- args$enrichGraph_svg
-locProbCutoffGraphFilename_svg <- args$locProbCutoffGraph_svg
+start_col <- intensity_cols[1]
+num_samples <- i_count

-localProbCutoff <- args$localProbCutoff
+output_filename <- args$output
+enrich_graph_filename <- args$enrichGraph
+loc_prob_cutoff_graph_filename <- args$locProbCutoffGraph
+enrich_graph_filename_svg <- args$enrichGraph_svg
+loc_prob_cutoff_graph_fn_svg <- args$locProbCutoffGraph_svg
+
+local_prob_cutoff <- args$localProbCutoff
 enriched <- args$enriched
-collapse_FUN <- args$collapse_func
+collapse_fn <- args$collapse_func

 ### EXTRACT PARAMETERS from arguments end ------------------------------------

@@ -303,53 +382,80 @@
 # is run by the Galaxy MaxQuant wrapper and need not be invoked here.


-# Read data, filtering out contaminants, reverse sequences, and localization probability
+# Read & filter out contaminants, reverse sequences, & localization probability
 # ---
-fullData <- read.table(file = inputFilename, sep ="\t", header=T, quote="")
+full_data <-
+  read.table(
+    file = input_file_name,
+    sep = "\t",
+    header = T,
+    quote = ""
+  )

-#Filter out contaminant rows and reverse rows
-filteredData <- subset(fullData,!grepl("CON__", Proteins))
-filteredData <- subset(filteredData,!grepl("_MYCOPLASMA", Proteins))
-filteredData <- subset(filteredData,!grepl("CONTAMINANT_", Proteins))
-filteredData <- subset(filteredData,!grepl("REV__", Protein)) #since REV__ rows are blank in the first column (Proteins)
-write.table(filteredData, file = filteredFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE)
+# Filter out contaminant rows and reverse rows
+filtered_data <- subset(full_data, !grepl("CON__", Proteins))
+filtered_data <-
+  subset(filtered_data, !grepl("_MYCOPLASMA", Proteins))
+filtered_data <-
+  subset(filtered_data, !grepl("CONTAMINANT_", Proteins))
+filtered_data <-
+  subset(filtered_data, !grepl("REV__", Protein)
+         ) # since REV__ rows are blank in the first column (Proteins)
+write.table(
+  filtered_data,
+  file = filtered_filename,
+  sep = "\t",
+  quote = FALSE,
+  col.names = TRUE,
+  row.names = FALSE
+)
 # ...


 # Filter out data with localization probability below localProbCutoff
 # ---
-#Data filtered by localization probability
-locProbFilteredData <- filteredData[filteredData$Localization.prob>=localProbCutoff,]
+# Data filtered by localization probability
+loc_prob_filtered_data <-
+  filtered_data[
+    filtered_data$Localization.prob >= local_prob_cutoff,
+    ]
 # ...


 # Localization probability -- visualize locprob cutoff
 # ---
-locProbGraphData <- data.frame(
-  group = c(paste(">",toString(localProbCutoff),sep=""), paste("<",toString(localProbCutoff),sep="")),
-  value = c(nrow(locProbFilteredData)/nrow(filteredData)*100, (nrow(filteredData)-nrow(locProbFilteredData))/nrow(filteredData)*100)
-)
+loc_prob_graph_data <-
+  data.frame(
+    group = c(paste(">", toString(local_prob_cutoff), sep = ""),
+              paste("<", toString(local_prob_cutoff), sep = "")),
+    value = c(
+      nrow(loc_prob_filtered_data) / nrow(filtered_data) * 100,
+      (nrow(filtered_data) - nrow(loc_prob_filtered_data))
+        / nrow(filtered_data) * 100
+    )
+  )
 gigi <-
-  ggplot(locProbGraphData, aes(x = "", y = value, fill = group)) +
-  geom_bar(width = 0.5, stat = "identity", color = "black") +
-  labs(
-    x = NULL
-  , y = "percent"
-  , title = "Phosphopeptides partitioned by localization-probability cutoff"
+  ggplot(loc_prob_graph_data, aes(x = "", y = value, fill = group)) +
+  geom_bar(width = 0.5,
+           stat = "identity",
+           color = "black") +
+  labs(x = NULL,
+    y = "percent",
+    title = "Phosphopeptides partitioned by localization-probability cutoff"
   ) +
   scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") +
   theme_minimal() +
   theme(
-         legend.position = "right"
-       , legend.title=element_text()
-       , plot.title = element_text(hjust = 0.5)
-       , plot.subtitle = element_text(hjust = 0.5)
-       , plot.title.position = "plot"
-       )
-pdf(locProbCutoffGraphFilename)
+    legend.position = "right",
+    legend.title = element_text(),
+    plot.title = element_text(hjust = 0.5),
+    plot.subtitle = element_text(hjust = 0.5),
+    plot.title.position = "plot"
+  )
+pdf(loc_prob_cutoff_graph_filename)
 print(gigi)
 dev.off()
-svg(locProbCutoffGraphFilename_svg)
+svg(loc_prob_cutoff_graph_fn_svg)
 print(gigi)
 dev.off()
 # ...
@@ -357,126 +463,212 @@

 # Extract quantitative values from filtered data
 # ---
-quantData <- locProbFilteredData[,seq(from=startCol, by=intervalCol, length.out=numSamples)]
+quant_data <-
+  loc_prob_filtered_data[, seq(from = start_col,
+                               by = interval_col,
+                               length.out = num_samples)]
 # ...


 # Generate Phosphopeptide Sequence
 #   for latest version of MaxQuant (Version 1.5.3.30)
 # ---
-dataTable <- data.frame(locProbFilteredData[,1:8],locProbFilteredData[,phosphoCol],locProbFilteredData[,phosphoCol+1],locProbFilteredData[,phosphoCol+2],locProbFilteredData[,phosphoCol+3],locProbFilteredData[,phosphoCol+4],locProbFilteredData[,phosphoCol+5],locProbFilteredData[,phosphoCol+6],locProbFilteredData[,phosphoCol+7],quantData)
-colnames(dataTable) <- c("Proteins","Positions within proteins", "Leading proteins", "Protein", "Protein names", "Gene names", "Fasta headers", "Localization prob", "Number of Phospho (STY)", "Amino Acid", "Sequence window","Modification window", "Peptide window coverage", "Phospho (STY) Probabilities", "Phospho (STY) Score diffs", "Position in peptide", colnames(quantData))
-# 'phosphopeptide_func' generates a phosphopeptide sequence for each row of data.
-#   for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1,2) = both
-dataTable$Phosphopeptide <- apply(X=dataTable, MARGIN=1, FUN=phosphopeptide_func)
+metadata_df <-
+  data.frame(
+    loc_prob_filtered_data[, 1:8],
+    loc_prob_filtered_data[, phospho_col],
+    loc_prob_filtered_data[, phospho_col + 1],
+    loc_prob_filtered_data[, phospho_col + 2],
+    loc_prob_filtered_data[, phospho_col + 3],
+    loc_prob_filtered_data[, phospho_col + 4],
+    loc_prob_filtered_data[, phospho_col + 5],
+    loc_prob_filtered_data[, phospho_col + 6],
+    loc_prob_filtered_data[, phospho_col + 7],
+    quant_data
+  )
+colnames(metadata_df) <-
+  c(
+    "Proteins",
+    "Positions within proteins",
+    "Leading proteins",
+    "Protein",
+    "Protein names",
+    "Gene names",
+    "Fasta headers",
+    "Localization prob",
+    "Number of Phospho (STY)",
+    "Amino Acid",
+    "Sequence window",
+    "Modification window",
+    "Peptide window coverage",
+    "Phospho (STY) Probabilities",
+    "Phospho (STY) Score diffs",
+    "Position in peptide",
+    colnames(quant_data)
+  )
+# 'phosphopeptide_func' generates a phosphopeptide sequence
+#   for each row of data.
+# for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1, 2) = both
+metadata_df$phosphopeptide <-
+  apply(X = metadata_df, MARGIN = 1, FUN = phosphopeptide_func)
+colnames(metadata_df)[1] <- "Phosphopeptide"
 # Move the quant data columns to the right end of the data.frame
-dataTable <- movetolast(dataTable,c(colnames(quantData)))
+metadata_df <- movetolast(metadata_df, c(colnames(quant_data)))
 # ...


 # Write quantitative values for debugging purposes
 # ---
-quantWrite <- cbind( dataTable[,"Sequence window"], quantData )
-colnames(quantWrite)[1] <- "Sequence.Window"
-write.table(quantWrite, file = quantFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE)
+quant_write <- cbind(metadata_df[, "Sequence window"], quant_data)
+colnames(quant_write)[1] <- "Sequence.Window"
 # ...


-# Make new data frame containing only Phosphopeptides to be mapped to quant data (merge_df)
+# Make new data frame containing only Phosphopeptides
+#   that are to be mapped to quant data (merge_df)
 # ---
-dataTable <- setDT(dataTable, keep.rownames=TRUE) #row name will be used to map
-merge_df <- data.frame(as.integer(dataTable$rn), dataTable$Phosphopeptide) #row index to merge data frames
+metadata_df <-
+  setDT(metadata_df, keep.rownames = TRUE) # row name will be used to map
+merge_df <-
+  data.frame(
+    as.integer(metadata_df$rn),
+    metadata_df$phosphopeptide # row index to merge data frames
+    )
 colnames(merge_df) <- c("rn", "Phosphopeptide")
 # ...


 # Add Phosphopeptide column to quant columns for quality control checking
 # ---
-quantData_qc <- as.data.frame(quantData)
-setDT(quantData_qc, keep.rownames=TRUE) #will use to match rowname to data
-quantData_qc$rn <- as.integer(quantData_qc$rn)
-quantData_qc <- merge(merge_df,quantData_qc, by="rn")
-quantData_qc$rn <- NULL #remove rn column
+quant_data_qc <- as.data.frame(quant_data)
+setDT(quant_data_qc, keep.rownames = TRUE) # will use to match rowname to data
+quant_data_qc$rn <- as.integer(quant_data_qc$rn)
+quant_data_qc <- merge(merge_df, quant_data_qc, by = "rn")
+quant_data_qc$rn <- NULL # remove rn column
 # ...


 # Collapse multiphosphorylated peptides
 # ---
-quantData_qc_collapsed <- data.table(quantData_qc, key = "Phosphopeptide")
-quantData_qc_collapsed <- aggregate(. ~ Phosphopeptide,quantData_qc, FUN= collapse_FUN)
+quant_data_qc_collapsed <-
+  data.table(quant_data_qc, key = "Phosphopeptide")
+quant_data_qc_collapsed <-
+  aggregate(. ~ Phosphopeptide, quant_data_qc, FUN = collapse_fn)
 # ...
+print("quant_data_qc_collapsed")
+head(quant_data_qc_collapsed)

-
-# Compute (as string) % of phosphopeptides that are multiphosphorylated (for use in next step)
+# Compute (as string) % of phosphopeptides that are multiphosphorylated
+#   (for use in next step)
 # ---
-pct_multiphos <- (nrow(quantData_qc) - nrow(quantData_qc_collapsed)) / (2 * nrow(quantData_qc))
+pct_multiphos <-
+  (
+    nrow(quant_data_qc) - nrow(quant_data_qc_collapsed)
+  ) / (2 * nrow(quant_data_qc))
 pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%")
 # ...

+write.table(
+  quant_data_qc_collapsed,
+  file = quant_file_name,
+  sep = "\t",
+  quote = FALSE,
+  col.names = TRUE,
+  row.names = FALSE
+)

 # Compute and visualize breakdown of pY, pS, and pT before enrichment filter
 # ---
-pY_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
-pS_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS"),]
-pT_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
+py_data <-
+  quant_data_qc_collapsed[
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"),
+    ]
+ps_data <-
+  quant_data_qc_collapsed[
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS"),
+    ]
+pt_data <-
+  quant_data_qc_collapsed[
+     str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"),
+     ]

-pY_num <- nrow(pY_data)
-pS_num <- nrow(pS_data)
-pT_num <- nrow(pT_data)
+py_num <- nrow(py_data)
+ps_num <- nrow(ps_data)
+pt_num <- nrow(pt_data)

 # Visualize enrichment
-enrichGraphData <- data.frame(
-  group = c("pY", "pS", "pT"),
-  value = c(pY_num, pS_num, pT_num)
-)
+enrich_graph_data <- data.frame(group = c("pY", "pS", "pT"),
+                                value = c(py_num, ps_num, pt_num))

-enrichGraphData <- enrichGraphData[enrichGraphData$value > 0,]
+enrich_graph_data <-
+  enrich_graph_data[
+    enrich_graph_data$value > 0,
+    ]

 # Plot pie chart with legend
 # start: https://stackoverflow.com/a/62522478/15509512
 # refine: https://www.statology.org/ggplot-pie-chart/
 # colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8
-slices <- enrichGraphData$value
-phosphoresidue <- enrichGraphData$group
+slices <- enrich_graph_data$value
+phosphoresidue <- enrich_graph_data$group
 pct    <- round(100 * slices / sum(slices))
-lbls   <- paste(enrichGraphData$group,"\n",pct, "%\n(", slices, ")", sep="")
+lbls   <-
+  paste(enrich_graph_data$group, "\n", pct, "%\n(", slices, ")", sep = "")
 slc_ctr <- c()
 run_tot <- 0
 for (p in pct) {
-  slc_ctr <- c(slc_ctr, run_tot + p/2.0)
+  slc_ctr <- c(slc_ctr, run_tot + p / 2.0)
   run_tot <- run_tot + p
 }
 lbl_y  <- 100 - slc_ctr
-df     <- data.frame(slices, pct, lbls, phosphoresidue = factor(phosphoresidue, levels = phosphoresidue))
-gigi <- ggplot(
-  df
-, aes(x = 1, y = pct, fill = phosphoresidue)) +
+df     <-
+  data.frame(slices,
+             pct,
+             lbls,
+             phosphoresidue = factor(phosphoresidue, levels = phosphoresidue))
+gigi <- ggplot(df
+               , aes(x = 1, y = pct, fill = phosphoresidue)) +
   geom_col(position = "stack", orientation = "x") +
   geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") +
   coord_polar(theta = "y", direction = -1) +
   labs(
     x = NULL
-  , y = NULL
-  , title = "Percentages (and counts) of phosphosites, by type of residue"
-  , caption = sprintf("Roughly %s of peptides have multiple phosphosites.", pct_multiphos)
+    ,
+    y = NULL
+    ,
+    title = "Percentages (and counts) of phosphosites, by type of residue"
+    ,
+    caption = sprintf(
+      "Roughly %s of peptides have multiple phosphosites.",
+      pct_multiphos
+    )
   ) +
   labs(x = NULL, y = NULL, fill = NULL) +
   theme_classic() +
-  theme( legend.position="right"
-       , axis.line = element_blank()
-       , axis.text = element_blank()
-       , axis.ticks = element_blank()
-       , plot.title = element_text(hjust = 0.5)
-       , plot.subtitle = element_text(hjust = 0.5)
-       , plot.caption = element_text(hjust = 0.5)
-       , plot.title.position = "plot"
-       ) +
-  scale_fill_manual(breaks = phosphoresidue, values=c("#c7eae5", "#f6e8c3", "#dfc27d"))
+  theme(
+    legend.position = "right"
+    ,
+    axis.line = element_blank()
+    ,
+    axis.text = element_blank()
+    ,
+    axis.ticks = element_blank()
+    ,
+    plot.title = element_text(hjust = 0.5)
+    ,
+    plot.subtitle = element_text(hjust = 0.5)
+    ,
+    plot.caption = element_text(hjust = 0.5)
+    ,
+    plot.title.position = "plot"
+  ) +
+  scale_fill_manual(breaks = phosphoresidue,
+                    values = c("#c7eae5", "#f6e8c3", "#dfc27d"))

-pdf(enrichGraphFilename)
+pdf(enrich_graph_filename)
 print(gigi)
 dev.off()
-svg(enrichGraphFilename_svg)
+svg(enrich_graph_filename_svg)
 print(gigi)
 dev.off()
 # ...
@@ -484,17 +676,31 @@

 # Filter phosphopeptides by enrichment
 # --
-if (enriched == "Y"){
-  quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
-} else if ( enriched == "ST" ) {
-  quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
+if (enriched == "Y") {
+  quant_data_qc_enrichment <- quant_data_qc_collapsed[
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"),
+    ]
+} else if (enriched == "ST") {
+  quant_data_qc_enrichment <- quant_data_qc_collapsed[
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS") |
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"),
+    ]
 } else {
   print("Error in enriched variable. Set to either 'Y' or 'ST'")
 }
 # ...

+print("quant_data_qc_enrichment")
+head(quant_data_qc_enrichment)

 # Write phosphopeptides filtered by enrichment
 # --
-write.table(quantData_qc_enrichment, file=outputfilename, sep="\t", quote = FALSE, row.names = FALSE)
+#ACE colnames(quant_data_qc_enrichment)[1] <- "Phosphopeptide"
+write.table(
+  quant_data_qc_enrichment,
+  file = output_filename,
+  sep = "\t",
+  quote = FALSE,
+  row.names = FALSE
+)
 # ...
--- a/mqppep_anova.R	Mon Mar 07 20:43:57 2022 +0000
+++ b/mqppep_anova.R	Thu Mar 10 23:42:48 2022 +0000
@@ -3,10 +3,6 @@
 library(optparse)
 library(data.table)
 library(stringr)
-#library(ggplot2)
-#library(PTXQC)
-#require(PTXQC)
-#require(methods)
 # bioconductor-preprocesscore
 #  - libopenblas
 #  - r-data.table
@@ -18,7 +14,6 @@

 # parse options
 option_list <- list(
-  # <param name="inputFilename" type="data" format="tabular" label="Phosphopeptide Intensities" help="First column label 'Phosphopeptide'; sample-intensities must begin in column 10 and must have column labels to match argument regexSampleNames"/>
   make_option(
     c("-i", "--inputFile"),
     action = "store",
@@ -31,7 +26,8 @@
     action = "store",
     default = NA,
     type = "character",
-    help = "List of alpha cutoff values for significance testing; path to text file having one column and no header"
+    help = paste0("List of alpha cutoff values for significance testing;",
+             " path to text file having one column and no header")
   ),
   make_option(
     c("-f", "--firstDataColumn"),
@@ -40,26 +36,29 @@
     type = "character",
     help = "First column of intensity values"
   ),
-  make_option( # imputationMethod <- c("group-median","median","mean","random")[1]
+  make_option(
     c("-m", "--imputationMethod"),
     action = "store",
     default = "group-median",
     type = "character",
-    help = "Method for missing-value imputation, one of c('group-median','median','mean','random')"
+    help = paste0("Method for missing-value imputation,",
+             " one of c('group-median','median','mean','random')")
   ),
   make_option(
     c("-p", "--meanPercentile"),
     action = "store",
     default = 3,
     type = "integer",
-    help = "Mean percentile for randomly generated imputed values; range [1,99]"
+    help = paste0("Mean percentile for randomly generated imputed values;",
+              ", range [1,99]")
   ),
   make_option(
     c("-d", "--sdPercentile"),
     action = "store",
     default = 3,
     type = "double",
-    help = "Adjustment value for standard deviation of randomly generated imputed values; real"
+    help = paste0("Adjustment value for standard deviation of",
+              " randomly generated imputed values; real")
   ),
   make_option(
     c("-s", "--regexSampleNames"),
@@ -73,9 +72,9 @@
     action = "store",
     default = "(\\d+)",
     type = "character",
-    help = "Regular expression extracting sample-group from an extracted sample-name"
+    help = paste0("Regular expression extracting sample-group",
+             " from an extracted sample-name")
   ),
-  # <data name="imputed_data_file" format="tabular" label="${input_file.name}.intensities_${imputation.imputation_method}-imputed_QN_LT" ></data>
   make_option(
     c("-o", "--imputedDataFile"),
     action = "store",
@@ -83,7 +82,6 @@
     type = "character",
     help = "Imputed Phosphopeptide Intensities output file path"
   ),
-  # <data name="report_file" format="html" label="report (download/unzip to view)" ></data>
   make_option(
     c("-r", "--reportFile"),
     action = "store",
@@ -92,82 +90,96 @@
     help = "HTML report file path"
   )
 )
-args <- parse_args(OptionParser(option_list=option_list))
+args <- parse_args(OptionParser(option_list = option_list))
+
 # Check parameter values

 if (! file.exists(args$inputFile)) {
   stop((paste("Input file", args$inputFile, "does not exist")))
 }
-inputFile <- args$inputFile
-alphaFile <- args$alphaFile
-firstDataColumn <- args$firstDataColumn
-imputationMethod <- args$imputationMethod
-meanPercentile <- args$meanPercentile
-sdPercentile <- args$sdPercentile
+input_file <- args$inputFile
+alpha_file <- args$alphaFile
+first_data_column <- args$firstDataColumn
+imputation_method <- args$imputationMethod
+mean_percentile <- args$meanPercentile
+sd_percentile <- args$sdPercentile

-regexSampleNames    <- gsub('^[ \t\n]*', ''  , readChar(args$regexSampleNames,  1000))
-regexSampleNames    <- gsub('[ \t\n]*$', ''  ,               regexSampleNames        )
-# regexSampleNames    <- gsub('\\\\'     , '@@',               regexSampleNames        )
-# regexSampleNames    <- gsub('@@'       , '\\',               regexSampleNames        )
-cat(regexSampleNames)
-cat('\n')
+regex_sample_names    <- gsub("^[ \t\n]*", "",
+                         readChar(args$regexSampleNames,  1000)
+                       )
+regex_sample_names    <- gsub("[ \t\n]*$", "",
+                         regex_sample_names
+                       )
+cat(regex_sample_names)
+cat("\n")

-regexSampleGrouping <- gsub('^[ \t\n]*', '', readChar(args$regexSampleGrouping, 1000))
-regexSampleGrouping <- gsub('[ \t\n]*$', '',               regexSampleGrouping       )
-# regexSampleGrouping <- gsub('\\\\'     , '@@',             regexSampleGrouping       )
-cat(regexSampleGrouping)
-cat('\n')
+regex_sample_grouping <- gsub("^[ \t\n]*", "",
+                           readChar(args$regexSampleGrouping, 1000)
+                         )
+regex_sample_grouping <- gsub("[ \t\n]*$", "",
+                           regex_sample_grouping
+                         )
+cat(regex_sample_grouping)
+cat("\n")

-# regexSampleGrouping <- gsub('@@'       , '\\',             regexSampleGrouping       )
-imputedDataFilename <- args$imputedDataFile
-reportFileName <- args$reportFile
+imputed_data_file_name <- args$imputedDataFile
+report_file_name <- args$reportFile

 print("args is:")
 cat(str(args))

-print("regexSampleNames is:")
-cat(str(regexSampleNames))
+print("regex_sample_names is:")
+cat(str(regex_sample_names))

-print("regexSampleGrouping is:")
-cat(str(regexSampleGrouping))
+print("regex_sample_grouping is:")
+cat(str(regex_sample_grouping))

-# from: https://github.com/molgenis/molgenis-pipelines/wiki/How-to-source-another_file.R-from-within-your-R-script
-LocationOfThisScript = function() # Function LocationOfThisScript returns the location of this .R script (may be needed to source other files in same dir)
-{
-    this.file = NULL
+# from: https://github.com/molgenis/molgenis-pipelines/wiki/
+#   How-to-source-another_file.R-from-within-your-R-script
+# Function location_of_this_script returns the location of this .R script
+#   (may be needed to source other files in same dir)
+location_of_this_script <- function() {
+    this_file <- NULL
     # This file may be 'sourced'
-    for (i in -(1:sys.nframe())) {
-        if (identical(sys.function(i), base::source)) this.file = (normalizePath(sys.frame(i)$ofile))
+    for (i in - (1:sys.nframe())) {
+        if (identical(sys.function(i), base::source)) {
+            this_file <- (normalizePath(sys.frame(i)$ofile))
+        }
     }

-    if (!is.null(this.file)) return(dirname(this.file))
+    if (!is.null(this_file)) return(dirname(this_file))

     # But it may also be called from the command line
-    cmd.args = commandArgs(trailingOnly = FALSE)
-    cmd.args.trailing = commandArgs(trailingOnly = TRUE)
-    cmd.args = cmd.args[seq.int(from=1, length.out=length(cmd.args) - length(cmd.args.trailing))]
-    res = gsub("^(?:--file=(.*)|.*)$", "\\1", cmd.args)
+    cmd_args <- commandArgs(trailingOnly = FALSE)
+    cmd_args_trailing <- commandArgs(trailingOnly = TRUE)
+    cmd_args <- cmd_args[
+      seq.int(
+        from = 1,
+        length.out = length(cmd_args) - length(cmd_args_trailing)
+        )
+      ]
+    res <- gsub("^(?:--file=(.*)|.*)$", "\\1", cmd_args)

     # If multiple --file arguments are given, R uses the last one
-    res = tail(res[res != ""], 1)
+    res <- tail(res[res != ""], 1)
     if (0 < length(res)) return(dirname(res))

     # Both are not the case. Maybe we are in an R GUI?
     return(NULL)
 }

-script.dir <-  LocationOfThisScript()
+script_dir <-  location_of_this_script()

 rmarkdown_params <- list(
-    inputFile = inputFile
-  , alphaFile = alphaFile
-  , firstDataColumn = firstDataColumn
-  , imputationMethod = imputationMethod
-  , meanPercentile = meanPercentile
-  , sdPercentile = sdPercentile
-  , regexSampleNames = regexSampleNames
-  , regexSampleGrouping = regexSampleGrouping
-  , imputedDataFilename = imputedDataFilename
+    inputFile = input_file
+  , alphaFile = alpha_file
+  , firstDataColumn = first_data_column
+  , imputationMethod = imputation_method
+  , meanPercentile = mean_percentile
+  , sdPercentile = sd_percentile
+  , regexSampleNames = regex_sample_names
+  , regexSampleGrouping = regex_sample_grouping
+  , imputedDataFilename = imputed_data_file_name
   )

 str(rmarkdown_params)
@@ -178,14 +190,15 @@
 # for reason:
 #   "The following dependencies are not available in conda"
 # reported here:
-#   https://github.com/ami-iit/bipedal-locomotion-framework/pull/457/commits/e98ccef8c8cb63e207df36628192af6ce22feb13
+#   https://github.com/ami-iit/bipedal-locomotion-framework/pull/457

-# freeze the random number generator so the same results will be produced from run to run
+# freeze the random number generator so the same results will be produced
+#  from run to run
 set.seed(28571)

 rmarkdown::render(
-  input = paste(script.dir, "mqppep_anova_script.Rmd", sep="/")
+  input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/")
 , output_format = rmarkdown::html_document(pandoc_args = "--self-contained")
-, output_file = reportFileName
+, output_file = report_file_name
 , params = rmarkdown_params
 )
--- a/mqppep_anova.xml	Mon Mar 07 20:43:57 2022 +0000
+++ b/mqppep_anova.xml	Thu Mar 10 23:42:48 2022 +0000
@@ -4,33 +4,37 @@
         <import>macros.xml</import>
     </macros>
     <requirements>
-        <requirement type="package" version="1.7.1">r-optparse</requirement>
-        <requirement type="package" version="1.4.0">r-stringr</requirement>
-        <requirement type="package" version="1.14.2">r-data.table</requirement>
-        <requirement type="package" version="3.3.5">r-ggplot2</requirement>
-        <requirement type="package" version="1.56.0">bioconductor-preprocesscore</requirement>
-        <requirement type="package" version="0.3.3" >openblas</requirement>
-        <requirement type="package" version="2.11"  >r-rmarkdown</requirement>
-        <requirement type="package" version="0.4.0" >r-sass</requirement>
-        <requirement type="package"                 >texlive-core</requirement>
+        <requirement type="package" version="1.7.1"   >r-optparse</requirement>
+        <requirement type="package" version="1.4.0"   >r-stringr</requirement>
+        <requirement type="package" version="1.14.2"  >r-data.table</requirement>
+        <requirement type="package" version="3.3.5"   >r-ggplot2</requirement>
+        <requirement type="package" version="1.56.0"  >bioconductor-preprocesscore</requirement>
+        <requirement type="package" version="0.3.3"   >openblas</requirement>
+        <requirement type="package" version="2.11"    >r-rmarkdown</requirement>
+        <requirement type="package" version="0.4.0"   >r-sass</requirement>
+        <requirement type="package" version="20210325">texlive-core</requirement>

     </requirements>
     <!-- Rscript -e 'rmarkdown::render("QuantDataProcessingScript.Rmd")' -->
     <command detect_errors="exit_code"><![CDATA[
-cat $sample_names_regex_f; cat $sample_grouping_regex_f;
-Rscript '$__tool_directory__/mqppep_anova.R'
---inputFile '$input_file'
---alphaFile $alpha_file
---firstDataColumn $first_data_column
---imputationMethod $imputation.imputation_method
-#if '$imputation_method' == 'random':
-  --meanPercentile '$meanPercentile'
-  --sdPercentile   '$sdPercentile'
-#end if
---regexSampleNames $sample_names_regex_f
---regexSampleGrouping $sample_grouping_regex_f
---imputedDataFile $imputed_data_file
---reportFile $report_file
+      cd /tmp;
+      cp '$__tool_directory__/mqppep_anova_script.Rmd' . || exit 0;
+      cp '$__tool_directory__/mqppep_anova.R' . || exit 0;
+      \${CONDA_PREFIX}/bin/Rscript /tmp/mqppep_anova.R
+      --inputFile '$input_file'
+      --alphaFile $alpha_file
+      --firstDataColumn $first_data_column
+      --imputationMethod $imputation.imputation_method
+      #if '$imputation_method' == 'random':
+        --meanPercentile '$meanPercentile'
+        --sdPercentile   '$sdPercentile'
+      #end if
+      --regexSampleNames $sample_names_regex_f
+      --regexSampleGrouping $sample_grouping_regex_f
+      --imputedDataFile $imputed_data_file
+      --reportFile $report_file;
+      rm mqppep_anova_script.Rmd;
+      rm mqppep_anova.R
     ]]></command>
     <configfiles>
       <configfile name="sample_names_regex_f">
@@ -109,9 +113,9 @@
             <output name="imputed_data_file">
                 <assert_contents>
                     <has_text text="Phosphopeptide" />
-                    <has_text text="AAAAAAAGDpSDpSWDADAFSVEDPVRK" />
-                    <has_text text="23574000" />
-                    <has_text text="pSESELIDELSEDFDR" />
+                    <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
+                    <has_text text="7.935878" />
+                    <has_text text="pSQKQEEENPAEETGEEK" />
                 </assert_contents>
             </output>
         </test>
@@ -125,9 +129,9 @@
             <output name="imputed_data_file">
                 <assert_contents>
                     <has_text text="Phosphopeptide" />
-                    <has_text text="AAAAAAAGDpSDpSWDADAFSVEDPVRK" />
-                    <has_text text="997800000" />
-                    <has_text text="pSESELIDELSEDFDR" />
+                    <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
+                    <has_text text="0.82258" />
+                    <has_text text="pSQKQEEENPAEETGEEK" />
                 </assert_contents>
             </output>
         </test>
@@ -209,7 +213,7 @@
 PERL-compatible regular expressions
 ===================================

-Note that the PERL-compatible regular expressions accepted by this tool are documented at https://rdrr.io/r/base/regex.html
+Note that the PERL-compatible regular expressions accepted by this tool are documented at http://rdrr.io/r/base/regex.html

     ]]></help>
     <citations>
--- a/mqppep_mrgfltr.py	Mon Mar 07 20:43:57 2022 +0000
+++ b/mqppep_mrgfltr.py	Thu Mar 10 23:42:48 2022 +0000
@@ -1,1337 +1,1500 @@
-#!/usr/bin/env python
-
-# Import the packages needed
-import argparse
-import os.path
-import sys
-
-import pandas
-import re
-import time
-import sqlite3 as sql
-from codecs import getreader as cx_getreader
-import sys
-import numpy as np
-
-#   for sorting list of lists using operator.itemgetter
-import operator
-
-#   for formatting stack-trace
-import traceback
-
-#   for Aho-Corasick search for fixed set of substrings
-import ahocorasick
-import operator
-import hashlib
-
-# for shutil.copyfile(src, dest)
-import shutil
-
-# global constants
-N_A = 'N/A'
-
-# ref: https://stackoverflow.com/a/8915613/15509512
-#   answers: "How to handle exceptions in a list comprehensions"
-#   usage:
-#       from math import log
-#       eggs = [1,3,0,3,2]
-#       print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
-#   producing:
-#       for <built-in function log>
-#         with args (0,)
-#         exception: math domain error
-#       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
-def catch(func, *args, handle=lambda e : e, **kwargs):
-    try:
-        return func(*args, **kwargs)
-    except Exception as e:
-        print("For %s" % str(func))
-        print("  with args %s" % str(args))
-        print("  caught exception: %s" % str(e))
-        (ty, va, tb) = sys.exc_info()
-        print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
-        exit(-1)
-        return None # was handle(e)
-
-def ppep_join(x):
-    x = [i for i in x if N_A != i]
-    result = "%s" % ' | '.join(x)
-    if result != "":
-        return result
-    else:
-        return N_A
-
-def melt_join(x):
-    tmp = {key.lower(): key for key in x}
-    result = "%s" % ' | '.join([tmp[key] for key in tmp])
-    return result
-
-def __main__():
-    # Parse Command Line
-    parser = argparse.ArgumentParser(
-        description='Phopsphoproteomic Enrichment Pipeline Merge and Filter.'
-        )
-
-    # inputs:
-    #   Phosphopeptide data for experimental results, including the intensities
-    #   and the mapping to kinase domains, in tabular format.
-    parser.add_argument(
-        '--phosphopeptides', '-p',
-        nargs=1,
-        required=True,
-        dest='phosphopeptides',
-        help='Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format'
-        )
-    #   UniProtKB/SwissProt DB input, SQLite
-    parser.add_argument(
-        '--ppep_mapping_db', '-d',
-        nargs=1,
-        required=True,
-        dest='ppep_mapping_db',
-        help='UniProtKB/SwissProt SQLite Database'
-        )
-    #ACE #   PhosPhositesPlus DB input, csv
-    #ACE parser.add_argument(
-    #ACE     '--psp_regulatory_sites', '-s',
-    #ACE     nargs=1,
-    #ACE     required=True,
-    #ACE     dest='psp_regulatory_sites_csv',
-    #ACE     help='PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header'
-    #ACE     )
-    #   species to limit records chosed from PhosPhositesPlus
-    parser.add_argument(
-        '--species', '-x',
-        nargs=1,
-        required=False,
-        default=[],
-        dest='species',
-        help='limit PhosphoSitePlus records to indicated species (field may be empty)'
-        )
-
-    # outputs:
-    #   tabular output
-    parser.add_argument(
-        '--mrgfltr_tab', '-o',
-        nargs=1,
-        required=True,
-        dest='mrgfltr_tab',
-        help='Tabular output file for results'
-        )
-    #   CSV output
-    parser.add_argument(
-        '--mrgfltr_csv', '-c',
-        nargs=1,
-        required=True,
-        dest='mrgfltr_csv',
-        help='CSV output file for results'
-        )
-    #   SQLite output
-    parser.add_argument(
-        '--mrgfltr_sqlite', '-S',
-        nargs=1,
-        required=True,
-        dest='mrgfltr_sqlite',
-        help='SQLite output file for results'
-        )
-
-    # "Make it so!" (parse the arguments)
-    options = parser.parse_args()
-    print("options: " + str(options))
-
-    # determine phosphopeptide ("upstream map") input tabular file access
-    if options.phosphopeptides is None:
-        exit('Argument "phosphopeptides" is required but not supplied')
-    try:
-        upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0])
-        input_file = open(upstream_map_filename_tab, 'r')
-        input_file.close()
-    except Exception as e:
-        exit('Error parsing phosphopeptides argument: %s' % str(e))
-
-    # determine input SQLite access
-    if options.ppep_mapping_db is None:
-        exit('Argument "ppep_mapping_db" is required but not supplied')
-    try:
-        uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0])
-        input_file = open(uniprot_sqlite, 'rb')
-        input_file.close()
-    except Exception as e:
-        exit('Error parsing ppep_mapping_db argument: %s' % str(e))
-
-    # copy input SQLite dataset to output SQLite dataset
-    if options.mrgfltr_sqlite is None:
-        exit('Argument "mrgfltr_sqlite" is required but not supplied')
-    try:
-        output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0])
-        shutil.copyfile(uniprot_sqlite, output_sqlite)
-    except Exception as e:
-        exit('Error copying ppep_mapping_db to mrgfltr_sqlite: %s' % str(e))
-
-    #ACE # determine psp_regulatory_sites CSV access
-    #ACE if options.psp_regulatory_sites_csv is None:
-    #ACE     exit('Argument "psp_regulatory_sites_csv" is required but not supplied')
-    #ACE #ACE print('options.psp_regulatory_sites_csv: ' + options.psp_regulatory_sites_csv)
-    #ACE try:
-    #ACE     phosphosite_filename_csv = os.path.abspath(options.psp_regulatory_sites_csv[0])
-    #ACE     input_file = open(phosphosite_filename_csv, 'r')
-    #ACE     input_file.close()
-    #ACE except Exception as e:
-    #ACE     exit('Error parsing psp_regulatory_sites_csv argument: %s' % str(e))
-    #ACE print('phosphosite_filename_csv: ' + phosphosite_filename_csv)
-
-    # determine species to limit records from PSP_Regulatory_Sites
-    if options.species is None:
-        exit('Argument "species" is required (and may be empty) but not supplied')
-    try:
-        if len(options.species) > 0:
-            species = options.species[0]
-        else:
-            species = ''
-    except Exception as e:
-        exit('Error parsing species argument: %s' % str(e))
-
-    # determine tabular output destination
-    if options.mrgfltr_tab is None:
-        exit('Argument "mrgfltr_tab" is required but not supplied')
-    try:
-        output_filename_tab = os.path.abspath(options.mrgfltr_tab[0])
-        output_file = open(output_filename_tab, 'w')
-        output_file.close()
-    except Exception as e:
-        exit('Error parsing mrgfltr_tab argument: %s' % str(e))
-
-    # determine CSV output destination
-    if options.mrgfltr_csv is None:
-        exit('Argument "mrgfltr_csv" is required but not supplied')
-    try:
-        output_filename_csv = os.path.abspath(options.mrgfltr_csv[0])
-        output_file = open(output_filename_csv, 'w')
-        output_file.close()
-    except Exception as e:
-        exit('Error parsing mrgfltr_csv argument: %s' % str(e))
-
-
-    def mqpep_getswissprot():
-
-        ###############################################
-        # copied from Excel Output Script.ipynb BEGIN #
-        ###############################################
-
-        ###########  String Constants  #################
-        DEPHOSPHOPEP = 'DephosphoPep'
-        DESCRIPTION = 'Description'
-        FUNCTION_PHOSPHORESIDUE = 'Function Phosphoresidue(PSP=PhosphoSitePlus.org)'
-        GENE_NAME = 'Gene_Name'                     # Gene Name from UniProtKB
-        ON_FUNCTION = 'ON_FUNCTION'                 # ON_FUNCTION column from PSP_Regulatory_Sites
-        ON_NOTES = 'NOTES'                          # NOTES column from PSP_Regulatory_Sites
-        ON_OTHER_INTERACT = 'ON_OTHER_INTERACT'     # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
-        ON_PROCESS = 'ON_PROCESS'                   # ON_PROCESS column from PSP_Regulatory_Sites
-        ON_PROT_INTERACT = 'ON_PROT_INTERACT'       # ON_PROT_INTERACT column from PSP_Regulatory_Sites
-        PHOSPHOPEPTIDE = 'Phosphopeptide'
-        PHOSPHOPEPTIDE_MATCH = 'Phosphopeptide_match'
-        PHOSPHORESIDUE = 'Phosphoresidue'
-        PUTATIVE_UPSTREAM_DOMAINS = 'Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains'
-        SEQUENCE = 'Sequence'
-        SEQUENCE10 = 'Sequence10'
-        SEQUENCE7 = 'Sequence7'
-        SITE_PLUSMINUS_7AA = 'SITE_+/-7_AA'
-        SITE_PLUSMINUS_7AA_SQL = 'SITE_PLUSMINUS_7AA'
-        UNIPROT_ID = 'UniProt_ID'
-        UNIPROT_SEQ_AND_META_SQL = '''
-            select    Uniprot_ID, Description, Gene_Name, Sequence,
-                      Organism_Name, Organism_ID, PE, SV
-                 from UniProtKB
-             order by Sequence, UniProt_ID
-        '''
-        UNIPROT_UNIQUE_SEQ_SQL = '''
-            select distinct Sequence
-                       from UniProtKB
-                   group by Sequence
-        '''
-        PPEP_PEP_UNIPROTSEQ_SQL = '''
-            select distinct phosphopeptide, peptide, sequence
-                       from uniprotkb_pep_ppep_view
-                   order by sequence
-        '''
-        PPEP_MELT_SQL = '''
-            SELECT DISTINCT
-                phospho_peptide AS 'p_peptide',
-                kinase_map AS 'characterization',
-                'X' AS 'X'
-            FROM ppep_gene_site_view
-        '''
-        # CREATE TABLE PSP_Regulatory_site (
-        #   site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
-        #   domain             TEXT,
-        #   ON_FUNCTION        TEXT,
-        #   ON_PROCESS         TEXT,
-        #   ON_PROT_INTERACT   TEXT,
-        #   ON_OTHER_INTERACT  TEXT,
-        #   notes              TEXT,
-        #   organism           TEXT
-        # );
-        PSP_REGSITE_SQL = '''
-            SELECT DISTINCT
-              SITE_PLUSMINUS_7AA ,
-              DOMAIN             ,
-              ON_FUNCTION        ,
-              ON_PROCESS         ,
-              ON_PROT_INTERACT   ,
-              ON_OTHER_INTERACT  ,
-              NOTES              ,
-              ORGANISM
-            FROM PSP_Regulatory_site
-        '''
-        PPEP_ID_SQL ='''
-            SELECT
-                id AS 'ppep_id',
-                seq AS 'ppep_seq'
-            FROM ppep
-        '''
-        MRGFLTR_DDL ='''
-        DROP VIEW  IF EXISTS mrgfltr_metadata_view;
-        DROP TABLE IF EXISTS mrgfltr_metadata;
-        CREATE TABLE mrgfltr_metadata
-          ( ppep_id                 INTEGER REFERENCES ppep(id)
-          , Sequence10              TEXT
-          , Sequence7               TEXT
-          , GeneName                TEXT
-          , Phosphoresidue          TEXT
-          , UniProtID               TEXT
-          , Description             TEXT
-          , FunctionPhosphoresidue  TEXT
-          , PutativeUpstreamDomains TEXT
-          , PRIMARY KEY (ppep_id)            ON CONFLICT IGNORE
-          )
-          ;
-        CREATE VIEW mrgfltr_metadata_view AS
-          SELECT DISTINCT
-              ppep.seq             AS phospho_peptide
-            , Sequence10
-            , Sequence7
-            , GeneName
-            , Phosphoresidue
-            , UniProtID
-            , Description
-            , FunctionPhosphoresidue
-            , PutativeUpstreamDomains
-          FROM
-            ppep, mrgfltr_metadata
-          WHERE
-              mrgfltr_metadata.ppep_id = ppep.id
-          ORDER BY
-            ppep.seq
-            ;
-        '''
-
-        CITATION_INSERT_STMT = '''
-          INSERT INTO Citation (
-            ObjectName,
-            CitationData
-          ) VALUES (?,?)
-          '''
-        CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
-        CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'
-
-        MRGFLTR_METADATA_COLUMNS = [
-            'ppep_id',
-            'Sequence10',
-            'Sequence7',
-            'GeneName',
-            'Phosphoresidue',
-            'UniProtID',
-            'Description',
-            'FunctionPhosphoresidue',
-            'PutativeUpstreamDomains'
-            ]
-
-        ###########  String Constants (end) ############
-
-        class Error(Exception):
-            """Base class for exceptions in this module."""
-            pass
-
-        class PreconditionError(Error):
-            """Exception raised for errors in the input.
-
-            Attributes:
-                expression -- input expression in which the error occurred
-                message -- explanation of the error
-            """
-
-            def __init__(self, expression, message):
-                self.expression = expression
-                self.message = message
-
-        #start_time = time.clock() #timer
-        start_time = time.process_time() #timer
-
-        #get keys from upstream tabular file using readline()
-        # ref: https://stackoverflow.com/a/16713581/15509512
-        #      answer to "Use codecs to read file with correct encoding"
-        file1_encoded = open(upstream_map_filename_tab, 'rb')
-        file1 = cx_getreader("latin-1")(file1_encoded)
-
-        count = 0
-        upstream_map_p_peptide_list = []
-        re_tab = re.compile('^[^\t]*')
-        while True:
-            count += 1
-            # Get next line from file
-            line = file1.readline()
-            # if line is empty
-            # end of file is reached
-            if not line:
-                break
-            if count > 1:
-                m = re_tab.match(line)
-                upstream_map_p_peptide_list.append(m[0])
-        file1.close()
-        file1_encoded.close()
-
-        # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
-        re_phos = re.compile('p')
-        dephospho_peptide_list = [ re_phos.sub('',foo) for foo in upstream_map_p_peptide_list ]
-
-        end_time = time.process_time() #timer
-        print("%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr)
-
-        ## ----------- Get SwissProt data from SQLite database (start) -----------
-        # build UniProt sequence LUT and list of unique SwissProt sequences
-
-        # Open SwissProt SQLite database
-        conn = sql.connect(uniprot_sqlite)
-        cur  = conn.cursor()
-
-        # Set up structures to hold SwissProt data
-
-        uniprot_Sequence_List = []
-        UniProtSeqLUT = {}
-
-        # Execute query for unique seqs without fetching the results yet
-        uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL)
-
-        while batch := uniprot_unique_seq_cur.fetchmany(size=50):
-            if None == batch:
-                # handle case where no records are returned
-                break
-            for row in batch:
-                Sequence = row[0]
-                UniProtSeqLUT[(Sequence,DESCRIPTION)] = []
-                UniProtSeqLUT[(Sequence,GENE_NAME)  ] = []
-                UniProtSeqLUT[(Sequence,UNIPROT_ID) ] = []
-                UniProtSeqLUT[ Sequence               ] = []
-
-        # Execute query for seqs and metadata without fetching the results yet
-        uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL)
-
-        while batch := uniprot_seq_and_meta.fetchmany(size=50):
-            if None == batch:
-                  # handle case where no records are returned
-                break
-            for UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV in batch:
-                uniprot_Sequence_List.append(Sequence)
-                UniProtSeqLUT[Sequence] = Sequence
-                UniProtSeqLUT[(Sequence,UNIPROT_ID) ].append(UniProt_ID)
-                UniProtSeqLUT[(Sequence,GENE_NAME)  ].append(Gene_Name)
-                if OS != N_A:
-                    Description += ' OS=' + OS
-                if OX != N_A:
-                    Description += ' OX=' + str(int(OX))
-                if Gene_Name != N_A:
-                    Description += ' GN=' + Gene_Name
-                if PE != N_A:
-                    Description += ' PE=' + PE
-                if SV != N_A:
-                    Description += ' SV=' + SV
-                UniProtSeqLUT[(Sequence,DESCRIPTION)].append(Description)
-
-        # Close SwissProt SQLite database; clean up local variables
-        conn.close()
-        Sequence = ''
-        UniProt_ID = ''
-        Description = ''
-        Gene_Name = ''
-
-        ## ----------- Get SwissProt data from SQLite database (finish) -----------
-
-        end_time = time.process_time() #timer
-        print("%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr)
-
-        ## ----------- Get SwissProt data from SQLite database (start) -----------
-        # build PhosphoPep_UniProtSeq_LUT and PhosphoPep_UniProtSeq_LUT
-        #ACE_temp pepSeqList = list( zip(pepList, dephosphPepList, [seq]*len(pepList)) )
-
-        # Open SwissProt SQLite database
-        conn = sql.connect(uniprot_sqlite)
-        cur  = conn.cursor()
-
-        # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
-        DephosphoPep_UniProtSeq_LUT = {}
-
-        # Set up dictionary to accumulate results
-        PhosphoPep_UniProtSeq_LUT = {}
-
-        # Execute query for tuples without fetching the results yet
-        ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL)
-
-        while batch := ppep_pep_uniprotseq_cur.fetchmany(size=50):
-            if None == batch:
-                # handle case where no records are returned
-                break
-            for (phospho_pep, dephospho_pep, sequence) in batch:
-                #do interesting stuff here...
-                PhosphoPep_UniProtSeq_LUT[phospho_pep]                  = phospho_pep
-                PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
-                if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
-                    DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set()
-                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)]  = []
-                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)]    = []
-                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]   = []
-                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]     = []
-                DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep)
-
-                #ACE print("ppep:'%s' dephospho_pep:'%s' sequence:'%s'" % (phospho_pep, dephospho_pep, sequence))
-                if sequence not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]:
-                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)].append(sequence)
-                for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]:
-                    if phospho_pep != phospho_pep:
-                        print("phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep))
-                    if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
-                        PhosphoPep_UniProtSeq_LUT[phospho_pep]                  = phospho_pep
-                        PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
-                    r = list(zip(
-                       [s for s in UniProtSeqLUT[(sequence,UNIPROT_ID)]],
-                       [s for s in UniProtSeqLUT[(sequence,GENE_NAME)]],
-                       [s for s in UniProtSeqLUT[(sequence,DESCRIPTION)]]
-                       ))
-                    # Sort by `UniProt_ID`
-                    #   ref: https://stackoverflow.com/a/4174955/15509512
-                    r = sorted(r, key=operator.itemgetter(0))
-                    # Get one tuple for each `phospho_pep`
-                    #   in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
-                    for (upid, gn, desc) in r:
-                        # Append pseudo-tuple per UniProt_ID but only when it is not present
-                        if upid not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]:
-                            DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)].append(upid)
-                            DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)].append(desc)
-                            DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)].append(gn)
-
-        # Close SwissProt SQLite database; clean up local variables
-        conn.close()
-        # wipe local variables
-        phospho_pep = dephospho_pep = sequence = 0
-        upid = gn = desc = r = ''
-
-        ## ----------- Get SwissProt data from SQLite database (finish) -----------
-
-        end_time = time.process_time() #timer
-        print("%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time,upstream_map_filename_tab), file=sys.stderr)
-
-        print('{:>10} unique upstream phosphopeptides tested'.format(str(len(upstream_map_p_peptide_list))))
-
-        #Read in Upstream tabular file
-        # We are discarding the intensity data; so read it as text
-        upstream_data = pandas.read_table(
-            upstream_map_filename_tab,
-            dtype='str',
-            index_col = 0
-            )
-
-        end_time = time.process_time() #timer
-        print("%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        upstream_data.index = upstream_map_p_peptide_list
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr) #timer
-
-
-        #trim upstream_data to include only the upstream map columns
-        old_cols = upstream_data.columns.tolist()
-        i = 0
-        first_intensity = -1
-        last_intensity  = -1
-        intensity_re = re.compile('Intensity.*')
-        for col_name in old_cols:
-            m = intensity_re.match(col_name)
-            if m:
-                last_intensity = i
-                if first_intensity == -1:
-                    first_intensity = i
-            i += 1
-        #print('last intensity = %d' % last_intensity)
-        col_PKCalpha = last_intensity + 2
-        col_firstIntensity = first_intensity
-
-        data_in_cols = [old_cols[0]] + old_cols[first_intensity:last_intensity+1]
-
-        if upstream_data.empty:
-            print("upstream_data is empty")
-            exit(0)
-
-        data_in = upstream_data.copy(deep=True)[data_in_cols]
-
-        # Convert floating-point integers to int64 integers
-        #   ref: https://stackoverflow.com/a/68497603/15509512
-        data_in[list(data_in.columns[1:])] = data_in[
-            list(data_in.columns[1:])].astype('float64').apply(np.int64)
-
-        #create another phosphopeptide column that will be used to join later;
-        #  MAY need to change depending on Phosphopeptide column position
-        #data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]]
-        data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index
-
-
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        # Produce a dictionary of metadata for a single phosphopeptide.
-        #   This is a replacement of `UniProtInfo_subdict` in the original code.
-        def pseq_to_subdict(phospho_pep):
-            #ACE print("calling pseq_to_subdict, %s" % phospho_pep);
-            # Strip "p" from phosphopeptide sequence
-            dephospho_pep = re_phos.sub('',phospho_pep)
-
-            # Determine number of phosphoresidues in phosphopeptide
-            numps = len(phospho_pep) - len(dephospho_pep)
-
-            # Determine location(s) of phosphoresidue(s) in phosphopeptide
-            #   (used later for Phosphoresidue, Sequence7, and Sequence10)
-            ploc = [] #list of p locations
-            i = 0
-            p = phospho_pep
-            while i < numps:
-                ploc.append(p.find("p"))
-                p = p[:p.find("p")] + p[p.find("p")+1:]
-                i +=1
-
-
-            # Establish nested dictionary
-            result = {}
-            result[SEQUENCE] = []
-            result[UNIPROT_ID] = []
-            result[DESCRIPTION] = []
-            result[GENE_NAME] = []
-            result[PHOSPHORESIDUE] = []
-            result[SEQUENCE7] = []
-            result[SEQUENCE10] = []
-
-            # Add stripped sequence to dictionary
-            result[SEQUENCE].append(dephospho_pep)
-
-            # Locate dephospho_pep in DephosphoPep_UniProtSeq_LUT
-            dephos = DephosphoPep_UniProtSeq_LUT[dephospho_pep]
-
-            # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT
-            ### Caller may elect to:
-            ## try:
-            ##     ...
-            ## except PreconditionError as pe:
-            ##     print("'{expression}': {message}".format(
-            ##             expression = pe.expression,
-            ##             message = pe.message))
-            ##             )
-            ##         )
-            if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
-                raise PreconditionError( dephospho_pep,
-                    'dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT'
-                    )
-            if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
-                raise PreconditionError( dephospho_pep,
-                    'no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT'
-                    )
-            if dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]:
-                raise PreconditionError( dephospho_pep,
-                    "dephosphorylated phosphopeptide does not match " +
-                    "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " +
-                    PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]
-                    )
-            result[SEQUENCE] = [dephospho_pep]
-            result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]
-            result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)]
-            result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)]
-            if (dephospho_pep,SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
-               raise PreconditionError( dephospho_pep,
-                    'no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT'
-                    )
-            UniProtSeqList = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]
-            if len (UniProtSeqList) < 1:
-               print("Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep)
-               # raise PreconditionError(
-               #     "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
-               #      'value has zero length'
-               #      )
-            for UniProtSeq in UniProtSeqList:
-                i = 0
-                phosphoresidues = []
-                seq7s_set = set()
-                seq7s = []
-                seq10s_set = set()
-                seq10s = []
-                while i < len(ploc):
-                    start = UniProtSeq.find(dephospho_pep)
-                    psite = start+ploc[i] #location of phosphoresidue on protein sequence
-
-                    #add Phosphoresidue
-                    phosphosite = "p"+str(UniProtSeq)[psite]+str(psite+1)
-                    phosphoresidues.append(phosphosite)
-
-                    #Add Sequence7
-                    if psite < 7: #phospho_pep at N terminus
-                        seq7 = str(UniProtSeq)[:psite+8]
-                        if seq7[psite] == "S": #if phosphosresidue is serine
-                            pres = "s"
-                        elif seq7[psite] == "T": #if phosphosresidue is threonine
-                            pres = "t"
-                        elif seq7[psite] == "Y": #if phosphoresidue is tyrosine
-                            pres = "y"
-                        else: # if not pSTY
-                            pres = "?"
-                        seq7 = seq7[:psite] + pres + seq7[psite+1:psite+8]
-                        while len(seq7) < 15: #add appropriate number of "_" to the front
-                            seq7 = "_" + seq7
-                    elif len(UniProtSeq) - psite < 8: #phospho_pep at C terminus
-                        seq7 = str(UniProtSeq)[psite-7:]
-                        if seq7[7] == "S":
-                            pres = "s"
-                        elif seq7[7] == "T":
-                            pres = "t"
-                        elif seq7[7] == "Y":
-                            pres = "y"
-                        else:
-                            pres = "?"
-                        seq7 = seq7[:7] + pres + seq7[8:]
-                        while len(seq7) < 15: #add appropriate number of "_" to the back
-                            seq7 = seq7 + "_"
-                    else:
-                        seq7 = str(UniProtSeq)[psite-7:psite+8]
-                        pres = "" #phosphoresidue
-                        if seq7[7] == "S": #if phosphosresidue is serine
-                            pres = "s"
-                        elif seq7[7] == "T": #if phosphosresidue is threonine
-                            pres = "t"
-                        elif seq7[7] == "Y": #if phosphoresidue is tyrosine
-                            pres = "y"
-                        else: # if not pSTY
-                            pres = "?"
-                        seq7 = seq7[:7] + pres + seq7[8:]
-                    if seq7 not in seq7s_set:
-                        seq7s.append(seq7)
-                        seq7s_set.add(seq7)
-
-                    #add Sequence10
-                    if psite < 10: #phospho_pep at N terminus
-                        seq10 = str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite:psite+11]
-                    elif len(UniProtSeq) - psite < 11: #phospho_pep at C terminus
-                        seq10 = str(UniProtSeq)[psite-10:psite] + "p" + str(UniProtSeq)[psite:]
-                    else:
-                        seq10 = str(UniProtSeq)[psite-10:psite+11]
-                        seq10 = seq10[:10] + "p" + seq10[10:]
-                    if seq10 not in seq10s_set:
-                        seq10s.append(seq10)
-                        seq10s_set.add(seq10)
-
-                    i+=1
-
-                result[PHOSPHORESIDUE].append(phosphoresidues)
-                result[SEQUENCE7].append(seq7s)
-                # result[SEQUENCE10] is a list of lists of strings
-                result[SEQUENCE10].append(seq10s)
-
-
-
-
-            r = list(zip(
-               result[UNIPROT_ID],
-               result[GENE_NAME],
-               result[DESCRIPTION],
-               result[PHOSPHORESIDUE]
-               ))
-            # Sort by `UniProt_ID`
-            #   ref: https://stackoverflow.com//4174955/15509512
-            s = sorted(r, key=operator.itemgetter(0))
-
-            result[UNIPROT_ID] = []
-            result[GENE_NAME] = []
-            result[DESCRIPTION] = []
-            result[PHOSPHORESIDUE] = []
-
-            for r in s:
-                result[UNIPROT_ID].append(r[0])
-                result[GENE_NAME].append(r[1])
-                result[DESCRIPTION].append(r[2])
-                result[PHOSPHORESIDUE].append(r[3])
-
-
-            #convert lists to strings in the dictionary
-            for key,value in result.items():
-                if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]:
-                    result[key] = '; '.join(map(str, value))
-                elif key in [SEQUENCE10]:
-                    # result[SEQUENCE10] is a list of lists of strings
-                    joined_value = ''
-                    joined_set = set()
-                    sep = ''
-                    for valL in value:
-                        # valL is a list of strings
-                        for val in valL:
-                            # val is a string
-                            if val not in joined_set:
-                                joined_set.add(val)
-                                #joined_value += sep + '; '.join(map(str, val))
-                                joined_value += sep + val
-                                sep = '; '
-                    # joined_value is a string
-                    result[key] = joined_value
-
-
-            newstring = '; '.join(
-                [', '.join(l) for l in result[PHOSPHORESIDUE]]
-                )
-            ### #separate the isoforms in PHOSPHORESIDUE column with ";"
-            ### oldstring = result[PHOSPHORESIDUE]
-            ### oldlist = list(oldstring)
-            ### newstring = ""
-            ### i = 0
-            ### for e in oldlist:
-            ###     if e == ";":
-            ###         if numps > 1:
-            ###             if i%numps:
-            ###                 newstring = newstring + ";"
-            ###             else:
-            ###                 newstring = newstring + ","
-            ###         else:
-            ###             newstring = newstring + ";"
-            ###         i +=1
-            ###     else:
-            ###         newstring = newstring + e
-            result[PHOSPHORESIDUE] = newstring
-
-
-            #separate sequence7's by |
-            oldstring = result[SEQUENCE7]
-            oldlist = oldstring
-            newstring = ""
-            for l in oldlist:
-              for e in l:
-                if e == ";":
-                    newstring = newstring + " |"
-                elif len(newstring) > 0 and 1 > newstring.count(e):
-                    newstring = newstring + " | " + e
-                elif 1 > newstring.count(e):
-                    newstring = newstring + e
-            result[SEQUENCE7] = newstring
-
-
-            return [phospho_pep, result]
-
-        # Construct list of [string, dictionary] lists
-        #   where the dictionary provides the SwissProt metadata for a phosphopeptide
-        result_list = [
-            catch(pseq_to_subdict,psequence)
-            for psequence
-            in data_in[PHOSPHOPEPTIDE_MATCH]
-            ]
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        # Construct dictionary from list of lists
-        #   ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
-        UniProt_Info = {
-            result[0]:result[1]
-            for result
-            in result_list
-            if result is not None
-            }
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #cosmetic: add N_A to phosphopeptide rows with no hits
-        p_peptide_list = []
-        for key in UniProt_Info:
-            p_peptide_list.append(key)
-            for nestedKey in UniProt_Info[key]:
-                if UniProt_Info[key][nestedKey] == "":
-                    UniProt_Info[key][nestedKey] = N_A
-
-        end_time = time.process_time() #timer
-        print("%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #convert UniProt_Info dictionary to dataframe
-        uniprot_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(UniProt_Info))
-
-        #reorder columns to match expected output file
-        uniprot_df[PHOSPHOPEPTIDE] = uniprot_df.index #make index a column too
-
-
-        cols = uniprot_df.columns.tolist()
-        #cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]]
-        #uniprot_df = uniprot_df[cols]
-        uniprot_df = uniprot_df[[
-            PHOSPHOPEPTIDE,
-            SEQUENCE10,
-            SEQUENCE7,
-            GENE_NAME,
-            PHOSPHORESIDUE,
-            UNIPROT_ID,
-            DESCRIPTION
-            ]]
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #concat to split then groupby to collapse
-        seq7_df = pandas.concat([pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(' | '))
-                            for _, row in uniprot_df.iterrows()]).reset_index()
-        seq7_df.columns = [SEQUENCE7,PHOSPHOPEPTIDE]
-
-        # --- -------------- begin read PSP_Regulatory_sites ---------------------------------
-        #read in PhosphoSitePlus Regulatory Sites dataset
-        #ACE if (True):
-        ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
-        conn = sql.connect(uniprot_sqlite)
-        regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
-        # Close SwissProt SQLite database
-        conn.close()
-        #ACE # Array indexes are zero-based
-        #ACE #   ref: https://en.wikipedia.org/wiki/Python_(programming_language)
-        #ACE RENAME_COLS = [ 'SITE_PLUSMINUS_7AA', 'DOMAIN', 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT'
-        #ACE               , 'ON_OTHER_INTERACT' , 'NOTES' , 'ORGANISM']
-        #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
-        #ACE     print(regsites_df)
-        ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (finish) -----------
-        #ACE else:
-        #ACE     regsites_df = pandas.read_csv(phosphosite_filename_csv, header=3,skiprows=1-3)
-        #ACE     SITE_PLUSMINUS_7AA_SQL = SITE_PLUSMINUS_7AA
-        #ACE     #ACE # Array indexes are zero-based
-        #ACE     #ACE #   ref: https://en.wikipedia.org/wiki/Python_(programming_language)
-        #ACE     #ACE RENAME_COLS = [ 'GENE'          , 'PROTEIN'    , 'PROT_TYPE' , 'ACC_ID'          , 'GENE_ID'
-        #ACE     #ACE               , 'HU_CHR_LOC'    , 'ORGANISM'   , 'MOD_RSD'   , 'SITE_GRP_ID'     , 'SITE_+/-7_AA'
-        #ACE     #ACE               , 'DOMAIN'        , 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT'
-        #ACE     #ACE               , 'PMIDs'         , 'LT_LIT'     , 'MS_LIT'    , 'MS_CST'          , 'NOTES'
-        #ACE     #ACE               ]
-        #ACE     #ACE REGSITE_COL_SITE7AA = 9
-        #ACE     #ACE REGSITE_COL_PROTEIN = 1
-        #ACE     #ACE REGSITE_COL_DOMAIN = 10
-        #ACE     #ACE REGSITE_COL_PMIDs = 15
-
-        # ... -------------- end read PSP_Regulatory_sites ------------------------------------
-
-
-        #keep only the human entries in dataframe
-        if len(species) > 0:
-            print('Limit PhosphoSitesPlus records to species "' + species + '"')
-            regsites_df = regsites_df[regsites_df.ORGANISM == species]
-
-        #merge the seq7 df with the regsites df based off of the sequence7
-        merge_df = seq7_df.merge(regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how='left')
-        #ACE print(merge_df.columns.tolist()) #ACE
-
-        #after merging df, select only the columns of interest - note that PROTEIN is absent here
-        merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,ON_FUNCTION,ON_PROCESS, ON_PROT_INTERACT,ON_OTHER_INTERACT,ON_NOTES]]
-        #ACE print(merge_df.columns.tolist()) #ACE
-        #combine column values of interest into one FUNCTION_PHOSPHORESIDUE column"
-        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(merge_df[ON_PROCESS], sep="; ", na_rep="")
-        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="")
-        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="")
-        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="")
-
-        #remove the columns that were combined
-        merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,FUNCTION_PHOSPHORESIDUE]]
-
-        #ACE print(merge_df) #ACE
-        #ACE print(merge_df.columns.tolist()) #ACE
-
-        end_time = time.process_time() #timer
-        print("%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #cosmetic changes to Function Phosphoresidue column
-        fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE])
-
-        end_time = time.process_time() #timer
-        print("%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        i = 0
-        while i < len(fp_series):
-            #remove the extra ";" so that it looks more professional
-            if fp_series[i] == "; ; ; ; ": #remove ; from empty hits
-                fp_series[i] = ""
-            while fp_series[i].endswith("; "): #remove ; from the ends
-                fp_series[i] = fp_series[i][:-2]
-            while fp_series[i].startswith("; "): #remove ; from the beginning
-                fp_series[i] = fp_series[i][2:]
-            fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ")
-            fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
-            fp_series[i] = fp_series[i].replace("; ; ", "; ")
-
-            #turn blanks into N_A to signify the info was searched for but cannot be found
-            if fp_series[i] == "":
-                fp_series[i] = N_A
-
-            i += 1
-        merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series
-
-        end_time = time.process_time() #timer
-        print("%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #merge uniprot df with merge df
-        uniprot_regsites_merged_df = uniprot_df.merge(merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE,how="left")
-
-        #collapse the merged df
-        uniprot_regsites_collapsed_df = pandas.DataFrame(
-            uniprot_regsites_merged_df
-            .groupby(PHOSPHOPEPTIDE)[FUNCTION_PHOSPHORESIDUE]
-            .apply(lambda x: ppep_join(x)))
-            #.apply(lambda x: "%s" % ' | '.join(x)))
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        uniprot_regsites_collapsed_df[PHOSPHOPEPTIDE] = uniprot_regsites_collapsed_df.index #add df index as its own column
-
-
-        #rename columns
-        uniprot_regsites_collapsed_df.columns = [FUNCTION_PHOSPHORESIDUE, 'ppp']
-
-
-
-        #select columns to be merged to uniprot_df
-        #ACE cols = regsites_df.columns.tolist()
-        #ACE print(cols) #ACE
-        #ACE if len(cols) > 8:
-        #ACE     cols = [cols[9]]+[cols[1]]+cols[10:15]
-        #ACE     #ACE cols = [cols[9]]+[cols[1]]+cols[10:15]
-        #ACE     print(cols) #ACE
-        #ACE regsite_merge_df = regsites_df[cols]
-
-        end_time = time.process_time() #timer
-        print("%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #add columns based on Sequence7 matching site_+/-7_AA
-        uniprot_regsite_df = pandas.merge(
-            left=uniprot_df,
-            right=uniprot_regsites_collapsed_df,
-            how='left',
-            left_on=PHOSPHOPEPTIDE,
-            right_on='ppp')
-
-        end_time = time.process_time() #timer
-        print("%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        data_in.rename(
-            {'Protein description': PHOSPHOPEPTIDE},
-            axis='columns',
-            inplace=True
-            )
-
-
-
-        sort_start_time = time.process_time() #timer
-
-        #data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
-        res2 = sorted(data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key = lambda s: s.casefold())
-        data_in = data_in.loc[res2]
-
-        end_time = time.process_time() #timer
-        print("%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr) #timer
-
-
-
-        cols = [old_cols[0]] + old_cols[col_PKCalpha-1:]
-        upstream_data = upstream_data[cols]
-
-        end_time = time.process_time() #timer
-        print("%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr) #timer
-
-
-        #### #rename upstream columns in new list
-        #### new_cols = []
-        #### for name in cols:
-        ####     if "_NetworKIN" in name:
-        ####         name = name.split("_")[0]
-        ####     if " motif" in name:
-        ####         name = name.split(" motif")[0]
-        ####     if " sequence " in name:
-        ####         name = name.split(" sequence")[0]
-        ####     if "_Phosida" in name:
-        ####         name = name.split("_")[0]
-        ####     if "_PhosphoSite" in name:
-        ####         name = name.split("_")[0]
-        ####     new_cols.append(name)
-
-        #rename upstream columns in new list
-        def col_rename(name):
-            if "_NetworKIN" in name:
-                name = name.split("_")[0]
-            if " motif" in name:
-                name = name.split(" motif")[0]
-            if " sequence " in name:
-                name = name.split(" sequence")[0]
-            if "_Phosida" in name:
-                name = name.split("_")[0]
-            if "_PhosphoSite" in name:
-                name = name.split("_")[0]
-            return name
-
-        new_cols = [col_rename(col) for col in cols]
-        upstream_data.columns = new_cols
-
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr) #timer
-
-
-        # Create upstream_data_cast as a copy of upstream_data
-        #   but with first column substituted by the phosphopeptide sequence
-        upstream_data_cast = upstream_data.copy()
-        new_cols_cast = new_cols
-        new_cols_cast[0] = 'p_peptide'
-        upstream_data_cast.columns = new_cols_cast
-        upstream_data_cast['p_peptide'] = upstream_data.index
-        new_cols_cast0 = new_cols_cast[0]
-
-        # --- -------------- begin read upstream_data_melt ------------------------------------
-        ## ----------- Get melted kinase mapping data from SQLite database (start) -----------
-        conn = sql.connect(uniprot_sqlite)
-        upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
-        # Close SwissProt SQLite database
-        conn.close()
-        upstream_data_melt = upstream_data_melt_df.copy()
-        upstream_data_melt.columns = ['p_peptide', 'characterization', 'X']
-        upstream_data_melt['characterization'] = [
-                col_rename(s)
-                for s in upstream_data_melt['characterization']
-                ]
-
-        print('%0.6f upstream_data_melt_df initially has %d rows' %
-              (end_time - start_time, len(upstream_data_melt.axes[0]))
-              , file=sys.stderr)
-        # ref: https://stackoverflow.com/a/27360130/15509512
-        #      e.g. df.drop(df[df.score < 50].index, inplace=True)
-        upstream_data_melt.drop(
-            upstream_data_melt[upstream_data_melt.X != 'X'].index,
-            inplace = True
-            )
-        print('%0.6f upstream_data_melt_df pre-dedup has %d rows' %
-              (end_time - start_time, len(upstream_data_melt.axes[0]))
-              , file=sys.stderr)
-        #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
-        #ACE     print(upstream_data_melt)
-        ## ----------- Get melted kinase mapping data from SQLite database (finish) -----------
-        # ... -------------- end read upstream_data_melt --------------------------------------
-
-        end_time = time.process_time() #timer
-        print("%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr) #timer
-        # ... end read upstream_data_melt
-
-        upstream_data_melt_index = upstream_data_melt.index
-        upstream_data_melt_p_peptide = upstream_data_melt['p_peptide']
-
-        end_time = time.process_time() #timer
-        print("%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        upstream_delta_melt_LoL = upstream_data_melt.values.tolist()
-
-        melt_dict = {}
-        for key in upstream_map_p_peptide_list:
-            melt_dict[key] = []
-
-        for el in upstream_delta_melt_LoL:
-            (p_peptide, characterization, X) = tuple(el)
-            if p_peptide in melt_dict:
-                melt_dict[p_peptide].append(characterization)
-            else:
-                exit('Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide))
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr) #timer
-
-
-        # for key in upstream_map_p_peptide_list:
-        #     melt_dict[key] = ' | '.join(melt_dict[key])
-
-        for key in upstream_map_p_peptide_list:
-            melt_dict[key] = melt_join(melt_dict[key])
-
-        end_time = time.process_time() #timer
-        print("%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        # map_dict is a dictionary of dictionaries
-        map_dict = {}
-        for key in upstream_map_p_peptide_list:
-            map_dict[key] = {}
-            map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key]
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #convert map_dict to dataframe
-        map_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(map_dict))
-        map_df["p-peptide"] = map_df.index #make index a column too
-        cols_map_df = map_df.columns.tolist()
-        cols_map_df = [cols_map_df[1]] + [cols_map_df[0]]
-        map_df = map_df[cols_map_df]
-
-        #join map_df to uniprot_regsite_df
-        output_df = uniprot_regsite_df.merge(
-            map_df,
-            how="left",
-            left_on=PHOSPHOPEPTIDE,
-            right_on="p-peptide")
-
-        output_df = output_df[
-            [  PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE,
-                UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE,
-                PUTATIVE_UPSTREAM_DOMAINS
-                ]
-            ]
-
-
-        # cols_output_prelim = output_df.columns.tolist()
-        #
-        # print("cols_output_prelim")
-        # print(cols_output_prelim)
-        #
-        # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
-        #
-        # print("cols_output with p-peptide")
-        # print(cols_output)
-        #
-        # cols_output = [col for col in cols_output if not col == "p-peptide"]
-        #
-        # print("cols_output")
-        # print(cols_output)
-        #
-        # output_df = output_df[cols_output]
-
-        #join output_df back to quantitative columns in data_in df
-        quant_cols = data_in.columns.tolist()
-        quant_cols = quant_cols[1:]
-        quant_data = data_in[quant_cols]
-
-        ## ----------- Write merge/filter metadata to SQLite database (start) -----------
-        # Open SwissProt SQLite database
-        conn = sql.connect(output_sqlite)
-        cur  = conn.cursor()
-
-        cur.executescript(MRGFLTR_DDL)
-
-        cur.execute(
-            CITATION_INSERT_STMT,
-            ('mrgfltr_metadata_view', CITATION_INSERT_PSP)
-            )
-        cur.execute(
-            CITATION_INSERT_STMT,
-            ('mrgfltr_metadata', CITATION_INSERT_PSP)
-            )
-        cur.execute(
-            CITATION_INSERT_STMT,
-            ('mrgfltr_metadata_view', CITATION_INSERT_PSP_REF)
-            )
-        cur.execute(
-            CITATION_INSERT_STMT,
-            ('mrgfltr_metadata', CITATION_INSERT_PSP_REF)
-            )
-
-        # Read ppep-to-sequence LUT
-        ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)
-        #ACE ppep_lut_df.info(verbose=True)
-        # write only metadata for merged/filtered records to SQLite
-        mrgfltr_metadata_df = output_df.copy()
-        # replace phosphopeptide seq with ppep.id
-        mrgfltr_metadata_df = ppep_lut_df.merge(
-            mrgfltr_metadata_df,
-            left_on='ppep_seq',
-            right_on=PHOSPHOPEPTIDE,
-            how='inner'
-            )
-        mrgfltr_metadata_df.drop(
-            columns=[PHOSPHOPEPTIDE, 'ppep_seq'],
-            inplace=True
-            )
-        #rename columns
-        mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS
-        #ACE mrgfltr_metadata_df.info(verbose=True)
-        mrgfltr_metadata_df.to_sql(
-            'mrgfltr_metadata',
-            con=conn,
-            if_exists='append',
-            index=False,
-            method='multi'
-            )
-
-        # Close SwissProt SQLite database
-        conn.close()
-        ## ----------- Write merge/filter metadata to SQLite database (finish) -----------
-
-        output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH)
-        output_cols = output_df.columns.tolist()
-        output_cols = output_cols[:-1]
-        output_df = output_df[output_cols]
-
-        #cosmetic changes to Upstream column
-        output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
-        us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
-        i = 0
-        while i < len(us_series):
-            #turn blanks into N_A to signify the info was searched for but cannot be found
-            if us_series[i] == "":
-                us_series[i] = N_A
-            i += 1
-        output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series
-
-        end_time = time.process_time() #timer
-        print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        (output_rows, output_cols) = output_df.shape
-
-        #output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True)
-        output_df = output_df.convert_dtypes(convert_integer=True)
-
-
-        #Output onto Final CSV file
-        output_df.to_csv(output_filename_csv, index=False)
-        output_df.to_csv(output_filename_tab, quoting=None, sep='\t', index=False)
-
-        end_time = time.process_time() #timer
-        print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        print('{:>10} phosphopeptides written to output'.format(str(output_rows)))
-
-        end_time = time.process_time() #timer
-        print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer
-
-
-        #Rev. 7/1/2016
-        #Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
-        #Rev. 7/3/2016:  renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
-        #Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
-        #                read from SwissProt SQLite database
-        #Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
-
-        #############################################
-        # copied from Excel Output Script.ipynb END #
-        #############################################
-
-    try:
-        catch(mqpep_getswissprot,)
-        exit(0)
-    except Exception as e:
-        exit('Internal error running mqpep_getswissprot(): %s' % (e))
-
-if __name__ == "__main__":
-    __main__()
-
+#!/usr/bin/env python
+
+# Import the packages needed
+import argparse
+import operator  # for operator.itemgetter
+import os.path
+import re
+import shutil  # for shutil.copyfile(src, dest)
+import sqlite3 as sql
+import sys  # import the sys module for exc_info
+import time
+import traceback  # for formatting stack-trace
+from codecs import getreader as cx_getreader
+
+import numpy as np
+import pandas
+
+# global constants
+N_A = "N/A"
+
+
+# ref: https://stackoverflow.com/a/8915613/15509512
+#   answers: "How to handle exceptions in a list comprehensions"
+#   usage:
+#       from math import log
+#       eggs = [1,3,0,3,2]
+#       print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
+#   producing:
+#       for <built-in function log>
+#         with args (0,)
+#         exception: math domain error
+#       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
+def catch(func, *args, handle=lambda e: e, **kwargs):
+
+    try:
+        return func(*args, **kwargs)
+    except Exception as e:
+        print("For %s" % str(func))
+        print("  with args %s" % str(args))
+        print("  caught exception: %s" % str(e))
+        (ty, va, tb) = sys.exc_info()
+        print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
+        exit(-1)
+        return None
+
+
+def ppep_join(x):
+    x = [i for i in x if N_A != i]
+    result = "%s" % " | ".join(x)
+    if result != "":
+        return result
+    else:
+        return N_A
+
+
+def melt_join(x):
+    tmp = {key.lower(): key for key in x}
+    result = "%s" % " | ".join([tmp[key] for key in tmp])
+    return result
+
+
+def __main__():
+    # Parse Command Line
+    parser = argparse.ArgumentParser(
+        description="Phopsphoproteomic Enrichment Pipeline Merge and Filter."
+    )
+
+    # inputs:
+    #   Phosphopeptide data for experimental results, including the intensities
+    #   and the mapping to kinase domains, in tabular format.
+    parser.add_argument(
+        "--phosphopeptides",
+        "-p",
+        nargs=1,
+        required=True,
+        dest="phosphopeptides",
+        help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format",
+    )
+    #   UniProtKB/SwissProt DB input, SQLite
+    parser.add_argument(
+        "--ppep_mapping_db",
+        "-d",
+        nargs=1,
+        required=True,
+        dest="ppep_mapping_db",
+        help="UniProtKB/SwissProt SQLite Database",
+    )
+    #   species to limit records chosed from PhosPhositesPlus
+    parser.add_argument(
+        "--species",
+        "-x",
+        nargs=1,
+        required=False,
+        default=[],
+        dest="species",
+        help="limit PhosphoSitePlus records to indicated species (field may be empty)",
+    )
+
+    # outputs:
+    #   tabular output
+    parser.add_argument(
+        "--mrgfltr_tab",
+        "-o",
+        nargs=1,
+        required=True,
+        dest="mrgfltr_tab",
+        help="Tabular output file for results",
+    )
+    #   CSV output
+    parser.add_argument(
+        "--mrgfltr_csv",
+        "-c",
+        nargs=1,
+        required=True,
+        dest="mrgfltr_csv",
+        help="CSV output file for results",
+    )
+    #   SQLite output
+    parser.add_argument(
+        "--mrgfltr_sqlite",
+        "-S",
+        nargs=1,
+        required=True,
+        dest="mrgfltr_sqlite",
+        help="SQLite output file for results",
+    )
+
+    # "Make it so!" (parse the arguments)
+    options = parser.parse_args()
+    print("options: " + str(options))
+
+    # determine phosphopeptide ("upstream map") input tabular file access
+    if options.phosphopeptides is None:
+        exit('Argument "phosphopeptides" is required but not supplied')
+    try:
+        upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0])
+        input_file = open(upstream_map_filename_tab, "r")
+        input_file.close()
+    except Exception as e:
+        exit("Error parsing phosphopeptides argument: %s" % str(e))
+
+    # determine input SQLite access
+    if options.ppep_mapping_db is None:
+        exit('Argument "ppep_mapping_db" is required but not supplied')
+    try:
+        uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0])
+        input_file = open(uniprot_sqlite, "rb")
+        input_file.close()
+    except Exception as e:
+        exit("Error parsing ppep_mapping_db argument: %s" % str(e))
+
+    # copy input SQLite dataset to output SQLite dataset
+    if options.mrgfltr_sqlite is None:
+        exit('Argument "mrgfltr_sqlite" is required but not supplied')
+    try:
+        output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0])
+        shutil.copyfile(uniprot_sqlite, output_sqlite)
+    except Exception as e:
+        exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e))
+
+    # determine species to limit records from PSP_Regulatory_Sites
+    if options.species is None:
+        exit(
+            'Argument "species" is required (and may be empty) but not supplied'
+        )
+    try:
+        if len(options.species) > 0:
+            species = options.species[0]
+        else:
+            species = ""
+    except Exception as e:
+        exit("Error parsing species argument: %s" % str(e))
+
+    # determine tabular output destination
+    if options.mrgfltr_tab is None:
+        exit('Argument "mrgfltr_tab" is required but not supplied')
+    try:
+        output_filename_tab = os.path.abspath(options.mrgfltr_tab[0])
+        output_file = open(output_filename_tab, "w")
+        output_file.close()
+    except Exception as e:
+        exit("Error parsing mrgfltr_tab argument: %s" % str(e))
+
+    # determine CSV output destination
+    if options.mrgfltr_csv is None:
+        exit('Argument "mrgfltr_csv" is required but not supplied')
+    try:
+        output_filename_csv = os.path.abspath(options.mrgfltr_csv[0])
+        output_file = open(output_filename_csv, "w")
+        output_file.close()
+    except Exception as e:
+        exit("Error parsing mrgfltr_csv argument: %s" % str(e))
+
+    def mqpep_getswissprot():
+
+        #
+        # copied from Excel Output Script.ipynb BEGIN #
+        #
+
+        #  String Constants  #################
+        DEPHOSPHOPEP = "DephosphoPep"
+        DESCRIPTION = "Description"
+        FUNCTION_PHOSPHORESIDUE = (
+            "Function Phosphoresidue(PSP=PhosphoSitePlus.org)"
+        )
+        GENE_NAME = "Gene_Name"  # Gene Name from UniProtKB
+        ON_FUNCTION = (
+            "ON_FUNCTION"  # ON_FUNCTION column from PSP_Regulatory_Sites
+        )
+        ON_NOTES = "NOTES"  # NOTES column from PSP_Regulatory_Sites
+        ON_OTHER_INTERACT = "ON_OTHER_INTERACT"  # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
+        ON_PROCESS = (
+            "ON_PROCESS"  # ON_PROCESS column from PSP_Regulatory_Sites
+        )
+        ON_PROT_INTERACT = "ON_PROT_INTERACT"  # ON_PROT_INTERACT column from PSP_Regulatory_Sites
+        PHOSPHOPEPTIDE = "Phosphopeptide"
+        PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match"
+        PHOSPHORESIDUE = "Phosphoresidue"
+        PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains"
+        SEQUENCE = "Sequence"
+        SEQUENCE10 = "Sequence10"
+        SEQUENCE7 = "Sequence7"
+        SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA"
+        UNIPROT_ID = "UniProt_ID"
+        UNIPROT_SEQ_AND_META_SQL = """
+            select    Uniprot_ID, Description, Gene_Name, Sequence,
+                      Organism_Name, Organism_ID, PE, SV
+                 from UniProtKB
+             order by Sequence, UniProt_ID
+        """
+        UNIPROT_UNIQUE_SEQ_SQL = """
+            select distinct Sequence
+                       from UniProtKB
+                   group by Sequence
+        """
+        PPEP_PEP_UNIPROTSEQ_SQL = """
+            select distinct phosphopeptide, peptide, sequence
+                       from uniprotkb_pep_ppep_view
+                   order by sequence
+        """
+        PPEP_MELT_SQL = """
+            SELECT DISTINCT
+                phospho_peptide AS 'p_peptide',
+                kinase_map AS 'characterization',
+                'X' AS 'X'
+            FROM ppep_gene_site_view
+        """
+        # CREATE TABLE PSP_Regulatory_site (
+        #   site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
+        #   domain             TEXT,
+        #   ON_FUNCTION        TEXT,
+        #   ON_PROCESS         TEXT,
+        #   ON_PROT_INTERACT   TEXT,
+        #   ON_OTHER_INTERACT  TEXT,
+        #   notes              TEXT,
+        #   organism           TEXT
+        # );
+        PSP_REGSITE_SQL = """
+            SELECT DISTINCT
+              SITE_PLUSMINUS_7AA ,
+              DOMAIN             ,
+              ON_FUNCTION        ,
+              ON_PROCESS         ,
+              ON_PROT_INTERACT   ,
+              ON_OTHER_INTERACT  ,
+              NOTES              ,
+              ORGANISM
+            FROM PSP_Regulatory_site
+        """
+        PPEP_ID_SQL = """
+            SELECT
+                id AS 'ppep_id',
+                seq AS 'ppep_seq'
+            FROM ppep
+        """
+        MRGFLTR_DDL = """
+        DROP VIEW  IF EXISTS mrgfltr_metadata_view;
+        DROP TABLE IF EXISTS mrgfltr_metadata;
+        CREATE TABLE mrgfltr_metadata
+          ( ppep_id                 INTEGER REFERENCES ppep(id)
+          , Sequence10              TEXT
+          , Sequence7               TEXT
+          , GeneName                TEXT
+          , Phosphoresidue          TEXT
+          , UniProtID               TEXT
+          , Description             TEXT
+          , FunctionPhosphoresidue  TEXT
+          , PutativeUpstreamDomains TEXT
+          , PRIMARY KEY (ppep_id)            ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW mrgfltr_metadata_view AS
+          SELECT DISTINCT
+              ppep.seq             AS phospho_peptide
+            , Sequence10
+            , Sequence7
+            , GeneName
+            , Phosphoresidue
+            , UniProtID
+            , Description
+            , FunctionPhosphoresidue
+            , PutativeUpstreamDomains
+          FROM
+            ppep, mrgfltr_metadata
+          WHERE
+              mrgfltr_metadata.ppep_id = ppep.id
+          ORDER BY
+            ppep.seq
+            ;
+        """
+
+        CITATION_INSERT_STMT = """
+          INSERT INTO Citation (
+            ObjectName,
+            CitationData
+          ) VALUES (?,?)
+          """
+        CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
+        CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'
+
+        MRGFLTR_METADATA_COLUMNS = [
+            "ppep_id",
+            "Sequence10",
+            "Sequence7",
+            "GeneName",
+            "Phosphoresidue",
+            "UniProtID",
+            "Description",
+            "FunctionPhosphoresidue",
+            "PutativeUpstreamDomains",
+        ]
+
+        #  String Constants (end) ############
+
+        class Error(Exception):
+            """Base class for exceptions in this module."""
+
+            pass
+
+        class PreconditionError(Error):
+            """Exception raised for errors in the input.
+
+            Attributes:
+                expression -- input expression in which the error occurred
+                message -- explanation of the error
+            """
+
+            def __init__(self, expression, message):
+                self.expression = expression
+                self.message = message
+
+        # start_time = time.clock() #timer
+        start_time = time.process_time()  # timer
+
+        # get keys from upstream tabular file using readline()
+        # ref: https://stackoverflow.com/a/16713581/15509512
+        #      answer to "Use codecs to read file with correct encoding"
+        file1_encoded = open(upstream_map_filename_tab, "rb")
+        file1 = cx_getreader("latin-1")(file1_encoded)
+
+        count = 0
+        upstream_map_p_peptide_list = []
+        re_tab = re.compile("^[^\t]*")
+        while True:
+            count += 1
+            # Get next line from file
+            line = file1.readline()
+            # if line is empty
+            # end of file is reached
+            if not line:
+                break
+            if count > 1:
+                m = re_tab.match(line)
+                upstream_map_p_peptide_list.append(m[0])
+        file1.close()
+        file1_encoded.close()
+
+        # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
+        re_phos = re.compile("p")
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,),
+            file=sys.stderr,
+        )
+
+        # ----------- Get SwissProt data from SQLite database (start) -----------
+        # build UniProt sequence LUT and list of unique SwissProt sequences
+
+        # Open SwissProt SQLite database
+        conn = sql.connect(uniprot_sqlite)
+        cur = conn.cursor()
+
+        # Set up structures to hold SwissProt data
+
+        uniprot_Sequence_List = []
+        UniProtSeqLUT = {}
+
+        # Execute query for unique seqs without fetching the results yet
+        uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL)
+
+        while 1:
+            batch = uniprot_unique_seq_cur.fetchmany(size=50)
+            if not batch:
+                # handle case where no records are returned
+                break
+            for row in batch:
+                Sequence = row[0]
+                UniProtSeqLUT[(Sequence, DESCRIPTION)] = []
+                UniProtSeqLUT[(Sequence, GENE_NAME)] = []
+                UniProtSeqLUT[(Sequence, UNIPROT_ID)] = []
+                UniProtSeqLUT[Sequence] = []
+
+        # Execute query for seqs and metadata without fetching the results yet
+        uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL)
+
+        while 1:
+            batch = uniprot_seq_and_meta.fetchmany(size=50)
+            if not batch:
+                # handle case where no records are returned
+                break
+            for (
+                UniProt_ID,
+                Description,
+                Gene_Name,
+                Sequence,
+                OS,
+                OX,
+                PE,
+                SV,
+            ) in batch:
+                uniprot_Sequence_List.append(Sequence)
+                UniProtSeqLUT[Sequence] = Sequence
+                UniProtSeqLUT[(Sequence, UNIPROT_ID)].append(UniProt_ID)
+                UniProtSeqLUT[(Sequence, GENE_NAME)].append(Gene_Name)
+                if OS != N_A:
+                    Description += " OS=" + OS
+                if OX != N_A:
+                    Description += " OX=" + str(int(OX))
+                if Gene_Name != N_A:
+                    Description += " GN=" + Gene_Name
+                if PE != N_A:
+                    Description += " PE=" + PE
+                if SV != N_A:
+                    Description += " SV=" + SV
+                UniProtSeqLUT[(Sequence, DESCRIPTION)].append(Description)
+
+        # Close SwissProt SQLite database; clean up local variables
+        conn.close()
+        Sequence = ""
+        UniProt_ID = ""
+        Description = ""
+        Gene_Name = ""
+
+        # ----------- Get SwissProt data from SQLite database (finish) -----------
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,),
+            file=sys.stderr,
+        )
+
+        # ----------- Get SwissProt data from SQLite database (start) -----------
+        # Open SwissProt SQLite database
+        conn = sql.connect(uniprot_sqlite)
+        cur = conn.cursor()
+
+        # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
+        DephosphoPep_UniProtSeq_LUT = {}
+
+        # Set up dictionary to accumulate results
+        PhosphoPep_UniProtSeq_LUT = {}
+
+        # Execute query for tuples without fetching the results yet
+        ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL)
+
+        while 1:
+            batch = ppep_pep_uniprotseq_cur.fetchmany(size=50)
+            if not batch:
+                # handle case where no records are returned
+                break
+            for (phospho_pep, dephospho_pep, sequence) in batch:
+                # do interesting stuff here...
+                PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
+                PhosphoPep_UniProtSeq_LUT[
+                    (phospho_pep, DEPHOSPHOPEP)
+                ] = dephospho_pep
+                if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
+                    DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set()
+                    DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, DESCRIPTION)
+                    ] = []
+                    DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, GENE_NAME)
+                    ] = []
+                    DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, UNIPROT_ID)
+                    ] = []
+                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep, SEQUENCE)] = []
+                DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep)
+
+                if (
+                    sequence
+                    not in DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, SEQUENCE)
+                    ]
+                ):
+                    DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, SEQUENCE)
+                    ].append(sequence)
+                for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]:
+                    if phospho_pep != phospho_pep:
+                        print(
+                            "phospho_pep:'%s' phospho_pep:'%s'"
+                            % (phospho_pep, phospho_pep)
+                        )
+                    if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
+                        PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
+                        PhosphoPep_UniProtSeq_LUT[
+                            (phospho_pep, DEPHOSPHOPEP)
+                        ] = dephospho_pep
+                    r = list(
+                        zip(
+                            [s for s in UniProtSeqLUT[(sequence, UNIPROT_ID)]],
+                            [s for s in UniProtSeqLUT[(sequence, GENE_NAME)]],
+                            [
+                                s
+                                for s in UniProtSeqLUT[(sequence, DESCRIPTION)]
+                            ],
+                        )
+                    )
+                    # Sort by `UniProt_ID`
+                    #   ref: https://stackoverflow.com/a/4174955/15509512
+                    r = sorted(r, key=operator.itemgetter(0))
+                    # Get one tuple for each `phospho_pep`
+                    #   in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
+                    for (upid, gn, desc) in r:
+                        # Append pseudo-tuple per UniProt_ID but only when it is not present
+                        if (
+                            upid
+                            not in DephosphoPep_UniProtSeq_LUT[
+                                (dephospho_pep, UNIPROT_ID)
+                            ]
+                        ):
+                            DephosphoPep_UniProtSeq_LUT[
+                                (dephospho_pep, UNIPROT_ID)
+                            ].append(upid)
+                            DephosphoPep_UniProtSeq_LUT[
+                                (dephospho_pep, DESCRIPTION)
+                            ].append(desc)
+                            DephosphoPep_UniProtSeq_LUT[
+                                (dephospho_pep, GENE_NAME)
+                            ].append(gn)
+
+        # Close SwissProt SQLite database; clean up local variables
+        conn.close()
+        # wipe local variables
+        phospho_pep = dephospho_pep = sequence = 0
+        upid = gn = desc = r = ""
+
+        # ----------- Get SwissProt data from SQLite database (finish) -----------
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f finished reading and decoding '%s' [0.4]"
+            % (end_time - start_time, upstream_map_filename_tab),
+            file=sys.stderr,
+        )
+
+        print(
+            "{:>10} unique upstream phosphopeptides tested".format(
+                str(len(upstream_map_p_peptide_list))
+            )
+        )
+
+        # Read in Upstream tabular file
+        # We are discarding the intensity data; so read it as text
+        upstream_data = pandas.read_table(
+            upstream_map_filename_tab, dtype="str", index_col=0
+        )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f read Upstream Map from file [1g_1]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        upstream_data.index = upstream_map_p_peptide_list
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f added index to Upstream Map [1g_2]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # trim upstream_data to include only the upstream map columns
+        old_cols = upstream_data.columns.tolist()
+        i = 0
+        first_intensity = -1
+        last_intensity = -1
+        intensity_re = re.compile("Intensity.*")
+        for col_name in old_cols:
+            m = intensity_re.match(col_name)
+            if m:
+                last_intensity = i
+                if first_intensity == -1:
+                    first_intensity = i
+            i += 1
+        # print('last intensity = %d' % last_intensity)
+        col_PKCalpha = last_intensity + 2
+
+        data_in_cols = [old_cols[0]] + old_cols[
+            first_intensity: last_intensity + 1
+        ]
+
+        if upstream_data.empty:
+            print("upstream_data is empty")
+            exit(0)
+
+        data_in = upstream_data.copy(deep=True)[data_in_cols]
+
+        # Convert floating-point integers to int64 integers
+        #   ref: https://stackoverflow.com/a/68497603/15509512
+        data_in[list(data_in.columns[1:])] = (
+            data_in[list(data_in.columns[1:])]
+            .astype("float64")
+            .apply(np.int64)
+        )
+
+        # create another phosphopeptide column that will be used to join later;
+        #  MAY need to change depending on Phosphopeptide column position
+        # data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]]
+        data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # Produce a dictionary of metadata for a single phosphopeptide.
+        #   This is a replacement of `UniProtInfo_subdict` in the original code.
+        def pseq_to_subdict(phospho_pep):
+            # Strip "p" from phosphopeptide sequence
+            dephospho_pep = re_phos.sub("", phospho_pep)
+
+            # Determine number of phosphoresidues in phosphopeptide
+            numps = len(phospho_pep) - len(dephospho_pep)
+
+            # Determine location(s) of phosphoresidue(s) in phosphopeptide
+            #   (used later for Phosphoresidue, Sequence7, and Sequence10)
+            ploc = []  # list of p locations
+            i = 0
+            p = phospho_pep
+            while i < numps:
+                ploc.append(p.find("p"))
+                p = p[: p.find("p")] + p[p.find("p") + 1:]
+                i += 1
+
+            # Establish nested dictionary
+            result = {}
+            result[SEQUENCE] = []
+            result[UNIPROT_ID] = []
+            result[DESCRIPTION] = []
+            result[GENE_NAME] = []
+            result[PHOSPHORESIDUE] = []
+            result[SEQUENCE7] = []
+            result[SEQUENCE10] = []
+
+            # Add stripped sequence to dictionary
+            result[SEQUENCE].append(dephospho_pep)
+
+            # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT
+            # Caller may elect to:
+            # try:
+            #     ...
+            # except PreconditionError as pe:
+            #     print("'{expression}': {message}".format(
+            #             expression = pe.expression,
+            #             message = pe.message))
+            #             )
+            #         )
+            if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
+                raise PreconditionError(
+                    dephospho_pep,
+                    "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT",
+                )
+            if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
+                raise PreconditionError(
+                    dephospho_pep,
+                    "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT",
+                )
+            if (
+                dephospho_pep
+                != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)]
+            ):
+                raise PreconditionError(
+                    dephospho_pep,
+                    "dephosphorylated phosphopeptide does not match "
+                    + "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = "
+                    + PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)],
+                )
+            result[SEQUENCE] = [dephospho_pep]
+            result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[
+                (dephospho_pep, UNIPROT_ID)
+            ]
+            result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[
+                (dephospho_pep, DESCRIPTION)
+            ]
+            result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[
+                (dephospho_pep, GENE_NAME)
+            ]
+            if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
+                raise PreconditionError(
+                    dephospho_pep,
+                    "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT",
+                )
+            UniProtSeqList = DephosphoPep_UniProtSeq_LUT[
+                (dephospho_pep, SEQUENCE)
+            ]
+            if len(UniProtSeqList) < 1:
+                print(
+                    "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length"
+                    % dephospho_pep
+                )
+                # raise PreconditionError(
+                #     "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
+                #      'value has zero length'
+                #      )
+            for UniProtSeq in UniProtSeqList:
+                i = 0
+                phosphoresidues = []
+                seq7s_set = set()
+                seq7s = []
+                seq10s_set = set()
+                seq10s = []
+                while i < len(ploc):
+                    start = UniProtSeq.find(dephospho_pep)
+                    psite = (
+                        start + ploc[i]
+                    )  # location of phosphoresidue on protein sequence
+
+                    # add Phosphoresidue
+                    phosphosite = "p" + str(UniProtSeq)[psite] + str(psite + 1)
+                    phosphoresidues.append(phosphosite)
+
+                    # Add Sequence7
+                    if psite < 7:  # phospho_pep at N terminus
+                        seq7 = str(UniProtSeq)[: psite + 8]
+                        if seq7[psite] == "S":  # if phosphosresidue is serine
+                            pres = "s"
+                        elif (
+                            seq7[psite] == "T"
+                        ):  # if phosphosresidue is threonine
+                            pres = "t"
+                        elif (
+                            seq7[psite] == "Y"
+                        ):  # if phosphoresidue is tyrosine
+                            pres = "y"
+                        else:  # if not pSTY
+                            pres = "?"
+                        seq7 = (
+                            seq7[:psite] + pres + seq7[psite + 1: psite + 8]
+                        )
+                        while (
+                            len(seq7) < 15
+                        ):  # add appropriate number of "_" to the front
+                            seq7 = "_" + seq7
+                    elif (
+                        len(UniProtSeq) - psite < 8
+                    ):  # phospho_pep at C terminus
+                        seq7 = str(UniProtSeq)[psite - 7:]
+                        if seq7[7] == "S":
+                            pres = "s"
+                        elif seq7[7] == "T":
+                            pres = "t"
+                        elif seq7[7] == "Y":
+                            pres = "y"
+                        else:
+                            pres = "?"
+                        seq7 = seq7[:7] + pres + seq7[8:]
+                        while (
+                            len(seq7) < 15
+                        ):  # add appropriate number of "_" to the back
+                            seq7 = seq7 + "_"
+                    else:
+                        seq7 = str(UniProtSeq)[psite - 7: psite + 8]
+                        pres = ""  # phosphoresidue
+                        if seq7[7] == "S":  # if phosphosresidue is serine
+                            pres = "s"
+                        elif seq7[7] == "T":  # if phosphosresidue is threonine
+                            pres = "t"
+                        elif seq7[7] == "Y":  # if phosphoresidue is tyrosine
+                            pres = "y"
+                        else:  # if not pSTY
+                            pres = "?"
+                        seq7 = seq7[:7] + pres + seq7[8:]
+                    if seq7 not in seq7s_set:
+                        seq7s.append(seq7)
+                        seq7s_set.add(seq7)
+
+                    # add Sequence10
+                    if psite < 10:  # phospho_pep at N terminus
+                        seq10 = (
+                            str(UniProtSeq)[:psite]
+                            + "p"
+                            + str(UniProtSeq)[psite: psite + 11]
+                        )
+                    elif (
+                        len(UniProtSeq) - psite < 11
+                    ):  # phospho_pep at C terminus
+                        seq10 = (
+                            str(UniProtSeq)[psite - 10: psite]
+                            + "p"
+                            + str(UniProtSeq)[psite:]
+                        )
+                    else:
+                        seq10 = str(UniProtSeq)[psite - 10: psite + 11]
+                        seq10 = seq10[:10] + "p" + seq10[10:]
+                    if seq10 not in seq10s_set:
+                        seq10s.append(seq10)
+                        seq10s_set.add(seq10)
+
+                    i += 1
+
+                result[PHOSPHORESIDUE].append(phosphoresidues)
+                result[SEQUENCE7].append(seq7s)
+                # result[SEQUENCE10] is a list of lists of strings
+                result[SEQUENCE10].append(seq10s)
+
+            r = list(
+                zip(
+                    result[UNIPROT_ID],
+                    result[GENE_NAME],
+                    result[DESCRIPTION],
+                    result[PHOSPHORESIDUE],
+                )
+            )
+            # Sort by `UniProt_ID`
+            #   ref: https://stackoverflow.com//4174955/15509512
+            s = sorted(r, key=operator.itemgetter(0))
+
+            result[UNIPROT_ID] = []
+            result[GENE_NAME] = []
+            result[DESCRIPTION] = []
+            result[PHOSPHORESIDUE] = []
+
+            for r in s:
+                result[UNIPROT_ID].append(r[0])
+                result[GENE_NAME].append(r[1])
+                result[DESCRIPTION].append(r[2])
+                result[PHOSPHORESIDUE].append(r[3])
+
+            # convert lists to strings in the dictionary
+            for key, value in result.items():
+                if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]:
+                    result[key] = "; ".join(map(str, value))
+                elif key in [SEQUENCE10]:
+                    # result[SEQUENCE10] is a list of lists of strings
+                    joined_value = ""
+                    joined_set = set()
+                    sep = ""
+                    for valL in value:
+                        # valL is a list of strings
+                        for val in valL:
+                            # val is a string
+                            if val not in joined_set:
+                                joined_set.add(val)
+                                # joined_value += sep + '; '.join(map(str, val))
+                                joined_value += sep + val
+                                sep = "; "
+                    # joined_value is a string
+                    result[key] = joined_value
+
+            newstring = "; ".join(
+                [", ".join(prez) for prez in result[PHOSPHORESIDUE]]
+            )
+            # #separate the isoforms in PHOSPHORESIDUE column with ";"
+            # oldstring = result[PHOSPHORESIDUE]
+            # oldlist = list(oldstring)
+            # newstring = ""
+            # i = 0
+            # for e in oldlist:
+            #     if e == ";":
+            #         if numps > 1:
+            #             if i%numps:
+            #                 newstring = newstring + ";"
+            #             else:
+            #                 newstring = newstring + ","
+            #         else:
+            #             newstring = newstring + ";"
+            #         i +=1
+            #     else:
+            #         newstring = newstring + e
+            result[PHOSPHORESIDUE] = newstring
+
+            # separate sequence7's by |
+            oldstring = result[SEQUENCE7]
+            oldlist = oldstring
+            newstring = ""
+            for ol in oldlist:
+                for e in ol:
+                    if e == ";":
+                        newstring = newstring + " |"
+                    elif len(newstring) > 0 and 1 > newstring.count(e):
+                        newstring = newstring + " | " + e
+                    elif 1 > newstring.count(e):
+                        newstring = newstring + e
+            result[SEQUENCE7] = newstring
+
+            return [phospho_pep, result]
+
+        # Construct list of [string, dictionary] lists
+        #   where the dictionary provides the SwissProt metadata for a phosphopeptide
+        result_list = [
+            catch(pseq_to_subdict, psequence)
+            for psequence in data_in[PHOSPHOPEPTIDE_MATCH]
+        ]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f added SwissProt annotations to phosphopeptides [B]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # Construct dictionary from list of lists
+        #   ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
+        UniProt_Info = {
+            result[0]: result[1]
+            for result in result_list
+            if result is not None
+        }
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # cosmetic: add N_A to phosphopeptide rows with no hits
+        p_peptide_list = []
+        for key in UniProt_Info:
+            p_peptide_list.append(key)
+            for nestedKey in UniProt_Info[key]:
+                if UniProt_Info[key][nestedKey] == "":
+                    UniProt_Info[key][nestedKey] = N_A
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # convert UniProt_Info dictionary to dataframe
+        uniprot_df = pandas.DataFrame.transpose(
+            pandas.DataFrame.from_dict(UniProt_Info)
+        )
+
+        # reorder columns to match expected output file
+        uniprot_df[
+            PHOSPHOPEPTIDE
+        ] = uniprot_df.index  # make index a column too
+
+        cols = uniprot_df.columns.tolist()
+        # cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]]
+        # uniprot_df = uniprot_df[cols]
+        uniprot_df = uniprot_df[
+            [
+                PHOSPHOPEPTIDE,
+                SEQUENCE10,
+                SEQUENCE7,
+                GENE_NAME,
+                PHOSPHORESIDUE,
+                UNIPROT_ID,
+                DESCRIPTION,
+            ]
+        ]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f reordered columns to match expected output file [1]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # concat to split then groupby to collapse
+        seq7_df = pandas.concat(
+            [
+                pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(" | "))
+                for _, row in uniprot_df.iterrows()
+            ]
+        ).reset_index()
+        seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE]
+
+        # --- -------------- begin read PSP_Regulatory_sites ---------------------------------
+        # read in PhosphoSitePlus Regulatory Sites dataset
+        # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
+        conn = sql.connect(uniprot_sqlite)
+        regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
+        # Close SwissProt SQLite database
+        conn.close()
+        # ... -------------- end read PSP_Regulatory_sites ------------------------------------
+
+        # keep only the human entries in dataframe
+        if len(species) > 0:
+            print(
+                'Limit PhosphoSitesPlus records to species "' + species + '"'
+            )
+            regsites_df = regsites_df[regsites_df.ORGANISM == species]
+
+        # merge the seq7 df with the regsites df based off of the sequence7
+        merge_df = seq7_df.merge(
+            regsites_df,
+            left_on=SEQUENCE7,
+            right_on=SITE_PLUSMINUS_7AA_SQL,
+            how="left",
+        )
+
+        # after merging df, select only the columns of interest - note that PROTEIN is absent here
+        merge_df = merge_df[
+            [
+                PHOSPHOPEPTIDE,
+                SEQUENCE7,
+                ON_FUNCTION,
+                ON_PROCESS,
+                ON_PROT_INTERACT,
+                ON_OTHER_INTERACT,
+                ON_NOTES,
+            ]
+        ]
+        # combine column values of interest into one FUNCTION_PHOSPHORESIDUE column"
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(
+            merge_df[ON_PROCESS], sep="; ", na_rep=""
+        )
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
+            FUNCTION_PHOSPHORESIDUE
+        ].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="")
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
+            FUNCTION_PHOSPHORESIDUE
+        ].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="")
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
+            FUNCTION_PHOSPHORESIDUE
+        ].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="")
+
+        # remove the columns that were combined
+        merge_df = merge_df[
+            [PHOSPHOPEPTIDE, SEQUENCE7, FUNCTION_PHOSPHORESIDUE]
+        ]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f merge regsite metadata [1a]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # cosmetic changes to Function Phosphoresidue column
+        fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE])
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f more cosmetic changes [1b]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        i = 0
+        while i < len(fp_series):
+            # remove the extra ";" so that it looks more professional
+            if fp_series[i] == "; ; ; ; ":  # remove ; from empty hits
+                fp_series[i] = ""
+            while fp_series[i].endswith("; "):  # remove ; from the ends
+                fp_series[i] = fp_series[i][:-2]
+            while fp_series[i].startswith("; "):  # remove ; from the beginning
+                fp_series[i] = fp_series[i][2:]
+            fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ")
+            fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
+            fp_series[i] = fp_series[i].replace("; ; ", "; ")
+
+            # turn blanks into N_A to signify the info was searched for but cannot be found
+            if fp_series[i] == "":
+                fp_series[i] = N_A
+
+            i += 1
+        merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f cleaned up semicolons [1c]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # merge uniprot df with merge df
+        uniprot_regsites_merged_df = uniprot_df.merge(
+            merge_df,
+            left_on=PHOSPHOPEPTIDE,
+            right_on=PHOSPHOPEPTIDE,
+            how="left",
+        )
+
+        # collapse the merged df
+        uniprot_regsites_collapsed_df = pandas.DataFrame(
+            uniprot_regsites_merged_df.groupby(PHOSPHOPEPTIDE)[
+                FUNCTION_PHOSPHORESIDUE
+            ].apply(lambda x: ppep_join(x))
+        )
+        # .apply(lambda x: "%s" % ' | '.join(x)))
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        uniprot_regsites_collapsed_df[
+            PHOSPHOPEPTIDE
+        ] = (
+            uniprot_regsites_collapsed_df.index
+        )  # add df index as its own column
+
+        # rename columns
+        uniprot_regsites_collapsed_df.columns = [
+            FUNCTION_PHOSPHORESIDUE,
+            "ppp",
+        ]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f selected columns to be merged to uniprot_df [1e]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # add columns based on Sequence7 matching site_+/-7_AA
+        uniprot_regsite_df = pandas.merge(
+            left=uniprot_df,
+            right=uniprot_regsites_collapsed_df,
+            how="left",
+            left_on=PHOSPHOPEPTIDE,
+            right_on="ppp",
+        )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        data_in.rename(
+            {"Protein description": PHOSPHOPEPTIDE},
+            axis="columns",
+            inplace=True,
+        )
+
+        # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
+        res2 = sorted(
+            data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold()
+        )
+        data_in = data_in.loc[res2]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f sorting time [1f]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        cols = [old_cols[0]] + old_cols[col_PKCalpha - 1:]
+        upstream_data = upstream_data[cols]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f refactored columns for Upstream Map [1g]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # #rename upstream columns in new list
+        # new_cols = []
+        # for name in cols:
+        #     if "_NetworKIN" in name:
+        #         name = name.split("_")[0]
+        #     if " motif" in name:
+        #         name = name.split(" motif")[0]
+        #     if " sequence " in name:
+        #         name = name.split(" sequence")[0]
+        #     if "_Phosida" in name:
+        #         name = name.split("_")[0]
+        #     if "_PhosphoSite" in name:
+        #         name = name.split("_")[0]
+        #     new_cols.append(name)
+
+        # rename upstream columns in new list
+        def col_rename(name):
+            if "_NetworKIN" in name:
+                name = name.split("_")[0]
+            if " motif" in name:
+                name = name.split(" motif")[0]
+            if " sequence " in name:
+                name = name.split(" sequence")[0]
+            if "_Phosida" in name:
+                name = name.split("_")[0]
+            if "_PhosphoSite" in name:
+                name = name.split("_")[0]
+            return name
+
+        new_cols = [col_rename(col) for col in cols]
+        upstream_data.columns = new_cols
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f renamed columns for Upstream Map [1h_1]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # Create upstream_data_cast as a copy of upstream_data
+        #   but with first column substituted by the phosphopeptide sequence
+        upstream_data_cast = upstream_data.copy()
+        new_cols_cast = new_cols
+        new_cols_cast[0] = "p_peptide"
+        upstream_data_cast.columns = new_cols_cast
+        upstream_data_cast["p_peptide"] = upstream_data.index
+
+        # --- -------------- begin read upstream_data_melt ------------------------------------
+        # ----------- Get melted kinase mapping data from SQLite database (start) -----------
+        conn = sql.connect(uniprot_sqlite)
+        upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
+        # Close SwissProt SQLite database
+        conn.close()
+        upstream_data_melt = upstream_data_melt_df.copy()
+        upstream_data_melt.columns = ["p_peptide", "characterization", "X"]
+        upstream_data_melt["characterization"] = [
+            col_rename(s) for s in upstream_data_melt["characterization"]
+        ]
+
+        print(
+            "%0.6f upstream_data_melt_df initially has %d rows"
+            % (end_time - start_time, len(upstream_data_melt.axes[0])),
+            file=sys.stderr,
+        )
+        # ref: https://stackoverflow.com/a/27360130/15509512
+        #      e.g. df.drop(df[df.score < 50].index, inplace=True)
+        upstream_data_melt.drop(
+            upstream_data_melt[upstream_data_melt.X != "X"].index, inplace=True
+        )
+        print(
+            "%0.6f upstream_data_melt_df pre-dedup has %d rows"
+            % (end_time - start_time, len(upstream_data_melt.axes[0])),
+            file=sys.stderr,
+        )
+        # ----------- Get melted kinase mapping data from SQLite database (finish) -----------
+        # ... -------------- end read upstream_data_melt --------------------------------------
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f melted and minimized Upstream Map dataframe [1h_2]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+        # ... end read upstream_data_melt
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f indexed melted Upstream Map [1h_2a]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        upstream_delta_melt_LoL = upstream_data_melt.values.tolist()
+
+        melt_dict = {}
+        for key in upstream_map_p_peptide_list:
+            melt_dict[key] = []
+
+        for el in upstream_delta_melt_LoL:
+            (p_peptide, characterization, X) = tuple(el)
+            if p_peptide in melt_dict:
+                melt_dict[p_peptide].append(characterization)
+            else:
+                exit(
+                    'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping'
+                    % (p_peptide)
+                )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f appended peptide characterizations [1h_2b]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # for key in upstream_map_p_peptide_list:
+        #     melt_dict[key] = ' | '.join(melt_dict[key])
+
+        for key in upstream_map_p_peptide_list:
+            melt_dict[key] = melt_join(melt_dict[key])
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f concatenated multiple characterizations [1h_2c]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # map_dict is a dictionary of dictionaries
+        map_dict = {}
+        for key in upstream_map_p_peptide_list:
+            map_dict[key] = {}
+            map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f instantiated map dictionary [2]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # convert map_dict to dataframe
+        map_df = pandas.DataFrame.transpose(
+            pandas.DataFrame.from_dict(map_dict)
+        )
+        map_df["p-peptide"] = map_df.index  # make index a column too
+        cols_map_df = map_df.columns.tolist()
+        cols_map_df = [cols_map_df[1]] + [cols_map_df[0]]
+        map_df = map_df[cols_map_df]
+
+        # join map_df to uniprot_regsite_df
+        output_df = uniprot_regsite_df.merge(
+            map_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="p-peptide"
+        )
+
+        output_df = output_df[
+            [
+                PHOSPHOPEPTIDE,
+                SEQUENCE10,
+                SEQUENCE7,
+                GENE_NAME,
+                PHOSPHORESIDUE,
+                UNIPROT_ID,
+                DESCRIPTION,
+                FUNCTION_PHOSPHORESIDUE,
+                PUTATIVE_UPSTREAM_DOMAINS,
+            ]
+        ]
+
+        # cols_output_prelim = output_df.columns.tolist()
+        #
+        # print("cols_output_prelim")
+        # print(cols_output_prelim)
+        #
+        # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
+        #
+        # print("cols_output with p-peptide")
+        # print(cols_output)
+        #
+        # cols_output = [col for col in cols_output if not col == "p-peptide"]
+        #
+        # print("cols_output")
+        # print(cols_output)
+        #
+        # output_df = output_df[cols_output]
+
+        # join output_df back to quantitative columns in data_in df
+        quant_cols = data_in.columns.tolist()
+        quant_cols = quant_cols[1:]
+        quant_data = data_in[quant_cols]
+
+        # ----------- Write merge/filter metadata to SQLite database (start) -----------
+        # Open SwissProt SQLite database
+        conn = sql.connect(output_sqlite)
+        cur = conn.cursor()
+
+        cur.executescript(MRGFLTR_DDL)
+
+        cur.execute(
+            CITATION_INSERT_STMT,
+            ("mrgfltr_metadata_view", CITATION_INSERT_PSP),
+        )
+        cur.execute(
+            CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP)
+        )
+        cur.execute(
+            CITATION_INSERT_STMT,
+            ("mrgfltr_metadata_view", CITATION_INSERT_PSP_REF),
+        )
+        cur.execute(
+            CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP_REF)
+        )
+
+        # Read ppep-to-sequence LUT
+        ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)
+        # write only metadata for merged/filtered records to SQLite
+        mrgfltr_metadata_df = output_df.copy()
+        # replace phosphopeptide seq with ppep.id
+        mrgfltr_metadata_df = ppep_lut_df.merge(
+            mrgfltr_metadata_df,
+            left_on="ppep_seq",
+            right_on=PHOSPHOPEPTIDE,
+            how="inner",
+        )
+        mrgfltr_metadata_df.drop(
+            columns=[PHOSPHOPEPTIDE, "ppep_seq"], inplace=True
+        )
+        # rename columns
+        mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS
+        mrgfltr_metadata_df.to_sql(
+            "mrgfltr_metadata",
+            con=conn,
+            if_exists="append",
+            index=False,
+            method="multi",
+        )
+
+        # Close SwissProt SQLite database
+        conn.close()
+        # ----------- Write merge/filter metadata to SQLite database (finish) -----------
+
+        output_df = output_df.merge(
+            quant_data,
+            how="right",
+            left_on=PHOSPHOPEPTIDE,
+            right_on=PHOSPHOPEPTIDE_MATCH,
+        )
+        output_cols = output_df.columns.tolist()
+        output_cols = output_cols[:-1]
+        output_df = output_df[output_cols]
+
+        # cosmetic changes to Upstream column
+        output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[
+            PUTATIVE_UPSTREAM_DOMAINS
+        ].fillna(
+            ""
+        )  # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
+        us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
+        i = 0
+        while i < len(us_series):
+            # turn blanks into N_A to signify the info was searched for but cannot be found
+            if us_series[i] == "":
+                us_series[i] = N_A
+            i += 1
+        output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f establisheed output [3]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        (output_rows, output_cols) = output_df.shape
+
+        output_df = output_df.convert_dtypes(convert_integer=True)
+
+        # Output onto Final CSV file
+        output_df.to_csv(output_filename_csv, index=False)
+        output_df.to_csv(
+            output_filename_tab, quoting=None, sep="\t", index=False
+        )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f wrote output [4]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        print(
+            "{:>10} phosphopeptides written to output".format(str(output_rows))
+        )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f seconds of non-system CPU time were consumed"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # Rev. 7/1/2016
+        # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
+        # Rev. 7/3/2016:  renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
+        # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
+        #                read from SwissProt SQLite database
+        # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
+
+        #
+        # copied from Excel Output Script.ipynb END #
+        #
+
+    try:
+        catch(
+            mqpep_getswissprot,
+        )
+        exit(0)
+    except Exception as e:
+        exit("Internal error running mqpep_getswissprot(): %s" % (e))
+
+
+if __name__ == "__main__":
+    __main__()
--- a/repository_dependencies.xml	Mon Mar 07 20:43:57 2022 +0000
+++ b/repository_dependencies.xml	Thu Mar 10 23:42:48 2022 +0000
@@ -1,5 +1,5 @@
 <?xml version="1.0" ?>
 <repositories description="Suite for preprocessing and ANOVA of MaxQuant results using LC-MS proteomics data from phosphoproteomic enrichment.">
     <repository name="mqppep_preproc" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="8be24b489992"/>
-    <repository name="mqppep_anova" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="f9c13bc8e7ad"/>
+    <repository name="mqppep_anova" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="cfc65ae227f8"/>
 </repositories>
\ No newline at end of file
--- a/search_ppep.py	Mon Mar 07 20:43:57 2022 +0000
+++ b/search_ppep.py	Thu Mar 10 23:42:48 2022 +0000
@@ -3,20 +3,19 @@

 import argparse
 import os.path
+import re
 import sqlite3
-import re
+import sys  # import the sys module for exc_info
+import time
+import traceback  # import the traceback module for format_exception
 from codecs import getreader as cx_getreader
-import time

 # For Aho-Corasick search for fixed set of substrings
 # - add_word
 # - make_automaton
 # - iter
 import ahocorasick
-# Support map over auto.iter(...)
-# - itemgetter
-import operator
-#import hashlib
+

 # ref: https://stackoverflow.com/a/8915613/15509512
 #   answers: "How to handle exceptions in a list comprehensions"
@@ -29,7 +28,8 @@
 #         with args (0,)
 #         exception: math domain error
 #       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
-def catch(func, *args, handle=lambda e : e, **kwargs):
+def catch(func, *args, handle=lambda e: e, **kwargs):
+
     try:
         return func(*args, **kwargs)
     except Exception as e:
@@ -38,13 +38,13 @@
         print("  caught exception: %s" % str(e))
         (ty, va, tb) = sys.exc_info()
         print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
-        #exit(-1)
-        return None # was handle(e)
+        # exit(-1)
+        return None  # was handle(e)
+

 def __main__():
-    ITEM_GETTER = operator.itemgetter(1)

-    DROP_TABLES_SQL = '''
+    DROP_TABLES_SQL = """
         DROP VIEW  IF EXISTS ppep_gene_site_view;
         DROP VIEW  IF EXISTS uniprot_view;
         DROP VIEW  IF EXISTS uniprotkb_pep_ppep_view;
@@ -59,9 +59,9 @@
         DROP TABLE IF EXISTS ppep_gene_site;
         DROP TABLE IF EXISTS ppep_metadata;
         DROP TABLE IF EXISTS ppep_intensity;
-    '''
+    """

-    CREATE_TABLES_SQL = '''
+    CREATE_TABLES_SQL = """
         CREATE TABLE deppep
           ( id INTEGER PRIMARY KEY
           , seq TEXT UNIQUE                            ON CONFLICT IGNORE
@@ -213,53 +213,55 @@
             AND
               ppep_intensity.ppep_id = ppep.id
           ;
-    '''
+    """

-    UNIPROT_SEQ_AND_ID_SQL = '''
+    UNIPROT_SEQ_AND_ID_SQL = """
         select    Sequence, Uniprot_ID
              from UniProtKB
-    '''
+    """

     # Parse Command Line
     parser = argparse.ArgumentParser(
-        description='Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB).'
-        )
+        description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)."
+    )

     # inputs:
     #   Phosphopeptide data for experimental results, including the intensities
     #   and the mapping to kinase domains, in tabular format.
     parser.add_argument(
-        '--phosphopeptides', '-p',
+        "--phosphopeptides",
+        "-p",
         nargs=1,
         required=True,
-        dest='phosphopeptides',
-        help='Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool'
-        )
+        dest="phosphopeptides",
+        help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool",
+    )
     parser.add_argument(
-        '--uniprotkb', '-u',
+        "--uniprotkb",
+        "-u",
         nargs=1,
         required=True,
-        dest='uniprotkb',
-        help='UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool'
-        )
+        dest="uniprotkb",
+        help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool",
+    )
     parser.add_argument(
-        '--schema',
-        action='store_true',
-        dest='db_schema',
-        help='show updated database schema'
-        )
+        "--schema",
+        action="store_true",
+        dest="db_schema",
+        help="show updated database schema",
+    )
     parser.add_argument(
-        '--warn-duplicates',
-        action='store_true',
-        dest='warn_duplicates',
-        help='show warnings for duplicated sequences'
-        )
+        "--warn-duplicates",
+        action="store_true",
+        dest="warn_duplicates",
+        help="show warnings for duplicated sequences",
+    )
     parser.add_argument(
-        '--verbose',
-        action='store_true',
-        dest='verbose',
-        help='show somewhat verbose program tracing'
-        )
+        "--verbose",
+        action="store_true",
+        dest="verbose",
+        help="show somewhat verbose program tracing",
+    )
     # "Make it so!" (parse the arguments)
     options = parser.parse_args()
     if options.verbose:
@@ -269,9 +271,9 @@
     if options.phosphopeptides is None:
         exit('Argument "phosphopeptides" is required but not supplied')
     try:
-        f_name  = os.path.abspath(options.phosphopeptides[0])
+        f_name = os.path.abspath(options.phosphopeptides[0])
     except Exception as e:
-        exit('Error parsing phosphopeptides argument: %s' % (e))
+        exit("Error parsing phosphopeptides argument: %s" % (e))

     # path to SQLite input/output tabular file
     if options.uniprotkb is None:
@@ -279,7 +281,7 @@
     try:
         db_name = os.path.abspath(options.uniprotkb[0])
     except Exception as e:
-        exit('Error parsing uniprotkb argument: %s' % (e))
+        exit("Error parsing uniprotkb argument: %s" % (e))

     # print("options.schema is %d" % options.db_schema)

@@ -302,21 +304,23 @@
     cur.executescript(CREATE_TABLES_SQL)

     if options.db_schema:
-        print("\nAfter creating tables/views that are to be created, schema is:")
+        print(
+            "\nAfter creating tables/views that are to be created, schema is:"
+        )
         cur.execute("SELECT * FROM sqlite_schema")
         for row in cur.fetchall():
             if row[4] is not None:
                 print("%s;" % row[4])

     def generate_ppep(f):
-        #get keys from upstream tabular file using readline()
+        # get keys from upstream tabular file using readline()
         # ref: https://stackoverflow.com/a/16713581/15509512
         #      answer to "Use codecs to read file with correct encoding"
-        file1_encoded = open(f, 'rb')
+        file1_encoded = open(f, "rb")
         file1 = cx_getreader("latin-1")(file1_encoded)

         count = 0
-        re_tab = re.compile('^[^\t]*')
+        re_tab = re.compile("^[^\t]*")
         re_quote = re.compile('"')
         while True:
             count += 1
@@ -328,7 +332,7 @@
                 break
             if count > 1:
                 m = re_tab.match(line)
-                m = re_quote.sub('',m[0])
+                m = re_quote.sub("", m[0])
                 yield m
         file1.close()
         file1_encoded.close()
@@ -339,7 +343,7 @@
     #   - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
     #   - https://en.wikipedia.org/wiki/Trie
     auto = ahocorasick.Automaton()
-    re_phos = re.compile('p')
+    re_phos = re.compile("p")
     # scrub out unsearchable characters per section
     #   "Match the p_peptides to the @sequences array:"
     # of the original
@@ -350,17 +354,17 @@
     #   $tmp_p_peptide =~ s/\_//g;
     #   $tmp_p_peptide =~ s/\.//g;
     #
-    re_scrub = re.compile('0-9_.#')
+    re_scrub = re.compile("0-9_.#")
     ppep_count = 0
     for ppep in generate_ppep(f_name):
         ppep_count += 1
         add_to_trie = False
-        #print(ppep)
-        scrubbed = re_scrub.sub('',ppep)
-        deppep = re_phos.sub('',scrubbed)
+        # print(ppep)
+        scrubbed = re_scrub.sub("", ppep)
+        deppep = re_phos.sub("", scrubbed)
         if options.verbose:
-            print("deppep: %s; scrubbed: %s" % (deppep,scrubbed))
-        #print(deppep)
+            print("deppep: %s; scrubbed: %s" % (deppep, scrubbed))
+        # print(deppep)
         cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
         if cur.fetchone() is None:
             add_to_trie = True
@@ -368,13 +372,13 @@
         cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
         deppep_id = cur.fetchone()[0]
         if add_to_trie:
-            #print((deppep_id, deppep))
+            # print((deppep_id, deppep))
             # Build the trie
             auto.add_word(deppep, (deppep_id, deppep))
         cur.execute(
             "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)",
-            (ppep, scrubbed, deppep_id)
-            )
+            (ppep, scrubbed, deppep_id),
+        )
     # def generate_deppep():
     #     cur.execute("SELECT seq FROM deppep")
     #     for row in cur.fetchall():
@@ -383,18 +387,20 @@
     for row in cur.fetchall():
         deppep_count = row[0]

-    cur.execute("SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)")
+    cur.execute(
+        "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)"
+    )
     for row in cur.fetchall():
         sequence_count = row[0]

-    print(
-      "%d phosphopeptides were read from input" % ppep_count
-      )
+    print("%d phosphopeptides were read from input" % ppep_count)
     print(
-      "%d corresponding dephosphopeptides are represented in input" % deppep_count
-      )
+        "%d corresponding dephosphopeptides are represented in input"
+        % deppep_count
+    )
     # Look for cases where both Gene_Name and Sequence are identical
-    cur.execute('''
+    cur.execute(
+        """
       SELECT Uniprot_ID, Gene_Name, Sequence
       FROM   UniProtKB
       WHERE  Sequence IN (
@@ -404,12 +410,15 @@
         HAVING   count(*) > 1
         )
       ORDER BY Sequence
-      ''')
+      """
+    )
     duplicate_count = 0
-    old_seq = ''
+    old_seq = ""
     for row in cur.fetchall():
         if duplicate_count == 0:
-            print("\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column).")
+            print(
+                "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)."
+            )
         if row[2] != old_seq:
             old_seq = row[2]
             duplicate_count += 1
@@ -419,47 +428,57 @@
             if options.warn_duplicates:
                 print("%s\t%s" % (row[0], row[1]))
     if duplicate_count > 0:
-        print("\n%d sequences have duplicated accession IDs\n" % duplicate_count)
+        print(
+            "\n%d sequences have duplicated accession IDs\n" % duplicate_count
+        )

-    print(
-      "%s accession sequences will be searched\n" % sequence_count
-      )
+    print("%s accession sequences will be searched\n" % sequence_count)

-    #print(auto.dump())
+    # print(auto.dump())

     # Convert the trie to an automaton (a finite-state machine)
     auto.make_automaton()

     # Execute query for seqs and metadata without fetching the results yet
     uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL)
-    while batch := uniprot_seq_and_id.fetchmany(size=50):
-      if None == batch:
-          break
-      for Sequence, UniProtKB_id in batch:
-          if Sequence is not None:
-              for end_index, (insert_order, original_value) in auto.iter(Sequence):
-                  ker.execute('''
+    while 1:
+        batch = uniprot_seq_and_id.fetchmany(size=50)
+        if not batch:
+            break
+        for Sequence, UniProtKB_id in batch:
+            if Sequence is not None:
+                for end_index, (insert_order, original_value) in auto.iter(
+                    Sequence
+                ):
+                    ker.execute(
+                        """
                       INSERT INTO deppep_UniProtKB
                         (deppep_id,UniProtKB_id,pos_start,pos_end)
                       VALUES (?,?,?,?)
-                      ''', (
-                          insert_order,
-                          UniProtKB_id,
-                          1 + end_index - len(original_value),
-                          end_index
-                          )
-                      )
-          else:
-              raise ValueError("UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" % (UniProtKB_id,))
-    ker.execute("""
+                      """,
+                        (
+                            insert_order,
+                            UniProtKB_id,
+                            1 + end_index - len(original_value),
+                            end_index,
+                        ),
+                    )
+            else:
+                raise ValueError(
+                    "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID"
+                    % (UniProtKB_id,)
+                )
+    ker.execute(
+        """
         SELECT   count(*) || ' accession-peptide-phosphopeptide combinations were found'
         FROM     uniprotkb_pep_ppep_view
         """
-        )
+    )
     for row in ker.fetchall():
         print(row[0])

-    ker.execute("""
+    ker.execute(
+        """
       SELECT   count(*) || ' accession matches were found', count(*) AS accession_count
       FROM     (
         SELECT   accession
@@ -467,12 +486,12 @@
         GROUP BY accession
         )
       """
-      )
+    )
     for row in ker.fetchall():
-      print(row[0])
-      accession_count = row[1]
+        print(row[0])

-    ker.execute("""
+    ker.execute(
+        """
       SELECT   count(*) || ' peptide matches were found'
       FROM     (
         SELECT   peptide
@@ -480,11 +499,12 @@
         GROUP BY peptide
         )
       """
-      )
+    )
     for row in ker.fetchall():
-      print(row[0])
+        print(row[0])

-    ker.execute("""
+    ker.execute(
+        """
       SELECT   count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count
       FROM     (
         SELECT   phosphopeptide
@@ -492,21 +512,24 @@
         GROUP BY phosphopeptide
         )
       """
-      )
+    )
     for row in ker.fetchall():
-      print(row[0])
-      phosphopeptide_count = row[1]
+        print(row[0])

     con.commit()
-    ker.execute('vacuum')
+    ker.execute("vacuum")
     con.close()

+
 if __name__ == "__main__":
     wrap_start_time = time.perf_counter()
     __main__()
     wrap_stop_time = time.perf_counter()
     # print(wrap_start_time)
     # print(wrap_stop_time)
-    print("\nThe matching process took %d milliseconds to run.\n" % ((wrap_stop_time - wrap_start_time)*1000),)
+    print(
+        "\nThe matching process took %d milliseconds to run.\n"
+        % ((wrap_stop_time - wrap_start_time) * 1000),
+    )

- # vim: sw=4 ts=4 et ai :
+# vim: sw=4 ts=4 et ai :