# HG changeset patch
# User galaxyp
# Date 1657567279 0
# Node ID ba62d93a9ef55b9c7bdd642a6a725901799525a4

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe

diff -r 000000000000 -r ba62d93a9ef5 MaxQuantProcessingScript.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MaxQuantProcessingScript.R	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,705 @@
+#!/usr/bin/env Rscript
+
+# This is the implementation for the
+#   "MaxQuant Phosphopeptide Localization Probability Cutoff"
+#   Galaxy tool (mqppep_lclztn_filter)
+# It is adapted from the MaxQuant Processing Script written by Larry Cheng.
+
+# libraries
+library(optparse)
+library(data.table)
+library(stringr)
+library(ggplot2)
+
+# title: "MaxQuant Processing Script"
+# author: "Larry Cheng"
+# date: "February 19, 2018"
+#
+# # MaxQuant Processing Script
+# Takes MaxQuant Phospho (STY)sites.txt file as input
+# and performs the following (in order):
+# 1) Runs the Proteomics Quality Control software
+# 2) Remove contaminant and reverse sequence rows
+# 3) Filters rows based on localization probability
+# 4) Extract the quantitative data
+# 5) Sequences phosphopeptides
+# 6) Merges multiply phosphorylated peptides
+# 7) Filters out phosphopeptides based on enrichment
+# The output file contains the phosphopeptide (first column)
+# and the quantitative values for each sample.
+#
+# ## Revision History
+# Rev. 2022-02-10 :wrap for inclusion in Galaxy
+# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script"
+#                  and "Phosphopeptide Processing Script"
+# Rev. 2017-12-12 :added PTXQC
+#                  added additional plots and table outputs for quality control
+#                  allowed for more than 2 samples to be grouped together
+#                  (up to 26 (eg, 1A, 1B, 1C, etc))
+#                  converted from .r to .rmd file to knit report
+#                  for quality control
+# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data
+#                  impute multiple times
+# Rev. 2016-09-09 :added filter to eliminate contaminant & reverse sequence rows
+# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to
+#                  preANOVA file output
+# Rev. 2016-08-22 :use regexSampleNames <- "\\.(\\d + )[AB]$"
+#                  so that it looks at the end of string
+# Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....)
+# Rev. 2016-07-03 :Removed row names from the write.table() output for
+#                  ANOVA and PreANOVA
+# Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75
+# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting
+#                  the row numbers afterwards
+# Rev. 2016-06-21 :test18 + standardized the regexpression in protocol
+
+
+### FUNCTION DECLARATIONS begin ----------------------------------------------
+
+# Read first line of file at filePath
+# adapted from: https://stackoverflow.com/a/35761217/15509512
+read_first_line <- function(filepath) {
+  con <- file(filepath, "r")
+  line <- readLines(con, n = 1)
+  close(con)
+  return(line)
+}
+
+# Move columns to the end of dataframe
+# - data: the dataframe
+# - move: a vector of column names, each of which is an element of names(data)
+movetolast <- function(data, move) {
+  data[c(setdiff(names(data), move), move)]
+}
+
+# Generate phosphopeptide and build list when applied
+phosphopeptide_func <- function(df) {
+  # generate peptide sequence and list of phosphopositions
+  phosphoprobsequence <-
+    strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]]
+  output <- vector()
+  phosphopeptide <- ""
+  counter <- 0 # keep track of position in peptide
+  phosphopositions <-
+    vector() # keep track of phosphorylation positions in peptide
+  score_diff <- ""
+  for (chara in phosphoprobsequence) {
+    # build peptide sequence
+    if (!(
+      chara == " " ||
+      chara == "(" ||
+      chara == ")" ||
+      chara == "." ||
+      chara == "-" ||
+      chara == "0" ||
+      chara == "1" ||
+      chara == "2" ||
+      chara == "3" ||
+      chara == "4" ||
+      chara == "5" ||
+      chara == "6" ||
+      chara == "7" ||
+      chara == "8" ||
+      chara == "9")
+    ) {
+      phosphopeptide <- paste(phosphopeptide, chara, sep = "")
+      counter <- counter + 1
+    }
+    # generate score_diff
+    if (chara == "-" ||
+        chara == "." ||
+        chara == "0" ||
+        chara == "1" ||
+        chara == "2" ||
+        chara == "3" ||
+        chara == "4" ||
+        chara == "5" ||
+        chara == "6" ||
+        chara == "7" ||
+        chara == "8" ||
+        chara == "9"
+    ) {
+      score_diff <- paste(score_diff, chara, sep = "")
+    }
+    # evaluate score_diff
+    if (chara == ")") {
+      score_diff <- as.numeric(score_diff)
+      # only consider a phosphoresidue if score_diff > 0
+      if (score_diff > 0) {
+        phosphopositions <- append(phosphopositions, counter)
+      }
+      score_diff <- ""
+    }
+  }
+
+  # generate phosphopeptide sequence (ie, peptide sequence with "p"'s)
+  counter <- 1
+  phosphoposition_correction1 <-
+    -1 # used to correct phosphosposition as "p"'s
+       #  are inserted into the phosphopeptide string
+  phosphoposition_correction2 <-
+    0  # used to correct phosphosposition as "p"'s
+       #   are inserted into the phosphopeptide string
+  while (counter <= length(phosphopositions)) {
+    phosphopeptide <-
+      paste(
+        substr(
+          phosphopeptide,
+          0,
+          phosphopositions[counter] + phosphoposition_correction1
+        ),
+        "p",
+        substr(
+          phosphopeptide,
+          phosphopositions[counter] + phosphoposition_correction2,
+          nchar(phosphopeptide)
+        ),
+        sep = ""
+      )
+    counter <- counter + 1
+    phosphoposition_correction1 <- phosphoposition_correction1 + 1
+    phosphoposition_correction2 <- phosphoposition_correction2 + 1
+  }
+  # building phosphopeptide list
+  output <- append(output, phosphopeptide)
+  return(output)
+}
+
+### FUNCTION DECLARATIONS end ------------------------------------------------
+
+
+### EXTRACT ARGUMENTS begin --------------------------------------------------
+
+# parse options
+option_list <- list(
+  make_option(
+    c("-i", "--input"),
+    action = "store",
+    type = "character",
+    help = "A MaxQuant Phospho (STY)Sites.txt"
+  )
+  ,
+  make_option(
+    c("-o", "--output"),
+    action = "store",
+    type = "character",
+    help = "path to output file"
+  )
+  ,
+  make_option(
+    c("-E", "--enrichGraph"),
+    action = "store",
+    type = "character",
+    help = "path to enrichment graph PDF"
+  )
+  ,
+  make_option(
+    c("-F", "--enrichGraph_svg"),
+    action = "store",
+    type = "character",
+    help = "path to enrichment graph SVG"
+  )
+  ,
+  make_option(
+    c("-L", "--locProbCutoffGraph"),
+    action = "store",
+    type = "character",
+    help = "path to location-proability cutoff graph PDF"
+  )
+  ,
+  make_option(
+    c("-M", "--locProbCutoffGraph_svg"),
+    action = "store",
+    type = "character",
+    help = "path to location-proability cutoff graph SVG"
+  )
+  ,
+  make_option(
+    c("-e", "--enriched"),
+    action = "store",
+    type = "character",
+    help = "pY or pST enriched samples (ie, 'Y' or 'ST')"
+  )
+  # default = "^Number of Phospho [(]STY[)]$",
+  ,
+  make_option(
+    c("-p", "--phosphoCol"),
+    action = "store",
+    type = "character",
+    help = paste0("PERL-compatible regular expression matching",
+             " header of column having number of 'Phospho (STY)'")
+  )
+  # default = "^Intensity[^_]",
+  ,
+  make_option(
+    c("-s", "--startCol"),
+    action = "store",
+    type = "character",
+    help = paste0("PERL-compatible regular expression matching",
+             " header of column having first sample intensity")
+  )
+  # default = 1,
+  ,
+  make_option(
+    c("-I", "--intervalCol"),
+    action = "store",
+    type = "integer",
+    help = paste0("Column interval between the Intensities of samples",
+             " (eg, 1 if subsequent column; 2 if every other column")
+  )
+  # default = 0.75,
+  ,
+  make_option(
+    c("-l", "--localProbCutoff"),
+    action = "store",
+    type = "double",
+    help = "Localization Probability Cutoff"
+  )
+  # default = "sum",
+  ,
+  make_option(
+    c("-f", "--collapse_func"),
+    action = "store",
+    type = "character",
+    help = paste0("merge identical phosphopeptides",
+             " by ('sum' or 'average') the intensities")
+  )
+  # default = "filtered_data.txt",
+  ,
+  make_option(
+    c("-r", "--filtered_data"),
+    action = "store",
+    type = "character",
+    help = "filtered_data.txt"
+  )
+  # default = "quantData.txt",
+  ,
+  make_option(
+    c("-q", "--quant_data"),
+    action = "store",
+    type = "character",
+    help = "quantData.txt"
+  )
+)
+args <- parse_args(OptionParser(option_list = option_list))
+# Check parameter values
+
+### EXTRACT ARGUMENTS end ----------------------------------------------------
+
+
+### EXTRACT PARAMETERS from arguments begin ----------------------------------
+
+if (!file.exists(args$input)) {
+  stop((paste("File", args$input, "does not exist")))
+}
+
+phospho_col_pattern <- "^Number of Phospho [(][STY][STY]*[)]$"
+start_col_pattern <- "^Intensity[^_]"
+phospho_col_pattern <- read_first_line(args$phosphoCol)
+start_col_pattern <- read_first_line(args$startCol)
+
+sink(getConnection(2))
+
+input_file_name <- args$input
+filtered_filename <- args$filtered_data
+quant_file_name <- args$quant_data
+interval_col <- as.integer(args$intervalCol)
+
+first_line <- read_first_line(input_file_name)
+col_headers <-
+  unlist(strsplit(
+    x = first_line,
+    split = c("\t"),
+    fixed = TRUE
+  ))
+sink(getConnection(2))
+sink()
+
+
+intensity_header_cols <-
+  grep(pattern = start_col_pattern, x = col_headers, perl = TRUE)
+if (length(intensity_header_cols) == 0) {
+  err_msg <-
+    paste("Found no intensity columns matching pattern:",
+          start_col_pattern)
+  # Divert output to stderr
+  sink(getConnection(2))
+  print(err_msg)
+  sink()
+  stop(err_msg)
+}
+
+
+phospho_col <-
+  grep(pattern = phospho_col_pattern, x = col_headers, perl = TRUE)[1]
+if (is.na(phospho_col)) {
+  err_msg <-
+    paste("Found no 'number of phospho sites' columns matching pattern:",
+          phospho_col_pattern)
+  # Divert output to stderr
+  sink(getConnection(2))
+  print(err_msg)
+  sink()
+  stop(err_msg)
+}
+
+
+i_count <- 0
+this_column <- 1
+last_value <- intensity_header_cols[1]
+intensity_cols <- c(last_value)
+
+while (length(intensity_header_cols) >= interval_col * i_count) {
+  i_count <- 1 + i_count
+  this_column <- interval_col + this_column
+  if (last_value + interval_col != intensity_header_cols[this_column])
+    break
+  last_value <- intensity_header_cols[this_column]
+  if (length(intensity_header_cols) < interval_col * i_count)
+    break
+  intensity_cols <-
+    c(intensity_cols, intensity_header_cols[this_column])
+}
+
+start_col <- intensity_cols[1]
+num_samples <- i_count
+
+output_filename <- args$output
+enrich_graph_filename <- args$enrichGraph
+loc_prob_cutoff_graph_filename <- args$locProbCutoffGraph
+enrich_graph_filename_svg <- args$enrichGraph_svg
+loc_prob_cutoff_graph_fn_svg <- args$locProbCutoffGraph_svg
+
+local_prob_cutoff <- args$localProbCutoff
+enriched <- args$enriched
+collapse_fn <- args$collapse_func
+
+### EXTRACT PARAMETERS from arguments end ------------------------------------
+
+
+# Proteomics Quality Control for MaxQuant Results
+#  (Bielow C et al. J Proteome Res. 2016 PMID: 26653327)
+# is run by the Galaxy MaxQuant wrapper and need not be invoked here.
+
+
+# Read & filter out contaminants, reverse sequences, & localization probability
+# ---
+full_data <-
+  read.table(
+    file = input_file_name,
+    sep = "\t",
+    header = TRUE,
+    quote = ""
+  )
+
+# Filter out contaminant rows and reverse rows
+filtered_data <- subset(full_data, !grepl("CON__", Proteins))
+filtered_data <-
+  subset(filtered_data, !grepl("_MYCOPLASMA", Proteins))
+filtered_data <-
+  subset(filtered_data, !grepl("CONTAMINANT_", Proteins))
+filtered_data <-
+  subset(filtered_data, !grepl("REV__", Protein)
+         ) # since REV__ rows are blank in the first column (Proteins)
+write.table(
+  filtered_data,
+  file = filtered_filename,
+  sep = "\t",
+  quote = FALSE,
+  col.names = TRUE,
+  row.names = FALSE
+)
+# ...
+
+
+# Filter out data with localization probability below localProbCutoff
+# ---
+# Data filtered by localization probability
+loc_prob_filtered_data <-
+  filtered_data[
+    filtered_data$Localization.prob >= local_prob_cutoff,
+    ]
+# ...
+
+
+# Localization probability -- visualize locprob cutoff
+# ---
+loc_prob_graph_data <-
+  data.frame(
+    group = c(paste(">", toString(local_prob_cutoff), sep = ""),
+              paste("<", toString(local_prob_cutoff), sep = "")),
+    value = c(
+      nrow(loc_prob_filtered_data) / nrow(filtered_data) * 100,
+      (nrow(filtered_data) - nrow(loc_prob_filtered_data))
+        / nrow(filtered_data) * 100
+    )
+  )
+gigi <-
+  ggplot(loc_prob_graph_data, aes(x = "", y = value, fill = group)) +
+  geom_bar(width = 0.5,
+           stat = "identity",
+           color = "black") +
+  labs(x = NULL,
+    y = "percent",
+    title = "Phosphopeptides partitioned by localization-probability cutoff"
+  ) +
+  scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") +
+  theme_minimal() +
+  theme(
+    legend.position = "right",
+    legend.title = element_text(),
+    plot.title = element_text(hjust = 0.5),
+    plot.subtitle = element_text(hjust = 0.5),
+    plot.title.position = "plot"
+  )
+pdf(loc_prob_cutoff_graph_filename)
+print(gigi)
+dev.off()
+svg(loc_prob_cutoff_graph_fn_svg)
+print(gigi)
+dev.off()
+# ...
+
+
+# Extract quantitative values from filtered data
+# ---
+quant_data <-
+  loc_prob_filtered_data[, seq(from = start_col,
+                               by = interval_col,
+                               length.out = num_samples)]
+# ...
+
+
+# Generate Phosphopeptide Sequence
+#   for latest version of MaxQuant (Version 1.5.3.30)
+# ---
+metadata_df <-
+  data.frame(
+    loc_prob_filtered_data[, 1:8],
+    loc_prob_filtered_data[, phospho_col],
+    loc_prob_filtered_data[, phospho_col + 1],
+    loc_prob_filtered_data[, phospho_col + 2],
+    loc_prob_filtered_data[, phospho_col + 3],
+    loc_prob_filtered_data[, phospho_col + 4],
+    loc_prob_filtered_data[, phospho_col + 5],
+    loc_prob_filtered_data[, phospho_col + 6],
+    loc_prob_filtered_data[, phospho_col + 7],
+    quant_data
+  )
+colnames(metadata_df) <-
+  c(
+    "Proteins",
+    "Positions within proteins",
+    "Leading proteins",
+    "Protein",
+    "Protein names",
+    "Gene names",
+    "Fasta headers",
+    "Localization prob",
+    "Number of Phospho (STY)",
+    "Amino Acid",
+    "Sequence window",
+    "Modification window",
+    "Peptide window coverage",
+    "Phospho (STY) Probabilities",
+    "Phospho (STY) Score diffs",
+    "Position in peptide",
+    colnames(quant_data)
+  )
+# 'phosphopeptide_func' generates a phosphopeptide sequence
+#   for each row of data.
+# for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1, 2) = both
+metadata_df$phosphopeptide <-
+  apply(X = metadata_df, MARGIN = 1, FUN = phosphopeptide_func)
+colnames(metadata_df)[1] <- "Phosphopeptide"
+# Move the quant data columns to the right end of the data.frame
+metadata_df <- movetolast(metadata_df, c(colnames(quant_data)))
+# ...
+
+
+# Write quantitative values for debugging purposes
+# ---
+quant_write <- cbind(metadata_df[, "Sequence window"], quant_data)
+colnames(quant_write)[1] <- "Sequence.Window"
+write.table(
+  quant_write,
+  file = quant_file_name,
+  sep = "\t",
+  quote = FALSE,
+  col.names = TRUE,
+  row.names = FALSE
+)
+# ...
+
+
+# Make new data frame containing only Phosphopeptides
+#   that are to be mapped to quant data (merge_df)
+# ---
+metadata_df <-
+  setDT(metadata_df, keep.rownames = TRUE) # row name will be used to map
+merge_df <-
+  data.frame(
+    as.integer(metadata_df$rn),
+    metadata_df$phosphopeptide # row index to merge data frames
+    )
+colnames(merge_df) <- c("rn", "Phosphopeptide")
+# ...
+
+
+# Add Phosphopeptide column to quant columns for quality control checking
+# ---
+quant_data_qc <- as.data.frame(quant_data)
+setDT(quant_data_qc, keep.rownames = TRUE) # will use to match rowname to data
+quant_data_qc$rn <- as.integer(quant_data_qc$rn)
+quant_data_qc <- merge(merge_df, quant_data_qc, by = "rn")
+quant_data_qc$rn <- NULL # remove rn column
+# ...
+
+
+# Collapse multiphosphorylated peptides
+# ---
+quant_data_qc_collapsed <-
+  data.table(quant_data_qc, key = "Phosphopeptide")
+quant_data_qc_collapsed <-
+  aggregate(. ~ Phosphopeptide, quant_data_qc, FUN = collapse_fn)
+# ...
+print("quant_data_qc_collapsed")
+head(quant_data_qc_collapsed)
+
+# Compute (as string) % of phosphopeptides that are multiphosphorylated
+#   (for use in next step)
+# ---
+pct_multiphos <-
+  (
+    nrow(quant_data_qc) - nrow(quant_data_qc_collapsed)
+  ) / (2 * nrow(quant_data_qc))
+pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%")
+# ...
+
+
+# Compute and visualize breakdown of pY, pS, and pT before enrichment filter
+# ---
+py_data <-
+  quant_data_qc_collapsed[
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"),
+    ]
+ps_data <-
+  quant_data_qc_collapsed[
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS"),
+    ]
+pt_data <-
+  quant_data_qc_collapsed[
+     str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"),
+     ]
+
+py_num <- nrow(py_data)
+ps_num <- nrow(ps_data)
+pt_num <- nrow(pt_data)
+
+# Visualize enrichment
+enrich_graph_data <- data.frame(group = c("pY", "pS", "pT"),
+                                value = c(py_num, ps_num, pt_num))
+
+enrich_graph_data <-
+  enrich_graph_data[
+    enrich_graph_data$value > 0,
+    ]
+
+# Plot pie chart with legend
+# start: https://stackoverflow.com/a/62522478/15509512
+# refine: https://www.statology.org/ggplot-pie-chart/
+# colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8
+slices <- enrich_graph_data$value
+phosphoresidue <- enrich_graph_data$group
+pct    <- round(100 * slices / sum(slices))
+lbls   <-
+  paste(enrich_graph_data$group, "\n", pct, "%\n(", slices, ")", sep = "")
+slc_ctr <- c()
+run_tot <- 0
+for (p in pct) {
+  slc_ctr <- c(slc_ctr, run_tot + p / 2.0)
+  run_tot <- run_tot + p
+}
+lbl_y  <- 100 - slc_ctr
+df     <-
+  data.frame(slices,
+             pct,
+             lbls,
+             phosphoresidue = factor(phosphoresidue, levels = phosphoresidue))
+gigi <- ggplot(df
+               , aes(x = 1, y = pct, fill = phosphoresidue)) +
+  geom_col(position = "stack", orientation = "x") +
+  geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") +
+  coord_polar(theta = "y", direction = -1) +
+  labs(
+    x = NULL
+    ,
+    y = NULL
+    ,
+    title = "Percentages (and counts) of phosphosites, by type of residue"
+    ,
+    caption = sprintf(
+      "Roughly %s of peptides have multiple phosphosites.",
+      pct_multiphos
+    )
+  ) +
+  labs(x = NULL, y = NULL, fill = NULL) +
+  theme_classic() +
+  theme(
+    legend.position = "right"
+    ,
+    axis.line = element_blank()
+    ,
+    axis.text = element_blank()
+    ,
+    axis.ticks = element_blank()
+    ,
+    plot.title = element_text(hjust = 0.5)
+    ,
+    plot.subtitle = element_text(hjust = 0.5)
+    ,
+    plot.caption = element_text(hjust = 0.5)
+    ,
+    plot.title.position = "plot"
+  ) +
+  scale_fill_manual(breaks = phosphoresidue,
+                    values = c("#c7eae5", "#f6e8c3", "#dfc27d"))
+
+pdf(enrich_graph_filename)
+print(gigi)
+dev.off()
+svg(enrich_graph_filename_svg)
+print(gigi)
+dev.off()
+# ...
+
+
+# Filter phosphopeptides by enrichment
+# --
+if (enriched == "Y") {
+  quant_data_qc_enrichment <- quant_data_qc_collapsed[
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"),
+    ]
+} else if (enriched == "ST") {
+  quant_data_qc_enrichment <- quant_data_qc_collapsed[
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS") |
+    str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"),
+    ]
+} else {
+  print("Error in enriched variable. Set to either 'Y' or 'ST'")
+}
+# ...
+
+print("quant_data_qc_enrichment")
+head(quant_data_qc_enrichment)
+
+# Write phosphopeptides filtered by enrichment
+# --
+write.table(
+  quant_data_qc_enrichment,
+  file = output_filename,
+  sep = "\t",
+  quote = FALSE,
+  row.names = FALSE
+)
+# ...
diff -r 000000000000 -r ba62d93a9ef5 PhosphoPeptide_Upstream_Kinase_Mapping.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PhosphoPeptide_Upstream_Kinase_Mapping.pl	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,2192 @@
+#!/usr/local/bin/perl
+###############################################################################################################################
+#    perl Kinase_enrichment_analysis_complete_v0.pl
+#
+#    Nick Graham, USC
+#    2016-02-27
+#
+#    Built from scripts written by NG at UCLA in Tom Graeber's lab:
+#        CombinePhosphoSites.pl
+#        Retrieve_p_motifs.pl
+#        NetworKIN_Motif_Finder_v7.pl
+#
+#    Given a list of phospho-peptides, find protein information and upstream kinases.
+#    Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl
+#
+#    Updated 2022-01-13, Art Eschenlauer, UMN on behalf of Justin Drake's lab:
+#        Added warnings and used strict;
+#        fixed some code paths resulting in more NetworKIN matches;
+#        applied Aho-Corasick algorithm (via external Python script because Perl implementation was still too slow)
+#        to speed up "Match the non_p_peptides to the @sequences array";
+#        added support for SQLite-formatted UniProtKB/Swiss-Prot data as an alternative to FASTA-formatted data;
+#        added support for SQLite output in addition to tabular files.
+#
+#
+###############################################################################################################################
+
+use strict;
+use warnings 'FATAL' => 'all';
+
+use Getopt::Std;
+use DBD::SQLite::Constants qw/:file_open/;
+use DBI qw(:sql_types);
+use File::Copy;
+use File::Basename;
+use POSIX qw(strftime);
+use Time::HiRes qw(gettimeofday);
+#use Data::Dump qw(dump);
+
+my $USE_SEARCH_PPEP_PY = 1;
+#my $FAILED_MATCH_SEQ = "Failed match";
+my $FAILED_MATCH_SEQ = 'No Sequence';
+my $FAILED_MATCH_GENE_NAME = 'No_Gene_Name';
+
+my $dirname = dirname(__FILE__);
+my %opts;
+my ($file_in, $average_or_sum, $db_out, $file_out, $file_melt, $phospho_type);
+my $dbtype;
+my ($fasta_in, $networkin_in, $motifs_in, $PSP_Kinase_Substrate_in, $PSP_Regulatory_Sites_in);
+my (@samples, %sample_id_lut, %ppep_id_lut, %data, @tmp_data, %n);
+my $line = 0;
+my @failed_match = ($FAILED_MATCH_SEQ);
+my @failed_matches;
+my (%all_data);
+my (@p_peptides, @non_p_peptides);
+my @parsed_fasta;
+my (@accessions, @names, @sequences, @databases, $database);
+my ($dbfile, $dbh, $stmth);
+my @col_names;
+my (%matched_sequences, %accessions,     %names,     %sites,   );
+my (@tmp_matches,       @tmp_accessions, @tmp_names, @tmp_sites);
+my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues);
+my (@kinases_observed, $kinases);
+my (@kinases_observed_lbl, @phosphosites_observed_lbl);
+my ($p_sequence_kinase, $p_sequence, $kinase);
+my (@motif_sequence, @motif_description, @motif_type_key_ary, %motif_type, %motif_count);
+my (@kinases_PhosphoSite, $kinases_PhosphoSite);
+my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite);
+my (%regulatory_sites_PhosphoSite_hash);
+my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes, %organism);
+my (%unique_motifs);
+my ($kinase_substrate_NetworKIN_matches, $kinase_substrate_PhosphoSite_matches);
+my %psp_regsite_protein_2;
+my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2, %organism_2);
+my @timeData;
+my $PhosphoSitePlusCitation;
+my (%site_description, %site_id);
+
+my %kinase_substrate_NetworKIN_matches;
+my %kinase_motif_matches;
+my $regulatory_sites_PhosphoSite;
+my ($seq_plus5aa, $seq_plus7aa, %seq_plus7aa_2);
+my %kinase_substrate_PhosphoSite_matches;
+my @formatted_sequence;
+my $pSTY_sequence;
+my $i;
+my @a;
+my $use_sqlite;
+my $verbose;
+
+##########
+## opts ##
+##########
+  ## input files
+    # i : path to input file, e.g., 'outputfile_STEP2.txt'
+    # f : path to UniProtKB/SwissProt FASTA
+    # s : optional species argument
+    # n : path to NetworKIN_201612_cutoffscore2.0.txt
+    # m : path to pSTY_Motifs.txt
+    # p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
+    # r : path to 2017-03_PSP_Regulatory_sites.txt
+  ## options
+    # P : phospho_type
+    # F : function
+    # v : verbose output
+  ## output files
+    # o : path to output file
+    # O : path to "melted" output file
+    # D : path to output SQLite file
+
+sub usage()
+    {
+        print STDERR <<"EOH";
+    This program given a list of phospho-peptides, finds protein information and upstream kinases.
+    usage: $0 [-hvd] -f FASTA_file
+     -h : this (help) message
+     -v : slightly verbose
+     -a : use SQLite less
+     ## input files
+     -i : path to input file, e.g., 'outputfile_STEP2.txt'
+     -f : path to UniProtDB/SwissProt FASTA
+     -s : optional species filter argument for PSP records; defaults to 'human'
+     -n : path to NetworKIN_201612_cutoffscore2.0.txt
+     -m : path to pSTY_Motifs.txt
+     -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
+     -r : path to 2017-03_PSP_Regulatory_sites.txt
+     ## options
+     -P : phospho_type
+     -F : function
+     ## output files
+     -o : path to output file
+     -O : path to "melted" output file
+     -D : path to output SQLite file
+    example: $0
+EOH
+        exit;
+    }
+
+sub format_localtime_iso8601 {
+    # ref: https://perldoc.perl.org/Time::HiRes
+    my ($seconds, $microseconds) = gettimeofday;
+    # ref: https://pubs.opengroup.org/onlinepubs/9699919799/functions/strftime.html
+    return strftime("%Y-%m-%dT%H:%M:%S",localtime(time)) . sprintf(".%03d", $microseconds/1000);
+}
+
+sub replace_pSpTpY {
+    my ($formatted_sequence, $phospho_type) = @_;
+    if ($phospho_type eq 'y') {
+        $formatted_sequence =~ s/pS/S/g;
+        $formatted_sequence =~ s/pT/T/g;
+        $formatted_sequence =~ s/pY/y/g;
+        }
+    elsif ($phospho_type eq "sty") {
+        $formatted_sequence =~ s/pS/s/g;
+        $formatted_sequence =~ s/pT/t/g;
+        $formatted_sequence =~ s/pY/y/g;
+        }
+    $formatted_sequence;
+}
+
+sub pseudo_sed
+{
+    # pseudo_sed produces "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV"
+    # Comments give the sed equivalent
+    my ($t) = @_;
+    my $s = $t;
+    # / GN=/!{ s:\(OX=[^ \t]*\):\1 GN=N/A:; };
+    unless ($s =~ m / GN=/s)
+    {
+        $s =~ s :(OX=[^ \t]*):${1} GN=N/A:s;
+    }
+    # / PE=/!{ s:\(GN=[^ \t]*\):\1 PE=N/A:; };
+    unless ($s =~ m / PE=/s)
+    {
+        $s =~ s :(GN=[^ \t]*):${1} PE=N/A:s;
+    }
+    # / SV=/!{ s:\(PE=[^ \t]*\):\1 SV=N/A:; };
+    unless ($s =~ m / SV=/s)
+    {
+        $s =~ s :(PE=[^ \t]*):${1} SV=N/A:s;
+    }
+    # s/^sp.//;
+    $s =~ s :^...::s;
+    # s/[|]/\t/g;
+    $s =~ s :[|]:\t:sg;
+    if ( !($s =~ m/ OX=/s)
+      && !($s =~ m/ GN=/s)
+      && !($s =~ m/ PE=/s)
+      && !($s =~ m/ SV=/s)
+    ) {
+      # OS= is used elsewhere, but it's not helpful without OX and GN
+      $s =~ s/OS=/Species /g;
+      # supply sensible default values
+      $s .= "\tN/A\t-1\tN/A\tN/A\tN/A";
+    } else {
+      # s/ OS=/\t/;
+      if ($s =~ m/ OS=/s) { $s =~ s: OS=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; };
+      # s/ OX=/\t/;
+      if ($s =~ m/ OX=/s) { $s =~ s: OX=:\t:s; } else { $s =~ s:(.*)\t:$1\t-1\t:x; };
+      # s/ GN=/\t/;
+      if ($s =~ m/ GN=/s) { $s =~ s: GN=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; };
+      # s/ PE=/\t/;
+      if ($s =~ m/ PE=/s) { $s =~ s: PE=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; };
+      # s/ SV=/\t/;
+      if ($s =~ m/ SV=/s) { $s =~ s: SV=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; };
+    }
+    return $s;
+} # sub pseudo_sed
+
+getopts('i:f:s:n:m:p:r:P:F:o:O:D:hva', \%opts) ;
+
+
+if (exists($opts{'h'})) {
+    usage();
+}
+if (exists($opts{'a'})) {
+    $USE_SEARCH_PPEP_PY = 0;
+}
+if (exists($opts{'v'})) {
+    $verbose = 1;
+} else {
+    $verbose = 0;
+}
+if (!exists($opts{'i'}) || !-e $opts{'i'}) {
+    die('Input File not found');
+} else {
+    $file_in = $opts{'i'};
+}
+if (!exists($opts{'f'}) || !-e $opts{'f'}) {
+    die('FASTA not found');
+} else {
+    $fasta_in = $opts{'f'};
+    $use_sqlite = 0;
+}
+my $species;
+if ((!exists($opts{'s'})) || ($opts{'s'} eq '')) {
+    $species = 'human';
+} else {
+    $species = $opts{'s'};
+    print "'-s' option is '$species'\n";
+}
+print "species filter is '$species'\n";
+
+if (!exists($opts{'n'}) || !-e $opts{'n'}) {
+    die('Input NetworKIN File not found');
+} else {
+    $networkin_in = $opts{'n'};
+}
+if (!exists($opts{'m'}) || !-e $opts{'m'}) {
+    die('Input pSTY_Motifs File not found');
+} else {
+    $motifs_in = $opts{'m'};
+}
+if (!exists($opts{'p'}) || !-e $opts{'p'}) {
+    die('Input PSP_Kinase_Substrate_Dataset File not found');
+} else {
+    $PSP_Kinase_Substrate_in = $opts{'p'};
+}
+if (!exists($opts{'r'}) || !-e $opts{'r'}) {
+    die('Input PSP_Regulatory_sites File not found');
+} else {
+    $PSP_Regulatory_Sites_in = $opts{'r'};
+}
+if (exists($opts{'P'})) {
+    $phospho_type = $opts{'P'};
+}
+else {
+    $phospho_type = "sty";
+}
+if (exists($opts{'F'})) {
+    $average_or_sum = $opts{'F'};
+}
+else {
+    $average_or_sum = "sum";
+}
+if (exists($opts{'D'})) {
+    $db_out = $opts{'D'};
+}
+else {
+    $db_out = "db_out.sqlite";
+}
+if (exists($opts{'O'})) {
+    $file_melt = $opts{'O'};
+}
+else {
+    $file_melt = "output_melt.tsv";
+}
+if (exists($opts{'o'})) {
+    $file_out = $opts{'o'};
+}
+else {
+    $file_out = "output.tsv";
+}
+
+
+###############################################################################################################################
+# Print the relevant file names to the screen
+###############################################################################################################################
+# print "\nData file:  $data_in\nFASTA file:  $fasta_in\nSpecies:  $species\nOutput file:  $motifs_out\n\n";
+print "\n--- parameters:\n";
+print "Data file:  $file_in\nAverage or sum identical p-sites?  $average_or_sum\nOutput file:  $file_out\nMelted map:  $file_melt\n";
+if ($use_sqlite == 0) {
+  print "Motifs file:  $motifs_in\nNetworKIN file:  networkin_in\nPhosphosite kinase substrate data:  $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data:  $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt FASTA file:  $fasta_in\nOutput SQLite file: $db_out\n";
+} else {
+  print "Motifs file:  $motifs_in\nNetworKIN file:  networkin_in\nPhosphosite kinase substrate data:  $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data:  $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt SQLIte file:  $dbfile\nOutput SQLite file: $db_out\n";
+}
+print "...\n\n";
+
+print "Phospho-residues(s) = $phospho_type\n\n";
+if ($phospho_type ne 'y') {
+    if ($phospho_type ne 'sty') {
+        die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n";
+    }
+}
+
+###############################################################################################################################
+# read the input data file
+# average or sum identical phospho-sites, depending on the value of $average_or_sum
+###############################################################################################################################
+
+open (IN, "$file_in") or die "I couldn't find the input file:  $file_in\n";
+
+die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ;
+
+
+$line = 0;
+
+while (<IN>) {
+    chomp;
+    my @x = split(/\t/);
+    for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n]  =~ s/\n//g; $x[$n]  =~ s/\"//g;}
+
+    # Read in the samples
+    if ($line == 0) {
+        for my $n (1 .. $#x) {
+            push (@samples, $x[$n]);
+            $sample_id_lut{$x[$n]} = $n;
+        }
+        $line++;
+    } else {
+        # check whether we have already seen a phospho-peptide
+        if (exists($data{$x[0]})) {
+            if ($average_or_sum eq "sum") {        # add the data
+                # unload the data
+                @tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); }
+                # add the new data and repack
+                for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; }
+                $all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); }
+
+            } elsif ($average_or_sum eq "average") {        # average the data
+                # unload the data
+                @tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); }
+                # average with the new data and repack
+                for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); }
+                $n{$x[0]}++;
+                $data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); }
+            }
+        }
+        # if the phospho-sequence has not been seen, save the data
+        else {
+            for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); }
+            $n{$x[0]} = 1;
+        }
+    }
+}
+close(IN);
+
+
+###############################################################################################################################
+# Search the FASTA database for phospho-sites and motifs
+#
+# based on Retrieve_p_peptide_motifs_v2.pl
+###############################################################################################################################
+
+
+###############################################################################################################################
+#
+#    Read in the Data file:
+#        1) make @p_peptides array as in the original file
+#        2) make @non_p_peptides array w/o residue modifications (p, #, other)
+#
+###############################################################################################################################
+
+foreach my $peptide (keys %data) {
+    $peptide =~ s/s/pS/g;    $peptide =~ s/t/pT/g;    $peptide =~ s/y/pY/g;
+    push (@p_peptides, $peptide);
+    $peptide =~ s/p//g;
+    push(@non_p_peptides, $peptide);
+}
+
+if ($use_sqlite == 0) {
+  ###############################################################################################################################
+  #
+  #    Read in the UniProtKB/Swiss-Prot data from FASTA; save to @sequences array and SQLite output database
+  #
+  ###############################################################################################################################
+
+  # e.g.
+  #   >sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2
+  #   MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD
+  #   DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK
+  #   EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH
+  #   QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS
+  #   EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT
+  # accession: Q9Y3B9
+  # name: RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2
+  # sequence: MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT
+  #
+  # e.g.
+  #   >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP
+  #   >gi|52001466|sp|P00366.2|DHE3_BOVIN Glutamate dehydrogenase 1, mitochondrial precursor (GDH) cRAP
+  #
+  # e.g.
+  #   >zs|P00009.24.AR-V2_1.zs|zs_peptide_0024_AR-V2_1
+
+
+  open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n";
+  print "Reading FASTA file $fasta_in\n";
+  # ref: https://perldoc.perl.org/perlsyn#Compound-Statements
+  #      "If the condition expression of a while statement is based on any of
+  #      a group of iterative expression types then it gets some magic treatment.
+  #      The affected iterative expression types are readline, the <FILEHANDLE>
+  #      input operator, readdir, glob, the <PATTERN> globbing operator, and
+  #      `each`. If the condition expression is one of these expression types,
+  #      then the value yielded by the iterative operator will be implicitly
+  #      assigned to `$_`."
+  while (<IN1>) {
+    chomp;
+    # ref: https://perldoc.perl.org/functions/split#split-/PATTERN/,EXPR
+    #      "If only PATTERN is given, EXPR defaults to $_."
+    my (@x) = split(/\|/);
+    # begin FIX >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP
+    if (@x > 3) {
+      @x = (">".$x[$#x - 2], $x[$#x - 1], $x[$#x]);
+    }
+    # end FIX >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP
+    for my $i (0 .. $#x) {
+      $x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g; }
+    # Use of uninitialized value $x[0] in pattern match (m//) at /home/rstudio/src/mqppep/tools/mqppep/PhosphoPeptide_Upstream_Kinase_Mapping.pl line 411, <IN1> line 3.
+    if (exists($x[0])) {
+      if ($x[0] =~ /^>/) {
+        # parsing header line
+        $x[0] =~ s/\>//g;
+        push (@databases, $x[0]);
+        push (@accessions, $x[1]);
+        push (@names, $x[2]);
+        # format tags of standard UniProtKB headers as tab-separated values
+        # pseudo_sed produces "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV"
+        $_ = pseudo_sed(join "\t", (">".$x[0], $x[1], $x[2]));
+        # append tab as separator between header and sequence
+        s/$/\t/;
+        # parsed_fasta gets "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV\t"
+        print "push (\@parsed_fasta, $_)\n" if (0 && $x[0] ne "zs");
+        push (@parsed_fasta, $_);
+      } elsif ($x[0] =~ /^\w/) {
+        # line is a portion of the sequence
+        if (defined $sequences[$#accessions]) {
+          $sequences[$#accessions] = $sequences[$#accessions].$x[0];
+        } else {
+          $sequences[$#accessions] = $x[0];
+        }
+        $parsed_fasta[$#accessions] = $parsed_fasta[$#accessions].$x[0];
+      }
+    }
+  }
+  close IN1;
+  print "Done Reading FASTA file $fasta_in\n";
+  $dbfile = $db_out;
+  print "Begin writing $dbfile at " . format_localtime_iso8601() . "\n";
+  $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef);
+  my $auto_commit = $dbh->{AutoCommit};
+  print "auto_commit was $auto_commit and is now 0\n" if ($verbose);
+  $dbh->{AutoCommit} = 0;
+
+  # begin DDL-to-SQLite
+  # ---
+  $stmth = $dbh->prepare("
+    DROP TABLE IF EXISTS UniProtKB;
+    ");
+  $stmth->execute();
+
+  $stmth = $dbh->prepare("
+  CREATE TABLE UniProtKB (
+    Uniprot_ID TEXT PRIMARY KEY ON CONFLICT IGNORE,
+    Description TEXT,
+    Organism_Name TEXT,
+    Organism_ID INTEGER,
+    Gene_Name TEXT,
+    PE TEXT,
+    SV TEXT,
+    Sequence TEXT,
+    Database TEXT
+  )
+  ");
+  $stmth->execute();
+  $stmth = $dbh->prepare("
+  CREATE UNIQUE INDEX idx_uniq_UniProtKB_0 on UniProtKB(Uniprot_ID);
+  ");
+  $stmth->execute();
+  $stmth = $dbh->prepare("
+  CREATE INDEX idx_UniProtKB_0 on UniProtKB(Gene_Name);
+  ");
+  $stmth->execute();
+  # ...
+  # end DDL-to-SQLite
+
+  # insert all rows
+  # begin store-to-SQLite "UniProtKB" table
+  # ---
+  $stmth = $dbh->prepare("
+  INSERT INTO UniProtKB (
+    Uniprot_ID,
+    Description,
+    Organism_Name,
+    Organism_ID,
+    Gene_Name,
+    PE,
+    SV,
+    Sequence,
+    Database
+  ) VALUES (?,?,?,?,?,?,?,?,?)
+  ");
+  my $row_count = 1;
+  my $row_string;
+  my (@row, @rows);
+  my $wrd;
+  while ( scalar @parsed_fasta > 0 ) {
+      $database = $databases[$#parsed_fasta];
+      # row_string gets "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV\t"
+      #                  1           2            3   4   5   6   7   sequence database
+      $row_string = pop(@parsed_fasta);
+      @row = (split /\t/, $row_string);
+      if ((not exists($row[4])) || ($row[4] eq "")) {
+        die("invalid fasta line\n$row_string\n");
+      };
+      if ($row[4] eq "N/A") {
+        print "Organism_ID is 'N/A' for row $row_count:\n'$row_string'\n";
+        $row[4] = -1;
+      };
+      for $i (1..3,5..8) {
+          #BIND print "bind_param $i, $row[$i]\n";
+          $stmth->bind_param($i, $row[$i]);
+      }
+      #BIND print "bind_param 4, $row[4]\n";
+      $stmth->bind_param(9, $database);
+      #BIND print "bind_param 4, $row[4]\n";
+      $stmth->bind_param(4, $row[4], { TYPE => SQL_INTEGER });
+      if (not $stmth->execute()) {
+          print "Error in row $row_count: " . $dbh->errstr . "\n";
+          print "Row $row_count: $row_string\n";
+          print "Row $row_count: " . ($row_string =~ s/\t/@/g) . "\n";
+      }
+      if (0 && $database ne "zs") {
+          print "row_count: $row_count\n";
+          #### print "row_string: $row_string\n";
+          print "Row $row_count: $row_string\n";
+          for $i (1..3,5..8) {
+              print "bind_param $i, $row[$i]\n" if (exists($row[$i]));
+          }
+          print "bind_param 4, $row[4]\n" if (exists($row[4]));
+          print "bind_param 9, $database\n";
+      };
+      $row_count += 1;
+  }
+  # ...
+  # end store-to-SQLite "UniProtKB" table
+
+  print "begin commit at " . format_localtime_iso8601() . "\n";
+  $dbh->{AutoCommit} = $auto_commit;
+  print "auto_commit is now $auto_commit\n" if ($verbose);
+  $dbh->disconnect if ( defined $dbh );
+  print "Finished writing $dbfile at " . format_localtime_iso8601() . "\n\n";
+  $dbtype = "FASTA";
+}
+
+if ($use_sqlite == 1) {
+  ###############################################################################################################################
+  #
+  #    Read in the UniProtKB/Swiss-Prot data from SQLite; save to @sequences array
+  #
+  ###############################################################################################################################
+
+  copy($dbfile, $db_out) or die "Copy $dbfile to $db_out failed: $!";
+
+  # https://metacpan.org/pod/DBD::SQLite#Read-Only-Database
+  $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef, {
+    sqlite_open_flags => SQLITE_OPEN_READONLY,
+  });
+  print "DB connection $dbh is to $dbfile\n";
+
+  # Uniprot_ID, Description, Organism_Name, Organism_ID, Gene_Name, PE, SV, Sequence
+  $stmth = $dbh->prepare("
+  SELECT Uniprot_ID
+  , Description
+    || CASE WHEN Organism_Name = 'N/A' THEN '' ELSE ' OS=' || Organism_Name END
+    || CASE WHEN Organism_ID = -1      THEN '' ELSE ' OX=' || Organism_ID   END
+    || CASE WHEN Gene_Name = 'N/A'     THEN '' ELSE ' GN=' || Gene_Name     END
+    || CASE WHEN PE = 'N/A'            THEN '' ELSE ' PE=' || PE            END
+    || CASE WHEN SV = 'N/A'            THEN '' ELSE ' SV=' || SV            END
+    AS Description
+  , Sequence
+  , Database
+  FROM
+    UniProtKB
+  ");
+  $stmth->execute();
+  @col_names = @{$stmth->{NAME}};
+  print "\nColumn names selected from UniProtKB SQLite table: " . join(", ", @col_names) . "\n\n" if ($verbose);
+  while (my @row = $stmth->fetchrow_array) {
+    push (@names,              $row[1]); # redacted Description
+    push (@accessions,         $row[0]); # Uniprot_ID
+    $sequences[$#accessions] = $row[2];  # Sequence
+    push (@databases,          $row[3]); # Database (should be 'sp')
+  }
+
+  $dbh->disconnect if ( defined $dbh );
+
+  print "Done Reading UniProtKB/Swiss-Prot file $dbfile\n\n";
+  $dbtype = "SQLite";
+}
+
+print "$#accessions accessions were read from the UniProtKB/Swiss-Prot $dbtype file\n";
+
+######################
+  $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef);
+  $stmth = $dbh->prepare("
+  INSERT INTO UniProtKB (
+    Uniprot_ID,
+    Description,
+    Organism_Name,
+    Organism_ID,
+    Gene_Name,
+    PE,
+    SV,
+    Sequence,
+    Database
+  ) VALUES (
+    'No Uniprot_ID',
+    'NO_GENE_SYMBOL No Description',
+    'No Organism_Name',
+    0,
+    '$FAILED_MATCH_GENE_NAME',
+    '0',
+    '0',
+    '$FAILED_MATCH_SEQ',
+    'No Database'
+  )
+  ");
+  if (not $stmth->execute()) {
+      print "Error inserting dummy row into UniProtKB: $stmth->errstr\n";
+  }
+  $dbh->disconnect if ( defined $dbh );
+######################
+
+@timeData = localtime(time);
+print "\n--- Start search at " . format_localtime_iso8601() ."\n";
+
+print "    --> Calling 'search_ppep' script\n\n";
+if ($verbose) {
+  $i = system("python $dirname/search_ppep.py -u $db_out -p $file_in --verbose");
+} else {
+  $i = system("python $dirname/search_ppep.py -u $db_out -p $file_in");
+}
+if ($i) {
+  print "python $dirname/search_ppep.py -u $db_out -p $file_in\n  exited with exit code $i\n";
+  die "Search failed for phosphopeptides in SwissProt/SQLite file.";
+}
+print "    <-- Returned from 'search_ppep' script\n";
+
+@timeData = localtime(time);
+print "... Finished search at " . format_localtime_iso8601() ."\n\n";
+
+
+###############################################################################################################################
+#
+#    Match the non_p_peptides to the @sequences array:
+#        1) Format the motifs +/- 10 residues around the phospho-site
+#        2) Print the original data plus the phospho-motif to the output file
+#
+###############################################################################################################################
+
+
+print "--- Match the non_p_peptides to the \@sequences array:\n";
+
+if ($USE_SEARCH_PPEP_PY) {
+  print "Find the matching protein sequence(s) for the peptide using SQLite\n";
+} else {
+  print "Find the matching protein sequence(s) for the peptide using slow search\n";
+}
+
+# https://metacpan.org/pod/DBD::SQLite#Read-Only-Database
+$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef, {
+  sqlite_open_flags => SQLITE_OPEN_READONLY,
+});
+print "DB connection $dbh is to $db_out\n";
+
+# CREATE VIEW uniprotid_pep_ppep AS
+#   SELECT   deppep_UniProtKB.UniprotKB_ID       AS accession
+#          , deppep.seq                          AS peptide
+#          , ppep.seq                            AS phosphopeptide
+#          , UniProtKB.Sequence                  AS sequence
+#          , UniProtKB.Description               AS description
+#   FROM     ppep, deppep, deppep_UniProtKB, UniProtKB
+#   WHERE    deppep.id = ppep.deppep_id
+#   AND      deppep.id = deppep_UniProtKB.deppep_id
+#   AND      deppep_UniProtKB.UniprotKB_ID = UniProtKB.Uniprot_ID
+#   ORDER BY UniprotKB_ID, deppep.seq, ppep.seq;
+
+my %ppep_to_count_lut;
+print "start select peptide counts " . format_localtime_iso8601() . "\n";
+my $uniprotkb_pep_ppep_view_stmth = $dbh->prepare("
+    SELECT DISTINCT
+      phosphopeptide
+    , count(*) as i
+    FROM
+      uniprotkb_pep_ppep_view
+    GROUP BY
+      phosphopeptide
+    ORDER BY
+      phosphopeptide
+");
+if (not $uniprotkb_pep_ppep_view_stmth->execute()) {
+    die "Error fetching peptide counts: $uniprotkb_pep_ppep_view_stmth->errstr\n";
+}
+while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) {
+  $ppep_to_count_lut{$row[0]} = $row[1];
+  #print "\$ppep_to_count_lut{$row[0]} = $ppep_to_count_lut{$row[0]}\n";
+}
+
+# accession, peptide, sequence, description, phosphopeptide, long_description, pos_start, pos_end, scrubbed, ppep_id
+# 0          1        2         3            4               5                 6          7        8         9
+my $COL_ACCESSION        = 0;
+my $COL_PEPTIDE          = 1;
+my $COL_SEQUENCE         = 2;
+my $COL_DESCRIPTION      = 3;
+my $COL_PHOSPHOPEPTIDE   = 4;
+my $COL_LONG_DESCRIPTION = 5;
+my $COL_POS_START        = 6;
+my $COL_POS_END          = 7;
+my $COL_SCRUBBED         = 8;
+my $COL_PPEP_ID          = 9;
+
+my %ppep_to_row_lut;
+print "start select all records without qualification " . format_localtime_iso8601() . "\n";
+$uniprotkb_pep_ppep_view_stmth = $dbh->prepare("
+    SELECT DISTINCT
+      accession
+    , peptide
+    , sequence
+    , description
+    , phosphopeptide
+    , long_description
+    , pos_start
+    , pos_end
+    , scrubbed
+    , ppep_id
+    FROM
+      uniprotkb_pep_ppep_view
+    ORDER BY
+      phosphopeptide
+");
+if (not $uniprotkb_pep_ppep_view_stmth->execute()) {
+    die "Error fetching all records without qualification: $uniprotkb_pep_ppep_view_stmth->errstr\n";
+}
+my $current_ppep;
+my $counter = 0;
+my $former_ppep = "";
+@tmp_matches = ();
+@tmp_accessions = ();
+@tmp_names = ();
+@tmp_sites = ();
+while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) {
+    # Identify phosphopeptide for current row;
+    #   it is an error for it to change when the counter is not zero.
+    $current_ppep = $row[$COL_PHOSPHOPEPTIDE];
+
+    # when counter is zero, prepare for a new phosphopeptide
+    if (not $current_ppep eq $former_ppep) {
+        die "counter is $counter instead of zero" if ($counter != 0);
+        $ppep_id_lut{$current_ppep} = $row[$COL_PPEP_ID];
+        print "next phosphpepetide: $current_ppep; id: $ppep_id_lut{$current_ppep}\n" if ($verbose);
+        $counter = $ppep_to_count_lut{$current_ppep};
+        @tmp_matches = ();
+        @tmp_accessions = ();
+        @tmp_names = ();
+        @tmp_sites = ();
+    }
+
+    if ($USE_SEARCH_PPEP_PY) {
+        push(@tmp_matches,    $row[ $COL_SEQUENCE         ]);
+        push(@tmp_accessions, $row[ $COL_ACCESSION        ]);
+        push(@tmp_names,      $row[ $COL_LONG_DESCRIPTION ]);
+        push(@tmp_sites,      $row[ $COL_POS_START        ]);
+    }
+
+    # Prepare counter and phosphopeptide tracker for next row
+    $former_ppep = $current_ppep;
+    $counter -= 1;
+
+    # Set trackers for later use after last instance of current phosphopeptide
+    if ($counter == 0) {
+        if ($USE_SEARCH_PPEP_PY) {
+            $matched_sequences{$current_ppep} = [ @tmp_matches ];
+            $accessions{       $current_ppep} = [ @tmp_accessions ];
+            $names{            $current_ppep} = [ @tmp_names ];
+            $sites{            $current_ppep} = [ @tmp_sites ];
+        }
+    }
+}
+
+
+print "end select all records without qualification " . format_localtime_iso8601() . "\n";
+
+for my $j (0 .. $#p_peptides) {
+
+    #Find the matching protein sequence(s) for the peptide using SQLite
+    my ($site, $sequence);
+    my (@row, @rows);
+    my $match = 0;
+    my $p_peptide = $p_peptides[$j];
+    @tmp_matches = ();
+    @tmp_accessions = ();
+    @tmp_names = ();
+    @tmp_sites = ();
+
+    #Find the matching protein sequence(s) for the peptide using slow search
+    $site = -1;
+    unless ($USE_SEARCH_PPEP_PY) {
+        for my $k (0 .. $#sequences) {
+            $site = index($sequences[$k], $non_p_peptides[$j]);
+            if ($site != -1) {
+                  push(@tmp_matches, $sequences[$k]);
+                  push(@tmp_accessions, $accessions[$k]);
+                  push(@tmp_names, $names[$k]);
+                  push(@tmp_sites, $site);
+                }
+                # print "Non-phosphpeptide $non_p_peptides[$j] matched accession $accessions[$k] ($names[$k]) at site $site\n";
+                $site = -1; $match++;
+                # print "tmp_accessions @tmp_accessions \n";
+        }
+        if ($match == 0) {    # Check to see if no match was found.  Skip to next if no match found.
+            print "Warning:  Failed match for $p_peptides[$j]\n";
+            $matched_sequences{$p_peptides[$j]} = \@failed_match;
+            push(@failed_matches,$p_peptides[$j]);
+            next;
+        } else {
+            $matched_sequences{$p_peptides[$j]} = [ @tmp_matches ];
+            $accessions{$p_peptides[$j]} = [ @tmp_accessions ];
+            $names{$p_peptides[$j]} = [ @tmp_names ];
+            $sites{$p_peptides[$j]} = [ @tmp_sites ];
+        }
+    }
+
+} # end for my $j (0 .. $#p_peptides)
+
+print "... Finished match the non_p_peptides at " . format_localtime_iso8601() ."\n\n";
+
+print "--- Match the p_peptides to the \@sequences array:\n";
+
+for my $peptide_to_match ( keys %matched_sequences ) {
+    if (grep($peptide_to_match, @failed_matches)) {
+        print "Failed to match peptide $peptide_to_match\n";
+    }
+    next if (grep($peptide_to_match, @failed_matches));
+    my @matches = @{$matched_sequences{$peptide_to_match}};
+    @tmp_motifs_array = ();
+    for my $i (0 .. $#matches) {
+
+        # Find the location of the phospo-site in the sequence(s)
+        $tmp_site = 0; my $offset = 0;
+        my $tmp_p_peptide = $peptide_to_match;
+        $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g;
+
+        # Find all phosphorylated residues in the p_peptide
+        @p_sites = ();
+        while ($tmp_site != -1) {
+            $tmp_site = index($tmp_p_peptide, 'p', $offset);
+            if ($tmp_site != -1) {push (@p_sites, $tmp_site);}
+            $offset = $tmp_site + 1;
+            $tmp_p_peptide =~ s/p//;
+        }
+        @tmp_p_residues = ();
+        for my $l (0 .. $#p_sites) {
+            next if not defined $sites{$peptide_to_match}[$i];
+
+            push (@tmp_p_residues, $p_sites[$l] + $sites{$peptide_to_match}[$i]);
+
+            # Match the sequences around the phospho residues to find the motifs
+            my ($desired_residues_L, $desired_residues_R);
+            if ($tmp_p_residues[0] - 10 < 0) {    #check to see if there are fewer than 10 residues left of the first p-site
+                # eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3
+                $desired_residues_L = $tmp_p_residues[0];
+            }
+            else {
+                $desired_residues_L = 10;
+            }
+            my $seq_length = length($matched_sequences{$peptide_to_match}[$i]);
+            if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) {    #check to see if there are fewer than 10 residues right of the last p-site
+                $desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1);
+                # eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3
+                # print "Line 170:  seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n";
+            }
+            else {
+                $desired_residues_R = 10;
+            }
+
+            my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1;
+            my $arg2 = $tmp_p_residues[0] - $desired_residues_L;
+            my $arg1 = $matched_sequences{$peptide_to_match}[$i];
+
+            if (($total_length > 0) && (length($arg1) > $arg2 + $total_length - 1)) {
+                $tmp_motif = substr($arg1, $arg2, $total_length);
+
+                # Put the "p" back in front of the appropriate phospho-residue(s).
+                my (@tmp_residues, $tmp_position);
+                for my $m (0 .. $#p_sites) {
+                    # print "Line 183: $p_sites[$m]\n";
+                    if ($m == 0) {
+                        $tmp_position = $desired_residues_L;
+                    } else {
+                        $tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0];
+                    }
+                    if ($tmp_position < length($tmp_motif) + 1) {
+                        push (@tmp_residues, substr($tmp_motif, $tmp_position, 1));
+                        if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");}
+                        if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");}
+                        if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");}
+                    }
+                }
+
+                $tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g;
+
+                # Comment out on 8.10.13 to remove the numbers from motifs
+                my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1;
+                my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1;
+                $tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue;
+                push(@tmp_motifs_array, $tmp_motif);
+                $residues{$peptide_to_match}{$i} = [ @tmp_residues ];
+                $p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ];
+            }
+        }
+        $p_motifs{$peptide_to_match} = [ @tmp_motifs_array ];
+    }  # end for my $i (0 .. $#matches)       ### this bracket could be in the wrong place
+}
+
+print "... Finished match the p_peptides to the \@sequences array at " . format_localtime_iso8601() ."\n\n";
+
+###############################################################################################################################
+#
+#  Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs
+#
+###############################################################################################################################
+
+
+print "--- Reading various site data:\n";
+
+###############################################################################################################################
+#
+#    Read the NetworKIN_predictions file:
+#        1) make a "kinases_observed" array
+#        2) annotate the phospho-substrates with the appropriate kinase
+#
+###############################################################################################################################
+my $SITE_KINASE_SUBSTRATE = 1;
+$site_description{$SITE_KINASE_SUBSTRATE} = "NetworKIN";
+
+open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n";
+print "Reading the NetworKIN data:  $networkin_in\n";
+while (<IN1>) {
+    chomp;
+    my (@x) = split(/\t/);
+    for my $i (0 .. $#x) {
+        $x[$i] =~ s/\r//g;     $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
+    }
+    next if ($x[0] eq "#substrate");
+    if (exists ($kinases -> {$x[2]})) {
+        #do nothing
+    }
+    else {
+        $kinases -> {$x[2]} = $x[2];
+        push (@kinases_observed, $x[2]);
+    }
+    my $tmp = $x[10]."_".$x[2];    #eg, REEILsEMKKV_PKCalpha
+    if (exists($p_sequence_kinase -> {$tmp})) {
+        #do nothing
+    }
+    else {
+        $p_sequence_kinase -> {$tmp} = $tmp;
+    }
+}
+close IN1;
+
+###############################################################################################################################
+#
+#    Read the Kinase motifs file:
+#        1) make a "motif_sequence" array
+#
+###############################################################################################################################
+
+# file format (tab separated):
+#   x[0] = quasi-primary key (character), e.g., '17' or '23a'
+#   x[1] = pattern (egrep pattern), e.g., '(M|I|L|V|F|Y).R..(pS|pT)'
+#   x[2] = description, e.g., 'PKA_Phosida' or '14-3-3 domain binding motif (HPRD)' or 'Akt kinase substrate motif (HPRD & Phosida)'
+# "counter"	"pcre"	"symbol"	"description"	"pubmed_id"	"classification"	"source"
+# "1"	"R.R..(pS|pT)(F|L)"	"PKB_group"	"Akt kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8985174"	"kinase substrate"	"HPRD"
+#   x[3] = old description, i.e., description in Amanchy (HPRD) and Phosida tables
+#   x[4] = pubmed id
+#   x[5] = classification
+#   x[6] = source (Phosida or HPRD)
+my $SITE_HPRD = 2;
+$site_description{$SITE_HPRD} = "HPRD";
+$site_id{$site_description{$SITE_HPRD}} = $SITE_HPRD;
+my $SITE_PHOSIDA = 4;
+$site_description{$SITE_PHOSIDA} = "Phosida";
+$site_id{$site_description{$SITE_PHOSIDA}} = $SITE_PHOSIDA;
+
+open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n";
+print "Reading the Motifs file:  $motifs_in\n";
+
+while (<IN2>) {
+    chomp;
+    my (@x) = split(/\t/);
+    my $tmp_motif_description;
+    if ($#x == 6) { # weirdly, a @list of length seven has $#list == 6
+        # remove double-quotes which are helpful or necessary for Excel
+        $x[6]  =~ s/\"//g;
+        $tmp_motif_description = $x[6];
+    } else {
+        $tmp_motif_description = "motif";
+    }
+    for my $i (0 .. 2) {
+        # remove any embedded CR or LF (none should exist)
+        $x[$i] =~ s/\r//g;
+        $x[$i]  =~ s/\n//g;
+        # remove double-quotes which are helpful or necessary for Excel
+        $x[$i]  =~ s/\"//g;
+        }
+    if (exists ($motif_type{$x[2]})) {
+        #ACE-2022.06.20 $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2];
+        $motif_type{$x[2]} = $motif_type{$x[2]}."|".$x[2];
+    } else {
+        $motif_type{$x[2]} = $x[2];
+        $motif_count{$x[1]} = 0;
+        push (@motif_sequence, $x[1]);
+        push (@motif_description, $tmp_motif_description);
+        push (@motif_type_key_ary, $x[2])
+    }
+}
+close (IN2);
+
+
+###############################################################################################################################
+#  6.28.2011
+#    Read PSP_Kinase_Substrate data:
+#        1) make a "kinases_PhosphoSite" array
+#        2) annotate the phospho-substrates with the appropriate kinase
+#
+#  Columns:
+#     (0) GENE
+#     (1) KINASE
+#     (2) KIN_ACC_ID
+#     (3) KIN_ORGANISM
+#     (4) SUBSTRATE
+#     (5) SUB_GENE_ID
+#     (6) SUB_ACC_ID
+#     (7) SUB_GENE
+#     (8) SUB_ORGANISM
+#     (9) SUB_MOD_RSD
+#     (10) SITE_GRP_ID
+#     (11) SITE_+/-7_AA
+#     (12) DOMAIN
+#     (13) IN_VIVO_RXN
+#     (14) IN_VITRO_RXN
+#     (15) CST_CAT#
+###############################################################################################################################
+
+my $SITE_PHOSPHOSITE = 3;
+$site_description{$SITE_PHOSPHOSITE} = "PhosphoSite";
+
+
+$line = 0;
+
+open (IN3, "$PSP_Kinase_Substrate_in") or die "I couldn't find $PSP_Kinase_Substrate_in\n";
+print "Reading the PhosphoSite Kinase-Substrate data:  $PSP_Kinase_Substrate_in\n";
+
+while (<IN3>) {
+    chomp;
+    my (@x) = split(/\t/);
+    for my $i (0 .. $#x) {
+        $x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
+        }
+    if ($line != 0) {
+        if (($species eq $x[3]) && ($species eq $x[8])) {
+            if (exists ($kinases_PhosphoSite -> {$x[0]})) {
+                #do nothing
+            }
+            else {
+                $kinases_PhosphoSite -> {$x[0]} = $x[0];
+                push (@kinases_PhosphoSite, $x[0]);
+            }
+            my $offset = 0;
+            # Replace the superfluous lower case s, t and y
+            my @lowercase = ('s','t','y');
+            my @uppercase = ('S','T','Y');
+            for my $k (0 .. 2) {
+                my $site = 0;
+                while ($site != -1) {
+                    $site = index($x[11],$lowercase[$k], $offset);
+                    if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);}
+                    $offset = $site + 1;
+                }
+            }
+            my $tmp = $x[11]."_".$x[0];        #eg, RTPGRPLsSYGMDSR_PAK2
+            if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
+                #do nothing
+            }
+            else {
+                $p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp;
+            }
+        }
+        else {
+            # do nothing
+            #print "PSP_kinase_substrate line rejected because KIN_ORGANISM is '$x[3]' and SUB_ORGANISM is '$x[8]': $line\n";
+        }
+    }
+    $line++;
+}
+close IN3;
+
+
+###############################################################################################################################
+#  Read PhosphoSite regulatory site data:
+#        1) make a "regulatory_sites_PhosphoSite" hash
+#
+#  Columns:
+#    (0)  GENE
+#    (2)  PROT_TYPE
+#    (3)  ACC_ID
+#    (4)  GENE_ID
+#    (5)  HU_CHR_LOC
+#    (6)  ORGANISM          --> %organism
+#    (7)  MOD_RSD
+#    (8)  SITE_GRP_ID
+#    (9)  SITE_+/-7_AA      --> %regulatory_sites_PhosphoSite_hash
+#    (10) DOMAIN            --> %domain
+#    (11) ON_FUNCTION       --> %ON_FUNCTION
+#    (12) ON_PROCESS        --> %ON_PROCESS
+#    (13) ON_PROT_INTERACT  --> %ON_PROT_INTERACT
+#    (14) ON_OTHER_INTERACT --> %ON_OTHER_INTERACT
+#    (15) PMIDs
+#    (16) LT_LIT
+#    (17) MS_LIT
+#    (18) MS_CST
+#    (19) NOTES             --> %notes
+###############################################################################################################################
+
+
+$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef);
+my $auto_commit = $dbh->{AutoCommit};
+$dbh->{AutoCommit} = 0;
+print "DB connection $dbh is to $db_out, opened for modification\n";
+
+# add partial PSP_Regulatory_site table (if not exists) regardless of whether SwissProt input was FASTA or SQLite
+$stmth = $dbh->prepare("
+CREATE TABLE IF NOT EXISTS PSP_Regulatory_site (
+  SITE_PLUSMINUS_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
+  DOMAIN             TEXT,
+  ON_FUNCTION        TEXT,
+  ON_PROCESS         TEXT,
+  ON_PROT_INTERACT   TEXT,
+  ON_OTHER_INTERACT  TEXT,
+  NOTES              TEXT,
+  ORGANISM           TEXT,
+  PROTEIN            TEXT
+)
+");
+$stmth->execute();
+
+# add partial PSP_Regulatory_site LUT (if not exists) regardless of whether SwissProt input was FASTA or SQLite
+$stmth = $dbh->prepare("
+CREATE TABLE IF NOT EXISTS ppep_regsite_LUT
+( ppep_id            INTEGER REFERENCES ppep(id)
+, site_plusminus_7AA TEXT    REFERENCES PSP_Regulatory_site(site_plusminus_7AA)
+, PRIMARY KEY (ppep_id, site_plusminus_7AA) ON CONFLICT IGNORE
+);
+");
+$stmth->execute();
+
+# $stmth = $dbh->prepare("
+# CREATE UNIQUE INDEX idx_PSP_Regulatory_site_0
+#   ON PSP_Regulatory_site(site_plusminus_7AA);
+# ");
+# $stmth->execute();
+
+
+# add Citation table (if not exists) regardless of whether SwissProt input was FASTA or SQLite
+my $citation_sql;
+$citation_sql = "
+CREATE TABLE IF NOT EXISTS Citation (
+  ObjectName TEXT REFERENCES sqlite_schema(name) ON DELETE CASCADE,
+  CitationData TEXT,
+  PRIMARY KEY (ObjectName, CitationData) ON CONFLICT IGNORE
+)
+";
+$stmth = $dbh->prepare($citation_sql);
+$stmth->execute();
+
+
+open (IN4, "$PSP_Regulatory_Sites_in") or die "I couldn't find $PSP_Regulatory_Sites_in\n";
+print "Reading the PhosphoSite regulatory site data:  $PSP_Regulatory_Sites_in\n";
+
+
+$line = -1;
+while (<IN4>) {
+    $line++;
+    chomp;
+    if ($_ =~ m/PhosphoSitePlus/) {
+        #$PhosphoSitePlusCitation = ($_ =~ s/PhosphoSitePlus/FooBar/g);
+        $PhosphoSitePlusCitation = $_;
+        $PhosphoSitePlusCitation =~ s/\t//g;
+        $PhosphoSitePlusCitation =~ s/\r//g;
+        $PhosphoSitePlusCitation =~ s/\n//g;
+        $PhosphoSitePlusCitation =~ s/""/"/g;
+        $PhosphoSitePlusCitation =~ s/^"//g;
+        $PhosphoSitePlusCitation =~ s/"$//g;
+        print "$PhosphoSitePlusCitation\n";
+        next;
+    }
+    my (@x) = split(/\t/);
+    for my $i (0 .. $#x) {
+        $x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
+    }
+    my $found_GENE=0;
+    if ( (not exists($x[0])) ) {
+        next;
+    }
+    elsif ( ($x[0] eq "GENE") ) {
+        $found_GENE=1;
+        next;
+    }
+    if ( (not exists($x[9])) || ($x[9] eq "") ) {
+        if (exists($x[8]) && (not $x[8] eq "")) {
+            die "$PSP_Regulatory_Sites_in line $line has no SITE_+/-7_AA: $_\n";
+        } else {
+            if ( (not exists($x[1])) || (not $x[1] eq "") ) {
+                print "$PSP_Regulatory_Sites_in line $line (".length($_)." characters) has no SITE_+/-7_AA: $_\n"
+                  if $found_GENE==1;
+            }
+            next;
+        }
+    }
+    elsif ($line != 0) {
+        if ($species ne $x[6]) {
+            # Do nothing - this record was filtered out by the species filter
+        }
+        elsif (!exists($regulatory_sites_PhosphoSite_hash{$x[9]})) {
+            if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") {
+                $regulatory_sites_PhosphoSite_hash{$x[9]} = $x[9];
+                $domain{$x[9]} = $x[10];
+                $ON_FUNCTION{$x[9]} = $x[11];
+                $ON_PROCESS{$x[9]} = $x[12];
+                $ON_PROT_INTERACT{$x[9]} = $x[13];
+                $ON_OTHER_INTERACT{$x[9]} = $x[14];
+                $notes{$x[9]} = $x[19];
+                $organism{$x[9]} = $x[6];
+            }
+        }
+        else {
+            # $domain
+            if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") {
+                if ($x[10] ne "") {
+                  $domain{$x[9]} = $domain{$x[10]};
+                  }
+                else {
+                  # do nothing
+                  }
+            }
+            else {
+                if ($domain{$x[9]} =~ /$x[10]/) {
+                  # do nothing
+                  }
+                else {
+                  $domain{$x[9]} = $domain{$x[9]}." / ".$x[10];
+                  #print "INFO line $line - compound domain for 7aa:  GENE $x[0]   PROTEIN $x[1]   PROT_TYPE $x[2]   ACC_ID $x[3]   GENE_ID $x[4]   HU_CHR_LOC $x[5]   ORGANISM $x[6]   MOD_RSD $x[7]   SITE_GRP_ID $x[8]   SITE_+/-7_AA $x[9]   DOMAIN $domain{$x[9]}\n";
+                  }
+            }
+
+            # $ON_FUNCTION
+            if (!defined $ON_FUNCTION{$x[9]} || $ON_FUNCTION{$x[9]} eq "") {
+                $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10];
+            }
+
+            # $ON_PROCESS
+            if (!defined $ON_PROCESS{$x[9]} || $ON_PROCESS{$x[9]} eq "") {
+                $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10];
+            }
+
+            # $ON_PROT_INTERACT
+            if (!defined $ON_PROT_INTERACT{$x[9]}  || $ON_PROT_INTERACT{$x[9]} eq "") {
+                $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10];
+            }
+
+            # $ON_OTHER_INTERACT
+            if (!defined $ON_OTHER_INTERACT{$x[9]} || $ON_OTHER_INTERACT{$x[9]} eq "") {
+                $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10];
+            }
+
+            # $notes
+            if (!defined $notes{$x[9]} || $notes{$x[9]} eq "") {
+                $notes{$x[9]} = $notes{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $notes{$x[9]} = $notes{$x[9]}." / ".$x[10];
+            }
+
+            # $organism
+            if (!defined $organism{$x[9]} || $organism{$x[9]} eq "") {
+                $organism{$x[9]} = $organism{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $organism{$x[9]} = $organism{$x[9]}." / ".$x[10];
+            }
+        }
+    }
+}
+close IN4;
+
+print "... Finished reading various site data at " . format_localtime_iso8601() ."\n\n";
+
+$stmth = $dbh->prepare("
+INSERT INTO Citation (
+  ObjectName,
+  CitationData
+) VALUES (?,?)
+");
+
+sub add_citation {
+    my ($cit_table, $cit_text, $cit_label) = @_;
+    $stmth->bind_param(1, $cit_table);
+    $stmth->bind_param(2, $cit_text);
+    if (not $stmth->execute()) {
+        print "Error writing $cit_label cit for table $cit_table: $stmth->errstr\n";
+    }
+}
+my ($citation_text, $citation_table);
+
+# PSP regulatory or kinase/substrate site
+$citation_text = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."';
+$citation_table = "PSP_Regulatory_site";
+add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "PSP_Regulatory_site");
+$citation_text = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122';
+$citation_table = "PSP_Regulatory_site";
+add_citation($citation_table, $citation_text, "PSP_Regulatory_site");
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
+
+# NetworKIN site
+$citation_text = 'Linding, 2007, "Systematic discovery of in vivo phosphorylation networks.", https://pubmed.ncbi.nlm.nih.gov/17570479, https://doi.org/10.1016/j.cell.2007.05.052';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_text = 'Horn, 2014, "KinomeXplorer: an integrated platform for kinome biology studies.", https://pubmed.ncbi.nlm.nih.gov/24874572, https://doi.org/10.1038/nmeth.296';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_text = 'Aken, 2016, "The Ensembl gene annotation system.", https://pubmed.ncbi.nlm.nih.gov/33137190, https://doi.org/10.1093/database/baw093';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+
+# pSTY motifs
+$citation_text = 'Amanchy, 2007, "A curated compendium of phosphorylation motifs.", https://pubmed.ncbi.nlm.nih.gov/17344875, https://doi.org/10.1038/nbt0307-285';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs");
+$citation_text = 'Gnad, 2011, "PHOSIDA 2011: the posttranslational modification database.", https://pubmed.ncbi.nlm.nih.gov/21081558, https://doi.org/10.1093/nar/gkq1159';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs");
+
+
+###############################################################################################################################
+#
+#    Read the data file:
+#        1) find sequences that match the NetworKIN predictions
+#        2) find motifs that match the observed sequences
+#
+###############################################################################################################################
+
+print "--- Find sequences that match the NetworKIN predictions and find motifs that match observed sequences\n";
+
+my $ppep_regsite_LUT_stmth;
+$ppep_regsite_LUT_stmth = $dbh->prepare("
+  INSERT INTO ppep_regsite_LUT (
+    ppep_id,
+    site_plusminus_7AA
+  ) VALUES (?,?)
+");
+
+my ($start_seconds, $start_microseconds) = gettimeofday;
+
+foreach my $peptide (keys %data) {
+    # find the unique phospho-motifs for this $peptide
+    my @all_motifs = ();
+    my $have_all_motifs = 0;
+    for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
+        my $tmp_motif = $p_motifs{$peptide}[$i];
+        push(@all_motifs, $tmp_motif);
+        $have_all_motifs = 1;
+    }
+    if ($have_all_motifs == 1) {
+        for my $j (0 .. $#all_motifs) {
+            if (defined $all_motifs[$j]) {
+                $all_motifs[$j] =~ s/\d+-\[\s//;
+                $all_motifs[$j] =~ s/\s\]\-\d+//;
+            }
+        }
+    }
+    my %seen = ();
+    if ($have_all_motifs == 1) {
+        foreach my $a (@all_motifs) {
+            if (defined $a) {
+                if (exists($seen{$a})) {
+                    next;
+                } else {
+                    push(@{$unique_motifs{$peptide}}, $a);
+                    $seen{$a} = 1;
+                }
+            }
+            print "push(\@{\$unique_motifs{$peptide}}, $a);\n" if ($verbose);
+        }
+    }
+
+    # count the number of phospo-sites in the motif
+    my $number_pY = 0;
+    my $number_pSTY = 0;
+    if ($phospho_type eq 'y') {
+        if (defined(${$unique_motifs{$peptide}}[0])) {
+            while (${$unique_motifs{$peptide}}[0] =~ /pY/g) {
+                $number_pY++;
+            }
+        }
+    }
+    if ($phospho_type eq 'sty') {
+        print "looking for unique_motifs for $peptide\n" if ($verbose);
+        if (defined(${$unique_motifs{$peptide}}[0])) {
+            while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) {
+                $number_pSTY++;
+               print "We have found $number_pSTY unique_motifs for $peptide\n" if ($verbose);
+            }
+        }
+    }
+
+
+    # search each of the unique motifs for matches
+    print "searching $#{$unique_motifs{$peptide}} motifs for peptide $peptide\n" if ($verbose);
+    for my $i (0 .. $#{$unique_motifs{$peptide}}) {
+        print "\$i = $i; peptide = $peptide; unique_motif = ${$unique_motifs{$peptide}}[$i]\n" if ($verbose);
+        my $tmp_motif = ${$unique_motifs{$peptide}}[$i];
+        print "   --- matching unique motif $tmp_motif for peptide  $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
+        my $formatted_sequence;
+        if (($number_pY == 1) || ($number_pSTY == 1)) {
+            my $seq_plus5aa = "";
+            my $seq_plus7aa = "";
+            $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type);
+            print "       a #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequence for peptide  $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
+            if ($phospho_type eq 'y') {
+                $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1];
+                $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1];
+            }
+            elsif ($phospho_type eq "sty") {
+                $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1];
+                $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1];
+            }
+
+            if (defined $seq_plus7aa) {
+                # commit the 7aa LUT records
+                $ppep_regsite_LUT_stmth->bind_param( 1, $ppep_id_lut{$peptide} );
+                $ppep_regsite_LUT_stmth->bind_param( 2, $seq_plus7aa             );
+                if (not $ppep_regsite_LUT_stmth->execute()) {
+                    print "Error writing tuple ($ppep_id_lut{$peptide},$seq_plus7aa) for peptide $peptide to ppep_regsite_LUT: $ppep_regsite_LUT_stmth->errstr\n";
+                }
+            }
+            for my $i (0 .. $#kinases_observed) {
+                if (defined $seq_plus5aa) {
+                    my $tmp = $seq_plus5aa."_".$kinases_observed[$i];    #eg, should be PGRPLsSYGMD_PKCalpha
+                    if (exists($p_sequence_kinase -> {$tmp})) {
+                        $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; #ACE
+                    }
+                }
+            }
+            for my $i (0 .. $#motif_sequence) {
+                print "matching $motif_sequence[$i]" if ($verbose);
+                if ($peptide =~ /$motif_sequence[$i]/) {
+                    $kinase_motif_matches{$peptide}{$motif_type{$motif_type_key_ary[$i]}} = "X";
+                }
+            }
+            for my $i (0 .. $#kinases_PhosphoSite) {
+                if (defined $seq_plus7aa) {
+                    my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i];    #eg, should be RTPGRPLsSYGMDSR_PAK2
+                    if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
+                        $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
+                    }
+                }
+            }
+            if (exists($regulatory_sites_PhosphoSite_hash{$seq_plus7aa})) {
+                $seq_plus7aa_2{$peptide} = $seq_plus7aa;
+                $domain_2{$peptide} = $domain{$seq_plus7aa};
+                $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
+                $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
+                $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
+                $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
+                $notes_2{$peptide} = $notes{$seq_plus7aa};
+                $organism_2{$peptide} = $organism{$seq_plus7aa};
+            } else {
+            }
+        }
+        elsif (($number_pY > 1) || ($number_pSTY > 1)) {  #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2
+            $formatted_sequence = $tmp_motif;
+            $seq_plus5aa = "";
+            $seq_plus7aa = "";
+            #Create the sequences with only one phosphorylation site
+            #eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes  1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329  and  1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329
+
+            my (@sites, $offset, $next_p_site);
+            $sites[0] = index($tmp_motif, "p");
+            $offset = $sites[0] + 1;
+            $next_p_site = 0;
+            while ($next_p_site != -1) {
+                $next_p_site = index($tmp_motif, "p", $offset);
+                if ($next_p_site != -1) {
+                    push (@sites, $next_p_site);
+                }
+                $offset = $next_p_site+1;
+            }
+
+            my @pSTY_sequences;
+            for my $n (0 .. $#sites) {
+                $pSTY_sequences[$n] = $tmp_motif;
+                for (my $m = $#sites; $m >= 0; $m--) {
+                    if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";}
+                }
+            }
+
+            my @formatted_sequences;
+            for my $k (0 .. $#sites) {
+                $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type);
+            }
+
+            for my $k (0 .. $#formatted_sequences) {
+                print "       b #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequences[$k] for peptide  $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
+                if ($phospho_type eq 'y') {
+                    $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequences[$k]))[1];
+                    $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequences[$k]))[1];
+                }
+                elsif ($phospho_type eq "sty") {
+                    $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequences[$k]))[1];
+                    $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequences[$k]))[1];
+                }
+                for my $i (0 .. $#kinases_observed) {
+                    my $tmp = $seq_plus5aa."_".$kinases_observed[$i];    #eg, should look like REEILsEMKKV_PKCalpha
+                    if (exists($p_sequence_kinase -> {$tmp})) {
+                        $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X";
+                    }
+                }
+                $pSTY_sequence = $formatted_sequences[$k];
+                for my $i (0 .. $#motif_sequence) {
+                    if ($pSTY_sequence =~ /$motif_sequence[$i]/) {
+                        $kinase_motif_matches{$peptide}{$motif_type{$motif_type_key_ary[$i]}} = "X";
+                    }
+                }
+                for my $i (0 .. $#kinases_PhosphoSite) {
+                    my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i];    #eg, should be RTPGRPLsSYGMDSR_PAK2
+                    #print "seq_plus7aa._.kinases_PhosphoSite[i] is $tmp";
+                    if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
+                        $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
+                    }
+                }
+                if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) {
+                    $seq_plus7aa_2{$peptide} = $seq_plus7aa;
+
+                    # $domain
+                    if ($domain_2{$peptide} eq "") {
+                        $domain_2{$peptide} = $domain{$seq_plus7aa};
+                    }
+                    elsif ($domain{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa};
+                    }
+
+
+                    # $ON_FUNCTION_2
+                    if ($ON_FUNCTION_2{$peptide} eq "") {
+                        $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
+                    }
+                    elsif ($ON_FUNCTION{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa};
+                    }
+
+                    # $ON_PROCESS_2
+                    if ($ON_PROCESS_2{$peptide} eq "") {
+                        $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
+                    }
+                    elsif ($ON_PROCESS{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa};
+                    }
+
+                    # $ON_PROT_INTERACT_2
+                    if ($ON_PROT_INTERACT_2{$peptide} eq "") {
+                        $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
+                    }
+                    elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa};
+                    }
+
+                    # $ON_OTHER_INTERACT_2
+                    if ($ON_OTHER_INTERACT_2{$peptide} eq "") {
+                        $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
+                    }
+                    elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa};
+                    }
+
+                    # $notes_2
+                    if ($notes_2{$peptide} eq "") {
+                        $notes_2{$peptide} = $notes{$seq_plus7aa};
+                    }
+                    elsif ($notes{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa};
+                    }
+                    $notes_2{$peptide} = $notes{$seq_plus7aa};
+
+                    # $organism_2
+                    if ($organism_2{$peptide} eq "") {
+                        $organism_2{$peptide} = $organism{$seq_plus7aa};
+                    }
+                    elsif ($organism{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $organism_2{$peptide} = $organism_2{$peptide}." / ".$organism{$seq_plus7aa};
+                    }
+                    $organism_2{$peptide} = $organism{$seq_plus7aa};
+                } else {
+                } # if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa}))
+            } # for my $k (0 .. $#formatted_sequences)
+        } # if/else number of phosphosites
+    } # for each motif i # for my $i (0 .. $#{$unique_motifs{$peptide}})
+} # for each $peptide
+
+my ($end_seconds, $end_microseconds) = gettimeofday;
+
+my $delta_seconds = $end_seconds - $start_seconds;
+my $delta_microseconds = $end_microseconds - $start_microseconds;
+$delta_microseconds += 1000000 * $delta_seconds;
+my $key_count = keys(%data);
+print sprintf("Average search time is %d microseconds per phopshopeptide\n", ($delta_microseconds / $key_count));
+
+($start_seconds, $start_microseconds) = gettimeofday;
+
+print "Writing PSP_Regulatory_site records\n";
+
+my $psp_regulatory_site_stmth = $dbh->prepare("
+    INSERT INTO PSP_Regulatory_site (
+      DOMAIN,
+      ON_FUNCTION,
+      ON_PROCESS,
+      ON_PROT_INTERACT,
+      ON_OTHER_INTERACT,
+      NOTES,
+      SITE_PLUSMINUS_7AA,
+      ORGANISM
+    ) VALUES (?,?,?,?,?,?,?,?)
+    ");
+
+foreach my $peptide (keys %data) {
+    if (exists($domain_2{$peptide}) and (defined $domain_2{$peptide}) and (not $domain_2{$peptide} eq "") ) {
+        $psp_regulatory_site_stmth->bind_param(1, $domain_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(2, $ON_FUNCTION_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(3, $ON_PROCESS_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(4, $ON_PROT_INTERACT_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(5, $ON_OTHER_INTERACT_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(6, $notes_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(7, $seq_plus7aa_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(8, $organism_2{$peptide});
+        if (not $psp_regulatory_site_stmth->execute()) {
+            print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $psp_regulatory_site_stmth->errstr\n";
+        } else {
+        }
+    } elsif (exists($domain_2{$peptide}) and (not defined $domain_2{$peptide})) {
+        print "\$domain_2{$peptide} is undefined\n";  #ACE
+    }
+}
+
+$dbh->{AutoCommit} = $auto_commit;
+# auto_commit implicitly finishes psp_regulatory_site_stmth, apparently # $psp_regulatory_site_stmth->finish;
+$dbh->disconnect if ( defined $dbh );
+
+
+($end_seconds, $end_microseconds) = gettimeofday;
+
+$delta_seconds = $end_seconds - $start_seconds;
+$delta_microseconds = $end_microseconds - $start_microseconds;
+$delta_microseconds += 1000000 * $delta_seconds;
+$key_count = keys(%data);
+print sprintf("Write time is %d microseconds\n", ($delta_microseconds));
+
+print "... Finished find sequences that match the NetworKIN predictions and find motifs that match observed sequences at " . format_localtime_iso8601() ."\n\n";
+
+###############################################################################################################################
+#
+# Print to the output file
+#
+###############################################################################################################################
+
+
+open (OUT, ">$file_out") || die "could not open the fileout: $file_out";
+open (MELT, ">$file_melt") || die "could not open the fileout: $file_melt";
+
+# print the header info
+print MELT "phospho_peptide\tgene_names\tsite_type\tkinase_map\n";
+print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t";
+
+# print the PhosphoSite regulatory data
+print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t";
+
+# print the sample names
+for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; }
+
+# print the kinases and groups
+for my $i (0 .. $#kinases_observed) {
+    my $temp = $kinases_observed[$i]."_NetworKIN";
+    print OUT "$temp\t";
+    push(@kinases_observed_lbl, $temp);
+}
+my @motif_type_keys = keys %motif_type;
+for my $i (1 .. $#motif_type_keys) {
+    print OUT "$motif_type{$motif_type_keys[$i]}\t";
+}
+for my $i (0 .. $#kinases_PhosphoSite) {
+    my $temp = $kinases_PhosphoSite[$i]; # ."_PhosphoSite";
+    if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; }
+    if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; }
+    push(@phosphosites_observed_lbl, $temp);
+}
+
+# begin DDL-to-SQLite
+# ---
+$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef);
+$auto_commit = $dbh->{AutoCommit};
+$dbh->{AutoCommit} = 0;
+print "DB connection $dbh is to $db_out, opened for modification\n";
+
+my $sample_stmth;
+$sample_stmth = $dbh->prepare("
+  INSERT INTO sample (
+    id,
+    name
+  ) VALUES (?,?)
+");
+
+my $ppep_intensity_stmth;
+$ppep_intensity_stmth = $dbh->prepare("
+  INSERT INTO ppep_intensity (
+    ppep_id,
+    sample_id,
+    intensity
+  ) VALUES (?,?,?)
+");
+
+my $site_type_stmth;
+$site_type_stmth = $dbh->prepare("
+  insert into site_type (
+    id,
+    type_name
+  ) values (?,?)
+");
+
+my $ppep_gene_site_stmth;
+$ppep_gene_site_stmth = $dbh->prepare("
+  insert into ppep_gene_site (
+    ppep_id,
+    gene_names,
+    kinase_map,
+    site_type_id
+  ) values (?,?,?,?)
+");
+
+my $ppep_metadata_stmth;
+$ppep_metadata_stmth = $dbh->prepare("
+  INSERT INTO ppep_metadata
+    ( ppep_id
+    , protein_description
+    , gene_name
+    , FASTA_name
+    , phospho_sites
+    , motifs_unique
+    , accessions
+    , motifs_all_members
+    , domain
+    , ON_FUNCTION
+    , ON_PROCESS
+    , ON_PROT_INTERACT
+    , ON_OTHER_INTERACT
+    , notes
+  ) VALUES (
+    ?,?,?,?,?,?,?
+  , ?,?,?,?,?,?,?
+  )
+");
+# end DDL-to-SQLite
+# ...
+
+# begin store-to-SQLite "sample" table
+# ---
+# %sample_id_lut maps name -> ID
+for my $sample_name (keys %sample_id_lut) {
+    $sample_stmth->bind_param( 2, $sample_name                 );
+    $sample_stmth->bind_param( 1, $sample_id_lut{$sample_name} );
+    if (not $sample_stmth->execute()) {
+        print "Error writing tuple ($sample_name,$sample_id_lut{$sample_name}): $sample_stmth->errstr\n";
+    }
+}
+# end store-to-SQLite "sample" table
+# ...
+
+# begin store-to-SQLite "site_type" table
+# ---
+sub add_site_type {
+    my ($site_type_id, $site_type_type_name) = @_;
+    $site_type_stmth->bind_param( 2, $site_type_type_name );
+    $site_type_stmth->bind_param( 1, $site_type_id        );
+    if (not $site_type_stmth->execute()) {
+        die "Error writing tuple ($site_type_id,$site_type_type_name): $site_type_stmth->errstr\n";
+    }
+}
+add_site_type($SITE_KINASE_SUBSTRATE, $site_description{$SITE_KINASE_SUBSTRATE});
+add_site_type($SITE_HPRD            , $site_description{$SITE_HPRD            });
+add_site_type($SITE_PHOSIDA         , $site_description{$SITE_PHOSIDA         });
+add_site_type($SITE_PHOSPHOSITE     , $site_description{$SITE_PHOSPHOSITE     });
+# end store-to-SQLite "site_type" table
+# ...
+
+foreach my $peptide (sort(keys %data)) {
+    next if (grep($peptide, @failed_matches));
+    my $ppep_id = $ppep_id_lut{$peptide};
+    my @ppep_metadata = ();
+    my @ppep_intensity = ();
+    my @gene = ();
+    my $gene_names;
+    my $j;
+    # Print the peptide itself
+    #   column 1: p-peptide
+    print OUT "$peptide\t";
+    push (@ppep_metadata, $ppep_id);
+    push (@ppep_intensity, $peptide);
+
+    my $verbose_cond = 0; # $peptide eq 'AAAAAAAGDpSDpSWDADAFSVEDPVR' || $peptide eq 'KKGGpSpSDEGPEPEAEEpSDLDSGSVHSASGRPDGPVR';
+    # skip over failed matches
+    print "\nfirst match for '$peptide' is '$matched_sequences{$peptide}[0]' and FAILED_MATCH_SEQ is '$FAILED_MATCH_SEQ'\n" if $verbose_cond;
+    if ($matched_sequences{$peptide}[0] eq $FAILED_MATCH_SEQ) {
+        # column 2: Protein description
+        # column 3: Gene name(s)
+        # column 4: FASTA name
+        # column 5: phospho-residues
+        # Column 6: UNIQUE phospho-motifs
+        # Column 7: accessions
+        # Column 8: ALL motifs with residue numbers
+        #          2                                     3   4   5   6   7   8
+        print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t";
+        print "No match found for '$peptide' in sequence database\n";
+        $gene_names = '$FAILED_MATCH_GENE_NAME';
+    } else {
+        my @description = ();
+        my %seen = ();
+        # Print just the protein description
+        for $i (0 .. $#{$names{$peptide}}) {
+            my $long_name = $names{$peptide}[$i];
+            my @naming_parts = split(/\sOS/, $long_name);
+            my @front_half = split(/\s/, $naming_parts[0]);
+            push(@description, join(" ", @front_half[1..($#front_half)]));
+        }
+        # column 2: Protein description
+        print OUT join(" /// ", @description), "\t";
+        push (@ppep_metadata, join(" /// ", @description));
+
+        # Print just the gene name
+        for $i (0 .. $#{$names{$peptide}}) {
+            my $tmp_gene = $names{$peptide}[$i];
+            $tmp_gene =~ s/^.*GN=//;
+            $tmp_gene =~ s/\s.*//;
+            if (!exists($seen{$tmp_gene})) {
+                push(@gene, $tmp_gene);
+                $seen{$tmp_gene} = $tmp_gene;
+            }
+        }
+        # column 3: Gene name(s)
+        $gene_names = join(" /// ", @gene);
+        print OUT $gene_names, "\t";
+        push (@ppep_metadata, join(" /// ", @gene));
+
+        # column 4: FASTA name
+        print OUT join(" /// ", @{$names{$peptide}}), "\t";
+        push (@ppep_metadata, join(" /// ", @{$names{$peptide}}));
+
+        # column 5: phospho-residues
+        my $tmp_for_insert = "";
+        my $foobar;
+        for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
+            print "match $i for '$peptide' is '$matched_sequences{$peptide}[$i]'\n" if $verbose_cond;
+            if ($i < $#{ $matched_sequences{$peptide} }) {
+                if (defined $p_residues{$peptide}{$i}) {
+                    @tmp_p_residues = @{$p_residues{$peptide}{$i}};
+                    for $j (0 .. $#tmp_p_residues) {
+                        if ($j < $#tmp_p_residues) {
+                            my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                            print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
+                            $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
+                        }
+                        elsif ($j == $#tmp_p_residues) {
+                            my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                            print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// ";
+                            $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// ";
+                        }
+                    }
+                }
+            }
+            elsif ($i == $#{ $matched_sequences{$peptide} }) {
+                my $there_were_sites = 0;
+                if (defined $p_residues{$peptide}{$i}) {
+                    @tmp_p_residues = @{$p_residues{$peptide}{$i}};
+                    if ($#tmp_p_residues > 0) {
+                        for my $j (0 .. $#tmp_p_residues) {
+                            if ($j < $#tmp_p_residues) {
+                                if (defined $p_residues{$peptide}{$i}[$j]) {
+                                    my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                                    $foobar = $residues{$peptide}{$i}[$j];
+                                    if (defined $foobar) {
+                                        print OUT "$foobar";
+                                        print OUT "$tmp_site_for_printing, ";
+                                        $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
+                                        $there_were_sites = 1;
+                                    }
+                                }
+                            }
+                            elsif ($j == $#tmp_p_residues) {
+                                if (defined $p_residues{$peptide}{$i}[$j]) {
+                                    $foobar = $residues{$peptide}{$i}[$j];
+                                    if (defined $foobar) {
+                                        my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                                        print OUT "$foobar";
+                                        print OUT "$tmp_site_for_printing\t";
+                                        $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing";
+                                        $there_were_sites = 1;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                if (0 == $there_were_sites) {
+                  print OUT "\t";
+                }
+            }
+        }
+        print "tmp_for_insert '$tmp_for_insert' for '$peptide'\n" if $verbose_cond;
+        push (@ppep_metadata, $tmp_for_insert);
+
+        # Column 6: UNIQUE phospho-motifs
+        print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t";
+        push (@ppep_metadata, join(" /// ", @{$unique_motifs{$peptide}}));
+
+        # Column 7: accessions
+        if (defined $accessions{$peptide}) {
+            print OUT join(" /// ", @{$accessions{$peptide}}), "\t";
+            push (@ppep_metadata, join(" /// ", @{$accessions{$peptide}}));
+        } else {
+            print OUT "\t";
+            push (@ppep_metadata, "");
+        }
+
+        # Column 8: ALL motifs with residue numbers
+        if (defined $p_motifs{$peptide}) {
+            print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t";
+            push (@ppep_metadata, join(" /// ", @{$p_motifs{$peptide}}));
+        } else {
+            print OUT "\t";
+            push (@ppep_metadata, "");
+        }
+
+    }
+
+    # Print the PhosphoSite regulatory data
+
+    if (defined $domain_2{$peptide})            { print OUT "$domain_2{$peptide}\t";            } else { print OUT "\t"; }
+    if (defined $ON_FUNCTION_2{$peptide})       { print OUT "$ON_FUNCTION_2{$peptide}\t";       } else { print OUT "\t"; }
+    if (defined $ON_PROCESS_2{$peptide})        { print OUT "$ON_PROCESS_2{$peptide}\t";        } else { print OUT "\t"; }
+    if (defined $ON_PROT_INTERACT_2{$peptide})  { print OUT "$ON_PROT_INTERACT_2{$peptide}\t";  } else { print OUT "\t"; }
+    if (defined $ON_OTHER_INTERACT_2{$peptide}) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; }
+    if (defined $notes_2{$peptide})             { print OUT "$notes_2{$peptide}\t";             } else { print OUT "\t"; }
+
+    if (defined $domain_2{$peptide})            { push (@ppep_metadata, $domain_2{$peptide});            } else { push(@ppep_metadata, ""); }
+    if (defined $ON_FUNCTION_2{$peptide})       { push (@ppep_metadata, $ON_FUNCTION_2{$peptide});       } else { push(@ppep_metadata, ""); }
+    if (defined $ON_PROCESS_2{$peptide})        { push (@ppep_metadata, $ON_PROCESS_2{$peptide});        } else { push(@ppep_metadata, ""); }
+    if (defined $ON_PROT_INTERACT_2{$peptide})  { push (@ppep_metadata, $ON_PROT_INTERACT_2{$peptide});  } else { push(@ppep_metadata, ""); }
+    if (defined $ON_OTHER_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_OTHER_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); }
+    if (defined $notes_2{$peptide})             { push (@ppep_metadata, $notes_2{$peptide});             } else { push(@ppep_metadata, ""); }
+
+    # begin store-to-SQLite "ppep_metadata" table
+    # ---
+    for $i (1..14) {
+        $ppep_metadata_stmth->bind_param($i, $ppep_metadata[$i-1]);
+    }
+    if (not $ppep_metadata_stmth->execute()) {
+        print "Error writing ppep_metadata row for phosphopeptide $ppep_metadata[$i]: $ppep_metadata_stmth->errstr\n";
+    }
+    # ...
+    # end store-to-SQLite "ppep_metadata" table
+
+    # Print the data
+    @tmp_data = ();
+    foreach (@{$data{$peptide}}) {
+        push(@tmp_data, $_);
+    }
+    print OUT join("\t", @tmp_data), "\t";
+
+    # begin store-to-SQLite "ppep_intensity" table
+    # ---
+    # commit the sample intensities
+    $i = 0;
+    foreach (@{$data{$peptide}}) {
+        my $intense = $_;
+        $ppep_intensity_stmth->bind_param( 1, $ppep_id                     );
+        $ppep_intensity_stmth->bind_param( 2, $sample_id_lut{$samples[$i]} );
+        $ppep_intensity_stmth->bind_param( 3, $intense                     );
+        if (not $ppep_intensity_stmth->execute()) {
+            print "Error writing tuple ($peptide,$samples[$i],$intense): $ppep_intensity_stmth->errstr\n";
+        }
+        $i += 1;
+    }
+    # ...
+    # end store-to-SQLite "ppep_intensity" table
+
+    # print the kinase-substrate data
+    for my $i (0 .. $#kinases_observed) {
+        if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) {
+            print OUT "X\t";
+            my $NetworKIN_label = $kinases_observed[$i]; #."_NetworKIN";
+            print MELT "$peptide\t$gene_names\t$site_description{$SITE_KINASE_SUBSTRATE}\t$NetworKIN_label\n";
+            # begin store-to-SQLite "ppep_gene_site" table
+            # ---
+            $ppep_gene_site_stmth->bind_param(1, $ppep_id);               # ppep_gene_site.ppep_id
+            $ppep_gene_site_stmth->bind_param(2, $gene_names);            # ppep_gene_site.gene_names
+            $ppep_gene_site_stmth->bind_param(3, $NetworKIN_label);       # ppep_gene_site.kinase_map
+            $ppep_gene_site_stmth->bind_param(4, $SITE_KINASE_SUBSTRATE); # ppep_gene_site.site_type_id
+            if (not $ppep_gene_site_stmth->execute()) {
+                print "Error writing tuple ($peptide,$gene_names,$kinases_observed[$i]): $ppep_gene_site_stmth->errstr\n";
+            }
+            # ...
+            # end store-to-SQLite "ppep_gene_site" table
+        }
+        else { print OUT "\t";}
+    }
+    my %wrote_motif;
+    my $motif_parts_0;
+    my @motif_split;
+    my $one_motif;
+    
+    for my $i (0 .. $#motif_type_keys) {
+        if (exists($kinase_motif_matches{$peptide}{$motif_type_keys[$i]})) {
+            print OUT "X\t";
+            #ACE-2022.06.20 $motif_parts_0 = $motif_type{$motif_sequence[$i]}." ".$motif_sequence[$i];
+            $motif_parts_0 = $motif_type{$motif_type_keys[$i]};
+            @motif_split = split("[|]", $motif_parts_0);
+            #ACE-2022.06.20 my $key = "$peptide\t$gene_names\t$motif_parts_0";
+            for my $j (0 .. $#motif_split) {
+                $one_motif = $motif_split[$j];
+                #ACE-2022.06.20 my $key = "$peptide\t$gene_names\t$motif_parts_0";
+                my $key = "$peptide\t$gene_names\t$one_motif";
+                if (!exists($wrote_motif{$key})) {
+                    $wrote_motif{$key} = $key;
+                    print MELT "$peptide\t$gene_names\t$motif_description[$i]\t$one_motif\n";
+                    # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n";            #debug
+                    # begin store-to-SQLite "ppep_gene_site" table
+                    # ---
+                    $ppep_gene_site_stmth->bind_param(1, $ppep_id);        # ppep_gene_site.ppep_id
+                    $ppep_gene_site_stmth->bind_param(2, $gene_names);     # ppep_gene_site.gene_names
+                    $ppep_gene_site_stmth->bind_param(3, $one_motif);  # ppep_gene_site.kinase_map
+                    $ppep_gene_site_stmth->bind_param(4, $site_id{$motif_description[$i]});     # ppep_gene_site.site_type_id
+                    if (not $ppep_gene_site_stmth->execute()) {
+                        print "Error writing tuple ($peptide,$gene_names,$one_motif): $ppep_gene_site_stmth->errstr\n";
+                    }
+                    # ...
+                    # end store-to-SQLite "ppep_gene_site" table
+                }
+            }
+        }
+        else { print OUT "\t";}
+    }
+    for my $i (0 .. $#kinases_PhosphoSite) {
+        if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]})) {
+            print MELT "$peptide\t$gene_names\t$site_description{$SITE_PHOSPHOSITE}\t$phosphosites_observed_lbl[$i]\n";
+            if ($i < $#kinases_PhosphoSite) {
+                print OUT "X\t";
+            }
+            else {
+                print OUT "X\n";
+            }
+            # begin store-to-SQLite "ppep_gene_site" table
+            # ---
+            $ppep_gene_site_stmth->bind_param(1, $ppep_id);                       # ppep_gene_site.ppep_id
+            $ppep_gene_site_stmth->bind_param(2, $gene_names);                    # ppep_gene_site.gene_names
+            $ppep_gene_site_stmth->bind_param(3, $phosphosites_observed_lbl[$i]); # ppep_gene_site.kinase_map
+            $ppep_gene_site_stmth->bind_param(4, $SITE_PHOSPHOSITE);              # ppep_gene_site.site_type_id
+            if (not $ppep_gene_site_stmth->execute()) {
+                print "Error writing tuple ($peptide,$gene_names,$phosphosites_observed_lbl[$i]): $ppep_gene_site_stmth->errstr\n";
+            }
+            # ...
+            # end store-to-SQLite "ppep_gene_site" table
+        }
+        else {
+            if ($i < $#kinases_PhosphoSite) {
+                print OUT "\t";
+            }
+            elsif ($i == $#kinases_PhosphoSite) {
+                print OUT "\n";
+            }
+        }
+    }
+}
+
+close OUT;
+close MELT;
+$ppep_gene_site_stmth->finish;
+print "begin DB commit at " . format_localtime_iso8601() . "\n";
+$dbh->{AutoCommit} = $auto_commit;
+$dbh->disconnect if ( defined $dbh );
+
+print "\nFinished writing output at " . format_localtime_iso8601() ."\n\n";
+
+###############################################################################################################################
diff -r 000000000000 -r ba62d93a9ef5 macros.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,89 @@
+<macros>
+    <token name="@TOOL_VERSION@">0.1.13</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="1.56.0"  >bioconductor-preprocesscore</requirement>
+            <requirement type="package" version="1.22.2"  >numpy</requirement>
+            <requirement type="package" version="0.3.3"   >openblas</requirement>
+            <requirement type="package" version="1.4.1"   >pandas</requirement>
+            <requirement type="package" version="1.64"    >perl-dbd-sqlite</requirement>
+            <requirement type="package" version="5.26.2"  >perl</requirement>
+            <requirement type="package" version="1.4.0"   >pyahocorasick</requirement>
+            <requirement type="package" version="3.9.10"  >python</requirement>
+            <requirement type="package" version="1.14.2"  >r-data.table</requirement>
+            <requirement type="package" version="1.1.2"   >r-dbi</requirement>
+            <requirement type="package" version="3.3.5"   >r-ggplot2</requirement>
+            <requirement type="package" version="3.1.3"   >r-gplots</requirement>
+            <requirement type="package" version="0.9.4"   >r-latex2exp</requirement>
+            <requirement type="package" version="1.7.1"   >r-optparse</requirement>
+            <requirement type="package" version="1.4.4"   >r-reshape2</requirement>
+            <requirement type="package" version="2.11"    >r-rmarkdown</requirement>
+            <requirement type="package" version="2.2.8"   >r-rsqlite</requirement>
+            <requirement type="package" version="0.4.0"   >r-sass</requirement>
+            <requirement type="package" version="0.4_11"  >r-sqldf</requirement>
+            <requirement type="package" version="1.4.0"   >r-stringr</requirement>
+            <requirement type="package" version="0.37"    >r-tinytex</requirement>
+            <requirement type="package" version="0.3.7"   >r-vioplot</requirement>
+            <!--
+            It would be nice to use conda-forge/texlive-core rather than r-tinytex because the
+            former installs texlive when the package is built, but issue 23 blocked PDF-creation.
+            Also, texlive-core also gave pango font errors (output had missing symbols replaced
+            with boxes) unless I specified the build as well as the version when building a
+            conda environment, e.g.:  texlive-core=20210325=h97429d4_0
+            -->
+        </requirements>
+        <!-- I specified the versions above because it takes a VERY long time to search for package versions when they are not omitted; also, version numbers should lead to reproducible behavior.  Contrast execution times of this (about 18 seconds):
+            echo n | time conda create -n mqppep_ver -c conda-forge -c bioconda \
+              bioconductor-preprocesscore=1.56.0 \
+              numpy=1.22.2 \
+              openblas=0.3.3 \
+              pandas=1.4.1 \
+              perl-dbd-sqlite=1.64 \
+              perl-dbd-sqlite=1.64 \
+              perl=5.26.2 \
+              pyahocorasick=1.4.0 \
+              python=3.9.10 \
+              r-data.table=1.14.2 \
+              r-dbi=1.1.2 \
+              r-ggplot2=3.3.5 \
+              r-gplots=3.1.3 \
+              r-latex2exp=0.9.4 \
+              r-optparse=1.7.1 \
+              r-reshape2=1.4.4 \
+              r-rmarkdown=2.11 \
+              r-rsqlite=2.2.8 \
+              r-sass=0.4.0 \
+              r-sqldf=0.4_11 \
+              r-stringr=1.4.0 \
+              r-tinytex=0.37 \
+              r-vioplot=0.3.7
+          with this (42 or more seconds):
+            echo n | time conda create -n mqppep_nover -c conda-forge -c bioconda \
+              bioconductor-preprocesscore= \
+              numpy \
+              openblas=0.3.3 \
+              pandas \
+              perl \
+              perl-dbd-sqlite \
+              perl-dbd-sqlite \
+              pyahocorasick \
+              python \
+              r-data.table \
+              r-dbi \
+              r-ggplot2 \
+              r-gplots \
+              r-latex2exp \
+              r-optparse \
+              r-reshape2 \
+              r-rmarkdown \
+              r-rsqlite \
+              r-sass \
+              r-sqldf \
+              r-stringr \
+              r-tinytex \
+              r-vioplot
+
+        -->
+    </xml>
+</macros>
diff -r 000000000000 -r ba62d93a9ef5 mqppep_anova.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_anova.R	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,297 @@
+#!/usr/bin/env Rscript
+# libraries
+library(optparse)
+library(data.table)
+library(stringr)
+
+# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
+
+# parse options
+option_list <- list(
+  make_option(
+    c("-i", "--inputFile"),
+    action = "store",
+    default = NA,
+    type = "character",
+    help = "Phosphopeptide Intensities sparse input file path"
+  ),
+  make_option(
+    c("-a", "--alphaFile"),
+    action = "store",
+    default = NA,
+    type = "character",
+    help = paste0("List of alpha cutoff values for significance testing;",
+             " path to text file having one column and no header")
+  ),
+  make_option(
+    c("-S", "--preproc_sqlite"),
+    action = "store",
+    default = NA,
+    type = "character",
+    help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`"
+  ),
+  make_option(
+    c("-K", "--ksea_sqlite"),
+    action = "store",
+    default = NA,
+    type = "character",
+    help = "Path to 'ksea_sqlite' output produced by this tool"
+  ),
+  make_option(
+    c("-f", "--firstDataColumn"),
+    action = "store",
+    default = "^Intensity[^_]",
+    type = "character",
+    help = "First column of intensity values"
+  ),
+  make_option(
+    c("-m", "--imputationMethod"),
+    action = "store",
+    default = "random",
+    type = "character",
+    help = paste0("Method for missing-value imputation,",
+             " one of c('group-median','median','mean','random')")
+  ),
+  make_option(
+    c("-p", "--meanPercentile"),
+    action = "store",
+    default = 3,
+    type = "integer",
+    help = paste0("Mean percentile for randomly generated imputed values;",
+              ", range [1,99]")
+  ),
+  make_option(
+    c("-d", "--sdPercentile"),
+    action = "store",
+    default = 3,
+    type = "double",
+    help = paste0("Adjustment value for standard deviation of",
+              " randomly generated imputed values; real")
+  ),
+  make_option(
+    c("-s", "--regexSampleNames"),
+    action = "store",
+    default = "\\.(\\d+)[A-Z]$",
+    type = "character",
+    help = "Regular expression extracting sample-names"
+  ),
+  make_option(
+    c("-g", "--regexSampleGrouping"),
+    action = "store",
+    default = "(\\d+)",
+    type = "character",
+    help = paste0("Regular expression extracting sample-group",
+             " from an extracted sample-name")
+  ),
+  make_option(
+    c("-o", "--imputedDataFile"),
+    action = "store",
+    default = "output_imputed.tsv",
+    type = "character",
+    help = "Imputed Phosphopeptide Intensities output file path"
+  ),
+  make_option(
+    c("-n", "--imputedQNLTDataFile"),
+    action = "store",
+    default = "output_imp_qn_lt.tsv",
+    type = "character",
+    help =
+      paste(
+        "Imputed, Quantile-Normalized Log-Transformed Phosphopeptide",
+        "Intensities output file path"
+        )
+  ),
+  make_option(
+    c("-r", "--reportFile"),
+    action = "store",
+    default = "QuantDataProcessingScript.html",
+    type = "character",
+    help = "HTML report file path"
+  ),
+  make_option(
+    c("-k", "--ksea_cutoff_statistic"),
+    action = "store",
+    default = "FDR",
+    type = "character",
+    help = paste0("Method for missing-value imputation,",
+             " one of c('FDR','p.value'), but don't expect 'p.value' to work well.")
+  ),
+  make_option(
+    c("-t", "--ksea_cutoff_threshold"),
+    action = "store",
+    default = 0.05,
+    type = "double",
+    help = paste0("Maximum score to be used to score a kinase enrichment as significant")
+  ),
+  make_option(
+    c("-M", "--anova_ksea_metadata"),
+    action = "store",
+    default = "anova_ksea_metadata.tsv",
+    type = "character",
+    help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments"
+  )
+)
+args <- parse_args(OptionParser(option_list = option_list))
+print("args is:")
+cat(str(args))
+
+# Check parameter values
+
+if (! file.exists(args$inputFile)) {
+  stop((paste("Input file", args$inputFile, "does not exist")))
+}
+input_file             <- args$inputFile
+alpha_file             <- args$alphaFile
+preproc_sqlite         <- args$preproc_sqlite
+imputed_data_file_name <- args$imputedDataFile
+imp_qn_lt_data_filenm  <- args$imputedQNLTDataFile
+anova_ksea_metadata    <- args$anova_ksea_metadata
+report_file_name       <- args$reportFile
+ksea_sqlite            <- args$ksea_sqlite
+ksea_cutoff_statistic  <- args$ksea_cutoff_statistic
+ksea_cutoff_threshold  <- args$ksea_cutoff_threshold
+if (
+  sum(
+    grepl(
+      pattern = ksea_cutoff_statistic,
+      x = c("FDR", "p.value")
+      )
+    ) < 1
+  ) {
+    print(sprintf("bad ksea_cutoff_statistic argument: %s", ksea_cutoff_statistic))
+    return(-1)
+    }
+
+imputation_method <- args$imputationMethod
+if (
+  sum(
+    grepl(
+      pattern = imputation_method,
+      x = c("group-median", "median", "mean", "random")
+      )
+    ) < 1
+  ) {
+    print(sprintf("bad imputationMethod argument: %s", imputation_method))
+    return(-1)
+    }
+
+# read with default values, when applicable
+mean_percentile <- args$meanPercentile
+sd_percentile   <- args$sdPercentile
+# in the case of 'random" these values are ignored by the client script
+if (imputation_method == "random") {
+  print("mean_percentile is:")
+  cat(str(mean_percentile))
+
+  print("sd_percentile is:")
+  cat(str(mean_percentile))
+}
+
+# convert string parameters that are passed in via config files:
+#  - firstDataColumn
+#  - regexSampleNames
+#  - regexSampleGrouping
+read_config_file_string <- function(fname, limit) {
+  # eliminate any leading whitespace
+  result    <- gsub("^[ \t\n]*", "", readChar(fname, limit))
+  # eliminate any trailing whitespace
+  result    <- gsub("[ \t\n]*$", "", result)
+  # substitute characters escaped by Galaxy sanitizer
+  result <- gsub("__lt__", "<",  result)
+  result <- gsub("__le__", "<=", result)
+  result <- gsub("__eq__", "==", result)
+  result <- gsub("__ne__", "!=", result)
+  result <- gsub("__gt__", ">",  result)
+  result <- gsub("__ge__", ">=", result)
+  result <- gsub("__sq__", "'",  result)
+  result <- gsub("__dq__", '"',  result)
+  result <- gsub("__ob__", "[",  result)
+  result <- gsub("__cb__", "]",  result)
+}
+cat(paste0("first_data_column file: ", args$firstDataColumn, "\n"))
+cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n"))
+cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n"))
+nc <- 1000
+regex_sample_names <- read_config_file_string(args$regexSampleNames, nc)
+regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc)
+first_data_column <- read_config_file_string(args$firstDataColumn,  nc)
+cat(paste0("first_data_column: ",     first_data_column,     "\n"))
+cat(paste0("regex_sample_names: ",    regex_sample_names,    "\n"))
+cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n"))
+
+# from: https://github.com/molgenis/molgenis-pipelines/wiki/
+#   How-to-source-another_file.R-from-within-your-R-script
+# Function location_of_this_script returns the location of this .R script
+#   (may be needed to source other files in same dir)
+location_of_this_script <- function() {
+    this_file <- NULL
+    # This file may be 'sourced'
+    for (i in - (1:sys.nframe())) {
+        if (identical(sys.function(i), base::source)) {
+            this_file <- (normalizePath(sys.frame(i)$ofile))
+        }
+    }
+
+    if (!is.null(this_file)) return(dirname(this_file))
+
+    # But it may also be called from the command line
+    cmd_args <- commandArgs(trailingOnly = FALSE)
+    cmd_args_trailing <- commandArgs(trailingOnly = TRUE)
+    cmd_args <- cmd_args[
+      seq.int(
+        from = 1,
+        length.out = length(cmd_args) - length(cmd_args_trailing)
+        )
+      ]
+    res <- gsub("^(?:--file=(.*)|.*)$", "\\1", cmd_args)
+
+    # If multiple --file arguments are given, R uses the last one
+    res <- tail(res[res != ""], 1)
+    if (0 < length(res)) return(dirname(res))
+
+    # Both are not the case. Maybe we are in an R GUI?
+    return(NULL)
+}
+
+script_dir <-  location_of_this_script()
+
+rmarkdown_params <- list(
+    inputFile = input_file
+  , alphaFile = alpha_file
+  , preprocDb = preproc_sqlite
+  , firstDataColumn = first_data_column
+  , imputationMethod = imputation_method
+  , meanPercentile = mean_percentile
+  , sdPercentile = sd_percentile
+  , regexSampleNames = regex_sample_names
+  , regexSampleGrouping = regex_sample_grouping
+  , imputedDataFilename = imputed_data_file_name
+  , imputedQNLTDataFile = imp_qn_lt_data_filenm
+  , anovaKseaMetadata = anova_ksea_metadata
+  , kseaAppPrepDb = ksea_sqlite
+  , kseaCutoffThreshold = ksea_cutoff_threshold
+  , kseaCutoffStatistic = ksea_cutoff_statistic
+  )
+
+print("rmarkdown_params")
+str(rmarkdown_params)
+
+# freeze the random number generator so the same results will be produced
+#  from run to run
+set.seed(28571)
+
+# BUG (or "opportunity")
+# To render as PDF for the time being requires installing the conda
+# package `r-texlive` until this issue in `texlive-core` is resolved:
+#   https://github.com/conda-forge/texlive-core-feedstock/issues/19
+# This workaround is detailed in the fourth comment of:
+#   https://github.com/conda-forge/texlive-core-feedstock/issues/61
+
+library(tinytex)
+tinytex::install_tinytex()
+rmarkdown::render(
+  input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/")
+, output_format = rmarkdown::pdf_document(toc = TRUE)
+, output_file = report_file_name
+, params = rmarkdown_params
+)
diff -r 000000000000 -r ba62d93a9ef5 mqppep_anova_script.Rmd
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_anova_script.Rmd	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,3536 @@
+---
+title: "MaxQuant Phosphoproteomic Enrichment Pipeline ANOVA/KSEA"
+author:
+- "Nick Graham^[ORCiD 0000-0002-6811-1941, University of Southern California: Los Angeles, CA, US]"
+- "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]"
+- "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]"
+date:
+- "May 28, 2018"
+- "; revised June 23, 2022"
+output:
+  pdf_document:
+    toc: true
+    toc_depth: 3
+    keep_tex: true
+header-includes:
+  - \usepackage{longtable}
+  - \newcommand\T{\rule{0pt}{2.6ex}}       % Top strut
+  - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut
+params:
+  alphaFile:            "test-data/alpha_levels.tabular"
+  inputFile:            "test-data/test_input_for_anova.tabular"
+  preprocDb:            "test-data/test_input_for_anova.sqlite"
+  kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
+  show_toc:             true
+  firstDataColumn:      "^Intensity[^_]"
+  imputationMethod:     !r c("group-median", "median", "mean", "random")[1]
+  meanPercentile:       1
+  sdPercentile:         1.0
+  regexSampleNames:     "\\.\\d+[A-Z]$"
+  regexSampleGrouping:  "\\d+"
+  imputedDataFilename:  "test-data/limbo/imputedDataFilename.txt"
+  imputedQNLTDataFile:  "test-data/limbo/imputedQNLTDataFile.txt"
+  anovaKseaMetadata:    "test-data/limbo/anovaKseaMetadata.txt"
+  oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1]
+  oneWayTwoCategories:  !r c("aov", "kruskal.test", "oneway.test")[3]
+  kseaCutoffStatistic:  !r c("p.value", "FDR")[2]
+  kseaCutoffThreshold:  !r c( 0.1, 0.05)[2]
+  kseaMinKinaseCount:   1
+  intensityHeatmapRows: 75
+---
+<!--
+  kseaCutoffStatistic:  !r c("p.value", "FDR")[2]
+  kseaCutoffThreshold:  !r c(0.05, 0.1)[1]
+
+  alphaFile:            "test-data/alpha_levels.tabular"
+  inputFile:            "test-data/test_input_for_anova.tabular"
+  preprocDb:            "test-data/test_input_for_anova.sqlite"
+  kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
+
+  alphaFile:            "test-data/alpha_levels.tabular"
+  inputFile:            "test-data/UT_phospho_ST_sites.preproc.tabular"
+  preprocDb:            "test-data/UT_phospho_ST_sites.preproc.sqlite"
+  kseaAppPrepDb:        !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2]
+
+  alphaFile:            "test-data/alpha_levels.tabular"
+  inputFile:            "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular"
+  preprocDb:            "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite"
+  kseaAppPrepDb:        !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2]
+
+  alphaFile:            "test-data/alpha_levels.tabular"
+  inputFile:            "test-data/pST_Sites_NancyDu.txt.preproc.tabular"
+  preprocDb:            "test-data/pST_Sites_NancyDu.txt.preproc.sqlite"
+  kseaAppPrepDb:        !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2]
+
+  inputFile:            "test-data/density_failure.preproc_tab.tabular"
+  kseaAppPrepDb:        !r c(":memory:", "mqppep.sqlite")[2]
+  latex_document: default
+-->
+```{r setup, include = FALSE}
+#ref for debugging: https://yihui.org/tinytex/r/#debugging
+options(tinytex.verbose = TRUE)
+
+# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
+# ref for top and bottom struts: https://tex.stackexchange.com/a/50355
+knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10))
+
+# freeze the random number generator so the same results will be produced
+#  from run to run
+set.seed(28571)
+
+### LIBRARIES
+library(gplots)
+library(DBI)
+library(RSQLite)
+# Suppress "Warning: no DISPLAY variable so Tk is not available"
+suppressWarnings(suppressMessages(library(sqldf)))
+
+# required but not added to search list:
+# - DBI
+# - RSQLite
+# - ggplot2
+# - knitr
+# - latex2exp
+# - preprocessCore
+# - reshape2
+# - vioplot
+
+### CONSTANTS
+
+const_parfin <- par("fin")
+const_boxplot_fill <- "grey94"
+const_stripchart_cex <- 0.5
+const_stripsmall_cex <-
+  sqrt(const_stripchart_cex * const_stripchart_cex / 2)
+const_stripchart_jitter <- 0.3
+const_write_debug_files <- FALSE
+const_table_anchor_bp <- "bp"
+const_table_anchor_ht <- "ht"
+const_table_anchor_p <- "p"
+const_table_anchor_tbp <- "tbp"
+
+
+const_ksea_astrsk_kinases    <- 1
+const_ksea_nonastrsk_kinases <- 2
+const_ksea_all_kinases       <- 3
+
+const_log10_e <- log10(exp(1))
+
+### FUNCTIONS
+
+# from `demo(error.catching)`
+##' Catch *and* save both errors and warnings, and in the case of
+##' a warning, also keep the computed result.
+##'
+##' @title tryCatch both warnings (with value) and errors
+##' @param expr an \R expression to evaluate
+##' @return a list with 'value' and 'warning', where
+##'   'value' may be an error caught.
+##' @author Martin Maechler;
+##' Copyright (C) 2010-2012  The R Core Team
+try_catch_w_e <- function(expr) {
+  wrn <- NULL
+  # warning handler
+  w_handler <- function(w) {
+    wrn <<- w
+    invokeRestart("muffleWarning")
+  }
+  list(
+    value = withCallingHandlers(
+      tryCatch(
+        expr,
+        error = function(e) e
+      ),
+      warning = w_handler
+    ),
+    warning = wrn
+  )
+}
+
+
+write_debug_file <- function(s) {
+  if (const_write_debug_files) {
+    s_path <- sprintf("test-data/%s.txt", deparse(substitute(s)))
+    print(sprintf("DEBUG writing file %s", spath))
+    write.table(
+      s,
+      file = s_path,
+      sep = "\t",
+      col.names = TRUE,
+      row.names = TRUE,
+      quote = FALSE
+    )
+  }
+}
+
+# ref: http://adv-r.had.co.nz/Environments.html
+# "When creating your own environment, note that you should set its parent
+#   environment to be the empty environment. This ensures you don't
+#   accidentally inherit objects from somewhere else."
+# Caution: this prevents `with(my_env, expr)` from working when `expr`
+#   contains anything from the global environment, even operators!
+#   Hence, `x <- 1; get("x", new_env())` fails by design.
+new_env <- function() {
+  new.env(parent = emptyenv())
+}
+
+### numerical/statistical helper functions
+
+any_nan <- function(x) {
+  !any(x == "NaN")
+}
+
+# determine standard deviation of quantile to impute
+sd_finite <- function(x) {
+  ok <- is.finite(x)
+  sd(x[ok])
+}
+
+anova_func <- function(x, grouping_factor, one_way_f) {
+  subject <- data.frame(
+    intensity = x
+  )
+  x_aov <-
+    one_way_f(
+      formula = intensity ~ grouping_factor,
+      data = subject
+      )
+  pvalue <-
+    if (identical(one_way_f, aov))
+      summary(x_aov)[[1]][["Pr(>F)"]][1]
+    else
+      pvalue <- x_aov$p.value
+  pvalue
+}
+
+
+### LaTeX functions
+
+latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) {
+  v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v
+  cat(
+    paste0(
+      v_sub,
+      collapse = collapse_string
+      )
+    )
+}
+
+latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
+  cat("\\begin{itemize}\n\\item ")
+  latex_collapsed_vector(collapse_string, v, underscore_whack)
+  cat("\n\\end{itemize}\n")
+}
+
+latex_itemized_list <- function(v, underscore_whack = TRUE) {
+  latex_itemized_collapsed("\n\\item ", v, underscore_whack)
+}
+
+latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
+  cat("\\begin{enumerate}\n\\item ")
+  latex_collapsed_vector(collapse_string, v, underscore_whack)
+  cat("\n\\end{enumerate}\n")
+}
+
+latex_enumerated_list <- function(v) {
+  latex_enumerated_collapsed("\n\\item ", v)
+}
+
+latex_table_row <- function(v, extra = "", underscore_whack = TRUE) {
+  latex_collapsed_vector(" & ", v, underscore_whack)
+  cat(extra)
+  cat(" \\\\\n")
+}
+
+# Use this like print.data.frame, from which it is adapted:
+data_frame_latex <-
+  function(
+    x,
+    ...,
+    # digits to pass to format.data.frame
+    digits = NULL,
+    # TRUE -> right-justify columns; FALSE -> left-justify
+    right = TRUE,
+    # maximumn number of rows to print
+    max = NULL,
+    # string with justification of each column
+    justification = NULL,
+    # TRUE to center on page
+    centered = TRUE,
+    # optional caption
+    caption = NULL,
+    # h(inline); b(bottom); t (top) or p (separate page)
+    anchor = "h",
+    # set underscore_whack to TRUE to escape underscores
+    underscore_whack = TRUE
+  ) {
+    if (is.null(justification))
+      justification <-
+        Reduce(
+          f = paste,
+          x = rep_len(if (right) "r" else "l", length(colnames(x)))
+          )
+    n <- length(rownames(x))
+    if (length(x) == 0L) {
+      cat(
+        sprintf(
+          # if n is one, use singular 'row', else use plural 'rows'
+          ngettext(
+            n,
+            "data frame with 0 columns and %d row",
+            "data frame with 0 columns and %d rows"
+            ),
+          n
+          ),
+        "\n",
+        sep = ""
+        )
+    } else if (n == 0L) {
+      cat("0 rows for:\n")
+      latex_itemized_list(
+        v = names(x),
+        underscore_whack = underscore_whack
+        )
+    } else {
+      if (is.null(max))
+        max <- getOption("max.print", 99999L)
+      if (!is.finite(max))
+        stop("invalid 'max' / getOption(\"max.print\"): ",
+          max)
+      omit <- (n0 <- max %/% length(x)) < n
+      m <- as.matrix(
+        format.data.frame(
+          if (omit) x[seq_len(n0), , drop = FALSE] else x,
+          digits = digits,
+          na.encode = FALSE
+          )
+        )
+      cat(
+        # h(inline); b(bottom); t (top) or p (separate page)
+        paste0("\\begin{table}[", anchor, "]\n")
+        )
+      if (!is.null(caption))
+        cat(paste0(" \\caption{", caption, "}"))
+      if (centered) cat("\\centering\n")
+      cat(
+        paste(
+          " \\begin{tabular}{",
+          justification,
+          "}\n",
+          sep = ""
+          )
+        )
+      # ref: https://tex.stackexchange.com/a/50353
+      #   Describes use of \rule{0pt}{3ex}
+      if (!is.null(caption))
+        cat("\\B \\\\ \\hline\\hline\n")
+      # ref for top and bottom struts: https://tex.stackexchange.com/a/50355
+      latex_table_row(
+        v = colnames(m),
+        extra = "\\T\\B",
+        underscore_whack = underscore_whack
+        )
+      cat("\\hline\n")
+      for (i in seq_len(length(m[, 1]))) {
+        latex_table_row(
+        v = m[i, ],
+        underscore_whack = underscore_whack
+        )
+      }
+      cat(
+        paste(
+          " \\end{tabular}",
+          "\\end{table}",
+          sep = "\n"
+          )
+        )
+      if (omit)
+        cat(" [ reached 'max' / getOption(\"max.print\") -- omitted",
+          n - n0, "rows ]\n")
+    }
+    invisible(x)
+  }
+
+hypersub <-
+  function(s) {
+    hyper <- tolower(s)
+    hyper <- gsub("[^a-z0-9]+", "-", hyper)
+    hyper <- gsub("[-]+",       "-", hyper)
+    hyper <- sub("^[-]",        "",  hyper)
+    hyper <- sub("[-]$",        "",  hyper)
+    return(hyper)
+  }
+
+subsection_header <-
+  function(s) {
+    hyper <- hypersub(s)
+    cat(
+      sprintf(
+        "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n",
+        hyper, s, hyper
+        )
+      )
+  }
+
+subsubsection_header <-
+  function(s) {
+    hyper <- hypersub(s)
+    cat(
+      sprintf(
+        "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n",
+        hyper, s, hyper
+        )
+      )
+  }
+
+### SQLite functions
+
+ddl_exec <- function(db, sql) {
+  discard <- DBI::dbExecute(conn = db, statement = sql)
+  if (FALSE && discard != 0) {
+    need_newpage <- TRUE
+    if (need_newpage) {
+      need_newpage <<- FALSE
+      cat("\\newpage\n")
+    }
+    o_file <- stdout()
+    cat("\n\\begin{verbatim}\n")
+    cat(sql, file = o_file)
+    cat(sprintf("\n%d rows affected by DDL\n", discard), file = o_file)
+    cat("\n\\end{verbatim}\n")
+  }
+}
+
+dml_no_rows_exec <- function(db, sql) {
+  discard <- DBI::dbExecute(conn = db, statement = sql)
+  if (discard != 0) {
+    need_newpage <- TRUE
+    if (need_newpage) {
+      need_newpage <<- FALSE
+      cat("\\newpage\n")
+    }
+    cat("\n\\begin{verbatim}\n")
+    o_file <- stdout()
+    cat(sql, file = o_file)
+    cat(sprintf("\n%d rows affected by DML\n", discard), file = o_file)
+    cat("\n\\end{verbatim}\n")
+  }
+}
+
+### KSEA functions and helpers
+
+# Adapted from KSEAapp::KSEA.Scores to allow retrieval of:
+# - maximum log2(FC)
+ksea_scores <- function(
+
+  # For human data, typically, ksdata = KSEAapp::ksdata
+  ksdata,
+
+  # Input data file having columns:
+  # - Protein     : abbreviated protein name
+  # - Gene        : HUGO gene name
+  # - Peptide     : peptide sequence without indications of phosphorylation
+  # - Reside.Both : position(s) of phosphorylation within Gene sequence
+  #                 - First letter designates AA that is modified
+  #                 - Numbers indicate position within Gene
+  #                 - Multiple values are separated by semicolons
+  #   - p         : p-value
+  #   - FC        : fold-change
+  px,
+
+  # A binary input of TRUE or FALSE, indicating whether or not to include
+  #   NetworKIN predictions
+  networkin,
+
+  # A numeric value between 1 and infinity setting the minimum NetworKIN
+  #   score (can be left out if networkin = FALSE)
+  networkin_cutoff
+
+) {
+  if (length(grep(";", px$Residue.Both)) == 0) {
+    # There are no Residue.Both entries having semicolons, so new is
+    #   simply px except two columns are renamed and a column is added
+    #   for log2(abs(fold-change))
+    new <- px
+    colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
+    new$log2_fc <- log2(abs(as.numeric(as.character(new$FC))))
+    new <- new[complete.cases(new$log2_fc), ]
+  } else {
+    # Split each row having semicolons in Residue.Both into rows that are
+    #   duplicated in all respects except that each row has a single
+    #   member of the set "split-on-semicolon-Residue.Both"
+    px_double <- px[grep(";", px$Residue.Both), ]
+    residues <- as.character(px_double$Residue.Both)
+    residues <- as.matrix(residues, ncol = 1)
+    split <- strsplit(residues, split = ";")
+    # x gets count of residues in each row,
+    #   i.e., 1 + count of semicolons
+    x <- sapply(split, length)
+    # Here is the set of split rows
+    px_single <- data.frame(
+      Protein      = rep(px_double$Protein, x),
+      Gene         = rep(px_double$Gene,    x),
+      Peptide      = rep(px_double$Peptide, x),
+      Residue.Both = unlist(split),
+      p            = rep(px_double$p,       x),
+      FC           = rep(px_double$FC,      x)
+      )
+    # new first gets the split rows
+    new <- px[-grep(";", px$Residue.Both), ]
+    # to new, append the rows that didn't need splitting in the first place
+    new <- rbind(new, px_single)
+    # map Gene         to SUB_GENE
+    # map Residue.Both to SUB_MOD_RSD
+    colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
+    # Eliminate any non-positive values to prevent introduction of
+    #   infinite or NaN values
+    new[(0 <= new$log2_fc), "log2_fc"] <- NA
+    # Because of preceding step, there is no need for abs in the next line
+    new$log2_fc <- log2(as.numeric(as.character(new$FC)))
+    # Convert any illegal values from NaN to NA
+    new[is.nan(new$log2_fc), "log2_fc"] <- NA
+    # Eliminate rows having missing values (e.g., non-imputed data)
+    new <- new[complete.cases(new$log2_fc), ]
+  }
+  if (networkin == TRUE) {
+    # When NetworKIN is true, filter on NetworKIN.cutoff which includes
+    #   PhosphoSitePlus data *because its networkin_score is set to Inf*
+    ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ]
+    ksdata_filtered <- ksdata_filtered[
+      (ksdata_filtered$networkin_score >= networkin_cutoff), ]
+  } else {
+    # Otherwise, simply use PhosphSitePlus rows
+    ksdata_filtered <- ksdata[
+      grep("PhosphoSitePlus", ksdata$Source), ]
+  }
+  # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD
+  #   colnames of ksdata_filtered:
+  #     "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID"
+  #     "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID"
+  #     "SITE_...7_AA" "networkin_score" "Source"
+  #   colnames of new:
+  #     "Protein" "SUB_GENE" "Peptide" "SUB_MOD_RSD" "p" "FC" "log2_fc"
+  # Equivalent to:
+  #   SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc
+  #     FROM ksdata_filtered a
+  #       INNER JOIN new b
+  #         ON a.SUB_GENE = b.SUB_GENE
+  #           AND a.SUB_MOD_RSD = b.SUB_MOD_RSD
+  ksdata_dataset <- base::merge(ksdata_filtered, new)
+  #   colnames of ksdata_dataset:
+  #     "KINASE"      "KIN_ACC_ID"   "GENE"       "KIN_ORGANISM" "SUBSTRATE"
+  #     "SUB_GENE_ID" "SUB_ACC_ID"   "SUB_GENE"   "SUB_ORGANISM" "SUB_MOD_RSD"
+  #     "SITE_GRP_ID" "SITE_...7_AA" "networkin_score"  "Source" "Protein"
+  #     "Peptide"     "p"            "FC"         "log2_fc" (uniprot_no_isoform)
+  # Re-order dataset; prior to accounting for isoforms
+  ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ]
+  # Extract non-isoform accession in UniProtKB
+  ksdata_dataset$uniprot_no_isoform <- sapply(
+    ksdata_dataset$KIN_ACC_ID,
+    function(x) unlist(strsplit(as.character(x), split = "-"))[1]
+    )
+  # Discard previous results while selecting interesting columns ...
+  ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)]
+  # Column names are now:
+  #   "GENE"       "SUB_GENE"        "SUB_MOD_RSD"    "Peptide" "p"
+  #   "FC" "log2_fc" "Source"
+  # Make column names human-readable
+  colnames(ksdata_dataset_abbrev) <- c(
+    "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p",
+    "FC", "log2FC", "Source"
+    )
+  # SELECT * FROM ksdata_dataset_abbrev
+  #   ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p
+  ksdata_dataset_abbrev <-
+    ksdata_dataset_abbrev[
+      order(
+        ksdata_dataset_abbrev$Kinase.Gene,
+        ksdata_dataset_abbrev$Substrate.Gene,
+        ksdata_dataset_abbrev$Substrate.Mod,
+        ksdata_dataset_abbrev$p),
+      ]
+  # First aggregation step to account for multiply phosphorylated peptides
+  #   and differing peptide sequences; the goal here is to combine results
+  #   for all measurements of the same substrate.
+  # SELECT  `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
+  #         `Source`, avg(log2FC) AS log2FC
+  #   FROM  ksdata_dataset_abbrev
+  #   GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
+  #         `Source`
+  #   ORDER BY `Kinase.Gene`;
+  # in two steps:
+  # (1) compute average log_2(fold-change)
+  ksdata_dataset_abbrev <- aggregate(
+    log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source,
+    data = ksdata_dataset_abbrev,
+    FUN = mean
+    )
+  # (2) order by Kinase.Gene
+  ksdata_dataset_abbrev <-
+    ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ]
+  # SELECT  `Kinase.Gene`, count(*)
+  #   FROM  ksdata_dataset_abbrev
+  #  GROUP BY `Kinase.Gene`;
+  # in two steps:
+  # (1) Extract the list of Kinase.Gene names
+  kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene)
+  # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev,
+  #   named by Kinase.Gene
+  kinase_list <- as.matrix(table(kinase_list))
+  # Second aggregation step to account for all substrates per kinase
+  # CREATE TABLE mean_fc
+  #   AS
+  # SELECT  `Kinase.Gene`, avg(log2FC) AS log2FC
+  #   FROM  ksdata_dataset_abbrev
+  #   GROUP BY `Kinase.Gene`
+  mean_fc <- aggregate(
+    log2FC ~ Kinase.Gene,
+    data = ksdata_dataset_abbrev,
+    FUN = mean
+    )
+  # mean_fc columns: "Kinase.Gene", "log2FC"
+  if (FALSE) {
+    # I need to re-think this; I was trying to find the most-represented
+    #   peptide, but that horse has already left the barn
+    # SELECT  `Kinase.Gene`, max(abs(log2FC)) AS log2FC
+    #   FROM  ksdata_dataset_abbrev
+    #   GROUP BY `Kinase.Gene`
+    max_fc <- aggregate(
+      log2FC ~ Kinase.Gene,
+      data = ksdata_dataset_abbrev,
+      FUN = function(r) max(abs(r))
+      )
+  }
+
+  # Create column 3: mS
+  mean_fc$m_s <- mean_fc[, 2]
+  # Create column 4: Enrichment
+  mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE))
+  # Create column 5: m, count of substrates
+  mean_fc$m <- kinase_list
+  # Create column 6: z-score
+  mean_fc$z_score <- (
+    (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) *
+      sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE)
+  # Create column 7: p-value, deduced from z-score
+  mean_fc$p_value <- pnorm(-abs(mean_fc$z_score))
+  # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value
+  mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr")
+
+  # Remove log2FC column, which is duplicated as mS
+  mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2]
+  # Correct the column names which we had to hack because of the linter...
+  colnames(mean_fc) <- c(
+    "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
+    )
+  return(mean_fc)
+}
+
+low_fdr_barplot <- function(
+  rslt,
+  i_cntrst,
+  i,
+  a_level,
+  b_level,
+  fold_change,
+  caption
+) {
+  rslt_score_list_i <- rslt$score_list[[i]]
+  if (!is.null(rslt_score_list_i)) {
+    rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
+    k <- data.frame(
+      contrast = as.integer(i_cntrst),
+      a_level = rep.int(a_level, rslt_score_list_i_nrow),
+      b_level = rep.int(b_level, rslt_score_list_i_nrow),
+      kinase_gene = rslt_score_list_i$Kinase.Gene,
+      mean_log2_fc = rslt_score_list_i$mS,
+      enrichment = rslt_score_list_i$Enrichment,
+      substrate_count = rslt_score_list_i$m,
+      z_score = rslt_score_list_i$z.score,
+      p_value = rslt_score_list_i$p.value,
+      fdr = rslt_score_list_i$FDR
+    )
+    selector <- switch(
+      ksea_cutoff_statistic,
+      "FDR" = {
+        k$fdr
+        },
+      "p.value" = {
+        k$p_value
+        },
+      stop(
+        sprintf(
+          "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+          ksea_cutoff_statistic
+          )
+        )
+      )
+
+    k <- k[selector < ksea_cutoff_threshold, ]
+
+    if (nrow(k) > 1) {
+      op <- par(mai = c(1, 1.5, 0.4, 0.4))
+      numeric_z_score <- as.numeric(k$z_score)
+      z_score_order <- order(numeric_z_score)
+      kinase_name <- k$kinase_gene
+      long_caption <-
+        sprintf(
+          "Kinase z-score, %s < %s, %s",
+          ksea_cutoff_statistic,
+          ksea_cutoff_threshold,
+          caption
+          )
+      my_cex_caption <- 65.0 / max(65.0, nchar(long_caption))
+      cat("\n\\clearpage\n")
+      barplot(
+        height = numeric_z_score[z_score_order],
+        border = NA,
+        xpd = FALSE,
+        cex.names = 1.0,
+        cex.axis = 1.0,
+        main = long_caption,
+        cex.main = my_cex_caption,
+        names.arg = kinase_name[z_score_order],
+        horiz = TRUE,
+        srt = 45,
+        las = 1)
+      par(op)
+    }
+  }
+}
+
+# note that this adds elements to the global variable `ksea_asterisk_hash`
+
+low_fdr_print <- function(
+  rslt,
+  i_cntrst,
+  i,
+  a_level,
+  b_level,
+  fold_change,
+  caption
+) {
+  rslt_score_list_i <- rslt$score_list[[i]]
+  if (!is.null(rslt_score_list_i)) {
+    rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
+    k <- contrast_ksea_scores <- data.frame(
+      contrast = as.integer(i_cntrst),
+      a_level = rep.int(a_level, rslt_score_list_i_nrow),
+      b_level = rep.int(b_level, rslt_score_list_i_nrow),
+      kinase_gene = rslt_score_list_i$Kinase.Gene,
+      mean_log2_fc = rslt_score_list_i$mS,
+      enrichment = rslt_score_list_i$Enrichment,
+      substrate_count = rslt_score_list_i$m,
+      z_score = rslt_score_list_i$z.score,
+      p_value = rslt_score_list_i$p.value,
+      fdr = rslt_score_list_i$FDR
+    )
+
+    selector <- switch(
+      ksea_cutoff_statistic,
+      "FDR" = {
+        k$fdr
+        },
+      "p.value" = {
+        k$p_value
+        },
+      stop(
+        sprintf(
+          "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+          ksea_cutoff_statistic
+          )
+        )
+      )
+
+    k <- k[selector < ksea_cutoff_threshold, ]
+    # save kinase names to ksea_asterisk_hash
+    for (kinase_name in k$kinase_gene) {
+      ksea_asterisk_hash[[kinase_name]] <- 1
+    }
+
+    db_write_table_overwrite <- (i_cntrst < 2)
+    db_write_table_append    <- !db_write_table_overwrite
+    RSQLite::dbWriteTable(
+      conn = db,
+      name = "contrast_ksea_scores",
+      value = contrast_ksea_scores,
+      append = db_write_table_append
+      )
+    selector <- switch(
+      ksea_cutoff_statistic,
+      "FDR" = {
+        contrast_ksea_scores$fdr
+        },
+      "p.value" = {
+        contrast_ksea_scores$p_value
+        },
+      stop(
+        sprintf(
+          "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+          ksea_cutoff_statistic
+          )
+        )
+      )
+    output_df <- contrast_ksea_scores[
+      selector < ksea_cutoff_threshold,
+      c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
+        "z_score", "p_value", "fdr")
+      ]
+    output_order <- with(output_df, order(mean_log2_fc, kinase_gene))
+    output_df <- output_df[output_order, ]
+    colnames(output_df) <-
+      c(
+        colnames(output_df)[1],
+        colnames(output_df)[2],
+        "enrichment",
+        "m_s",
+        "z_score",
+        "p_value",
+        "fdr"
+      )
+    output_df$fdr <- sprintf("%0.4f", output_df$fdr)
+    output_df$p_value <- sprintf("%0.2e", output_df$p_value)
+    output_df$z_score <- sprintf("%0.2f", output_df$z_score)
+    output_df$m_s <- sprintf("%d", output_df$m_s)
+    output_df$enrichment <- sprintf("%0.2f", output_df$enrichment)
+    output_ncol <- ncol(output_df)
+    colnames(output_df) <-
+      c(
+        "Kinase",
+        "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)",
+        "Enrichment",
+        "Substrates",
+        "z-score",
+        "p-value",
+        "FDR"
+      )
+    selector <- switch(
+      ksea_cutoff_statistic,
+      "FDR" = {
+        rslt$score_list[[i]]$FDR
+        },
+      "p.value" = {
+        rslt$score_list[[i]]$p.value
+        },
+      stop(
+        sprintf(
+          "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+          ksea_cutoff_statistic
+          )
+        )
+      )
+    if (sum(selector < ksea_cutoff_threshold) > 0) {
+      math_caption <- gsub("{", "\\{", caption,      fixed = TRUE)
+      math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
+      data_frame_latex(
+        x = output_df,
+        justification = "l c c c c c c",
+        centered = TRUE,
+        caption = sprintf(
+          "\\text{%s}, %s < %s",
+          math_caption,
+          ksea_cutoff_statistic,
+          ksea_cutoff_threshold
+          ),
+        anchor = const_table_anchor_p
+        )
+    } else {
+      cat(
+        sprintf(
+          "\\break
+          No kinases had
+          \\(\\text{%s}_\\text{enrichment} < %s\\)
+          for contrast %s\\hfill\\break\n",
+          ksea_cutoff_statistic,
+          ksea_cutoff_threshold,
+          caption
+        )
+      )
+    }
+  }
+}
+
+# create_breaks is a helper for ksea_heatmap
+create_breaks <- function(merged_scores) {
+  if (min(merged_scores, na.rm = TRUE) < -1.6) {
+    breaks_neg <- seq(-1.6, 0, length.out = 30)
+    breaks_neg <-
+      append(
+        seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10),
+        breaks_neg
+        )
+    breaks_neg <- sort(unique(breaks_neg))
+  } else {
+    breaks_neg <- seq(-1.6, 0, length.out = 30)
+  }
+  if (max(merged_scores, na.rm = TRUE) > 1.6) {
+    breaks_pos <- seq(0, 1.6, length.out = 30)
+    breaks_pos <-
+      append(
+        breaks_pos,
+        seq(1.6, max(merged_scores, na.rm = TRUE),
+        length.out = 10)
+        )
+    breaks_pos <- sort(unique(breaks_pos))
+  } else {
+    breaks_pos <- seq(0, 1.6, length.out = 30)
+  }
+  breaks_all <- unique(append(breaks_neg, breaks_pos))
+  mycol_neg <-
+    gplots::colorpanel(n = length(breaks_neg),
+               low = "blue",
+               high = "white")
+  mycol_pos <-
+    gplots::colorpanel(n = length(breaks_pos) - 1,
+               low = "white",
+               high = "red")
+  mycol <- unique(append(mycol_neg, mycol_pos))
+  color_breaks <- list(breaks_all, mycol)
+  return(color_breaks)
+}
+
+# draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap
+draw_kseaapp_summary_heatmap <- function(
+    x,
+    sample_cluster,
+    merged_asterisk,
+    my_cex_row,
+    color_breaks,
+    margins,
+    ...
+) {
+  merged_scores <- x
+  if (!is.matrix(x)) {
+    cat(
+      paste0(
+        "No plot because \\texttt{typeof(x)} is '",
+        typeof(x),
+        "' rather than 'matrix'.\n\n"
+        )
+      )
+  } else if (nrow(x) < 2) {
+    cat("No plot because matrix x has ", nrow(x), " rows.\n\n")
+    cat("\\begin{verbatim}\n")
+    str(x)
+    cat("\\end{verbatim}\n")
+  } else if (ncol(x) < 2) {
+    cat("No plot because matrix x has ", ncol(x), " columns.\n\n")
+    cat("\\begin{verbatim}\n")
+    str(x)
+    cat("\\end{verbatim}\n")
+  } else {
+    gplots::heatmap.2(
+      x            = merged_scores,
+      Colv         = sample_cluster,
+      scale        = "none",
+      cellnote     = merged_asterisk,
+      notecol      = "white",
+      cexCol       = 0.9,
+      # Heuristically assign size of row labels
+      cexRow       = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25),
+      srtCol       = 45,
+      srtRow       = 45,
+      notecex      = 3 * my_cex_row,
+      col          = color_breaks[[2]],
+      density.info = "none",
+      trace        = "none",
+      breaks       = color_breaks[[1]],
+      lmat         = rbind(c(0, 3), c(2, 1), c(0, 4)),
+      lhei         = c(0.4, 8.0, 1.1),
+      lwid         = c(0.5, 3),
+      key          = FALSE,
+      margins      = margins,
+      ...
+    )
+  }
+}
+
+# Adapted from KSEAapp::KSEA.Heatmap
+ksea_heatmap <- function(
+  # the data frame outputs from the KSEA.Scores() function, in list format
+  score_list,
+  # a character vector of all the sample names for heatmap annotation:
+  # - the names must be in the same order as the data in score_list
+  # - please avoid long names, as they may get cropped in the final image
+  sample_labels,
+  # character string of either "p.value" or "FDR" indicating the data column
+  #   to use for marking statistically significant scores
+  stats,
+  # a numeric value between 0 and infinity indicating the min. number of
+  #   substrates a kinase must have to be included in the heatmap
+  m_cutoff,
+  # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
+  #   for indicating significant kinases in the heatmap
+  p_cutoff =
+    stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"),
+  # a binary input of TRUE or FALSE, indicating whether or not to perform
+  #   hierarchical clustering of the sample columns
+  sample_cluster,
+  # a binary input of TRUE or FALSE, indicating whether or not to export
+  #   the heatmap as a .png image into the working directory
+  export = FALSE,
+  # bottom and right margins; adjust as needed if contrast names are too long
+  margins = c(6, 20),
+  # print which kinases?
+  # - Mandatory argument, must be one of const_ksea_.*_kinases
+  which_kinases,
+  # additional arguments to gplots::heatmap.2, such as:
+  # - main: main title of plot
+  # - xlab: x-axis label
+  # - ylab: y-axis label
+  ...
+) {
+  filter_m <- function(dataset, m_cutoff) {
+    filtered <- dataset[(dataset$m >= m_cutoff), ]
+    return(filtered)
+  }
+  score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff))
+  for (i in seq_len(length(score_list_m))) {
+    names <- colnames(score_list_m[[i]])[c(2:7)]
+    colnames(score_list_m[[i]])[c(2:7)] <-
+      paste(names, i, sep = ".")
+  }
+  master <-
+    Reduce(
+      f = function(...) {
+        base::merge(..., by = "Kinase.Gene", all = FALSE)
+      },
+      x = score_list_m
+      )
+
+  row.names(master) <- master$Kinase.Gene
+  columns <- as.character(colnames(master))
+  merged_scores <- as.matrix(master[, grep("z.score", columns), drop = FALSE])
+  colnames(merged_scores) <- sample_labels
+  merged_stats <- as.matrix(master[, grep(stats, columns)])
+  asterisk <- function(mtrx, p_cutoff) {
+    new <- data.frame()
+    for (i in seq_len(nrow(mtrx))) {
+      for (j in seq_len(ncol(mtrx))) {
+        my_value <- mtrx[i, j]
+        if (!is.na(my_value) && my_value < p_cutoff) {
+          new[i, j] <- "*"
+        } else {
+          new[i, j] <- ""
+        }
+      }
+    }
+    return(new)
+  }
+  merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff))
+
+  # begin hack to print only significant rows
+  asterisk_rows <- rowSums(merged_asterisk == "*") > 0
+  all_rows <- rownames(merged_stats)
+  names(asterisk_rows) <- all_rows
+  non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE])
+  asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE])
+  merged_scores_asterisk <- merged_scores[names(asterisk_rows), ]
+  merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ]
+  # end hack to print only significant rows
+
+  row_list <- list()
+  row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows
+  row_list[[const_ksea_all_kinases]] <- all_rows
+  row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows
+
+  i <- which_kinases
+  my_row_names <- row_list[[i]]
+  scrs <- merged_scores[my_row_names, ]
+  stts <- merged_stats[my_row_names, ]
+  merged_asterisk <- as.matrix(asterisk(stts, p_cutoff))
+
+  color_breaks <- create_breaks(scrs)
+  plot_height <- nrow(scrs) ^ 0.55
+  plot_width <- ncol(scrs) ^ 0.7
+  my_cex_row <- 0.25 * 16 / plot_height
+  if (export == "TRUE") {
+    png(
+      "KSEA.Merged.Heatmap.png",
+      width = plot_width * 300,
+      height = 2 * plot_height * 300,
+      res = 300,
+      pointsize = 14
+    )
+  }
+  draw_kseaapp_summary_heatmap(
+    x               = scrs,
+    sample_cluster  = sample_cluster,
+    merged_asterisk = merged_asterisk,
+    my_cex_row      = my_cex_row,
+    color_breaks    = color_breaks,
+    margins         = margins
+  )
+  if (export == "TRUE") {
+    dev.off()
+  }
+  return(my_row_names)
+}
+
+# helper for heatmaps of phosphopeptide intensities
+
+draw_intensity_heatmap <-
+  function(
+    m,                              # matrix with rownames already formatted
+    cutoff,                         # cutoff used by hm_heading_function
+    hm_heading_function,            # construct and cat heading from m and cutoff
+    hm_main_title,                  # main title for plot (drawn below heading)
+    suppress_row_dendrogram = TRUE, # set to false to show dendrogram
+    max_peptide_count               # experimental:
+          = intensity_hm_rows,      #   values of 50 and 75 worked well
+    ...                             # passthru parameters for heatmap
+  ) {
+    peptide_count <- 0
+    # emit the heading for the heatmap
+    if (hm_heading_function(m, cutoff)) {
+      peptide_count <- min(max_peptide_count, nrow(m))
+      if (nrow(m) > 1) {
+        m_margin <- m[peptide_count:1, ]
+        # Margin setting was heuristically derived
+        margins <-
+          c(0.5, # col
+            max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16  # row
+          )
+        }
+      if (nrow(m) > 1) {
+        tryCatch(
+          {
+            old_oma <- par("oma")
+            par(cex.main = 0.6)
+            # Heuristically determined character size adjustment formula
+            char_contractor <-
+              250000 / (
+                max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows
+                )
+            heatmap(
+              m[peptide_count:1, ],
+              Rowv = if (suppress_row_dendrogram) NA else NULL,
+              Colv = NA,
+              cexRow = char_contractor,
+              cexCol = char_contractor * 50 / max_peptide_count,
+              scale = "row",
+              margins = margins,
+              main =
+                "Unimputed, unnormalized log(intensities)",
+              xlab = "",
+              las = 1,
+              ...
+              )
+          },
+          error = function(e) {
+            cat(
+              sprintf(
+                "\nCould not draw heatmap, possibly because of too many missing values.  Internal message: %s\n",
+                e$message
+                )
+              )
+            },
+          finally = par(old_oma)
+        )
+      }
+    }
+    return(peptide_count)
+  }
+```
+
+```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+cat("\\listoftables\n")
+```
+# Purpose
+
+To perform for phosphopeptides:
+
+- imputation of missing values,
+- quantile normalization,
+- ANOVA (using the R stats::`r params$oneWayManyCategories` function), and
+- KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases:
+  - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org)
+  - The Human Proteome Database [http://hprd.org](http://hprd.org)
+  - NetworKIN [http://networkin.science/](http://networkin.science/)
+  - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx)
+
+```{r include = FALSE}
+
+### GLOBAL VARIABLES
+
+# parameters for KSEA
+
+ksea_cutoff_statistic <- params$kseaCutoffStatistic
+ksea_cutoff_threshold <- params$kseaCutoffThreshold
+ksea_min_kinase_count <- params$kseaMinKinaseCount
+
+ksea_heatmap_titles <- list()
+ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <-
+  sprintf(
+    "Summary for all kinases enriched in one or more contrasts at %s < %s",
+    ksea_cutoff_statistic,
+    ksea_cutoff_threshold
+    )
+ksea_heatmap_titles[[const_ksea_all_kinases]] <-
+  "Summary figure for all contrasts and all kinases"
+ksea_heatmap_titles[[const_ksea_nonastrsk_kinases]] <-
+  sprintf(
+    "Summary for all kinases not enriched at %s < %s in any contrast",
+    ksea_cutoff_statistic,
+    ksea_cutoff_threshold
+    )
+# hash to hold names of significantly enriched kinases
+ksea_asterisk_hash <- new_env()
+
+# READ PARAMETERS (mostly)
+
+intensity_hm_rows <- params$intensityHeatmapRows
+# Input Filename
+input_file <- params$inputFile
+
+# First data column - ideally, this could be detected via regexSampleNames,
+#   but for now leave it as is.
+first_data_column <- params$firstDataColumn
+fdc_is_integer <- is.integer(first_data_column)
+if (fdc_is_integer) {
+  first_data_column <- as.integer(params$firstDataColumn)
+}
+
+# False discovery rate adjustment for ANOVA
+#  Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05
+val_fdr <-
+  read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "")
+
+if (
+  ncol(val_fdr) != 1 ||
+  sum(!is.numeric(val_fdr[, 1])) ||
+  sum(val_fdr[, 1] < 0) ||
+  sum(val_fdr[, 1] > 1)
+) {
+  stop("alphaFile should be one column of numbers within the range [0.0,1.0]")
+}
+val_fdr <- val_fdr[, 1]
+
+#Imputed Data filename
+imputed_data_filename <- params$imputedDataFilename
+imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
+anova_ksea_mtdt_file  <- params$anovaKseaMetadata
+
+```
+
+```{r echo = FALSE}
+# Imputation method, should be one of
+#   "random", "group-median", "median", or "mean"
+imputation_method <- params$imputationMethod
+
+# Selection of percentile of logvalue data to set the mean for random number
+#   generation when using random imputation
+mean_percentile <- params$meanPercentile / 100.0
+
+# deviation adjustment-factor for random values; real number.
+sd_percentile <- params$sdPercentile
+
+# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
+regex_sample_names <- params$regexSampleNames
+
+# Regular expression to extract Sample Grouping from Sample Name;
+#   if error occurs, compare sample_treatment_levels vs. sample_name_matches
+#   to see if groupings/pairs line up
+#   e.g., "(\\d+)"
+regex_sample_grouping <- params$regexSampleGrouping
+
+one_way_all_categories_fname <- params$oneWayManyCategories
+one_way_all_categories <- try_catch_w_e(
+  match.fun(one_way_all_categories_fname))
+if (!is.function(one_way_all_categories$value)) {
+  write("fatal error for parameter oneWayManyCategories:", stderr())
+  write(one_way_all_categories$value$message,             stderr())
+  if (sys.nframe() > 0) quit(save = "no", status = 1)
+  stop("Cannot continue. Goodbye.")
+}
+one_way_all_categories <- one_way_all_categories$value
+
+one_way_two_categories_fname <- params$oneWayManyCategories
+one_way_two_categories <- try_catch_w_e(
+  match.fun(one_way_two_categories_fname))
+if (!is.function(one_way_two_categories$value)) {
+  cat("fatal error for parameter oneWayTwoCategories: \n")
+  cat(one_way_two_categories$value$message, fill = TRUE)
+  if (sys.nframe() > 0) quit(save = "no", status = 1)
+  stop("Cannot continue. Goodbye.")
+}
+one_way_two_categories <- one_way_two_categories$value
+
+preproc_db       <- params$preprocDb
+ksea_app_prep_db <- params$kseaAppPrepDb
+result <- file.copy(
+  from      = preproc_db,
+  to        = ksea_app_prep_db,
+  overwrite = TRUE
+  )
+if (!result) {
+  write(
+    sprintf(
+      "fatal error copying initial database '%s' to output '%s'",
+      preproc_db,
+      ksea_app_prep_db,
+    ),
+    stderr()
+  )
+  if (sys.nframe() > 0) quit(save = "no", status = 1)
+  stop("Cannot continue. Goodbye.")
+}
+```
+
+```{r echo = FALSE}
+### READ DATA
+
+# read.table reads a file in table format and creates a data frame from it.
+#   - note that `quote = ""` means that quotation marks are treated literally.
+full_data <- read.table(
+  file = input_file,
+  sep = "\t",
+  header = TRUE,
+  quote = "",
+  check.names = FALSE
+  )
+```
+
+# Extract Sample Names and Treatment Levels
+
+Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2.
+
+```{r echo = FALSE, results = 'asis'}
+
+data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
+
+if (!fdc_is_integer) {
+  if (length(data_column_indices) > 0) {
+    first_data_column <- data_column_indices[1]
+  } else {
+    stop(paste("failed to convert firstDataColumn:", first_data_column))
+  }
+}
+
+cat(
+  sprintf(
+    paste(
+      "\n\nThe input data file has peptide-intensity data for each sample",
+      "in one of columns %d through %d.\n\n"
+      ),
+    min(data_column_indices),
+    max(data_column_indices)
+    )
+  )
+
+# Write column names as a LaTeX enumerated list.
+column_name_df <- data.frame(
+  column = seq_len(length(colnames(full_data))),
+  name = paste0("\\verb@", colnames(full_data), "@")
+  )
+data_frame_latex(
+  x = column_name_df,
+  justification = "l l",
+  centered = TRUE,
+  caption = "Input data column names",
+  anchor = const_table_anchor_bp,
+  underscore_whack = FALSE
+  )
+
+```
+
+```{r echo = FALSE, results = 'asis'}
+quant_data <- full_data[first_data_column:length(full_data)]
+quant_data[quant_data == 0] <- NA
+rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide
+# Extract factors and trt-replicates using regular expressions.
+# Typically:
+#   regex_sample_names    is "\\.\\d+[A-Z]$"
+#   regex_sample_grouping is "\\d+"
+# This would distinguish trt-replicates by terminal letter [A-Z]
+#   in sample names and group them into trts by the preceding digits.
+#   e.g.:
+#     group .1A .1B .1C into group 1;
+#     group .2A .2B .2C, into group 2;
+#     etc.
+m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE)
+sample_name_matches <- regmatches(names(quant_data), m)
+colnames(quant_data) <- sample_name_matches
+
+write_debug_file(quant_data)
+
+rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
+sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match))
+number_of_samples <- length(sample_name_matches)
+sample_treatment_df <- data.frame(
+  level = sample_treatment_levels,
+  sample = sample_name_matches
+  )
+data_frame_latex(
+  x = sample_treatment_df,
+  justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}",
+  centered = TRUE,
+  caption = "Treatment levels",
+  anchor = const_table_anchor_tbp,
+  underscore_whack = FALSE
+  )
+```
+
+```{r echo = FALSE, results = 'asis'}
+cat("\\newpage\n")
+```
+
+## Are the log-transformed sample distributions similar?
+
+```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+
+quant_data[quant_data == 0] <- NA  #replace 0 with NA
+quant_data_log <- log10(quant_data)
+
+rownames(quant_data_log) <- rownames(quant_data)
+colnames(quant_data_log) <- sample_name_matches
+
+write_debug_file(quant_data_log)
+
+# data visualization
+old_par <- par(
+  mai = par("mai") + c(0.5, 0, 0, 0)
+)
+# ref: https://r-charts.com/distribution/add-points-boxplot/
+# Vertical plot
+boxplot(
+  quant_data_log
+, las = 1
+, col = const_boxplot_fill
+, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+, xlab = "Sample"
+)
+par(old_par)
+
+
+
+cat("\n\n\n")
+cat("\n\n\n")
+
+```
+
+```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
+if (nrow(quant_data_log) > 1) {
+  quant_data_log_stack <- stack(quant_data_log)
+  ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) +
+    ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
+    ggplot2::ylab("Probability density") +
+    ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE)
+} else {
+  cat("No density plot because there are too few peptides.\n\n")
+}
+```
+
+## Globally, are peptide intensities are approximately unimodal?
+
+<!--
+# bquote could be used as an alternative to latex2exp::TeX below particularly
+#   and when plotting math expressions generally, at the expense of mastering
+#   another syntax, which hardly seems worthwhile when I need to use TeX
+#   elsewhere; here's an introduction to bquote:
+#   https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/
+-->
+```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
+
+# identify the location of missing values
+fin <- is.finite(as.numeric(as.matrix(quant_data_log)))
+
+logvalues <- as.numeric(as.matrix(quant_data_log))[fin]
+logvalues_density <- density(logvalues)
+plot(
+  x = logvalues_density,
+  main = latex2exp::TeX(
+    "Smoothed estimated probability density vs. $log_{10}$(peptide intensity)"
+    ),
+  xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
+  ylab = "Probability density"
+  )
+hist(
+  x = as.numeric(as.matrix(quant_data_log)),
+  xlim = c(min(logvalues_density$x), max(logvalues_density$x)),
+  breaks = 100,
+  main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"),
+  xlab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+)
+```
+
+## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values
+
+```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
+# determine quantile
+q1 <- quantile(logvalues, probs = mean_percentile)[1]
+
+# 1 = row of matrix (ie, phosphopeptide)
+sds <- apply(quant_data_log, 1, sd_finite)
+if (sum(!is.na(sds)) > 2) {
+  plot(
+    density(sds, na.rm = TRUE)
+  , main = "Smoothed estimated probability density vs. std. deviation"
+  , sub = "(probability estimation made with Gaussian smoothing)"
+  , ylab = "Probability density"
+  )
+} else {
+  cat(
+    "At least two non-missing values are required to plot",
+    "probability density.\n\n"
+    )
+}
+
+```
+
+```{r echo = FALSE}
+# Determine number of cells to impute
+temp <- quant_data[is.na(quant_data)]
+
+# Determine number of values to impute
+number_to_impute <- length(temp)
+
+# Determine percent of missing values
+pct_missing_values <-
+  round(length(temp) / (length(logvalues) + length(temp)) * 100)
+```
+
+```{r echo = FALSE}
+
+# prep for trt-median based imputation
+
+```
+# Impute Missing Values
+
+```{r echo = FALSE}
+
+imp_smry_pot_peptides_before <- nrow(quant_data_log)
+imp_smry_missing_values_before   <- number_to_impute
+imp_smry_pct_missing      <- pct_missing_values
+
+```
+
+```{r echo = FALSE}
+#Determine number of cells to impute
+
+```
+```{r echo = FALSE}
+
+# Identify which values are missing and need to be imputed
+ind <- which(is.na(quant_data), arr.ind = TRUE)
+
+```
+```{r echo = FALSE, results = 'asis'}
+
+# Apply imputation
+switch(
+  imputation_method
+, "group-median" = {
+    quant_data_imp <- quant_data
+    imputation_method_description <-
+      paste("Substitute missing value with",
+        "median peptide-intensity for sample group.\n"
+        )
+    sample_level_integers <- as.integer(sample_treatment_levels)
+    # Take the accurate ln(x+1) because the data are log-normally distributed
+    #   and because median can involve an average of two measurements.
+    quant_data_imp <- log1p(quant_data_imp)
+    for (i in seq_len(length(levels(sample_treatment_levels)))) {
+      # Determine the columns for this factor-level
+      level_cols <- i == sample_level_integers
+      # Extract those columns
+      lvlsbst <- quant_data_imp[, level_cols, drop = FALSE]
+      # assign to ind the row-column pairs corresponding to each NA
+      ind <- which(is.na(lvlsbst), arr.ind = TRUE)
+      # No group-median exists if there is only one sample
+      #   a given ppep has no measurement; otherwise, proceed.
+      if (ncol(lvlsbst) > 1) {
+        the_centers <-
+          apply(lvlsbst, 1, median, na.rm = TRUE)
+        for (j in seq_len(nrow(lvlsbst))) {
+          for (k in seq_len(ncol(lvlsbst))) {
+            if (is.na(lvlsbst[j, k])) {
+              lvlsbst[j, k] <- the_centers[j]
+            }
+          }
+        }
+        quant_data_imp[, level_cols] <- lvlsbst
+      }
+    }
+    # Take the accurate e^x - 1 to match scaling of original input.
+    quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
+    good_rows <- !is.na(rowMeans(quant_data_imp))
+  }
+, "median" = {
+    quant_data_imp <- quant_data
+    imputation_method_description <-
+      paste("Substitute missing value with",
+        "median peptide-intensity across all sample classes.\n"
+        )
+    # Take the accurate ln(x+1) because the data are log-normally distributed
+    #   and because median can involve an average of two measurements.
+    quant_data_imp <- log1p(quant_data_imp)
+    quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]]
+    # Take the accurate e^x - 1 to match scaling of original input.
+    quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
+    good_rows <- !is.nan(rowMeans(quant_data_imp))
+  }
+, "mean" = {
+    quant_data_imp <- quant_data
+    imputation_method_description <-
+      paste("Substitute missing value with",
+        "geometric-mean peptide intensity across all sample classes.\n"
+        )
+    # Take the accurate ln(x+1) because the data are log-normally distributed,
+    #   so arguments to mean should be previously transformed.
+    #   this will have to be
+    quant_data_imp <- log1p(quant_data_imp)
+    # Assign to NA cells the mean for the row
+    quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]]
+    # Take the accurate e^x - 1 to match scaling of original input.
+    quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
+    good_rows <- !is.nan(rowMeans(quant_data_imp))
+  }
+, "random" = {
+    quant_data_imp <- quant_data
+    m1 <- median(sds, na.rm = TRUE) * sd_percentile #sd to be used is the median sd
+    # If you want results to be reproducible, you will want to call
+    #   base::set.seed before calling stats::rnorm
+    imputation_method_description <-
+      paste("Substitute each missing value with random intensity",
+        sprintf(
+          "random intensity $N \\sim (%0.2f, %0.2f)$.\n",
+          q1, m1
+          )
+        )
+    cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n",
+      100 * mean_percentile))
+    cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n",
+      sd_percentile))
+    quant_data_imp[ind] <-
+      10 ^ rnorm(number_to_impute, mean = q1, sd = m1)
+    quant_data_imp_ln <- log1p(quant_data_imp)
+    good_rows <- !is.nan(rowMeans(quant_data_imp))
+  }
+)
+quant_data_imp_log10 <- quant_data_imp_ln * const_log10_e
+
+if (length(good_rows) < 1) {
+  print("ERROR: Cannot impute data; there are no good rows!")
+  return(-1)
+  }
+```
+
+```{r echo = FALSE, results = 'asis'}
+cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description)
+```
+
+```{r echo = FALSE}
+
+imp_smry_pot_peptides_after <- sum(good_rows)
+imp_smry_rejected_after  <- sum(!good_rows)
+imp_smry_missing_values_after   <- sum(is.na(quant_data_imp[good_rows, ]))
+```
+```{r echo = FALSE, results = 'asis'}
+# ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf
+tabular_lines_fmt <- paste(
+  "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page)
+  " \\caption{Imputation Results}",
+  " \\centering", # \centering centers the table on the page
+  " \\begin{tabular}{l c c c}",
+  "  \\hline\\hline",
+  "  \\  & potential peptides   & missing values & rejected",
+  "    peptides \\\\ [0.5ex]",
+  "  \\hline",
+  "  before imputation & %d     & %d (%d\\%s)    &    \\\\",
+  "  after imputation  & %d     & %d             & %d \\\\ [1ex]",
+  "  \\hline",
+  " \\end{tabular}",
+  #" \\label{table:nonlin}", # may be used to refer this table in the text
+  "\\end{table}",
+  sep = "\n"
+  )
+tabular_lines <-
+  sprintf(
+    tabular_lines_fmt,
+    imp_smry_pot_peptides_before,
+    imp_smry_missing_values_before,
+    imp_smry_pct_missing,
+    "%",
+    imp_smry_pot_peptides_after,
+    imp_smry_missing_values_after,
+    imp_smry_rejected_after
+    )
+cat(tabular_lines)
+```
+```{r echo = FALSE}
+
+
+# Zap rows where imputation was ineffective
+full_data         <- full_data        [good_rows, ]
+quant_data        <- quant_data       [good_rows, ]
+
+quant_data_imp <- quant_data_imp[good_rows, ]
+write_debug_file(quant_data_imp)
+quant_data_imp_good_rows <- quant_data_imp
+
+write_debug_file(quant_data_imp_good_rows)
+```
+
+```{r echo = FALSE, results = 'asis'}
+
+can_plot_before_after_imp <- TRUE
+d_combined <-
+  as.numeric(
+    as.matrix(
+      log10(quant_data_imp)
+      )
+    )
+d_original <-
+  as.numeric(
+    as.matrix(
+      log10(quant_data_imp[!is.na(quant_data)])
+      )
+    )
+
+if (sum(!is.na(d_original)) > 2) {
+  d_original <- density(d_original)
+} else {
+  can_plot_before_after_imp <- FALSE
+}
+if (can_plot_before_after_imp && sum(is.na(d_combined)) < 1) {
+  d_combined <- density(d_combined)
+} else {
+  can_plot_before_after_imp <- FALSE
+}
+
+if (sum(is.na(quant_data)) > 0) {
+  # There ARE missing values
+  d_imputed <-
+    as.numeric(
+      as.matrix(
+        log10(quant_data_imp[is.na(quant_data)])
+        )
+      )
+  if (can_plot_before_after_imp && sum(is.na(d_imputed)) < 1) {
+    d_imputed <- (density(d_imputed))
+  } else {
+    can_plot_before_after_imp <- FALSE
+  }
+} else {
+  # There are NO missing values
+  d_imputed <- d_combined
+}
+
+```
+
+```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+zero_sd_rownames <-
+  rownames(quant_data_imp)[
+    is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0)
+  ]
+
+if (length(zero_sd_rownames) >= nrow(quant_data_imp)) {
+  stop("All peptides have zero standard deviation.  Cannot continue.")
+}
+if (length(zero_sd_rownames) > 0) {
+  cat(
+    sprintf("%d peptides with zero variance were removed from statistical consideration",
+      length(zero_sd_rownames)
+    )
+  )
+  zap_named_rows <- function(df, nms) {
+    return(df[!(row.names(df) %in% nms), ])
+  }
+  quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames)
+  quant_data     <- zap_named_rows(quant_data,     zero_sd_rownames)
+  full_data      <- zap_named_rows(full_data,      zero_sd_rownames)
+}
+
+if (sum(is.na(quant_data)) > 0) {
+  cat("\\leavevmode\\newpage\n")
+  # data visualization
+  old_par <- par(
+    mai = par("mai") + c(0.5, 0, 0, 0)
+  )
+  # Copy quant data to x
+  x <- quant_data
+  # x gets to have values of:
+  #  - NA for observed values
+  #  - 1 for missing values
+  x[is.na(x)] <- 0
+  x[x > 1] <- NA
+  x[x == 0] <- 1
+  # Log-transform imputed data
+  # update variable because rows may have been eliminated from quant_data_imp
+  quant_data_imp_log10 <- log10(quant_data_imp)
+
+  write_debug_file(quant_data_imp_log10)
+
+  # Set blue_dots to log of quant data or NA for NA quant data
+  blue_dots <- log10(quant_data)
+  # Set red_dots to log of imputed data or NA for observed quant data
+  red_dots <- quant_data_imp_log10 * x
+
+  count_red <- sum(!is.na(red_dots))
+  count_blue <- sum(!is.na(blue_dots))
+  ylim_save <- ylim <- c(
+    min(red_dots, blue_dots, na.rm = TRUE),
+    max(red_dots, blue_dots, na.rm = TRUE)
+    )
+  show_stripchart <-
+    50 > (count_red + count_blue) / length(sample_name_matches)
+  if (show_stripchart) {
+    boxplot_sub <- "Light blue = data before imputation; Red = imputed data"
+  } else {
+    boxplot_sub <- ""
+  }
+
+  # Vertical plot
+  colnames(blue_dots) <- sample_name_matches
+  boxplot(
+      blue_dots
+    , las = 1 # "always horizontal"
+    , col = const_boxplot_fill
+    , ylim = ylim
+    , main = "Peptide intensities after eliminating unusable peptides"
+    , sub = boxplot_sub
+    , xlab = "Sample"
+    , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+    )
+
+  if (show_stripchart) {
+    # Points
+    # ref: https://r-charts.com/distribution/add-points-boxplot/
+    # NA values are not plotted
+    stripchart(
+      blue_dots,                 # Data
+      method = "jitter",          # Random noise
+      jitter = const_stripchart_jitter,
+      pch = 19,                   # Pch symbols
+      cex = const_stripsmall_cex, # Size of symbols reduced
+      col = "lightblue",          # Color of the symbol
+      vertical = TRUE,            # Vertical mode
+      add = TRUE                  # Add it over
+      )
+    stripchart(
+      red_dots,                   # Data
+      method = "jitter",          # Random noise
+      jitter = const_stripchart_jitter,
+      pch = 19,                   # Pch symbols
+      cex = const_stripsmall_cex, # Size of symbols reduced
+      col = "red",                # Color of the symbol
+      vertical = TRUE,            # Vertical mode
+      add = TRUE                  # Add it over
+      )
+
+  }
+  if (TRUE) {
+    # show measured values in blue on left half-violin plot
+    cat("\\leavevmode\n\\quad\n\n\\quad\n\n")
+    vioplot::vioplot(
+      x = lapply(blue_dots, function(x) x[!is.na(x)]),
+      col = "lightblue1",
+      side = "left",
+      plotCentre = "line",
+      ylim = ylim_save,
+      main = "Distributions of observed and imputed data",
+      sub = "Light blue = observed data; Pink = imputed data",
+      xlab = "Sample",
+      ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+      )
+    red_violins <- lapply(red_dots, function(x) x[!is.na(x)])
+    cols_to_delete <- c()
+    for (ix in seq_len(length(red_violins))) {
+      if (length(red_violins[[ix]]) < 1) {
+        cols_to_delete <- c(cols_to_delete, ix)
+      }
+    }
+    # destroy any unimputable columns
+    if (!is.null(cols_to_delete)) {
+      red_violins <- red_violins[-cols_to_delete]
+    }
+    # plot imputed values in red on right half-violin plot
+    vioplot::vioplot(
+      x = red_violins,
+      col = "lightpink1",
+      side = "right",
+      plotCentre = "line",
+      add = TRUE
+      )
+  }
+
+  par(old_par)
+
+  # density plot
+  cat("\\leavevmode\n\n\n\n\n\n\n")
+  if (can_plot_before_after_imp) {
+    ylim <- c(
+      0,
+      max(
+        if (is.list(d_combined)) d_combined$y else d_combined,
+        if (is.list(d_original)) d_original$y else d_original,
+        if (is.list(d_imputed)) d_imputed$y else d_imputed,
+        na.rm = TRUE
+      )
+    )
+    plot(
+      d_combined,
+      ylim = ylim,
+      sub =
+        paste(
+          "Blue = data before imputation; Red = imputed data;",
+          "Black = combined"
+          ),
+      main = "Density of peptide intensity before and after imputation",
+      xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
+      ylab = "Probability density"
+    )
+    lines(d_original, col = "blue")
+    lines(d_imputed, col = "red")
+  } else {
+    cat(
+      "There are too few points to plot the density of peptide intensity",
+      "before and after imputation."
+      )
+  }
+  cat("\\leavevmode\\newpage\n")
+}
+```
+
+# Perform Quantile Normalization
+
+The excellent `normalize.quantiles` function from
+*[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)*
+performs "quantile normalization" as described Bolstad *et al.* (2003),
+DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)*
+and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*,
+i.e., it assumes that the goal is to detect
+subtle differences among grossly similar samples (having similar distributions)
+by equailzing intra-quantile quantitations.
+Unfortunately, one software library upon which it depends
+*[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)*
+that requires that a specific, non-concurrent version of the library be
+installed.  The installation command equivalent to what was used to install the library to produce the results presented here is:
+```
+    conda install bioconductor-preprocesscore openblas=0.3.3
+```
+
+
+<!--
+# Apply quantile normalization using preprocessCore::normalize.quantiles
+# ---
+# tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html
+#   except this: https://support.bioconductor.org/p/122925/#9135989
+#   says to install it like this:
+#     ```
+#     BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1])
+#     ```
+# conda installation (necessary because of a bug in recent openblas):
+#   conda install bioconductor-preprocesscore openblas=0.3.3
+# ...
+# ---
+# normalize.quantiles {preprocessCore} --  Quantile Normalization
+#
+# Description:
+#   Using a normalization based upon quantiles, this function normalizes a
+#     matrix of probe level intensities.
+#
+#   THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the
+#     assumption that the data is missing at random.
+#
+# Usage:
+#   normalize.quantiles(x, copy = TRUE, keep.names = FALSE)
+#
+# Arguments:
+#
+#   - x: A matrix of intensities where each column corresponds to a chip and each row is a probe.
+#
+#   - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy,
+#       but in certain situations not making a copy of the matrix, but instead normalizing
+#       it in place will be more memory friendly.
+#
+#   - keep.names: Boolean option to preserve matrix row and column names in output.
+#
+# Details:
+#   This method is based upon the concept of a quantile-quantile plot extended to n dimensions.
+#     No special allowances are made for outliers. If you make use of quantile normalization
+#     please cite Bolstad et al, Bioinformatics (2003).
+#
+#   This functions will handle missing data (ie NA values), based on
+#     the assumption that the data is missing at random.
+#
+#   Note that the current implementation optimizes for better memory usage
+#     at the cost of some additional run-time.
+#
+# Value: A normalized matrix.
+#
+# Author: Ben Bolstad, bmbolstad.com
+#
+# References
+#
+#   - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide
+#       Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf
+#
+#   - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of
+#       Normalization Methods for High Density Oligonucleotide Array Data Based on Bias
+#       and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
+#       http://bmbolstad.com/misc/normalize/normalize.html
+# ...
+-->
+```{r echo = FALSE, results = 'asis'}
+
+if (nrow(quant_data_imp) > 0) {
+  quant_data_imp_qn <- preprocessCore::normalize.quantiles(
+     as.matrix(quant_data_imp), keep.names = TRUE
+   )
+} else {
+  quant_data_imp_qn <- as.matrix(quant_data_imp)
+}
+
+quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
+
+write_debug_file(quant_data_imp_qn)
+
+quant_data_imp_qn_log <- log10(quant_data_imp_qn)
+
+write_debug_file(quant_data_imp_qn_log)
+
+quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))
+
+sel <- apply(quant_data_imp_qn_ls, 1, any_nan)
+quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls
+
+quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ]
+quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2)
+
+quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls)
+
+write_debug_file(quant_data_imp_qn_ls)
+write_debug_file(quant_data_imp_qn_ls2)
+
+# Create data.frame used by ANOVA analysis
+data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
+```
+
+<!-- ACE insertion begin -->
+## Are normalized, imputed, log-transformed sample distributions similar?
+
+```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+
+# Save unimputed quant_data_log for plotting below
+unimputed_quant_data_log <- quant_data_log
+
+# log10 transform (after preparing for zero values,
+#   which should never happen...)
+quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001
+quant_data_log <- log10(quant_data_imp_qn)
+
+how_many_peptides <- nrow(quant_data_log)
+
+if ((how_many_peptides) > 0) {
+  cat(
+    sprintf(
+      "Intensities for %d peptides:\n\n\n",
+      how_many_peptides
+      )
+    )
+  cat("\n\n\n")
+
+
+  # data visualization
+  old_par <- par(
+    mai = par("mai") + c(0.5, 0, 0, 0)
+  , oma = par("oma") + c(0.5, 0, 0, 0)
+  )
+  # ref: https://r-charts.com/distribution/add-points-boxplot/
+  # Vertical plot
+  colnames(quant_data_log) <- sample_name_matches
+  boxplot(
+    quant_data_log
+  , las = 1
+  , col = const_boxplot_fill
+  , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+  , xlab = "Sample"
+  )
+  par(old_par)
+} else {
+  cat("There are no peptides to plot\n")
+}
+
+cat("\n\n\n")
+
+```
+
+```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
+if (nrow(quant_data_log) > 1) {
+  quant_data_log_stack <- stack(quant_data_log)
+  ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) +
+    ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
+    ggplot2::ylab("Probability density") +
+    ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE)
+} else {
+  cat("No density plot because there are fewer than two peptides to plot.\n\n")
+}
+```
+```{r echo = FALSE, results = 'asis'}
+cat("\\leavevmode\\newpage\n")
+```
+
+# ANOVA Analysis
+
+```{r, echo = FALSE}
+# Make new data frame containing only Phosphopeptides
+#   to connect preANOVA to ANOVA (connect_df)
+connect_df <- data.frame(
+    data_table_imp_qn_lt$Phosphopeptide
+  , data_table_imp_qn_lt[, first_data_column]
+  )
+colnames(connect_df) <- c("Phosphopeptide", "Intensity")
+```
+
+```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+count_of_treatment_levels <- length(levels(sample_treatment_levels))
+if (count_of_treatment_levels < 2) {
+  nuke_control_sequences <-
+    function(s) {
+      s <- gsub("[\\]", "xyzzy_plugh", s)
+      s <- gsub("[$]", "\\\\$", s)
+      s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
+      return(s)
+    }
+  cat(
+    "ERROR!!!! Cannot perform ANOVA analysis",
+    "(see next page)\\newpage\n"
+  )
+  cat(
+    "ERROR: ANOVA analysis",
+    "requires two or more factor levels!\n\n\n"
+  )
+
+  cat("\n\n\n\n\n")
+  cat("Unparsed sample names are:\n\n\n",
+    "\\begin{quote}\n",
+    paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"),
+    "\n\\end{quote}\n\n")
+
+  regex_sample_names <- nuke_control_sequences(regex_sample_names)
+
+  cat("\\leavevmode\n\n\n")
+  cat("Parsing rule for SampleNames is",
+    "\n\n\n",
+    "\\text{'",
+    regex_sample_names,
+    "'}\n\n\n",
+    sep = ""
+    )
+
+  cat("\nParsed sample names are:\n",
+    "\\begin{quote}\n",
+    paste(sample_name_matches, collapse = "\n\n\n"),
+    "\n\\end{quote}\n\n")
+
+  regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping)
+
+  cat("\\leavevmode\n\n\n")
+  cat("Parsing rule for SampleGrouping is",
+    "\n\n\n",
+    "\\text{'",
+    regex_sample_grouping,
+    "'}\n\n\n",
+    sep = ""
+    )
+
+  cat("\n\n\n")
+  cat("Sample group assignments are:\n",
+    "\\begin{quote}\n",
+    paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"),
+    "\n\\end{quote}\n\n")
+
+} else {
+
+  p_value_data_anova_ps <-
+    apply(
+      quant_data_imp_qn_log,
+      1,
+      anova_func,
+      grouping_factor = sample_treatment_levels,
+      one_way_f = one_way_all_categories
+      )
+
+  p_value_data_anova_ps_fdr <-
+    p.adjust(p_value_data_anova_ps, method = "fdr")
+  p_value_data <- data.frame(
+    phosphopeptide = full_data[, 1]
+    ,
+    raw_anova_p = p_value_data_anova_ps
+    ,
+    fdr_adjusted_anova_p = p_value_data_anova_ps_fdr
+  )
+
+  # output ANOVA file to constructed filename,
+  #   e.g.    "Outputfile_pST_ANOVA_STEP5.txt"
+  #   becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"
+
+  # Re-output datasets to include p-values
+  metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3])
+  write.table(
+    cbind(metadata_plus_p, quant_data_imp),
+    file = imputed_data_filename,
+    sep = "\t",
+    col.names = TRUE,
+    row.names = FALSE,
+    quote = FALSE
+    )
+
+  write.table(
+    cbind(metadata_plus_p, quant_data_imp_qn_log),
+    file = imp_qn_lt_data_filenm,
+    sep = "\t",
+    col.names = TRUE,
+    row.names = FALSE,
+    quote = FALSE
+    )
+
+
+  p_value_data <-
+    p_value_data[order(p_value_data$fdr_adjusted_anova_p), ]
+
+  first_page_suppress <- 1
+  number_of_peptides_found <- 0
+  cutoff <- val_fdr[1]
+  for (cutoff in val_fdr) {
+    if (number_of_peptides_found > 49) {
+      cat("\\leavevmode\n\n\n")
+      break
+      }
+
+    #loop through FDR cutoffs
+
+    filtered_p <-
+      p_value_data[
+        which(p_value_data$fdr_adjusted_anova_p < cutoff),
+        , drop = FALSE
+        ]
+    filtered_data_filtered <-
+      quant_data_imp_qn_log[
+        rownames(filtered_p),
+        , drop = FALSE
+        ]
+    filtered_data_filtered <-
+      filtered_data_filtered[
+        order(filtered_p$fdr_adjusted_anova_p),
+        , drop = FALSE
+        ]
+
+    # <!-- ACE insertion start -->
+
+    if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) {
+      if (first_page_suppress == 1) {
+        first_page_suppress <- 0
+        } else {
+          cat("\\newpage\n")
+        }
+      if (nrow(filtered_data_filtered) > 1) {
+        subsection_header(sprintf(
+          "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n",
+          nrow(filtered_data_filtered),
+          cutoff
+        ))
+      } else {
+        subsection_header(sprintf(
+          "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n",
+          rownames(filtered_data_filtered)[1],
+          cutoff
+        ))
+      }
+      cat("\n\n\n")
+      cat("\n\n\n")
+
+      old_oma <- par("oma")
+      old_par <- par(
+        mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1),
+        oma = old_oma * c(1, 1, 0.3, 1),
+        cex.main = 0.9,
+        cex.axis = 0.7,
+        fin = c(9, 7.25)
+        )
+      # ref: https://r-charts.com/distribution/add-points-boxplot/
+      # Vertical plot
+      colnames(filtered_data_filtered) <- sample_name_matches
+      tryCatch(
+        boxplot(
+          filtered_data_filtered,
+          main = "Imputed, normalized intensities", # no line plot
+          las = 1,
+          col = const_boxplot_fill,
+          ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+        ),
+        error = function(e) print(e)
+      )
+      par(old_par)
+    } else {
+      cat(sprintf(
+        "%s < %0.2f\n\n\n\n\n",
+        "No peptides were found to have cutoff adjusted p-value",
+        cutoff
+      ))
+    }
+
+    if (nrow(filtered_data_filtered) > 0) {
+      # Add Phosphopeptide column to anova_filtered table
+      # The assumption here is that the first intensity is unique;
+      #   this is a hokey assumption but almost definitely will
+      #   be true in the real world unless there is a computation
+      #   error upstream.
+      anova_filtered_merge <- base::merge(
+        x = connect_df,
+        y = filtered_data_filtered,
+        by.x = "Intensity",
+        by.y = 1
+      )
+      anova_filtered_merge_order <- rownames(filtered_p)
+
+      anova_filtered <- data.frame(
+        ppep    = anova_filtered_merge$Phosphopeptide,
+        intense = anova_filtered_merge$Intensity,
+        data    = anova_filtered_merge[, 2:number_of_samples + 1]
+      )
+      colnames(anova_filtered) <-
+        c("Phosphopeptide", colnames(filtered_data_filtered))
+
+      # Merge qualitative columns into the ANOVA data
+      output_table <- data.frame(anova_filtered$Phosphopeptide)
+      output_table <- base::merge(
+        x = output_table,
+        y = data_table_imp_qn_lt,
+        by.x = "anova_filtered.Phosphopeptide",
+        by.y = "Phosphopeptide"
+      )
+
+      # Produce heatmap to visualize significance and the effect of imputation
+
+      anova_filtered_merge_format <- sapply(
+        X = filtered_p$fdr_adjusted_anova_p
+        ,
+        FUN = function(x) {
+          if (x > 0.0001)
+            paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s")
+          else
+            paste0("(%0.4e) %s")
+        }
+      )
+
+      cat_hm_heading <- function(m, cutoff) {
+        cat("\\newpage\n")
+        if (nrow(m) > intensity_hm_rows) {
+          subsection_header(
+            paste(
+              sprintf("Heatmap for the %d most-significant peptides",
+                intensity_hm_rows),
+              sprintf("whose adjusted p-value < %0.2f\n", cutoff)
+            )
+          )
+        } else {
+          if (nrow(m) == 1) {
+            return(FALSE)
+          } else {
+            subsection_header(
+              paste(
+                sprintf("Heatmap for %d usable peptides whose", nrow(m)),
+                sprintf("adjusted p-value < %0.2f\n", cutoff)
+              )
+            )
+          }
+        }
+        cat("\n\n\n")
+        cat("\n\n\n")
+        return(TRUE)
+      }
+
+      # construct matrix with appropriate rownames
+      m <-
+        as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ])
+      if (nrow(m) > 0) {
+        rownames_m <- rownames(m)
+        rownames(m) <- sapply(
+          X = seq_len(nrow(m))
+          ,
+          FUN = function(i) {
+            sprintf(
+              anova_filtered_merge_format[i],
+              filtered_p$fdr_adjusted_anova_p[i],
+              rownames_m[i]
+            )
+          }
+        )
+      }
+      # draw the heading and heatmap
+      if (nrow(m) > 0) {
+          number_of_peptides_found <-
+            draw_intensity_heatmap(
+              m                       = m,
+              cutoff                  = cutoff,
+              hm_heading_function     = cat_hm_heading,
+              hm_main_title           = "Unimputed, unnormalized log(intensities)",
+              suppress_row_dendrogram = FALSE
+            )
+      }
+    }
+  }
+}
+cat("\\leavevmode\n\n\n")
+```
+
+```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+
+if (count_of_treatment_levels > 1) {
+  # Prepare two-way contrasts with adjusted p-values
+  # Strategy:
+  # - use imputed, log-transformed data:
+  #   - remember this when computing log2(fold-change)
+  # - each contrast is between a combination of trt levels
+  # - for each contrast, compute samples that are members
+  # - compute one-way test:
+  #   - use `oneway.test` (Welch test) if numbers of samples
+  #     are not equivalent between trt levels
+  #   - otherwise, aov is fine but offers no advantage
+  # - adjust p-value, assuming that
+  #   (# of pppeps)*(# of contrasts) tests were performed
+
+  # Each contrast is between a combination of trt levels
+  m2 <- combn(
+    x = seq_len(length(levels(sample_treatment_levels))),
+    m = 2,
+    simplify = TRUE
+  )
+  contrast_count <- ncol(m2)
+
+  # For each contrast, compute samples that are members
+  # - local function to construct a data.frame for each contrast
+  #   - the contrast in the first "column"
+  f_m2 <-
+    function(cntrst, lvl1, lvl2) {
+      return(
+        data.frame(
+          contrast = cntrst,
+          level = sample_treatment_levels[
+              sample_treatment_levels %in%
+                levels(sample_treatment_levels)[c(lvl1, lvl2)]
+            ],
+          label = sample_name_matches[
+              sample_treatment_levels %in%
+                levels(sample_treatment_levels)[c(lvl1, lvl2)]
+            ]
+        )
+      )
+    }
+  # - compute a df for each contrast
+  sample_level_dfs <- lapply(
+    X = 1:contrast_count,
+    FUN = function(i) f_m2(i, m2[1, i], m2[2, i])
+  )
+  # - compute a single df for all contrasts
+  combined_contrast_df <- Reduce(f = rbind, x = sample_level_dfs)
+
+  # - dispose objects to free resources
+  rm(sample_level_dfs)
+
+  # - write the df to a DB for later join-per-contrast
+  db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
+
+  RSQLite::dbWriteTable(
+    conn = db,
+    name = "contrast",
+    value = combined_contrast_df,
+    overwrite = TRUE
+  )
+
+  # Create UK for insert
+  ddl_exec(db, "
+    CREATE UNIQUE INDEX IF NOT EXISTS contrast__uk__idx
+      ON contrast(contrast, label);
+    "
+  )
+  # Create indexes for join
+  ddl_exec(db, "
+    -- index for join in contrast_ppep_smpl_qnlt on a.label < b.label
+    CREATE INDEX IF NOT EXISTS contrast__label__idx
+      ON contrast(label);
+    "
+  )
+  ddl_exec(db, "
+    -- index for joining two contrast_lvl_ppep_avg_quant on contrast
+    CREATE INDEX IF NOT EXISTS contrast__contrast__idx
+      ON contrast(contrast);
+    "
+  )
+  ddl_exec(db, "
+    -- index for joining two contrast_lvl_ppep_avg_quant on phophospep
+    CREATE INDEX IF NOT EXISTS contrast__level__idx
+      ON contrast(level);
+    "
+  )
+  # - dispose objects to free resources
+  rm(combined_contrast_df)
+
+  # Use imputed, log-transformed data
+  # - remember that this was donoe when computing log2(fold-change)
+  # - melt data matrix for use in later join-per-contrast
+  casted <- cbind(
+    data.frame(vrbl = rownames(quant_data_imp_qn_log)),
+    quant_data_imp_qn_log
+  )
+  quant_data_imp_qn_log_melted <- reshape2::melt(
+    casted,
+    id.vars = "vrbl"
+  )
+  colnames(quant_data_imp_qn_log_melted) <-
+    c("phosphopep", "sample", "quant")
+  # - dispose objects to free resources
+  rm(casted)
+
+  # - write the df to a DB for use in later join-per-contrast
+  RSQLite::dbWriteTable(
+    conn = db,
+    name = "ppep_smpl_qnlt",
+    value = quant_data_imp_qn_log_melted,
+    overwrite = TRUE
+  )
+  # Create UK for insert
+  ddl_exec(db, "
+    CREATE UNIQUE INDEX IF NOT EXISTS ppep_smpl_qnlt__uk__idx
+      ON ppep_smpl_qnlt(phosphopep, sample);
+    "
+  )
+  # Create index for join
+  ddl_exec(db, "
+    -- index for join in contrast_ppep_smpl_qnlt
+    CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__sample__idx
+      ON ppep_smpl_qnlt(sample);
+    "
+  )
+  ddl_exec(db, "
+    -- index for joining two contrast_lvl_ppep_avg_quant on phopho.pep
+    CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__phosphopep__idx
+      ON ppep_smpl_qnlt(phosphopep);
+    "
+  )
+  # - dispose objects to free resources
+  rm(quant_data_imp_qn_log_melted)
+
+  # - drop views if exist
+  ddl_exec(db, "
+    -- drop view dependent on contrast_lvl_ppep_avg_quant
+    DROP VIEW IF EXISTS v_contrast_log2_fc;
+    "
+  )
+  ddl_exec(db, "
+    -- drop table dependent on contrast_ppep_smpl_qnlt
+    DROP TABLE IF EXISTS contrast_lvl_ppep_avg_quant;
+    "
+  )
+  ddl_exec(db, "
+    DROP TABLE IF EXISTS contrast_lvl_lvl_metadata;
+    "
+  )
+  ddl_exec(db, "
+    DROP VIEW IF EXISTS v_contrast_lvl_metadata;
+    "
+  )
+  ddl_exec(db, "
+    -- drop view dependent on contrast_ppep_smpl_qnlt
+    DROP VIEW IF EXISTS v_contrast_lvl_ppep_avg_quant;
+    "
+  )
+  ddl_exec(db, "
+    DROP VIEW IF EXISTS v_contrast_lvl_lvl;
+    "
+  )
+  ddl_exec(db, "
+    -- drop view upon which other views depend
+    DROP VIEW IF EXISTS contrast_ppep_smpl_qnlt;
+    "
+  )
+  # - create view
+  dml_no_rows_exec(db, "
+      -- view contrast_ppep_smpl_qnlt is used for each phopshopep to
+      --   compute p-value for test of trt effect for two trt levels
+      CREATE VIEW contrast_ppep_smpl_qnlt
+        AS
+      SELECT  contrast,
+              level,
+              phosphopep,
+              sample,
+              quant
+        FROM  contrast AS c,
+              ppep_smpl_qnlt AS q
+        WHERE q.sample = c.label
+        ORDER BY contrast, level, phosphopep
+      ;
+    "
+  )
+  # - create simplification views
+  dml_no_rows_exec(db, "
+      CREATE VIEW v_contrast_lvl_metadata
+        AS
+      SELECT  contrast,
+              level,
+              group_concat(label, ';') AS samples
+        FROM  contrast
+        GROUP BY contrast, level
+        /* view v_contrast_lvl_metadata is used
+           to simplify creation of table contrast_lvl_lvl_metadata */
+      ;
+    "
+  )
+  dml_no_rows_exec(db, "
+      CREATE VIEW v_contrast_lvl_ppep_avg_quant
+        AS
+      SELECT  contrast,
+              level,
+              phosphopep,
+              avg(quant)                AS avg_quant
+        FROM  contrast_ppep_smpl_qnlt
+        GROUP BY contrast, level, phosphopep
+        /* view v_contrast_lvl_ppep_avg_quant is used
+           to simplify view v_contrast_log2_fc */
+      ;
+    "
+  )
+
+  # - create contrast-metadata table
+  dml_no_rows_exec(db, "
+      CREATE TABLE contrast_lvl_lvl_metadata
+        AS
+      SELECT DISTINCT
+              a.contrast              AS ab_contrast,
+              a.level                 AS a_level,
+              b.level                 AS b_level,
+              a.samples               AS a_samples,
+              b.samples               AS b_samples,
+              'log2(level_'||a.level||'/level_'||b.level||')'
+                                      AS fc_description
+        FROM  v_contrast_lvl_metadata AS a,
+              v_contrast_lvl_metadata AS b
+        WHERE a.contrast = b.contrast
+          AND a.level > b.level
+        /* view v_contrast_lvl_lvl is used
+           to simplify view v_contrast_log2_fc */
+      ;
+    "
+  )
+  # - create pseudo-materialized view table
+  dml_no_rows_exec(db, "
+      CREATE VIEW v_contrast_lvl_lvl
+        AS
+      SELECT DISTINCT
+              a.contrast AS ab_contrast,
+              a.level    AS a_level,
+              b.level    AS b_level
+        FROM  contrast   AS a,
+              contrast   AS b
+        WHERE a.contrast = b.contrast
+          AND a.level > b.level
+        /* view v_contrast_lvl_lvl is used
+           to simplify view v_contrast_log2_fc */
+      ;
+    "
+  )
+
+  # - create view to compute log2(fold-change)
+  dml_no_rows_exec(db, "
+      CREATE VIEW v_contrast_log2_fc
+        AS
+      SELECT  ab.ab_contrast                         AS contrast,
+              m.a_level                              AS a_level,
+              c.avg_quant                            AS a_quant,
+              m.a_samples                            AS a_samples,
+              ab.b_level                             AS b_level,
+              d.avg_quant                            AS b_quant,
+              m.b_samples                            AS b_samples,
+              m.fc_description                       AS fc_description,
+              3.32193 * ( d.avg_quant - c.avg_quant) AS log2_fc,
+              d.phosphopep                           AS phosphopep
+        FROM  contrast_lvl_lvl_metadata                  AS m,
+              v_contrast_lvl_ppep_avg_quant              AS d,
+              v_contrast_lvl_lvl AS ab
+                INNER JOIN v_contrast_lvl_ppep_avg_quant AS c
+                  ON c.contrast = ab.ab_contrast
+                  AND c.level   = ab.a_level
+        WHERE d.contrast    = ab.ab_contrast
+          AND m.ab_contrast = ab.ab_contrast
+          AND d.level       = ab.b_level
+          AND d.phosphopep  = c.phosphopep
+        /* view to compute log2(fold-change) */
+        ;
+    "
+  )
+
+  # For each contrast, compute samples that are members
+  # compute one-way test:
+  # - use `oneway.test` (Welch test) if numbers of samples
+  #   are not equivalent between trt levels
+  # - otherwise, aov is fine but offers no advantage
+  for (contrast in contrast_count:2) {
+    invisible(contrast)
+  }
+  for (contrast in 1:contrast_count) {
+    contrast_df <- sqldf::sqldf(
+      x = paste0("
+        SELECT level, phosphopep, sample, quant
+          FROM contrast_ppep_smpl_qnlt
+          WHERE contrast = ", contrast, "
+          ORDER BY phosphopep, level, sample
+      "),
+      connection = db
+    )
+    contrast_cast <- reshape2::dcast(
+      data = contrast_df,
+      formula = phosphopep ~ sample,
+      value.var = "quant"
+    )
+    contrast_cast_ncol <- ncol(contrast_cast)
+    contrast_cast_data <- contrast_cast[, 2:contrast_cast_ncol]
+    contrast_cast_samples <- colnames(contrast_cast_data)
+
+    # - order grouping_factor by order of sample columns of contrast_cast_data
+    grouping_factor <- sqldf::sqldf(
+      x = paste0("
+        SELECT sample, level
+          FROM contrast_ppep_smpl_qnlt
+          WHERE contrast = ", contrast, "
+          ORDER BY phosphopep, level, sample
+          LIMIT ", contrast_cast_ncol - 1
+      ),
+      connection = db
+    )
+    rownames(grouping_factor) <- grouping_factor$sample
+    grouping_factor <- grouping_factor[, "level", drop = FALSE]
+
+    # - run the two-level (one-way) test
+    p_value_data_contrast_ps <-
+      apply(
+        X = contrast_cast_data,
+        MARGIN = 1, # apply to rows
+        FUN = anova_func,
+        grouping_factor =
+          as.factor(as.numeric(grouping_factor$level)), # anova_func arg2
+        one_way_f = one_way_two_categories, # anova_func arg3
+        simplify = TRUE # TRUE is the default for simplify
+        )
+    contrast_data_adj_p_values <- p.adjust(
+        p = p_value_data_contrast_ps,
+        method = "fdr",
+        n = length(p_value_data_contrast_ps) # this is the default, length(p)
+      )
+    # - compute the fold-change
+    contrast_p_df <-
+      data.frame(
+        contrast = contrast,
+        phosphopep = contrast_cast$phosphopep,
+        p_value_raw = p_value_data_contrast_ps,
+        p_value_adj = contrast_data_adj_p_values
+      )
+    db_write_table_overwrite <- (contrast < 2)
+    db_write_table_append <- !db_write_table_overwrite
+    RSQLite::dbWriteTable(
+      conn = db,
+      name = "contrast_ppep_p_val",
+      value = contrast_p_df,
+      append = db_write_table_append
+      )
+    # Create UK for insert
+    ddl_exec(db, "
+      CREATE UNIQUE INDEX IF NOT EXISTS contrast_ppep_p_val__uk__idx
+        ON contrast_ppep_p_val(phosphopep, contrast);
+      "
+    )
+  }
+  # Perhaps this could be done more elegantly using unique keys
+  #   or creating the tables before saving data to them, but this
+  #   is fast and, if the database exists on disk rather than in
+  #   memory, it doesn't stress memory.
+  dml_no_rows_exec(db, "
+    CREATE TEMP table contrast_log2_fc
+      AS
+    SELECT *
+      FROM  v_contrast_log2_fc
+      ORDER BY contrast, phosphopep
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+    CREATE TEMP table ppep_p_val
+      AS
+    SELECT  p_value_raw,
+            p_value_adj,
+            contrast   AS p_val_contrast,
+            phosphopep AS p_val_ppep
+      FROM  contrast_ppep_p_val
+      ORDER BY contrast, phosphopep
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+    DROP TABLE IF EXISTS contrast_log2_fc_p_val
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+    CREATE TABLE contrast_log2_fc_p_val
+      AS
+    SELECT  a.*,
+            b.p_value_raw,
+            b.p_value_adj,
+            b.p_val_contrast,
+            b.p_val_ppep
+      FROM  contrast_log2_fc a, ppep_p_val b
+      WHERE a.rowid = b.rowid
+        AND a.phosphopep = b.p_val_ppep
+    ;
+    "
+  )
+  # Create UK
+  ddl_exec(db, "
+    CREATE UNIQUE INDEX IF NOT EXISTS contrast_log2_fc_p_val__uk__idx
+      ON contrast_log2_fc_p_val(phosphopep, contrast);
+    "
+  )
+  # Create indices for future queries
+  ddl_exec(db, "
+    CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__contrast__idx
+      ON contrast_log2_fc_p_val(contrast);
+    "
+  )
+  ddl_exec(db, "
+    CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__phosphopep__idx
+      ON contrast_log2_fc_p_val(phosphopep);
+    "
+  )
+  ddl_exec(db, "
+    CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_raw__idx
+      ON contrast_log2_fc_p_val(p_value_raw);
+    "
+  )
+  ddl_exec(db, "
+    CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_adj__idx
+      ON contrast_log2_fc_p_val(p_value_adj);
+    "
+  )
+  dml_no_rows_exec(db, "
+    DROP VIEW IF EXISTS v_contrast_log2_fc_p_val
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+      CREATE VIEW v_contrast_log2_fc_p_val
+        AS
+      SELECT  contrast,
+              a_level,
+              a_samples,
+              b_level,
+              b_samples,
+              a_quant,
+              b_quant,
+              fc_description,
+              log2_fc,
+              p_value_raw,
+              p_value_adj,
+              phosphopep
+        FROM  contrast_log2_fc_p_val
+        ORDER BY contrast, phosphopep
+        ;
+    "
+  )
+  ddl_exec(db, "
+    DROP TABLE IF EXISTS kseaapp_metadata
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+    CREATE TABLE kseaapp_metadata
+      AS
+    WITH extended(deppep, ppep, gene_name, uniprot_id, phosphoresidue) AS (
+              SELECT DISTINCT
+                  deppep.seq,
+                  ppep.seq,
+                  GeneName||';',
+                  UniProtID||';',
+                  PhosphoResidue||';'
+              FROM
+                ppep, deppep, mrgfltr_metadata
+              WHERE
+                  mrgfltr_metadata.ppep_id = ppep.id
+                AND
+                  ppep.deppep_id = deppep.id
+        )
+    SELECT
+        ppep                                                  AS `ppep`,
+        SUBSTR(uniprot_id,     1, INSTR(uniprot_id,';') - 1 ) AS `Protein`,
+        SUBSTR(gene_name,      1, INSTR(gene_name,';')  - 1 ) AS `Gene`,
+        deppep                                                AS `Peptide`,
+        REPLACE(
+          REPLACE(
+            SUBSTR(phosphoresidue, 1, INSTR(phosphoresidue,';') - 1 ),
+            'p',
+            ''
+            ),
+          ', ',
+          ';'
+          )                                                   AS `Residue.Both`
+      FROM extended
+      ;
+    "
+  )
+  # Create indexes for join
+  ddl_exec(db, "
+    CREATE INDEX IF NOT EXISTS kseaapp_metadata__ppep__idx
+      ON kseaapp_metadata(ppep);
+    "
+  )
+  ddl_exec(db, "
+    DROP VIEW IF EXISTS v_kseaapp_contrast
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+    CREATE VIEW v_kseaapp_contrast
+      AS
+    SELECT  a.*, b.Protein, b.Gene, b.Peptide, b.`Residue.Both`
+      FROM  v_contrast_log2_fc_p_val a, kseaapp_metadata b
+      WHERE b.ppep = a.phosphopep
+      ;
+      "
+  )
+  ddl_exec(db, "
+    DROP VIEW IF EXISTS v_kseaapp_input
+    ;
+    "
+  )
+  dml_no_rows_exec(db, "
+    CREATE VIEW v_kseaapp_input
+      AS
+    SELECT  v.contrast,
+            v.phosphopep,
+            m.`Protein`,
+            m.`Gene`,
+            m.`Peptide`,
+            m.`Residue.Both`,
+            v.p_value_raw AS `p`,
+            v.log2_fc AS `FC`
+      FROM  kseaapp_metadata AS m,
+            v_contrast_log2_fc_p_val AS v
+      WHERE m.ppep = v.phosphopep
+        AND NOT m.`Gene` = 'No_Gene_Name'
+        AND NOT v.log2_fc = 0
+      ;
+      "
+  )
+}
+```
+
+```{r echo = FALSE, results = 'asis'}
+cat("\\newpage\n")
+```
+
+# KSEA Analysis
+
+Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched.   Enrichments are found by the CRAN `KSEAapp` package:
+
+- The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp
+- The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415).
+- An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/).
+
+For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as:
+
+$$
+\text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j}
+$$
+
+and fold-enrichment is computed as:
+
+$$
+\text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j}
+$$
+
+where:
+
+- $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$,
+- $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and
+- $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$,
+- $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset.
+- Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score.
+
+$\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x)
+
+Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive).
+
+Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`.
+
+- Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis.
+- Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)).
+- Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads).
+
+```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+
+db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
+
+# -- eliminate the table that's about to be defined
+ddl_exec(db, "
+DROP TABLE IF EXISTS site_metadata;
+")
+
+# -- define the site_metadata table
+ddl_exec(db, "
+CREATE TABLE site_metadata(
+  id            INTEGER PRIMARY KEY
+, site_type_id  INTEGER REFERENCES site_type(id)
+, full          TEXT    UNIQUE ON CONFLICT IGNORE
+, abbrev        TEXT
+, pattern       TEXT
+, motif         TEXT
+);
+")
+
+# -- populate the table with initial values
+ddl_exec(db, "
+INSERT INTO site_metadata(full, abbrev, site_type_id)
+  SELECT  DISTINCT kinase_map, kinase_map, site_type_id
+    FROM  ppep_gene_site
+    ORDER BY kinase_map;
+")
+
+# -- drop bogus KSData view if exists
+ddl_exec(db, "
+DROP VIEW IF EXISTS ks_data_v;
+")
+
+# -- create view to serve as an impostor for  KSEAapp::KSData
+ddl_exec(db, "
+CREATE VIEW IF NOT EXISTS ks_data_v
+AS
+SELECT
+  'NA' AS KINASE,
+  'NA' AS KIN_ACC_ID,
+  kinase_map AS GENE,
+  'NA' AS KIN_ORGANISM,
+  'NA' AS SUBSTRATE,
+  0 AS SUB_GENE_ID,
+  'NA' AS SUB_ACC_ID,
+  gene_names AS SUB_GENE,
+  'NA' AS SUB_ORGANISM,
+  phospho_peptide AS SUB_MOD_RSD,
+  0 AS SITE_GROUP_ID,
+  'NA' AS 'SITE_7AA',
+  2 AS networkin_score,
+  type_name AS Source
+FROM ppep_gene_site_view;
+")
+
+contrast_metadata_df <-
+  sqldf::sqldf("select * from contrast_lvl_lvl_metadata", connection = db)
+rslt <- new_env()
+rslt$score_list <- list()
+rslt$name_list  <- list()
+rslt$longname_list  <- list()
+
+ddl_exec(db, "
+  DROP TABLE IF EXISTS contrast_ksea_scores;
+  "
+)
+
+next_index <- 0
+err_na_subscr_df_const <-
+ "missing values are not allowed in subscripted assignments of data frames"
+
+for (i_cntrst in seq_len(nrow(contrast_metadata_df))) {
+  cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
+  cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
+  cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
+  contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
+  contrast_longlabel <- (
+    sprintf(
+      "Trt %s {%s} -> Trt %s {%s}",
+      contrast_metadata_df[i_cntrst, "b_level"],
+      gsub(
+        pattern = ";",
+        replacement = ", ",
+        x = contrast_metadata_df[i_cntrst, "b_samples"],
+        fixed = TRUE
+      ),
+      contrast_metadata_df[i_cntrst, "a_level"],
+      gsub(
+        pattern = ";",
+        replacement = ", ",
+        x = contrast_metadata_df[i_cntrst, "a_samples"],
+        fixed = TRUE
+      )
+    )
+  )
+  kseaapp_input <-
+    sqldf::sqldf(
+      x = sprintf("
+        SELECT `Protein`, `Gene`, `Peptide`, phosphopep AS `Residue.Both`, `p`, `FC`
+          FROM v_kseaapp_input
+          WHERE contrast = %d
+        ",
+        i_cntrst
+      ),
+    connection = db
+    )
+
+  pseudo_ksdata <- dbReadTable(db, "ks_data_v")
+
+  # This hack is because SQL table has the log2-transformed values
+  kseaapp_input[, "FC"] <- 2 ** kseaapp_input[, "FC", drop = TRUE]
+  main_title <- (
+      sprintf(
+      "Change from treatment %s to treatment %s",
+      contrast_metadata_df[i_cntrst, "b_level"],
+      contrast_metadata_df[i_cntrst, "a_level"]
+    )
+  )
+  sub_title <- contrast_longlabel
+  tryCatch(
+    expr = {
+        ksea_scores_rslt <-
+          ksea_scores(
+            ksdata           = pseudo_ksdata, # KSEAapp::KSData,
+            px               = kseaapp_input,
+            networkin        = TRUE,
+            networkin_cutoff = 2
+            )
+
+        if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
+          next_index <- 1 + next_index
+          rslt$score_list[[next_index]] <- ksea_scores_rslt
+          rslt$name_list[[next_index]] <- contrast_label
+          rslt$longname_list[[next_index]] <- contrast_longlabel
+          low_fdr_print(
+            rslt = rslt,
+            i_cntrst = i_cntrst,
+            i = next_index,
+            a_level = cntrst_a_level,
+            b_level = cntrst_b_level,
+            fold_change = cntrst_fold_change,
+            caption = contrast_longlabel
+            )
+        }
+      },
+    error = function(e) str(e)
+  )
+}
+
+plotted_kinases <- NULL
+if (length(rslt$score_list) > 1) {
+  for (i in seq_len(length(ksea_heatmap_titles))) {
+    hdr <- ksea_heatmap_titles[[i]]
+    which_kinases <- i
+
+    cat("\\clearpage\n\\begin{center}\n")
+    if (i == const_ksea_astrsk_kinases) {
+      subsection_header(hdr)
+    } else {
+      subsection_header(hdr)
+    }
+    cat("\\end{center}\n")
+
+    plotted_kinases <- ksea_heatmap(
+      # the data frame outputs from the KSEA.Scores() function, in list format
+      score_list = rslt$score_list,
+      # a character vector of all the sample names for heatmap annotation:
+      # - the names must be in the same order as the data in score_list
+      # - please avoid long names, as they may get cropped in the final image
+      sample_labels = rslt$name_list,
+      # character string of either "p.value" or "FDR" indicating the data column
+      #   to use for marking statistically significant scores
+      stats = c("p.value", "FDR")[2],
+      # a numeric value between 0 and infinity indicating the min. number of
+      #   substrates a kinase must have to be included in the heatmap
+      m_cutoff = 1,
+      # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
+      #   for indicating significant kinases in the heatmap
+      p_cutoff = 0.05,
+      # a binary input of TRUE or FALSE, indicating whether or not to perform
+      #   hierarchical clustering of the sample columns
+      sample_cluster = TRUE,
+      # a binary input of TRUE or FALSE, indicating whether or not to export
+      #   the heatmap as a .png image into the working directory
+      export = FALSE,
+      # additional arguments to gplots::heatmap.2, such as:
+      # - main: main title of plot
+      # - xlab: x-axis label
+      # - ylab: y-axis label
+      xlab = "Contrast",
+      ylab = "Kinase",
+      # print which kinases:
+      # - 1 : all kinases
+      # - 2 : significant kinases
+      # - 3 : non-significant kinases
+      which_kinases = which_kinases
+    )
+    cat("\\begin{center}\n")
+    cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign.  Asterisks reflect significance.\n")
+    cat("\\end{center}\n")
+  } # end for (i in ...
+} # end if (length ...
+
+for (i_cntrst in seq_len(length(rslt$score_list))) {
+  next_index <- i_cntrst
+  cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
+  cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
+  cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
+  contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
+  contrast_longlabel <- (
+    sprintf(
+      "Trt %s {%s} -> Trt %s {%s}",
+      contrast_metadata_df[i_cntrst, "b_level"],
+      gsub(
+        pattern = ";",
+        replacement = ", ",
+        x = contrast_metadata_df[i_cntrst, "b_samples"],
+        fixed = TRUE
+      ),
+      contrast_metadata_df[i_cntrst, "a_level"],
+      gsub(
+        pattern = ";",
+        replacement = ", ",
+        x = contrast_metadata_df[i_cntrst, "a_samples"],
+        fixed = TRUE
+      )
+    )
+  )
+  main_title <- (
+      sprintf(
+      "Change from treatment %s to treatment %s",
+      contrast_metadata_df[i_cntrst, "b_level"],
+      contrast_metadata_df[i_cntrst, "a_level"]
+    )
+  )
+  sub_title <- contrast_longlabel
+  tryCatch(
+    expr = {
+        ksea_scores_rslt <- rslt$score_list[[next_index]]
+
+        if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
+          low_fdr_barplot(
+            rslt = rslt,
+            i_cntrst = i_cntrst,
+            i = next_index,
+            a_level = cntrst_a_level,
+            b_level = cntrst_b_level,
+            fold_change = cntrst_fold_change,
+            caption = contrast_longlabel
+            )
+        }
+      },
+    error = function(e) str(e)
+  )
+}
+```
+
+```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+
+# Use enriched kinases to find enriched kinase-substrate pairs
+enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))
+all_enriched_substrates <- sqldf("
+  SELECT
+    gene AS kinase,
+    ppep,
+    '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label
+  FROM (
+    SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
+      FROM pseudo_ksdata
+      WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+    )
+  GROUP BY ppep
+  ")
+
+# helper used to label per-kinase substrate enrichment figure
+cat_enriched_heading <- function(m, cut_args) {
+  cutoff <- cut_args$cutoff
+  kinase <- cut_args$kinase
+  statistic <- cut_args$statistic
+  threshold <- cut_args$threshold
+  cat("\\newpage\n")
+  if (nrow(m) > intensity_hm_rows) {
+    subsection_header(
+      paste(
+        sprintf(
+          "Lowest p-valued %d (of %d) enriched %s-substrates,",
+          intensity_hm_rows,
+          nrow(m),
+          kinase
+        ),
+        sprintf(" KSEA %s < %0.2f\n", statistic, threshold)
+      )
+    )
+  } else {
+    if (nrow(m) == 1) {
+      return(FALSE)
+    } else {
+      subsection_header(
+        paste(
+          sprintf(
+            "%d enriched %s-substrates,",
+            nrow(m),
+            kinase
+            ),
+          sprintf(
+            " KSEA %s < %0.2f\n",
+            statistic,
+            threshold
+            )
+        )
+      )
+    }
+  }
+  cat("\n\n\n")
+  cat("\n\n\n")
+  return(TRUE)
+}
+
+# Disabling heatmaps for substrates pending decision whether to eliminate them altogether
+if (FALSE)
+  for (kinase_name in sort(enriched_kinases$kinase)) {
+    enriched_substrates <-
+      all_enriched_substrates[
+        all_enriched_substrates$kinase == kinase_name,
+        ,
+        drop = FALSE
+        ]
+    # Get the intensity values for the heatmap
+    enriched_intensities <-
+      as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE])
+    # Remove rows having too many NA values to be relevant
+    na_counter <- is.na(enriched_intensities)
+    na_counts <- apply(na_counter, 1, sum)
+    enriched_intensities <-
+      enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE]
+    # Rename the rows with the display-name for the heatmap
+    rownames(enriched_intensities) <-
+      sapply(
+        X = rownames(enriched_intensities),
+        FUN = function(rn) {
+          enriched_substrates[enriched_substrates$ppep == rn, "label"]
+        }
+      )
+    # Format as matrix for heatmap
+    m <- as.matrix(enriched_intensities)
+    # Draw the heading and heatmap
+    if (nrow(m) > 0) {
+      cut_args <- new_env()
+      cut_args$cutoff <- cutoff
+      cut_args$kinase <- kinase_name
+      cut_args$statistic <- ksea_cutoff_statistic
+      cut_args$threshold <- ksea_cutoff_threshold
+      number_of_peptides_found <-
+        draw_intensity_heatmap(
+          m                       = m,
+          cutoff                  = cut_args,
+          hm_heading_function     = cat_enriched_heading,
+          hm_main_title
+            = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates",
+          suppress_row_dendrogram = FALSE
+        )
+    }
+  }
+
+# Write output tabular files
+
+# get kinase, ppep, concat(kinase) tuples for enriched kinases
+
+kinase_ppep_label <- sqldf("
+  WITH
+    t(ppep, label) AS
+      (
+        SELECT DISTINCT
+            SUB_MOD_RSD AS ppep,
+            group_concat(gene, '; ') AS label
+          FROM pseudo_ksdata
+          WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+          GROUP BY ppep
+      ),
+    k(kinase, ppep_join) AS
+      (
+      SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join
+        FROM pseudo_ksdata
+        WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+      )
+  SELECT k.kinase, t.ppep, t.label
+    FROM  t,  k
+    WHERE t.ppep = k.ppep_join
+    ORDER BY k.kinase, t.ppep
+  ")
+
+# extract what we need from full_data
+impish <- cbind(rownames(quant_data_imp), quant_data_imp)
+colnames(impish)[1] <- "Phosphopeptide"
+data_table_imputed_sql <- "
+  SELECT
+    f.*,
+    k.label AS KSEA_enrichments,
+    q.*
+  FROM
+    metadata_plus_p f
+      LEFT JOIN kinase_ppep_label k
+        ON f.Phosphopeptide = k.ppep,
+    impish q
+  WHERE
+    f.Phosphopeptide = q.Phosphopeptide
+  "
+data_table_imputed <- sqldf(data_table_imputed_sql)
+# Zap the duplicated 'Phosphopeptide' column named 'ppep'
+data_table_imputed <-
+    data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
+
+# Output with imputed, un-normalized data
+
+write.table(
+    data_table_imputed
+  , file = imputed_data_filename
+  , sep = "\t"
+  , col.names = TRUE
+  , row.names = FALSE
+  , quote = FALSE
+  )
+
+
+#output quantile normalized data
+impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)
+colnames(impish)[1] <- "Phosphopeptide"
+data_table_imputed <- sqldf(data_table_imputed_sql)
+# Zap the duplicated 'Phosphopeptide' column named 'ppep'
+data_table_imputed <-
+    data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
+write.table(
+  data_table_imputed,
+  file = imp_qn_lt_data_filenm,
+  sep = "\t",
+  col.names = TRUE,
+  row.names = FALSE,
+  quote = FALSE
+)
+
+ppep_kinase <- sqldf("
+  SELECT DISTINCT k.ppep, k.kinase
+    FROM (
+      SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep
+        FROM pseudo_ksdata
+        WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+      ) k
+    ORDER BY k.ppep, k.kinase
+  ")
+
+RSQLite::dbWriteTable(
+  conn = db,
+  name = "ksea_enriched_ks",
+  value = ppep_kinase,
+  append = FALSE
+  )
+
+RSQLite::dbWriteTable(
+  conn = db,
+  name = "anova_signif",
+  value = p_value_data,
+  append = FALSE
+  )
+
+  ddl_exec(db, "
+    DROP VIEW IF EXISTS stats_metadata_v;
+    "
+  )
+  dml_no_rows_exec(db, "
+      CREATE VIEW stats_metadata_v
+        AS
+      SELECT DISTINCT  m.*,
+          p.raw_anova_p,
+          p.fdr_adjusted_anova_p,
+          kek.kinase AS ksea_enrichments
+        FROM
+          mrgfltr_metadata_view m
+            LEFT JOIN anova_signif p
+              ON m.phospho_peptide = p.phosphopeptide
+            LEFT JOIN ksea_enriched_ks kek
+              ON m.phospho_peptide = kek.ppep
+      ;
+    "
+  )
+
+write.table(
+  dbReadTable(db, "stats_metadata_v"),
+  file = anova_ksea_mtdt_file,
+  sep = "\t",
+  col.names = TRUE,
+  row.names = FALSE,
+  quote = FALSE
+  )
+
+
+```
+
+```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+cat("\\leavevmode\n\n\n")
+
+# write parameters to report
+
+param_unlist <- unlist(as.list(params))
+param_df <- data.frame(
+  parameter = paste0("\\verb@", names(param_unlist), "@"),
+  value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@")
+  )
+
+data_frame_latex(
+  x = param_df,
+  justification = "p{0.35\\linewidth} p{0.6\\linewidth}",
+  centered = TRUE,
+  caption = "Input parameters",
+  anchor = const_table_anchor_bp,
+  underscore_whack = FALSE
+  )
+
+# write parameters to SQLite output
+
+mqppep_anova_script_param_df <- data.frame(
+  script    = "mqppep_anova_script.Rmd",
+  parameter = names(param_unlist),
+  value     = param_unlist
+  )
+ddl_exec(db, "
+  DROP TABLE IF EXISTS script_parameter;
+  "
+)
+ddl_exec(db, "
+  CREATE TABLE IF NOT EXISTS script_parameter(
+    script    TEXT,
+    parameter TEXT,
+    value     ANY,
+    UNIQUE (script, parameter) ON CONFLICT REPLACE
+    )
+    ;
+  "
+)
+RSQLite::dbWriteTable(
+  conn = db,
+  name = "script_parameter",
+  value = mqppep_anova_script_param_df,
+  append = TRUE
+)
+
+# We are done with output
+RSQLite::dbDisconnect(db)
+```
+<!--
+There's gotta be a better way...
+
+loaded_packages_df <-  sessioninfo::package_info("loaded")
+loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)
+loaded_packages_df <- data.frame(
+  package = loaded_packages_df$package,
+  version = loaded_packages_df$loadedversion,
+  date    = loaded_packages_df$date
+  )
+data_frame_latex(
+  x = loaded_packages_df,
+  justification = "l | l l",
+  centered = FALSE,
+  caption = "Loaded R packages",
+  anchor = const_table_anchor_bp
+  )
+-->
diff -r 000000000000 -r ba62d93a9ef5 mqppep_mrgfltr.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_mrgfltr.py	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,1551 @@
+#!/usr/bin/env python
+
+# Import the packages needed
+import argparse
+import operator  # for operator.itemgetter
+import os.path
+import re
+import shutil  # for shutil.copyfile(src, dest)
+import sqlite3 as sql
+import sys  # import the sys module for exc_info
+import time
+import traceback  # for formatting stack-trace
+from codecs import getreader as cx_getreader
+
+import numpy as np
+import pandas
+
+# global constants
+N_A = "N/A"
+
+
+# ref: https://stackoverflow.com/a/8915613/15509512
+#   answers: "How to handle exceptions in a list comprehensions"
+#   usage:
+#       from math import log
+#       eggs = [1,3,0,3,2]
+#       print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
+#   producing:
+#       for <built-in function log>
+#         with args (0,)
+#         exception: math domain error
+#       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
+def catch(func, *args, handle=lambda e: e, **kwargs):
+
+    try:
+        return func(*args, **kwargs)
+    except Exception as e:
+        print("For %s" % str(func))
+        print("  with args %s" % str(args))
+        print("  caught exception: %s" % str(e))
+        (ty, va, tb) = sys.exc_info()
+        print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
+        exit(-1)
+        return None
+
+
+def whine(func, *args, handle=lambda e: e, **kwargs):
+
+    try:
+        return func(*args, **kwargs)
+    except Exception as e:
+        print("Warning: For %s" % str(func))
+        print("  with args %s" % str(args))
+        print("  caught exception: %s" % str(e))
+        (ty, va, tb) = sys.exc_info()
+        print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
+        return None
+
+
+def ppep_join(x):
+    x = [i for i in x if N_A != i]
+    result = "%s" % " | ".join(x)
+    if result != "":
+        return result
+    else:
+        return N_A
+
+
+def melt_join(x):
+    tmp = {key.lower(): key for key in x}
+    result = "%s" % " | ".join([tmp[key] for key in tmp])
+    return result
+
+
+def __main__():
+    # Parse Command Line
+    parser = argparse.ArgumentParser(
+        description="Phopsphoproteomic Enrichment Pipeline Merge and Filter."
+    )
+
+    # inputs:
+    #   Phosphopeptide data for experimental results, including the intensities
+    #   and the mapping to kinase domains, in tabular format.
+    parser.add_argument(
+        "--phosphopeptides",
+        "-p",
+        nargs=1,
+        required=True,
+        dest="phosphopeptides",
+        help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format",
+    )
+    #   UniProtKB/SwissProt DB input, SQLite
+    parser.add_argument(
+        "--ppep_mapping_db",
+        "-d",
+        nargs=1,
+        required=True,
+        dest="ppep_mapping_db",
+        help="UniProtKB/SwissProt SQLite Database",
+    )
+    #   species to limit records chosed from PhosPhositesPlus
+    parser.add_argument(
+        "--species",
+        "-x",
+        nargs=1,
+        required=False,
+        default=[],
+        dest="species",
+        help="limit PhosphoSitePlus records to indicated species (field may be empty)",
+    )
+
+    # outputs:
+    #   tabular output
+    parser.add_argument(
+        "--mrgfltr_tab",
+        "-o",
+        nargs=1,
+        required=True,
+        dest="mrgfltr_tab",
+        help="Tabular output file for results",
+    )
+    #   CSV output
+    parser.add_argument(
+        "--mrgfltr_csv",
+        "-c",
+        nargs=1,
+        required=True,
+        dest="mrgfltr_csv",
+        help="CSV output file for results",
+    )
+    #   SQLite output
+    parser.add_argument(
+        "--mrgfltr_sqlite",
+        "-S",
+        nargs=1,
+        required=True,
+        dest="mrgfltr_sqlite",
+        help="SQLite output file for results",
+    )
+
+    # "Make it so!" (parse the arguments)
+    options = parser.parse_args()
+    print("options: " + str(options))
+
+    # determine phosphopeptide ("upstream map") input tabular file access
+    if options.phosphopeptides is None:
+        exit('Argument "phosphopeptides" is required but not supplied')
+    try:
+        upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0])
+        input_file = open(upstream_map_filename_tab, "r")
+        input_file.close()
+    except Exception as e:
+        exit("Error parsing phosphopeptides argument: %s" % str(e))
+
+    # determine input SQLite access
+    if options.ppep_mapping_db is None:
+        exit('Argument "ppep_mapping_db" is required but not supplied')
+    try:
+        uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0])
+        input_file = open(uniprot_sqlite, "rb")
+        input_file.close()
+    except Exception as e:
+        exit("Error parsing ppep_mapping_db argument: %s" % str(e))
+
+    # copy input SQLite dataset to output SQLite dataset
+    if options.mrgfltr_sqlite is None:
+        exit('Argument "mrgfltr_sqlite" is required but not supplied')
+    try:
+        output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0])
+        shutil.copyfile(uniprot_sqlite, output_sqlite)
+    except Exception as e:
+        exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e))
+
+    # determine species to limit records from PSP_Regulatory_Sites
+    if options.species is None:
+        exit(
+            'Argument "species" is required (and may be empty) but not supplied'
+        )
+    try:
+        if len(options.species) > 0:
+            species = options.species[0]
+        else:
+            species = ""
+    except Exception as e:
+        exit("Error parsing species argument: %s" % str(e))
+
+    # determine tabular output destination
+    if options.mrgfltr_tab is None:
+        exit('Argument "mrgfltr_tab" is required but not supplied')
+    try:
+        output_filename_tab = os.path.abspath(options.mrgfltr_tab[0])
+        output_file = open(output_filename_tab, "w")
+        output_file.close()
+    except Exception as e:
+        exit("Error parsing mrgfltr_tab argument: %s" % str(e))
+
+    # determine CSV output destination
+    if options.mrgfltr_csv is None:
+        exit('Argument "mrgfltr_csv" is required but not supplied')
+    try:
+        output_filename_csv = os.path.abspath(options.mrgfltr_csv[0])
+        output_file = open(output_filename_csv, "w")
+        output_file.close()
+    except Exception as e:
+        exit("Error parsing mrgfltr_csv argument: %s" % str(e))
+
+    def mqpep_getswissprot():
+
+        #
+        # copied from Excel Output Script.ipynb BEGIN #
+        #
+
+        #  String Constants  #################
+        DEPHOSPHOPEP = "DephosphoPep"
+        DESCRIPTION = "Description"
+        FUNCTION_PHOSPHORESIDUE = (
+            "Function Phosphoresidue(PSP=PhosphoSitePlus.org)"
+        )
+        GENE_NAME = "Gene_Name"  # Gene Name from UniProtKB
+        ON_FUNCTION = (
+            "ON_FUNCTION"  # ON_FUNCTION column from PSP_Regulatory_Sites
+        )
+        ON_NOTES = "NOTES"  # NOTES column from PSP_Regulatory_Sites
+        ON_OTHER_INTERACT = "ON_OTHER_INTERACT"  # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
+        ON_PROCESS = (
+            "ON_PROCESS"  # ON_PROCESS column from PSP_Regulatory_Sites
+        )
+        ON_PROT_INTERACT = "ON_PROT_INTERACT"  # ON_PROT_INTERACT column from PSP_Regulatory_Sites
+        PHOSPHOPEPTIDE = "Phosphopeptide"
+        PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match"
+        PHOSPHORESIDUE = "Phosphoresidue"
+        PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains"
+        SEQUENCE = "Sequence"
+        SEQUENCE10 = "Sequence10"
+        SEQUENCE7 = "Sequence7"
+        SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA"
+        UNIPROT_ID = "UniProt_ID"
+        UNIPROT_SEQ_AND_META_SQL = """
+            select    Uniprot_ID, Description, Gene_Name, Sequence,
+                      Organism_Name, Organism_ID, PE, SV
+                 from UniProtKB
+             order by Sequence, UniProt_ID
+        """
+        UNIPROT_UNIQUE_SEQ_SQL = """
+            select distinct Sequence
+                       from UniProtKB
+                   group by Sequence
+        """
+        PPEP_PEP_UNIPROTSEQ_SQL = """
+            select distinct phosphopeptide, peptide, sequence
+                       from uniprotkb_pep_ppep_view
+                   order by sequence
+        """
+        PPEP_MELT_SQL = """
+            SELECT DISTINCT
+                phospho_peptide AS 'p_peptide',
+                kinase_map AS 'characterization',
+                'X' AS 'X'
+            FROM ppep_gene_site_view
+        """
+        # CREATE TABLE PSP_Regulatory_site (
+        #   site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
+        #   domain             TEXT,
+        #   ON_FUNCTION        TEXT,
+        #   ON_PROCESS         TEXT,
+        #   ON_PROT_INTERACT   TEXT,
+        #   ON_OTHER_INTERACT  TEXT,
+        #   notes              TEXT,
+        #   organism           TEXT
+        # );
+        PSP_REGSITE_SQL = """
+            SELECT DISTINCT
+              SITE_PLUSMINUS_7AA ,
+              DOMAIN             ,
+              ON_FUNCTION        ,
+              ON_PROCESS         ,
+              ON_PROT_INTERACT   ,
+              ON_OTHER_INTERACT  ,
+              NOTES              ,
+              ORGANISM
+            FROM PSP_Regulatory_site
+        """
+        PPEP_ID_SQL = """
+            SELECT
+                id AS 'ppep_id',
+                seq AS 'ppep_seq'
+            FROM ppep
+        """
+        MRGFLTR_DDL = """
+        DROP VIEW  IF EXISTS mrgfltr_metadata_view;
+        DROP TABLE IF EXISTS mrgfltr_metadata;
+        CREATE TABLE mrgfltr_metadata
+          ( ppep_id                 INTEGER REFERENCES ppep(id)
+          , Sequence10              TEXT
+          , Sequence7               TEXT
+          , GeneName                TEXT
+          , Phosphoresidue          TEXT
+          , UniProtID               TEXT
+          , Description             TEXT
+          , FunctionPhosphoresidue  TEXT
+          , PutativeUpstreamDomains TEXT
+          , PRIMARY KEY (ppep_id)            ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW mrgfltr_metadata_view AS
+          SELECT DISTINCT
+              ppep.seq             AS phospho_peptide
+            , Sequence10
+            , Sequence7
+            , GeneName
+            , Phosphoresidue
+            , UniProtID
+            , Description
+            , FunctionPhosphoresidue
+            , PutativeUpstreamDomains
+          FROM
+            ppep, mrgfltr_metadata
+          WHERE
+              mrgfltr_metadata.ppep_id = ppep.id
+          ORDER BY
+            ppep.seq
+            ;
+        """
+
+        CITATION_INSERT_STMT = """
+          INSERT INTO Citation (
+            ObjectName,
+            CitationData
+          ) VALUES (?,?)
+          """
+        CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
+        CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'
+
+        MRGFLTR_METADATA_COLUMNS = [
+            "ppep_id",
+            "Sequence10",
+            "Sequence7",
+            "GeneName",
+            "Phosphoresidue",
+            "UniProtID",
+            "Description",
+            "FunctionPhosphoresidue",
+            "PutativeUpstreamDomains",
+        ]
+
+        #  String Constants (end) ############
+
+        class Error(Exception):
+            """Base class for exceptions in this module."""
+
+            pass
+
+        class PreconditionError(Error):
+            """Exception raised for errors in the input.
+
+            Attributes:
+                expression -- input expression in which the error occurred
+                message -- explanation of the error
+            """
+
+            def __init__(self, expression, message):
+                self.expression = expression
+                self.message = message
+
+        # start_time = time.clock() #timer
+        start_time = time.process_time()  # timer
+
+        # get keys from upstream tabular file using readline()
+        # ref: https://stackoverflow.com/a/16713581/15509512
+        #      answer to "Use codecs to read file with correct encoding"
+        file1_encoded = open(upstream_map_filename_tab, "rb")
+        file1 = cx_getreader("latin-1")(file1_encoded)
+
+        count = 0
+        upstream_map_p_peptide_list = []
+        re_tab = re.compile("^[^\t]*")
+        while True:
+            count += 1
+            # Get next line from file
+            line = file1.readline()
+            # if line is empty
+            # end of file is reached
+            if not line:
+                break
+            if count > 1:
+                m = re_tab.match(line)
+                upstream_map_p_peptide_list.append(m[0])
+        file1.close()
+        file1_encoded.close()
+
+        # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
+        re_phos = re.compile("p")
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,),
+            file=sys.stderr,
+        )
+
+        # ----------- Get SwissProt data from SQLite database (start) -----------
+        # build UniProt sequence LUT and list of unique SwissProt sequences
+
+        # Open SwissProt SQLite database
+        conn = sql.connect(uniprot_sqlite)
+        cur = conn.cursor()
+
+        # Set up structures to hold SwissProt data
+
+        uniprot_Sequence_List = []
+        UniProtSeqLUT = {}
+
+        # Execute query for unique seqs without fetching the results yet
+        uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL)
+
+        while 1:
+            batch = uniprot_unique_seq_cur.fetchmany(size=50)
+            if not batch:
+                # handle case where no records are returned
+                break
+            for row in batch:
+                Sequence = row[0]
+                UniProtSeqLUT[(Sequence, DESCRIPTION)] = []
+                UniProtSeqLUT[(Sequence, GENE_NAME)] = []
+                UniProtSeqLUT[(Sequence, UNIPROT_ID)] = []
+                UniProtSeqLUT[Sequence] = []
+
+        # Execute query for seqs and metadata without fetching the results yet
+        uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL)
+
+        while 1:
+            batch = uniprot_seq_and_meta.fetchmany(size=50)
+            if not batch:
+                # handle case where no records are returned
+                break
+            for (
+                UniProt_ID,
+                Description,
+                Gene_Name,
+                Sequence,
+                OS,
+                OX,
+                PE,
+                SV,
+            ) in batch:
+                uniprot_Sequence_List.append(Sequence)
+                UniProtSeqLUT[Sequence] = Sequence
+                UniProtSeqLUT[(Sequence, UNIPROT_ID)].append(UniProt_ID)
+                UniProtSeqLUT[(Sequence, GENE_NAME)].append(Gene_Name)
+                if OS != N_A:
+                    Description += " OS=" + OS
+                if OX != -1:
+                    Description += " OX=" + str(OX)
+                if Gene_Name != N_A:
+                    Description += " GN=" + Gene_Name
+                if PE != N_A:
+                    Description += " PE=" + PE
+                if SV != N_A:
+                    Description += " SV=" + SV
+                UniProtSeqLUT[(Sequence, DESCRIPTION)].append(Description)
+
+        # Close SwissProt SQLite database; clean up local variables
+        conn.close()
+        Sequence = ""
+        UniProt_ID = ""
+        Description = ""
+        Gene_Name = ""
+
+        # ----------- Get SwissProt data from SQLite database (finish) -----------
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,),
+            file=sys.stderr,
+        )
+
+        # ----------- Get SwissProt data from SQLite database (start) -----------
+        # Open SwissProt SQLite database
+        conn = sql.connect(uniprot_sqlite)
+        cur = conn.cursor()
+
+        # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
+        DephosphoPep_UniProtSeq_LUT = {}
+
+        # Set up dictionary to accumulate results
+        PhosphoPep_UniProtSeq_LUT = {}
+
+        # Execute query for tuples without fetching the results yet
+        ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL)
+
+        while 1:
+            batch = ppep_pep_uniprotseq_cur.fetchmany(size=50)
+            if not batch:
+                # handle case where no records are returned
+                break
+            for (phospho_pep, dephospho_pep, sequence) in batch:
+                # do interesting stuff here...
+                PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
+                PhosphoPep_UniProtSeq_LUT[
+                    (phospho_pep, DEPHOSPHOPEP)
+                ] = dephospho_pep
+                if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
+                    DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set()
+                    DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, DESCRIPTION)
+                    ] = []
+                    DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, GENE_NAME)
+                    ] = []
+                    DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, UNIPROT_ID)
+                    ] = []
+                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep, SEQUENCE)] = []
+                DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep)
+
+                if (
+                    sequence
+                    not in DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, SEQUENCE)
+                    ]
+                ):
+                    DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, SEQUENCE)
+                    ].append(sequence)
+                for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]:
+                    if phospho_pep != phospho_pep:
+                        print(
+                            "phospho_pep:'%s' phospho_pep:'%s'"
+                            % (phospho_pep, phospho_pep)
+                        )
+                    if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
+                        PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
+                        PhosphoPep_UniProtSeq_LUT[
+                            (phospho_pep, DEPHOSPHOPEP)
+                        ] = dephospho_pep
+                    r = list(
+                        zip(
+                            [s for s in UniProtSeqLUT[(sequence, UNIPROT_ID)]],
+                            [s for s in UniProtSeqLUT[(sequence, GENE_NAME)]],
+                            [
+                                s
+                                for s in UniProtSeqLUT[(sequence, DESCRIPTION)]
+                            ],
+                        )
+                    )
+                    # Sort by `UniProt_ID`
+                    #   ref: https://stackoverflow.com/a/4174955/15509512
+                    r = sorted(r, key=operator.itemgetter(0))
+                    # Get one tuple for each `phospho_pep`
+                    #   in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
+                    for (upid, gn, desc) in r:
+                        # Append pseudo-tuple per UniProt_ID but only when it is not present
+                        if (
+                            upid
+                            not in DephosphoPep_UniProtSeq_LUT[
+                                (dephospho_pep, UNIPROT_ID)
+                            ]
+                        ):
+                            DephosphoPep_UniProtSeq_LUT[
+                                (dephospho_pep, UNIPROT_ID)
+                            ].append(upid)
+                            DephosphoPep_UniProtSeq_LUT[
+                                (dephospho_pep, DESCRIPTION)
+                            ].append(desc)
+                            DephosphoPep_UniProtSeq_LUT[
+                                (dephospho_pep, GENE_NAME)
+                            ].append(gn)
+
+        # Close SwissProt SQLite database; clean up local variables
+        conn.close()
+        # wipe local variables
+        phospho_pep = dephospho_pep = sequence = 0
+        upid = gn = desc = r = ""
+
+        # ----------- Get SwissProt data from SQLite database (finish) -----------
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f finished reading and decoding '%s' [0.4]"
+            % (end_time - start_time, upstream_map_filename_tab),
+            file=sys.stderr,
+        )
+
+        print(
+            "{:>10} unique upstream phosphopeptides tested".format(
+                str(len(upstream_map_p_peptide_list))
+            )
+        )
+
+        # Read in Upstream tabular file
+        # We are discarding the intensity data; so read it as text
+        upstream_data = pandas.read_table(
+            upstream_map_filename_tab, dtype="str", index_col=0
+        )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f read Upstream Map from file [1g_1]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        upstream_data.index = upstream_map_p_peptide_list
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f added index to Upstream Map [1g_2]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # ########################################################################
+        # # trim upstream_data to include only the upstream map columns
+        # old_cols = upstream_data.columns.tolist()
+        # i = 0
+        # first_intensity = -1
+        # last_intensity = -1
+        # intensity_re = re.compile("Intensity.*")
+        # for col_name in old_cols:
+        #     m = intensity_re.match(col_name)
+        #     if m:
+        #         last_intensity = i
+        #         if first_intensity == -1:
+        #             first_intensity = i
+        #     i += 1
+        # # print('last intensity = %d' % last_intensity)
+        # col_PKCalpha = last_intensity + 2
+        #
+        # data_in_cols = [old_cols[0]] + old_cols[
+        #     first_intensity: last_intensity + 1
+        # ]
+        #
+        # if upstream_data.empty:
+        #     print("upstream_data is empty")
+        #     exit(0)
+        #
+        # data_in = upstream_data.copy(deep=True)[data_in_cols]
+        ########################################################################
+        # trim upstream_data to include only the upstream map columns
+        old_cols = upstream_data.columns.tolist()
+        i = 0
+        first_intensity = -1
+        last_intensity = -1
+        intensity_re = re.compile("Intensity.*")
+        for col_name in old_cols:
+            m = intensity_re.match(col_name)
+            if m:
+                last_intensity = i
+                if first_intensity == -1:
+                    first_intensity = i
+            i += 1
+        # print('last intensity = %d' % last_intensity)
+        col_PKCalpha = last_intensity + 2
+
+        data_in_cols = [old_cols[0]] + old_cols[
+            first_intensity - 1: last_intensity
+        ]
+        data_col_names = [old_cols[0]] + old_cols[
+            first_intensity: last_intensity + 1
+        ]
+
+        if upstream_data.empty:
+            print("upstream_data is empty")
+            exit(0)
+
+        data_in = upstream_data.copy(deep=True)[data_in_cols]
+        data_in.columns = data_col_names
+        print("data_in")
+        print(data_in)
+        ########################################################################
+
+        # Convert floating-point integers to int64 integers
+        #   ref: https://stackoverflow.com/a/68497603/15509512
+        data_in[list(data_in.columns[1:])] = (
+            data_in[list(data_in.columns[1:])]
+            .astype("float64")
+            .apply(np.int64)
+        )
+
+        # create another phosphopeptide column that will be used to join later;
+        #  MAY need to change depending on Phosphopeptide column position
+        # data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]]
+        data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # Produce a dictionary of metadata for a single phosphopeptide.
+        #   This is a replacement of `UniProtInfo_subdict` in the original code.
+        def pseq_to_subdict(phospho_pep):
+            # Strip "p" from phosphopeptide sequence
+            dephospho_pep = re_phos.sub("", phospho_pep)
+
+            # Determine number of phosphoresidues in phosphopeptide
+            numps = len(phospho_pep) - len(dephospho_pep)
+
+            # Determine location(s) of phosphoresidue(s) in phosphopeptide
+            #   (used later for Phosphoresidue, Sequence7, and Sequence10)
+            ploc = []  # list of p locations
+            i = 0
+            p = phospho_pep
+            while i < numps:
+                ploc.append(p.find("p"))
+                p = p[: p.find("p")] + p[p.find("p") + 1:]
+                i += 1
+
+            # Establish nested dictionary
+            result = {}
+            result[SEQUENCE] = []
+            result[UNIPROT_ID] = []
+            result[DESCRIPTION] = []
+            result[GENE_NAME] = []
+            result[PHOSPHORESIDUE] = []
+            result[SEQUENCE7] = []
+            result[SEQUENCE10] = []
+
+            # Add stripped sequence to dictionary
+            result[SEQUENCE].append(dephospho_pep)
+
+            # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT
+            # Caller may elect to:
+            # try:
+            #     ...
+            # except PreconditionError as pe:
+            #     print("'{expression}': {message}".format(
+            #             expression = pe.expression,
+            #             message = pe.message))
+            #             )
+            #         )
+            if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
+                raise PreconditionError(
+                    phospho_pep,
+                    "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT",
+                )
+            if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
+                raise PreconditionError(
+                    dephospho_pep,
+                    "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT",
+                )
+            if (
+                dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)]
+            ):
+                my_err_msg = "dephosphorylated phosphopeptide does not match "
+                my_err_msg += "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = "
+                my_err_msg += PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)]
+                raise PreconditionError(dephospho_pep, my_err_msg)
+
+            result[SEQUENCE] = [dephospho_pep]
+            result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[
+                (dephospho_pep, UNIPROT_ID)
+            ]
+            result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[
+                (dephospho_pep, DESCRIPTION)
+            ]
+            result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[
+                (dephospho_pep, GENE_NAME)
+            ]
+            if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
+                raise PreconditionError(
+                    dephospho_pep,
+                    "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT",
+                )
+            UniProtSeqList = DephosphoPep_UniProtSeq_LUT[
+                (dephospho_pep, SEQUENCE)
+            ]
+            if len(UniProtSeqList) < 1:
+                print(
+                    "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length"
+                    % dephospho_pep
+                )
+                # raise PreconditionError(
+                #     "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
+                #      'value has zero length'
+                #      )
+            for UniProtSeq in UniProtSeqList:
+                i = 0
+                phosphoresidues = []
+                seq7s_set = set()
+                seq7s = []
+                seq10s_set = set()
+                seq10s = []
+                while i < len(ploc):
+                    start = UniProtSeq.find(dephospho_pep)
+                    # handle case where no sequence was found for dep-pep
+                    if start < 0:
+                        i += 1
+                        continue
+                    psite = (
+                        start + ploc[i]
+                    )  # location of phosphoresidue on protein sequence
+
+                    # add Phosphoresidue
+                    phosphosite = "p" + str(UniProtSeq)[psite] + str(psite + 1)
+                    phosphoresidues.append(phosphosite)
+
+                    # Add Sequence7
+                    if psite < 7:  # phospho_pep at N terminus
+                        seq7 = str(UniProtSeq)[: psite + 8]
+                        if seq7[psite] == "S":  # if phosphosresidue is serine
+                            pres = "s"
+                        elif (
+                            seq7[psite] == "T"
+                        ):  # if phosphosresidue is threonine
+                            pres = "t"
+                        elif (
+                            seq7[psite] == "Y"
+                        ):  # if phosphoresidue is tyrosine
+                            pres = "y"
+                        else:  # if not pSTY
+                            pres = "?"
+                        seq7 = (
+                            seq7[:psite] + pres + seq7[psite + 1: psite + 8]
+                        )
+                        while (
+                            len(seq7) < 15
+                        ):  # add appropriate number of "_" to the front
+                            seq7 = "_" + seq7
+                    elif (
+                        len(UniProtSeq) - psite < 8
+                    ):  # phospho_pep at C terminus
+                        seq7 = str(UniProtSeq)[psite - 7:]
+                        if seq7[7] == "S":
+                            pres = "s"
+                        elif seq7[7] == "T":
+                            pres = "t"
+                        elif seq7[7] == "Y":
+                            pres = "y"
+                        else:
+                            pres = "?"
+                        seq7 = seq7[:7] + pres + seq7[8:]
+                        while (
+                            len(seq7) < 15
+                        ):  # add appropriate number of "_" to the back
+                            seq7 = seq7 + "_"
+                    else:
+                        seq7 = str(UniProtSeq)[psite - 7: psite + 8]
+                        pres = ""  # phosphoresidue
+                        if seq7[7] == "S":  # if phosphosresidue is serine
+                            pres = "s"
+                        elif seq7[7] == "T":  # if phosphosresidue is threonine
+                            pres = "t"
+                        elif seq7[7] == "Y":  # if phosphoresidue is tyrosine
+                            pres = "y"
+                        else:  # if not pSTY
+                            pres = "?"
+                        seq7 = seq7[:7] + pres + seq7[8:]
+                    if seq7 not in seq7s_set:
+                        seq7s.append(seq7)
+                        seq7s_set.add(seq7)
+
+                    # add Sequence10
+                    if psite < 10:  # phospho_pep at N terminus
+                        seq10 = (
+                            str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite: psite + 11]
+                        )
+                    elif (
+                        len(UniProtSeq) - psite < 11
+                    ):  # phospho_pep at C terminus
+                        seq10 = (
+                            str(UniProtSeq)[psite - 10: psite] + "p" + str(UniProtSeq)[psite:]
+                        )
+                    else:
+                        seq10 = str(UniProtSeq)[psite - 10: psite + 11]
+                        seq10 = seq10[:10] + "p" + seq10[10:]
+                    if seq10 not in seq10s_set:
+                        seq10s.append(seq10)
+                        seq10s_set.add(seq10)
+
+                    i += 1
+
+                result[PHOSPHORESIDUE].append(phosphoresidues)
+                result[SEQUENCE7].append(seq7s)
+                # result[SEQUENCE10] is a list of lists of strings
+                result[SEQUENCE10].append(seq10s)
+
+            r = list(
+                zip(
+                    result[UNIPROT_ID],
+                    result[GENE_NAME],
+                    result[DESCRIPTION],
+                    result[PHOSPHORESIDUE],
+                )
+            )
+            # Sort by `UniProt_ID`
+            #   ref: https://stackoverflow.com//4174955/15509512
+            s = sorted(r, key=operator.itemgetter(0))
+
+            result[UNIPROT_ID] = []
+            result[GENE_NAME] = []
+            result[DESCRIPTION] = []
+            result[PHOSPHORESIDUE] = []
+
+            for r in s:
+                result[UNIPROT_ID].append(r[0])
+                result[GENE_NAME].append(r[1])
+                result[DESCRIPTION].append(r[2])
+                result[PHOSPHORESIDUE].append(r[3])
+
+            # convert lists to strings in the dictionary
+            for key, value in result.items():
+                if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]:
+                    result[key] = "; ".join(map(str, value))
+                elif key in [SEQUENCE10]:
+                    # result[SEQUENCE10] is a list of lists of strings
+                    joined_value = ""
+                    joined_set = set()
+                    sep = ""
+                    for valL in value:
+                        # valL is a list of strings
+                        for val in valL:
+                            # val is a string
+                            if val not in joined_set:
+                                joined_set.add(val)
+                                joined_value += sep + val
+                                sep = "; "
+                    # joined_value is a string
+                    result[key] = joined_value
+
+            newstring = "; ".join(
+                [", ".join(prez) for prez in result[PHOSPHORESIDUE]]
+            )
+            # #separate the isoforms in PHOSPHORESIDUE column with ";"
+            # oldstring = result[PHOSPHORESIDUE]
+            # oldlist = list(oldstring)
+            # newstring = ""
+            # i = 0
+            # for e in oldlist:
+            #     if e == ";":
+            #         if numps > 1:
+            #             if i%numps:
+            #                 newstring = newstring + ";"
+            #             else:
+            #                 newstring = newstring + ","
+            #         else:
+            #             newstring = newstring + ";"
+            #         i +=1
+            #     else:
+            #         newstring = newstring + e
+            result[PHOSPHORESIDUE] = newstring
+
+            # separate sequence7's by |
+            oldstring = result[SEQUENCE7]
+            oldlist = oldstring
+            newstring = ""
+            for ol in oldlist:
+                for e in ol:
+                    if e == ";":
+                        newstring = newstring + " |"
+                    elif len(newstring) > 0 and 1 > newstring.count(e):
+                        newstring = newstring + " | " + e
+                    elif 1 > newstring.count(e):
+                        newstring = newstring + e
+            result[SEQUENCE7] = newstring
+
+            return [phospho_pep, result]
+
+        # Construct list of [string, dictionary] lists
+        #   where the dictionary provides the SwissProt metadata
+        #   for a phosphopeptide
+        result_list = [
+            whine(pseq_to_subdict, psequence)
+            for psequence in data_in[PHOSPHOPEPTIDE_MATCH]
+        ]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f added SwissProt annotations to phosphopeptides [B]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # Construct dictionary from list of lists
+        #   ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
+        UniProt_Info = {
+            result[0]: result[1]
+            for result in result_list
+            if result is not None
+        }
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # cosmetic: add N_A to phosphopeptide rows with no hits
+        p_peptide_list = []
+        for key in UniProt_Info:
+            p_peptide_list.append(key)
+            for nestedKey in UniProt_Info[key]:
+                if UniProt_Info[key][nestedKey] == "":
+                    UniProt_Info[key][nestedKey] = N_A
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # convert UniProt_Info dictionary to dataframe
+        uniprot_df = pandas.DataFrame.transpose(
+            pandas.DataFrame.from_dict(UniProt_Info)
+        )
+
+        # reorder columns to match expected output file
+        uniprot_df[
+            PHOSPHOPEPTIDE
+        ] = uniprot_df.index  # make index a column too
+
+        cols = uniprot_df.columns.tolist()
+        # cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]]
+        # uniprot_df = uniprot_df[cols]
+        uniprot_df = uniprot_df[
+            [
+                PHOSPHOPEPTIDE,
+                SEQUENCE10,
+                SEQUENCE7,
+                GENE_NAME,
+                PHOSPHORESIDUE,
+                UNIPROT_ID,
+                DESCRIPTION,
+            ]
+        ]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f reordered columns to match expected output file [1]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # concat to split then groupby to collapse
+        seq7_df = pandas.concat(
+            [
+                pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(" | "))
+                for _, row in uniprot_df.iterrows()
+            ]
+        ).reset_index()
+        seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE]
+
+        # --- -------------- begin read PSP_Regulatory_sites ---------------------------------
+        # read in PhosphoSitePlus Regulatory Sites dataset
+        # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
+        conn = sql.connect(uniprot_sqlite)
+        regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
+        # Close SwissProt SQLite database
+        conn.close()
+        # ... -------------- end read PSP_Regulatory_sites ------------------------------------
+
+        # keep only the human entries in dataframe
+        if len(species) > 0:
+            print(
+                'Limit PhosphoSitesPlus records to species "' + species + '"'
+            )
+            regsites_df = regsites_df[regsites_df.ORGANISM == species]
+
+        # merge the seq7 df with the regsites df based off of the sequence7
+        merge_df = seq7_df.merge(
+            regsites_df,
+            left_on=SEQUENCE7,
+            right_on=SITE_PLUSMINUS_7AA_SQL,
+            how="left",
+        )
+
+        # after merging df, select only the columns of interest;
+        #   note that PROTEIN is absent here
+        merge_df = merge_df[
+            [
+                PHOSPHOPEPTIDE,
+                SEQUENCE7,
+                ON_FUNCTION,
+                ON_PROCESS,
+                ON_PROT_INTERACT,
+                ON_OTHER_INTERACT,
+                ON_NOTES,
+            ]
+        ]
+        # combine column values of interest
+        #   into one FUNCTION_PHOSPHORESIDUE column"
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(
+            merge_df[ON_PROCESS], sep="; ", na_rep=""
+        )
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
+            FUNCTION_PHOSPHORESIDUE
+        ].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="")
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
+            FUNCTION_PHOSPHORESIDUE
+        ].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="")
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
+            FUNCTION_PHOSPHORESIDUE
+        ].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="")
+
+        # remove the columns that were combined
+        merge_df = merge_df[
+            [PHOSPHOPEPTIDE, SEQUENCE7, FUNCTION_PHOSPHORESIDUE]
+        ]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f merge regsite metadata [1a]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # cosmetic changes to Function Phosphoresidue column
+        fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE])
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f more cosmetic changes [1b]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        i = 0
+        while i < len(fp_series):
+            # remove the extra ";" so that it looks more professional
+            if fp_series[i] == "; ; ; ; ":  # remove ; from empty hits
+                fp_series[i] = ""
+            while fp_series[i].endswith("; "):  # remove ; from the ends
+                fp_series[i] = fp_series[i][:-2]
+            while fp_series[i].startswith("; "):  # remove ; from the beginning
+                fp_series[i] = fp_series[i][2:]
+            fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ")
+            fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
+            fp_series[i] = fp_series[i].replace("; ; ", "; ")
+
+            # turn blanks into N_A to signify the info was searched for but cannot be found
+            if fp_series[i] == "":
+                fp_series[i] = N_A
+
+            i += 1
+        merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f cleaned up semicolons [1c]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # merge uniprot df with merge df
+        uniprot_regsites_merged_df = uniprot_df.merge(
+            merge_df,
+            left_on=PHOSPHOPEPTIDE,
+            right_on=PHOSPHOPEPTIDE,
+            how="left",
+        )
+
+        # collapse the merged df
+        uniprot_regsites_collapsed_df = pandas.DataFrame(
+            uniprot_regsites_merged_df.groupby(PHOSPHOPEPTIDE)[
+                FUNCTION_PHOSPHORESIDUE
+            ].apply(lambda x: ppep_join(x))
+        )
+        # .apply(lambda x: "%s" % ' | '.join(x)))
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        uniprot_regsites_collapsed_df[
+            PHOSPHOPEPTIDE
+        ] = (
+            uniprot_regsites_collapsed_df.index
+        )  # add df index as its own column
+
+        # rename columns
+        uniprot_regsites_collapsed_df.columns = [
+            FUNCTION_PHOSPHORESIDUE,
+            "ppp",
+        ]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f selected columns to be merged to uniprot_df [1e]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # add columns based on Sequence7 matching site_+/-7_AA
+        uniprot_regsite_df = pandas.merge(
+            left=uniprot_df,
+            right=uniprot_regsites_collapsed_df,
+            how="left",
+            left_on=PHOSPHOPEPTIDE,
+            right_on="ppp",
+        )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        data_in.rename(
+            {"Protein description": PHOSPHOPEPTIDE},
+            axis="columns",
+            inplace=True,
+        )
+
+        # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
+        res2 = sorted(
+            data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold()
+        )
+        data_in = data_in.loc[res2]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f sorting time [1f]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        print("old_cols[:col_PKCalpha]")
+        print(old_cols[:col_PKCalpha])
+        cols = [old_cols[0]] + old_cols[col_PKCalpha - 1:]
+        upstream_data = upstream_data[cols]
+        print("upstream_data.columns")
+        print(upstream_data.columns)
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f refactored columns for Upstream Map [1g]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # #rename upstream columns in new list
+        # new_cols = []
+        # for name in cols:
+        #     if "_NetworKIN" in name:
+        #         name = name.split("_")[0]
+        #     if " motif" in name:
+        #         name = name.split(" motif")[0]
+        #     if " sequence " in name:
+        #         name = name.split(" sequence")[0]
+        #     if "_Phosida" in name:
+        #         name = name.split("_")[0]
+        #     if "_PhosphoSite" in name:
+        #         name = name.split("_")[0]
+        #     new_cols.append(name)
+
+        # rename upstream columns in new list
+        def col_rename(name):
+            if "_NetworKIN" in name:
+                name = name.split("_")[0]
+            if " motif" in name:
+                name = name.split(" motif")[0]
+            if " sequence " in name:
+                name = name.split(" sequence")[0]
+            if "_Phosida" in name:
+                name = name.split("_")[0]
+            if "_PhosphoSite" in name:
+                name = name.split("_")[0]
+            return name
+
+        new_cols = [col_rename(col) for col in cols]
+        upstream_data.columns = new_cols
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f renamed columns for Upstream Map [1h_1]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # Create upstream_data_cast as a copy of upstream_data
+        #   but with first column substituted by the phosphopeptide sequence
+        upstream_data_cast = upstream_data.copy()
+        new_cols_cast = new_cols
+        new_cols_cast[0] = "p_peptide"
+        upstream_data_cast.columns = new_cols_cast
+        upstream_data_cast["p_peptide"] = upstream_data.index
+
+        # --- -------------- begin read upstream_data_melt ------------------------------------
+        # ----------- Get melted kinase mapping data from SQLite database (start) -----------
+        conn = sql.connect(uniprot_sqlite)
+        upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
+        # Close SwissProt SQLite database
+        conn.close()
+        upstream_data_melt = upstream_data_melt_df.copy()
+        upstream_data_melt.columns = ["p_peptide", "characterization", "X"]
+        upstream_data_melt["characterization"] = [
+            col_rename(s) for s in upstream_data_melt["characterization"]
+        ]
+
+        print(
+            "%0.6f upstream_data_melt_df initially has %d rows"
+            % (end_time - start_time, len(upstream_data_melt.axes[0])),
+            file=sys.stderr,
+        )
+        # ref: https://stackoverflow.com/a/27360130/15509512
+        #      e.g. df.drop(df[df.score < 50].index, inplace=True)
+        upstream_data_melt.drop(
+            upstream_data_melt[upstream_data_melt.X != "X"].index, inplace=True
+        )
+        print(
+            "%0.6f upstream_data_melt_df pre-dedup has %d rows"
+            % (end_time - start_time, len(upstream_data_melt.axes[0])),
+            file=sys.stderr,
+        )
+        # ----------- Get melted kinase mapping data from SQLite database (finish) -----------
+        # ... -------------- end read upstream_data_melt --------------------------------------
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f melted and minimized Upstream Map dataframe [1h_2]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+        # ... end read upstream_data_melt
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f indexed melted Upstream Map [1h_2a]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        upstream_delta_melt_LoL = upstream_data_melt.values.tolist()
+
+        melt_dict = {}
+        for key in upstream_map_p_peptide_list:
+            melt_dict[key] = []
+
+        for el in upstream_delta_melt_LoL:
+            (p_peptide, characterization, X) = tuple(el)
+            if p_peptide in melt_dict:
+                melt_dict[p_peptide].append(characterization)
+            else:
+                exit(
+                    'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping'
+                    % (p_peptide)
+                )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f appended peptide characterizations [1h_2b]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # for key in upstream_map_p_peptide_list:
+        #     melt_dict[key] = ' | '.join(melt_dict[key])
+
+        for key in upstream_map_p_peptide_list:
+            melt_dict[key] = melt_join(melt_dict[key])
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f concatenated multiple characterizations [1h_2c]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # map_dict is a dictionary of dictionaries
+        map_dict = {}
+        for key in upstream_map_p_peptide_list:
+            map_dict[key] = {}
+            map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f instantiated map dictionary [2]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # convert map_dict to dataframe
+        map_df = pandas.DataFrame.transpose(
+            pandas.DataFrame.from_dict(map_dict)
+        )
+        map_df["p-peptide"] = map_df.index  # make index a column too
+        cols_map_df = map_df.columns.tolist()
+        cols_map_df = [cols_map_df[1]] + [cols_map_df[0]]
+        map_df = map_df[cols_map_df]
+
+        # join map_df to uniprot_regsite_df
+        output_df = uniprot_regsite_df.merge(
+            map_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="p-peptide"
+        )
+
+        output_df = output_df[
+            [
+                PHOSPHOPEPTIDE,
+                SEQUENCE10,
+                SEQUENCE7,
+                GENE_NAME,
+                PHOSPHORESIDUE,
+                UNIPROT_ID,
+                DESCRIPTION,
+                FUNCTION_PHOSPHORESIDUE,
+                PUTATIVE_UPSTREAM_DOMAINS,
+            ]
+        ]
+
+        # cols_output_prelim = output_df.columns.tolist()
+        #
+        # print("cols_output_prelim")
+        # print(cols_output_prelim)
+        #
+        # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
+        #
+        # print("cols_output with p-peptide")
+        # print(cols_output)
+        #
+        # cols_output = [col for col in cols_output if not col == "p-peptide"]
+        #
+        # print("cols_output")
+        # print(cols_output)
+        #
+        # output_df = output_df[cols_output]
+
+        # join output_df back to quantitative columns in data_in df
+        quant_cols = data_in.columns.tolist()
+        quant_cols = quant_cols[1:]
+        quant_data = data_in[quant_cols]
+
+        # ----------- Write merge/filter metadata to SQLite database (start) -----------
+        # Open SwissProt SQLite database
+        conn = sql.connect(output_sqlite)
+        cur = conn.cursor()
+
+        cur.executescript(MRGFLTR_DDL)
+
+        cur.execute(
+            CITATION_INSERT_STMT,
+            ("mrgfltr_metadata_view", CITATION_INSERT_PSP),
+        )
+        cur.execute(
+            CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP)
+        )
+        cur.execute(
+            CITATION_INSERT_STMT,
+            ("mrgfltr_metadata_view", CITATION_INSERT_PSP_REF),
+        )
+        cur.execute(
+            CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP_REF)
+        )
+
+        # Read ppep-to-sequence LUT
+        ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)
+        # write only metadata for merged/filtered records to SQLite
+        mrgfltr_metadata_df = output_df.copy()
+        # replace phosphopeptide seq with ppep.id
+        mrgfltr_metadata_df = ppep_lut_df.merge(
+            mrgfltr_metadata_df,
+            left_on="ppep_seq",
+            right_on=PHOSPHOPEPTIDE,
+            how="inner",
+        )
+        mrgfltr_metadata_df.drop(
+            columns=[PHOSPHOPEPTIDE, "ppep_seq"], inplace=True
+        )
+        # rename columns
+        mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS
+        mrgfltr_metadata_df.to_sql(
+            "mrgfltr_metadata",
+            con=conn,
+            if_exists="append",
+            index=False,
+            method="multi",
+        )
+
+        # Close SwissProt SQLite database
+        conn.close()
+        # ----------- Write merge/filter metadata to SQLite database (finish) -----------
+
+        output_df = output_df.merge(
+            quant_data,
+            how="right",
+            left_on=PHOSPHOPEPTIDE,
+            right_on=PHOSPHOPEPTIDE_MATCH,
+        )
+        output_cols = output_df.columns.tolist()
+        output_cols = output_cols[:-1]
+        output_df = output_df[output_cols]
+
+        # cosmetic changes to Upstream column
+        output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[
+            PUTATIVE_UPSTREAM_DOMAINS
+        ].fillna(
+            ""
+        )  # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
+        us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
+        i = 0
+        while i < len(us_series):
+            # turn blanks into N_A to signify the info was searched for but cannot be found
+            if us_series[i] == "":
+                us_series[i] = N_A
+            i += 1
+        output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f establisheed output [3]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        (output_rows, output_cols) = output_df.shape
+
+        output_df = output_df.convert_dtypes(convert_integer=True)
+
+        # Output onto Final CSV file
+        output_df.to_csv(output_filename_csv, index=False)
+        output_df.to_csv(
+            output_filename_tab, quoting=None, sep="\t", index=False
+        )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f wrote output [4]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        print(
+            "{:>10} phosphopeptides written to output".format(str(output_rows))
+        )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f seconds of non-system CPU time were consumed"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # Rev. 7/1/2016
+        # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
+        # Rev. 7/3/2016:  renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
+        # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
+        #                read from SwissProt SQLite database
+        # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
+
+        #
+        # copied from Excel Output Script.ipynb END #
+        #
+
+    try:
+        catch(
+            mqpep_getswissprot,
+        )
+        exit(0)
+    except Exception as e:
+        exit("Internal error running mqpep_getswissprot(): %s" % (e))
+
+
+if __name__ == "__main__":
+    __main__()
diff -r 000000000000 -r ba62d93a9ef5 mqppep_preproc.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_preproc.xml	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,540 @@
+<tool
+  id="mqppep_preproc"
+  name="MaxQuant Phosphopeptide Preprocessing"
+  version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"
+  profile="21.05"
+  >
+    <description>
+        Prep phosphoproteomic MaxQuant output for statistical anlaysis.
+    </description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <edam_topics>
+        <edam_topic>topic_0121</edam_topic><!-- Proteomics -->
+        <edam_topic>topic_3520</edam_topic><!-- Proteomics experiment-->
+    </edam_topics>
+    <edam_operations>
+        <edam_operation>operation_0338</edam_operation><!-- Sequence database search -->
+        <edam_operation>operation_0361</edam_operation><!-- Sequence annotation -->
+        <edam_operation>operation_3434</edam_operation><!-- Conversion -->
+        <edam_operation>operation_3436</edam_operation><!-- Aggregation -->
+    </edam_operations>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+      echo '--- localization-filter step:'
+      && (
+        Rscript '$__tool_directory__/MaxQuantProcessingScript.R'
+        -i '$phosphoSites'
+        #if $pst_py_selector == "y"
+            --enriched Y
+        #else
+            --enriched ST
+        #end if
+        --phosphoCol '$phosphocol_script'
+        --startCol '$startcol_script'
+        --intervalCol $intervalCol
+        --localProbCutoff $localProbCutoff
+        --collapse_func $collapse_func
+        -o '$phosphoPepIntensities'
+        --locProbCutoffGraph $locProbCutoffGraph
+        --enrichGraph $enrichGraph
+        --locProbCutoffGraph_svg $locProbCutoffGraph_svg
+        --enrichGraph_svg $enrichGraph_svg
+        --filtered_data $filteredData_tabular
+        --quant_data $quantData_tabular
+      ) &&
+      echo '... end localization-filter.'
+      && (
+        echo '--- kinase-mapping step:'
+      ) && (
+        perl '$__tool_directory__/PhosphoPeptide_Upstream_Kinase_Mapping.pl'
+        -i '$phosphoPepIntensities'
+        -f '$protein_fasta'
+        -n '$networkin'
+        -m '$p_sty_motifs'
+        -p '$psp_kinase_substrate'
+        -r '$psp_regulatory_sites'
+        #if $pst_py_selector == "y"
+            -P y
+        #else
+            -P sty
+        #end if
+        -F $merge_function
+        -o '$mapped_phophopeptides'
+        -O '$melted_phophopeptide_map'
+        -D '$mqppep_output_sqlite'
+        -s '$species'
+      ) &&
+        echo '... end kinase-mapping.'
+      &&
+        echo '--- merge-and-filter step:'
+      && (
+        python '$__tool_directory__/mqppep_mrgfltr.py'
+        --phosphopeptides='$mapped_phophopeptides'
+        --ppep_mapping_db='$mqppep_output_sqlite'
+        --species='$species'
+        --mrgfltr_tab='$preproc_tab'
+        --mrgfltr_csv='$preproc_csv'
+        --mrgfltr_sqlite='$preproc_sqlite'
+      )
+      && echo '... end merge-and-filter.'
+    ]]></command>
+    <configfiles>
+        <configfile name="phosphocol_script">$phosphoCol
+        </configfile>
+        <configfile name="startcol_script">$startCol
+        </configfile>
+    </configfiles>
+    <inputs>
+        <param name="phosphoSites" type="data" format="tabular"
+               label="Phospho (STY)Sites.txt"
+               help="Tabular 'Phospho (STY)Sites.txt' produced by MaxQuant"
+               />
+        <param name="phosphoCol" type="text"
+               label="pattern for column 'Number of Phospho (STY)'"
+               help="PERL-compatible regular expression matching header of column having number of 'Phospho (STY)'"
+               value="^Number of Phospho [(]STY[)]$">
+            <sanitizer>
+                <valid initial="string.printable">
+                    <remove value="&apos;"/>
+                </valid>
+            </sanitizer>
+        </param>
+        <param name="startCol" type="text"
+               label="pattern for first column of intensity values"
+               help="PERL-compatible regular expression matching column header having first sample intensity"
+               value="^Intensity[^_]">
+            <sanitizer>
+                <valid initial="string.printable">
+                    <remove value="&apos;"/>
+                </valid>
+            </sanitizer>
+        </param>
+        <param name="intervalCol" type="integer" value="1" min="1"
+               label="Interval between the intensity column of samples"
+               help="E.g., 1 if subsequent column is next sample; 2 if next sample is two columns away, etc."/>
+        <param name="pst_py_selector" type="select"
+            label="Phosphopeptide enrichment type"
+            help="Were samples enriched for pS and pT, or were they enriched for pY instead?"
+            >
+            <option value="st" selected="true">pST</option>
+            <option value="y">pY</option>
+        </param>
+        <param name="collapse_func" type="select"
+               label="Intensity merge function"
+               help="When a peptide is multiply phosphorylated, how should intensities be merged? [default: sum]"
+               >
+            <option value="sum" selected="true">sum</option>
+            <option value="mean">average</option>
+        </param>
+        <param name="localProbCutoff" type="float" value="0.75" min="0" max="1.0"
+               label="Localization Probability Cutoff"
+               help="See help below for an explanation."
+               />
+        <param name="merge_function" type="select" label="intensity merge-function"
+               help="Specifies how intensities for identical phosphosites should be merged">
+            <option value="sum" selected="true">sum</option>
+            <option value="average">average</option>
+        </param>
+        <param name="protein_fasta" type="data" format="fasta" label="UniProtKB/SwissProt FASTA database"
+               help="Sequence database; supply the same FASTA file as you supplied to by MaxQuant"
+               />
+        <param name="networkin" type="data" format="tabular" label="NetworKIN file"
+            help="NetworKIN file; see help section below"/>
+        <param name="p_sty_motifs" type="data" format="tabular" label="pSTY_Motifs file"
+            help="pS/pT/pY phosphorylation site motifs; see help section below"/>
+        <param name="psp_kinase_substrate" type="data" format="tabular" label="PSP_Kinase_Substrate_Dataset"
+            help="'Kinase-substrate dataset'; see help section below"/>
+        <param name="psp_regulatory_sites" type="data" format="tabular" label="PSP_Regulatory_sites"
+            help="'Regulatory sites'; see help section below"/>
+        <param name="species"
+               type="text"
+               value = "human"
+               label="filter to limit PhosphoSitePlus records to indicated species"
+               help="(field may be empty) [default: human].  If you supply this parameter, use the species indentifier seen as a suffix in UniProtKB"
+               />
+    </inputs>
+    <outputs>
+      <!-- localization filter -->
+        <data name="phosphoPepIntensities"  format="tabular" label="${phosphoSites.name}.ppep_intensities" />
+        <data name="enrichGraph"            format="pdf"     label="${phosphoSites.name}.enrichment.pdf" />
+        <data name="locProbCutoffGraph"     format="pdf"     label="${phosphoSites.name}.locProbCutoff.pdf" />
+        <data name="enrichGraph_svg"        format="svg"     label="${phosphoSites.name}.enrichment.svg" />
+        <data name="locProbCutoffGraph_svg" format="svg"     label="${phosphoSites.name}.locProbCutoff.svg" />
+        <data name="filteredData_tabular"   format="tabular" label="${phosphoSites.name}.filteredData" />
+        <data name="quantData_tabular"      format="tabular" label="${phosphoSites.name}.quantData" />
+      <!-- upstream kinase mapping -->
+        <data name="mapped_phophopeptides" format="tabular" label="${phosphoSites.name}.ppep_intensities.ppep_map"/>
+        <data name="melted_phophopeptide_map" format="tabular" label="${phosphoSites.name}.ppep_intensities.melted"/>
+        <data name="mqppep_output_sqlite" format="sqlite" label="${phosphoSites.name}.ppep_intensities.ppep_mapping_sqlite"/>
+      <!-- merge and filter -->
+        <data name="preproc_tab"    format="tabular" label="${phosphoSites.name}.ppep_intensities.ppep_map.preproc_tab" />
+        <data name="preproc_csv"    format="csv"     label="${phosphoSites.name}.ppep_intensities.ppep_map.preproc_csv"  />
+        <data name="preproc_sqlite" format="sqlite"  label="${phosphoSites.name}.ppep_intensities.ppep_map.preproc_sqlite"  />
+    </outputs>
+    <tests>
+        <test>
+            <param name="phosphoSites" ftype="tabular" value="test_input_for_preproc.tabular" />
+            <param name="protein_fasta" ftype="fasta" value="test_swissprot.fasta" />
+            <param name="networkin" ftype="tabular" value="test_networkin.tabular" />
+            <param name="p_sty_motifs" ftype="tabular" value="pSTY_motifs.tabular" />
+            <param name="psp_kinase_substrate" ftype="tabular" value="test_kinase_substrate.tabular" />
+            <param name="psp_regulatory_sites" ftype="tabular" value="test_regulatory_sites.tabular" />
+            <param name="pst_py_selector" value="st"/>
+            <param name="merge_function"  value="sum"/>
+
+            <param name="phosphoCol" value="^Number of Phospho [(][STY][STY]*[)]$"/>
+            <param name="startCol" value="^Intensity[^_]"/>
+            <param name="intervalCol" value="1"/>
+            <param name="collapse_func" value="sum"/>
+            <param name="localProbCutoff" value="0.75"/>
+            <param name="species" value="human"/>
+
+            <output name="phosphoPepIntensities">
+                <assert_contents>
+                    <has_text text="Phosphopeptide" />
+                    <has_line_matching expression="AAAITDMADLEELSRLpSPLPPGpSPGSAAR.5416400.7101800.385280000.208060000.41426000.352400000" />
+                    <has_line_matching expression="pSQKQEEENPAEETGEEK.0.0.8765300.0.2355900.14706000" />
+                </assert_contents>
+            </output>
+
+            <output name="preproc_tab">
+                <assert_contents>
+                    <has_text text="SSRP1_HUMAN FACT complex subunit SSRP1" />
+                    <has_text text="AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2" />
+                    <has_text text="molecular association, regulation" />
+                    <has_text text="cell cycle regulation" />
+                    <has_text text="PPP2CA(INDUCES)" />
+                    <has_text text="SNCA(DISRUPTS)" />
+                    <has_text text="CDK7" />
+                    <has_text text="CK1alpha" />
+                    <has_text text="CK2alpha" />
+                    <has_text text="DNAPK" />
+                    <has_text text="HIPK2" />
+                    <has_text text="IKKalpha" />
+                    <has_text text="PKCalpha" />
+                    <has_text text="PKCbeta" />
+                    <has_text text="PKC" />
+                    <has_text text="CK2a2" />
+                    <has_text text="CK2alpha" />
+                    <has_text text="Csnk2a1" />
+                </assert_contents>
+            </output>
+
+            <output name="melted_phophopeptide_map">
+                <assert_contents>
+                    <has_text text="CDK7" />
+                    <has_text text="CK1alpha" />
+                    <has_text text="CK2alpha" />
+                    <has_text text="DNAPK" />
+                    <has_text text="HIPK2" />
+                    <has_text text="IKKalpha" />
+                    <has_text text="PKCalpha" />
+                    <has_text text="PKCbeta" />
+                    <has_text text="PKC" />
+                    <has_text text="CK2a2" />
+                    <has_text text="CK2alpha" />
+                    <has_text text="Csnk2a1" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="phosphoSites" ftype="tabular" value="test_input_for_preproc.tabular" />
+            <param name="protein_fasta" ftype="fasta" value="test_swissprot.fasta" />
+            <param name="networkin" ftype="tabular" value="test_networkin.tabular" />
+            <param name="p_sty_motifs" ftype="tabular" value="pSTY_motifs.tabular" />
+            <param name="psp_kinase_substrate" ftype="tabular" value="test_kinase_substrate.tabular" />
+            <param name="psp_regulatory_sites" ftype="tabular" value="test_regulatory_sites.tabular" />
+            <param name="pst_py_selector" value="y"/>
+            <param name="merge_function"  value="sum"/>
+
+            <param name="phosphoCol" value="^Number of Phospho [(][STY][STY]*[)]$"/>
+            <param name="startCol" value="^Intensity[^_]"/>
+            <param name="intervalCol" value="1"/>
+            <param name="collapse_func" value="sum"/>
+            <param name="localProbCutoff" value="0.75"/>
+            <param name="species" value="human"/>
+
+            <output name="phosphoPepIntensities">
+                <assert_contents>
+                    <has_text text="Phosphopeptide" />
+                    <has_text text="pTYVDPFTpYEDPNQAVR" />
+                </assert_contents>
+            </output>
+
+            <output name="preproc_tab">
+                <assert_contents>
+                    <has_text text="pTYVDPFTpYEDPNQAVR" />
+                    <has_text text="EEKHLNQGVRpTYVDPFTYEDP" />
+                    <has_text text="GVRTYVDPFTpYEDPNQAVREF" />
+                    <has_text text="HLNQGVRtYVDPFTY" />
+                    <has_text text="TYVDPFTyEDPNQAV" />
+                    <has_text text="EPHA4" />
+                    <has_text text="pT595, pY602" />
+                    <has_text text="pT544, pY551" />
+                    <has_text text="P54764;" />
+                    <has_text text="P54764-2" />
+                </assert_contents>
+            </output>
+
+            <output name="melted_phophopeptide_map">
+                <assert_contents>
+                    <has_text text="EphA6" />
+                    <has_text text="EPHA4" />
+                    <has_text text="EphA4" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+=========================================================
+Phopsphoproteomic Enrichment Pipeline Preprocessing Steps
+=========================================================
+
+**Overview**
+
+Prior to statistical analysis, it is necessary to perform
+three steps to transform the MaxQuant output
+for phosphoproteome-enriched samples.
+
+**Workflow position**
+
+``upstream tool``
+      The input data file for this tool is the ``Phospho (STY)Sites.txt`` file that is produced:
+
+      - by the Galaxy "MaxQuant" (``maxquant``) tool
+      - or by the Galaxy "Maxquant (using mqpar.xml)" (``maxquant_mqpar``) tool
+      - or by the desktop version of MaxQuant.
+
+``downstream tool``
+  The "MaxQuant Phosphopeptide ANOVA" tool (``mqppep_anova``) consumes the ``merged/filtered`` output file ``preproc_tab`` that this tool produces.
+
+======================================================================
+Phopsphoproteomic Enrichment Pipeline Localization-Probability Cut-Off
+======================================================================
+
+This step applies a "localization-probability cut-off" for phosphopeptides for each phosphopeptide.
+Higher values may reduce the number of peptides in the output.
+The default value of 0.75 reflects the text of [Cheng  2018]:
+
+    "For phosphopeptide identification, a localization probability cutoff is applied. This filter is performed to select for phosphopeptides with a high confidence (i.e., greater than 0.75) in phosphoresidue identification [Hogrebe 2018; Olsen 2006]. In other words, the summed probability of all other residues that could potentially contain the phospho-group is less than 0.25.  This cutoff could be raised to increase the stringency of the phosphopeptide selection. In regard to the number of identifications, the expected number of pY peptides is in the hundreds, while the expected number of pST peptides is in the high thousands. These values reflect previously observed phosphoproteome distribution where about 2%, 12%, and 86% of the phosphosites are pY, pT, and pS, respectively [Olsen 2006]."
+
+This tool wraps an R script. written by Larry Cheng, that performs the following (in order):
+
+1. Remove contaminant and reverse sequence rows
+2. Filters rows based on localization probability
+3. Extract the quantitative data
+4. Inserts a "p" before the phosphorylated residue(s) in each peptide sequence
+5. Merges (aggregating by "sum" or "average") multiply-phosphorylated peptides
+6. Filters output phosphopeptides based on enrichment
+7. Produces an output file (in tabular format) that contains the phosphopeptide (first column) and its (possibly merged) mass spectral intensity for each sample.
+
+Note that the "ProTeomiX Quality Control Report"
+[Bielow 2016] (available at `https://github.com/cbielow/PTXQC/
+<https://github.com/cbielow/PTXQC/>`_) is run by the Galaxy wrappers for MaxQuant,
+so it is omitted here even though it was included in Larry Cheng's original script.
+
+
+**Input dataset**
+
+``phosphoSites``
+    This is the ``MaxQuant Phospho (STY)Sites.txt`` file produced by MaxQuant.
+    If you use the desktop version of MaxQuant, you will find this file in the ``txt`` folder.
+
+**Output datasets**
+
+``ppep_intensities``
+  Data table (in tabular format) presenting, for each sample, the mass-spectral intensity of each phopshopeptide having localization probability greater than the cutoff.
+``enrichment.pdf``
+  Graph (in PDF format) presenting non-zero proportions of pS, pT, and pY among the phosphosites; note that a phosphopeptide may have multiple phosphosite.
+``locProbCutoff.pdf``
+  Graph (in PDF format) contrasting proportion of phosphopeptides above the localization probability cutoff with the proportion below.
+``enrichment.svg``
+  Enrichment graph (in downloadable "scalable vector graphics" format) for incorporation into documents.
+``locProbCutoff.svg``
+  Localization probability cutoff graph (in downloadable "scalable vector graphics" format) for incorporation into documents.
+``filteredData``
+  Data table (in tabular format) comprising rows of the ``phosphSites`` input file that are not flagged as contaminants or reversed sequences.
+``quantData``
+  Data table (in tabular format) comprising rows of the ``filteredData`` file whose localization probability exceeds the **Localization Probability Cutoff** parameter.
+
+**Authors**
+
+``Nicholas A. Graham``
+  (`ORCiD 0000-0002-6811-1941 <https://orcid.org/0000-0002-6811-1941>`_) initiated the original script.
+
+``Larry C. Cheng``
+  (`ORCiD 0000-0002-6922-6433 <https://orcid.org/0000-0002-6922-6433>`_) updated the original script.
+
+``Arthur C. Eschenlauer``
+  (`ORCiD 0000-0002-2882-0508 <https://orcid.org/0000-0002-2882-0508>`_) adapted the script to run in Galaxy.
+
+``James E. Johnson``
+  (University of Minnesota Supercomputing Institute) adapted the script to run in Galaxy.
+
+
+=============================================================
+Phopsphoproteomic Enrichment Pipeline Upstream Kinase Mapping
+=============================================================
+
+This step searches phosphopeptides against several databases for known or predicted sites.
+
+**Input databases**
+
+``networkin``
+    This table is the result of filtering the NetworkKIN database [Linding 2007; Horn 2014] for cutoff score > 2.0.  The ENSEMBL data used to generate the file were from Ensembl, `ensembl.org <https://web.archive.org/web/20220308011159/http://useast.ensembl.org/index.html>`_ [Howe 2021].
+
+       *To generate this file:*
+
+       **(1)** Download the "precomputed data for all available kinase predictors against ENSEMBL"
+       (Available at the NetworkKIN predictions link on the downloads page at https://web.archive.org/web/20200208000403/http://networkin.info/download/networkin_human_predictions_3.1.tsv.xz;  N.B.: "Commercial users are requested to contact the authors before using the data on the networkin.info website");
+
+       **(2)** Decompress the .tsv.xz with file with "unxz" (from XZ Utils `https://tukaani.org/xz/ <https://tukaani.org/xz/>`_);
+
+       **(3)** Filter out the rows having "network_kin" less than 2.0.
+
+       The result should be a tab-separated file with the following columns:
+
+           1. ``#substrate``
+           2. ``position``
+           3. ``id``
+           4. ``networkin_score``
+           5. ``tree``
+           6. ``netphorest_group``
+           7. ``netphorest_score``
+           8. ``string_identifier``
+           9. ``string_score``
+           10. ``substrate_name``
+           11. ``sequence``
+           12. ``string_path``
+
+
+``p_sty_motifs``
+  This database merges motif patterns from [Amanchy 2007] and Phosida [Gnad 2011].
+
+    The Amanchy data are adapted from `http://hprd.org/serine_motifs <http://hprd.org/serine_motifs>`_ and `http://hprd.org/tyrosine_motifs <http://hprd.org/tyrosine_motifs>`_ (both links cite the reference where each motif was published), and the patterns are translated into Perl regular expression format (`https://perldoc.perl.org/perlre <https://perldoc.perl.org/perlre>`_).
+
+    The Phosida data are adapted (translated to Perl-formatted regular expressions) from `http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx <http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx>`_ (this link cites the reference where each motif was published).
+
+      This file has three tab-separated columns (and no header):
+
+         1. column 1 is an (ignored) identifier
+         2. column 2 is a Perl regular expression
+         3. column 3 is a descriptor.
+
+      For two examples:
+
+      ``2<TAB>R.R..(pS|pT)<TAB>Akt kinase substrate motif (HPRD)``
+
+      ``10<TAB>R..(pS|pT)V<TAB>CAMK2_Phosida``
+
+``psp_kinase_substrate``
+  'Kinase-substrate dataset: experimentally determined substrates, sequences, cognate kinases, and metadata curated from the literature' [Hornbeck 2011].  This tabular-formatted file may be downloaded for non-commercial purposes as 'Kinase_Substrate_Dataset.gz' from `https://www.phosphosite.org/staticDownloads.action <https://www.phosphosite.org/staticDownloads.action>`_.
+
+      Data extracted from PhosphoSitePlus(R), created by Cell Signaling Technology Inc. PhosphoSitePlus is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (`https://creativecommons.org/licenses/by-nc-sa/3.0/ <https://creativecommons.org/licenses/by-nc-sa/3.0/>`_). Attribution must be given in written, oral and digital presentations to PhosphoSitePlus, www.phosphosite.org. Written documents should additionally cite:
+
+          Hornbeck PV, Kornhauser JM, Tkachev S, Zhang B, Skrzypek E, Murray B, Latham V, Sullivan M (2012) PhosphoSitePlus: a comprehensive resource for investigating the structure and function of experimentally determined post-translational modifications in man and mouse. Nucleic Acids Res. 40, D261-D270.; www.phosphosite.org.
+
+``psp_regulatory_sites``
+  'Regulatory sites: information curated from the literature about modification sites shown to regulate molecular functions, biological processes, and molecular interactions including protein-protein interactions' [Hornbeck 2011].  This tabular-formatted file may be downloaded for non-commercial purposes as 'Regulatory_sites.gz' from `https://www.phosphosite.org/staticDownloads.action <https://www.phosphosite.org/staticDownloads.action>`_.
+
+      Terms of use and citatation are as for the ``psp_kinase_substrate`` file.
+
+**Output datasets**
+
+``ppep_map``
+  Data table (in tabular format, consumed by the merge/filter step) presenting, for each phosphopeptide, the kinase mappings,  the mass-spectral intensities for each sample, and the metadata from UniProtKB/SwissProt, phospho-sites, phospho-motifs, and regulatory sites.  Data in the columns marked "``Domain``", "``ON_...``", or "``..._PhosphoSite``" are available subject to the following terms:
+
+    "PhosphoSitePlus\ |reg| (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License(`https://creativecommons.org/licenses/by-nc-sa/3.0/ <https://creativecommons.org/licenses/by-nc-sa/3.0/>`_). When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words 'PhosphoSitePlus(R), www.phosphosite.org' must be included at appropriate places in the text or webpage, and (b) citation of [Hornbeck 2011 (`PMID: 25514926 <https://pubmed.ncbi.nlm.nih.gov/25514926>`_)] must be included in the bibliography."
+
+
+``melted``
+  Data table (in tabular format) presenting, for each phosphopeptide, the gene and one of the phospho-motifs or kinase-substrate sites.
+
+``ppep_mapping_sqlite``
+  SQLite database (consumed by the merge/filter step).
+
+**Authors**
+
+``Nicholas A. Graham``
+  (`ORCiD 0000-0002-6811-1941 <https://orcid.org/0000-0002-6811-1941>`_) wrote the original script.
+
+``Arthur C. Eschenlauer``
+  (`ORCiD 0000-0002-2882-0508 <https://orcid.org/0000-0002-2882-0508>`_) adapted the script to run in Galaxy.
+
+
+======================================================
+Phopsphoproteomic Enrichment Pipeline Merge and Filter
+======================================================
+
+This step merges mapped metadata into metadata for phosphopeptides, filtering by species.
+
+**Input parameters**
+
+``species``
+  Limit PhosphoSitesPlus to indicated species. Default: **human**
+
+**Output datasets**
+
+``preproc_tab``
+  Phosphopeptides annotated with SwissProt and phosphosite metadata, in tabular format.  This file is designed to be consumed by the downstream ANOVA tool.  Some data in the columns marked "PSP" are available subject to the following terms:
+
+    "PhosphoSitePlus\ |reg| (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License(`https://creativecommons.org/licenses/by-nc-sa/3.0/ <https://creativecommons.org/licenses/by-nc-sa/3.0/>`_). When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words 'PhosphoSitePlus(R), www.phosphosite.org' must be included at appropriate places in the text or webpage, and (b) citation of [Hornbeck 2011 (`PMID: 25514926 <https://pubmed.ncbi.nlm.nih.gov/25514926>`_)] must be included in the bibliography."
+
+``preproc_csv``
+  Phosphopeptides annotated with SwissProt and phosphosite metadata, in CSV format.
+
+``preproc_sqlite``
+  ``ppep_mapping_sqlite`` updated with annotations, in SQLite format.
+
+**Authors**
+
+``Nicholas A. Graham``
+  (`ORCiD 0000-0002-6811-1941 <https://orcid.org/0000-0002-6811-1941>`_) initiated the original script.
+
+``Larry C. Cheng``
+  (`ORCiD 0000-0002-6922-6433 <https://orcid.org/0000-0002-6922-6433>`_) updated the original script.
+
+``Arthur C. Eschenlauer``
+  (`ORCiD 0000-0002-2882-0508 <https://orcid.org/0000-0002-2882-0508>`_) adapted the script to run in Galaxy.
+
+.. |reg|    unicode:: U+000AE .. REGISTERED SIGN
+    ]]></help>
+    <citations>
+      <!-- upstream kinase mapping -->
+        <!-- Amanchy, R., Periaswamy, B., Mathivanan, S. et al. A curated compendium of phosphorylation motifs. PMID: 17344875 -->
+        <citation type="doi">10.1038/nbt0307-285</citation>
+        <!-- Aken 2016 "The Ensembl gene annotation system." PMID: 33137190 -->
+        <citation type="doi">10.1093/database/baw093</citation>
+      <!-- localization filter -->
+        <!-- Bielow_2016 "Proteomics Quality Control: Quality Control Software for MaxQuant Results" PMID:  26653327 -->
+        <citation type="doi">10.1021/acs.jproteome.5b00780</citation>
+      <!-- all three -->
+        <!-- Cheng 2018 "Phosphopeptide Enrichment ..." PMID: 30124664 -->
+        <citation type="doi">10.3791/57996</citation>
+      <!-- localization and upstream kinase mapping -->
+        <!-- Cox 2014 "Accurate proteome-wide label-free quantification ..." PMID: 24942700 -->
+        <citation type="doi">10.1074/mcp.M113.031591</citation>
+        <!-- Cox 2008 "MaxQuant enables high peptide identification rates ..." PMID: 19029910 -->
+      <!-- upstream kinase mapping -->
+        <citation type="doi">10.1038/nbt.1511</citation>
+        <!-- Gnad 2011 "PHOSIDA 2011: the posttranslational modification database." PMID: 21081558 -->
+        <citation type="doi">10.1093/nar/gkq1159</citation>
+      <!-- localization filter -->
+        <!-- Hogrebe_2018 "Benchmarking common quantification strategies for large-scale phosphoproteomics" PMID: 29535314 -->
+        <citation type="doi">10.1038/s41467-018-03309-6</citation>
+      <!-- upstream kinase mapping -->
+        <!-- Horn 2014 "KinomeXplorer: an integrated platform for kinome biology studies." PMID: 24874572 -->
+        <citation type="doi">10.1038/nmeth.2968</citation>
+          <!-- upstream kinase mapping and merge and filter -->
+        <!-- Hornbeck 2012 "PhosphoSitePlus: a comprehensive resource for investigating the structure and function of experimentally determined post-translational modifications in man and mouse." PMID: 22135298 -->
+        <citation type="doi">10.1093/nar/gkr1122</citation>
+      <!-- upstream kinase mapping -->
+        <!-- Linding 2007 "Systematic discovery of in vivo phosphorylation networks." PMID: 17570479 -->
+        <citation type="doi">10.1016/j.cell.2007.05.052</citation>
+      <!-- localization filter -->
+        <!-- Olsen_2006 "Global, in vivo, and site-specific phosphorylation dynamics in signaling networks" PMID: 17081983 -->
+        <citation type="doi">10.1016/j.cell.2006.09.026</citation>
+    </citations>
+</tool>
diff -r 000000000000 -r ba62d93a9ef5 search_ppep.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/search_ppep.py	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,560 @@
+#!/usr/bin/env python
+# Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB
+
+import argparse
+import os.path
+import re
+import sqlite3
+import sys  # import the sys module for exc_info
+import time
+import traceback  # import the traceback module for format_exception
+from codecs import getreader as cx_getreader
+
+# For Aho-Corasick search for fixed set of substrings
+# - add_word
+# - make_automaton
+# - iter
+import ahocorasick
+
+
+# ref: https://stackoverflow.com/a/8915613/15509512
+#   answers: "How to handle exceptions in a list comprehensions"
+#   usage:
+#       from math import log
+#       eggs = [1,3,0,3,2]
+#       print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
+#   producing:
+#       for <built-in function log>
+#         with args (0,)
+#         exception: math domain error
+#       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
+def catch(func, *args, handle=lambda e: e, **kwargs):
+
+    try:
+        return func(*args, **kwargs)
+    except Exception as e:
+        print("For %s" % str(func))
+        print("  with args %s" % str(args))
+        print("  caught exception: %s" % str(e))
+        (ty, va, tb) = sys.exc_info()
+        print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
+        # exit(-1)
+        return None  # was handle(e)
+
+
+def __main__():
+
+    DROP_TABLES_SQL = """
+        DROP VIEW  IF EXISTS ppep_gene_site_view;
+        DROP VIEW  IF EXISTS uniprot_view;
+        DROP VIEW  IF EXISTS uniprotkb_pep_ppep_view;
+        DROP VIEW  IF EXISTS ppep_intensity_view;
+        DROP VIEW  IF EXISTS ppep_metadata_view;
+
+        DROP TABLE IF EXISTS sample;
+        DROP TABLE IF EXISTS ppep;
+        DROP TABLE IF EXISTS site_type;
+        DROP TABLE IF EXISTS deppep_UniProtKB;
+        DROP TABLE IF EXISTS deppep;
+        DROP TABLE IF EXISTS ppep_gene_site;
+        DROP TABLE IF EXISTS ppep_metadata;
+        DROP TABLE IF EXISTS ppep_intensity;
+    """
+
+    CREATE_TABLES_SQL = """
+        CREATE TABLE deppep
+          ( id INTEGER PRIMARY KEY
+          , seq TEXT UNIQUE                            ON CONFLICT IGNORE
+          )
+          ;
+        CREATE TABLE deppep_UniProtKB
+          ( deppep_id    INTEGER REFERENCES deppep(id) ON DELETE CASCADE
+          , UniProtKB_id TEXT REFERENCES UniProtKB(id) ON DELETE CASCADE
+          , pos_start    INTEGER
+          , pos_end      INTEGER
+          , PRIMARY KEY (deppep_id, UniProtKB_id, pos_start, pos_end)
+                                                       ON CONFLICT IGNORE
+          )
+          ;
+        CREATE TABLE ppep
+          ( id        INTEGER PRIMARY KEY
+          , deppep_id INTEGER REFERENCES deppep(id)    ON DELETE CASCADE
+          , seq       TEXT UNIQUE                      ON CONFLICT IGNORE
+          , scrubbed  TEXT
+          );
+        CREATE TABLE site_type
+          ( id        INTEGER PRIMARY KEY
+          , type_name TEXT UNIQUE                      ON CONFLICT IGNORE
+          );
+        CREATE INDEX idx_ppep_scrubbed on ppep(scrubbed)
+          ;
+        CREATE TABLE sample
+          ( id        INTEGER PRIMARY KEY
+          , name      TEXT UNIQUE                      ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW uniprot_view AS
+          SELECT DISTINCT
+              Uniprot_ID
+            , Description
+            , Organism_Name
+            , Organism_ID
+            , Gene_Name
+            , PE
+            , SV
+            , Sequence
+            , Description ||
+                CASE WHEN Organism_Name = 'N/A'
+                     THEN ''
+                     ELSE ' OS='|| Organism_Name
+                     END ||
+                CASE WHEN Organism_ID = -1
+                     THEN ''
+                     ELSE ' OX='|| Organism_ID
+                     END ||
+                CASE WHEN Gene_Name = 'N/A'
+                     THEN ''
+                     ELSE ' GN='|| Gene_Name
+                     END ||
+                CASE WHEN PE = 'N/A'
+                     THEN ''
+                     ELSE ' PE='|| PE
+                     END ||
+                CASE WHEN SV = 'N/A'
+                     THEN ''
+                     ELSE ' SV='|| SV
+                     END AS long_description
+            , Database
+          FROM UniProtKB
+          ;
+        CREATE VIEW uniprotkb_pep_ppep_view AS
+          SELECT   deppep_UniProtKB.UniprotKB_ID       AS accession
+                 , deppep_UniProtKB.pos_start          AS pos_start
+                 , deppep_UniProtKB.pos_end            AS pos_end
+                 , deppep.seq                          AS peptide
+                 , ppep.seq                            AS phosphopeptide
+                 , ppep.scrubbed                       AS scrubbed
+                 , uniprot_view.Sequence               AS sequence
+                 , uniprot_view.Description            AS description
+                 , uniprot_view.long_description       AS long_description
+                 , ppep.id                             AS ppep_id
+          FROM     ppep, deppep, deppep_UniProtKB, uniprot_view
+          WHERE    deppep.id = ppep.deppep_id
+          AND      deppep.id = deppep_UniProtKB.deppep_id
+          AND      deppep_UniProtKB.UniprotKB_ID = uniprot_view.Uniprot_ID
+          ORDER BY UniprotKB_ID, deppep.seq, ppep.seq
+          ;
+        CREATE TABLE ppep_gene_site
+          ( ppep_id         INTEGER REFERENCES ppep(id)
+          , gene_names      TEXT
+          , site_type_id    INTEGER REFERENCES site_type(id)
+          , kinase_map      TEXT
+          , PRIMARY KEY (ppep_id, kinase_map)          ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW ppep_gene_site_view AS
+          SELECT DISTINCT
+            ppep.seq   AS phospho_peptide
+          , ppep_id
+          , gene_names
+          , type_name
+          , kinase_map
+          FROM
+            ppep, ppep_gene_site, site_type
+          WHERE
+              ppep_gene_site.ppep_id = ppep.id
+            AND
+              ppep_gene_site.site_type_id = site_type.id
+          ORDER BY
+            ppep.seq
+            ;
+        CREATE TABLE ppep_metadata
+          ( ppep_id             INTEGER REFERENCES ppep(id)
+          , protein_description TEXT
+          , gene_name           TEXT
+          , FASTA_name          TEXT
+          , phospho_sites       TEXT
+          , motifs_unique       TEXT
+          , accessions          TEXT
+          , motifs_all_members  TEXT
+          , domain              TEXT
+          , ON_FUNCTION         TEXT
+          , ON_PROCESS          TEXT
+          , ON_PROT_INTERACT    TEXT
+          , ON_OTHER_INTERACT   TEXT
+          , notes               TEXT
+          , PRIMARY KEY (ppep_id)                      ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW ppep_metadata_view AS
+          SELECT DISTINCT
+              ppep.seq             AS phospho_peptide
+            , protein_description
+            , gene_name
+            , FASTA_name
+            , phospho_sites
+            , motifs_unique
+            , accessions
+            , motifs_all_members
+            , domain
+            , ON_FUNCTION
+            , ON_PROCESS
+            , ON_PROT_INTERACT
+            , ON_OTHER_INTERACT
+            , notes
+          FROM
+            ppep, ppep_metadata
+          WHERE
+              ppep_metadata.ppep_id = ppep.id
+          ORDER BY
+            ppep.seq
+            ;
+        CREATE TABLE ppep_intensity
+          ( ppep_id    INTEGER REFERENCES ppep(id)
+          , sample_id  INTEGER
+          , intensity  INTEGER
+          , PRIMARY KEY (ppep_id, sample_id)           ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW ppep_intensity_view AS
+          SELECT DISTINCT
+              ppep.seq             AS phospho_peptide
+            , sample.name          AS sample
+            , intensity
+          FROM
+            ppep, sample, ppep_intensity
+          WHERE
+              ppep_intensity.sample_id = sample.id
+            AND
+              ppep_intensity.ppep_id = ppep.id
+          ;
+    """
+
+    UNIPROT_SEQ_AND_ID_SQL = """
+        select    Sequence, Uniprot_ID
+             from UniProtKB
+    """
+
+    # Parse Command Line
+    parser = argparse.ArgumentParser(
+        description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)."
+    )
+
+    # inputs:
+    #   Phosphopeptide data for experimental results, including the intensities
+    #   and the mapping to kinase domains, in tabular format.
+    parser.add_argument(
+        "--phosphopeptides",
+        "-p",
+        nargs=1,
+        required=True,
+        dest="phosphopeptides",
+        help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool",
+    )
+    parser.add_argument(
+        "--uniprotkb",
+        "-u",
+        nargs=1,
+        required=True,
+        dest="uniprotkb",
+        help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool",
+    )
+    parser.add_argument(
+        "--schema",
+        action="store_true",
+        dest="db_schema",
+        help="show updated database schema",
+    )
+    parser.add_argument(
+        "--warn-duplicates",
+        action="store_true",
+        dest="warn_duplicates",
+        help="show warnings for duplicated sequences",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        dest="verbose",
+        help="show somewhat verbose program tracing",
+    )
+    # "Make it so!" (parse the arguments)
+    options = parser.parse_args()
+    if options.verbose:
+        print("options: " + str(options) + "\n")
+
+    # path to phosphopeptide (e.g., "outputfile_STEP2.txt") input tabular file
+    if options.phosphopeptides is None:
+        exit('Argument "phosphopeptides" is required but not supplied')
+    try:
+        f_name = os.path.abspath(options.phosphopeptides[0])
+    except Exception as e:
+        exit("Error parsing phosphopeptides argument: %s" % (e))
+
+    # path to SQLite input/output tabular file
+    if options.uniprotkb is None:
+        exit('Argument "uniprotkb" is required but not supplied')
+    try:
+        db_name = os.path.abspath(options.uniprotkb[0])
+    except Exception as e:
+        exit("Error parsing uniprotkb argument: %s" % (e))
+
+    # print("options.schema is %d" % options.db_schema)
+
+    # db_name = "demo/test.sqlite"
+    # f_name  = "demo/test_input.txt"
+
+    con = sqlite3.connect(db_name)
+    cur = con.cursor()
+    ker = con.cursor()
+
+    cur.executescript(DROP_TABLES_SQL)
+
+    # if options.db_schema:
+    #     print("\nAfter dropping tables/views that are to be created, schema is:")
+    #     cur.execute("SELECT * FROM sqlite_schema")
+    #     for row in cur.fetchall():
+    #         if row[4] is not None:
+    #             print("%s;" % row[4])
+
+    cur.executescript(CREATE_TABLES_SQL)
+
+    if options.db_schema:
+        print(
+            "\nAfter creating tables/views that are to be created, schema is:"
+        )
+        cur.execute("SELECT * FROM sqlite_schema")
+        for row in cur.fetchall():
+            if row[4] is not None:
+                print("%s;" % row[4])
+
+    def generate_ppep(f):
+        # get keys from upstream tabular file using readline()
+        # ref: https://stackoverflow.com/a/16713581/15509512
+        #      answer to "Use codecs to read file with correct encoding"
+        file1_encoded = open(f, "rb")
+        file1 = cx_getreader("latin-1")(file1_encoded)
+
+        count = 0
+        re_tab = re.compile("^[^\t]*")
+        re_quote = re.compile('"')
+        while True:
+            count += 1
+            # Get next line from file
+            line = file1.readline()
+            # if line is empty
+            # end of file is reached
+            if not line:
+                break
+            if count > 1:
+                m = re_tab.match(line)
+                m = re_quote.sub("", m[0])
+                yield m
+        file1.close()
+        file1_encoded.close()
+
+    # Build an Aho-Corasick automaton from a trie
+    # - ref:
+    #   - https://pypi.org/project/pyahocorasick/
+    #   - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
+    #   - https://en.wikipedia.org/wiki/Trie
+    auto = ahocorasick.Automaton()
+    re_phos = re.compile("p")
+    # scrub out unsearchable characters per section
+    #   "Match the p_peptides to the @sequences array:"
+    # of the original
+    #   PhosphoPeptide Upstream Kinase Mapping.pl
+    # which originally read
+    #   $tmp_p_peptide =~ s/#//g;
+    #   $tmp_p_peptide =~ s/\d//g;
+    #   $tmp_p_peptide =~ s/\_//g;
+    #   $tmp_p_peptide =~ s/\.//g;
+    #
+    re_scrub = re.compile("0-9_.#")
+    ppep_count = 0
+    for ppep in generate_ppep(f_name):
+        ppep_count += 1
+        add_to_trie = False
+        # print(ppep)
+        scrubbed = re_scrub.sub("", ppep)
+        deppep = re_phos.sub("", scrubbed)
+        if options.verbose:
+            print("deppep: %s; scrubbed: %s" % (deppep, scrubbed))
+        # print(deppep)
+        cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
+        if cur.fetchone() is None:
+            add_to_trie = True
+        cur.execute("INSERT INTO deppep(seq) VALUES (?)", (deppep,))
+        cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
+        deppep_id = cur.fetchone()[0]
+        if add_to_trie:
+            # print((deppep_id, deppep))
+            # Build the trie
+            auto.add_word(deppep, (deppep_id, deppep))
+        cur.execute(
+            "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)",
+            (ppep, scrubbed, deppep_id),
+        )
+    # def generate_deppep():
+    #     cur.execute("SELECT seq FROM deppep")
+    #     for row in cur.fetchall():
+    #         yield row[0]
+    cur.execute("SELECT count(*) FROM (SELECT seq FROM deppep GROUP BY seq)")
+    for row in cur.fetchall():
+        deppep_count = row[0]
+
+    cur.execute(
+        "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)"
+    )
+    for row in cur.fetchall():
+        sequence_count = row[0]
+
+    print("%d phosphopeptides were read from input" % ppep_count)
+    print(
+        "%d corresponding dephosphopeptides are represented in input"
+        % deppep_count
+    )
+    # Look for cases where both Gene_Name and Sequence are identical
+    cur.execute(
+        """
+      SELECT Uniprot_ID, Gene_Name, Sequence
+      FROM   UniProtKB
+      WHERE  Sequence IN (
+        SELECT   Sequence
+        FROM     UniProtKB
+        GROUP BY Sequence, Gene_Name
+        HAVING   count(*) > 1
+        )
+      ORDER BY Sequence
+      """
+    )
+    duplicate_count = 0
+    old_seq = ""
+    for row in cur.fetchall():
+        if duplicate_count == 0:
+            print(
+                "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)."
+            )
+        if row[2] != old_seq:
+            old_seq = row[2]
+            duplicate_count += 1
+            if options.warn_duplicates:
+                print("\n%s\t%s\t%s" % row)
+        else:
+            if options.warn_duplicates:
+                print("%s\t%s" % (row[0], row[1]))
+    if duplicate_count > 0:
+        print(
+            "\n%d sequences have duplicated accession IDs\n" % duplicate_count
+        )
+
+    print("%s accession sequences will be searched\n" % sequence_count)
+
+    # print(auto.dump())
+
+    # Convert the trie to an automaton (a finite-state machine)
+    auto.make_automaton()
+
+    # Execute query for seqs and metadata without fetching the results yet
+    uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL)
+    while 1:
+        batch = uniprot_seq_and_id.fetchmany(size=50)
+        if not batch:
+            break
+        for Sequence, UniProtKB_id in batch:
+            if Sequence is not None:
+                for end_index, (insert_order, original_value) in auto.iter(
+                    Sequence
+                ):
+                    ker.execute(
+                        """
+                      INSERT INTO deppep_UniProtKB
+                        (deppep_id,UniProtKB_id,pos_start,pos_end)
+                      VALUES (?,?,?,?)
+                      """,
+                        (
+                            insert_order,
+                            UniProtKB_id,
+                            1 + end_index - len(original_value),
+                            end_index,
+                        ),
+                    )
+            else:
+                raise ValueError(
+                    "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID"
+                    % (UniProtKB_id,)
+                )
+    ker.execute(
+        """
+        SELECT   count(*) || ' accession-peptide-phosphopeptide combinations were found'
+        FROM     uniprotkb_pep_ppep_view
+        """
+    )
+    for row in ker.fetchall():
+        print(row[0])
+
+    ker.execute(
+        """
+      SELECT   count(*) || ' accession matches were found', count(*) AS accession_count
+      FROM     (
+        SELECT   accession
+        FROM     uniprotkb_pep_ppep_view
+        GROUP BY accession
+        )
+      """
+    )
+    for row in ker.fetchall():
+        print(row[0])
+
+    ker.execute(
+        """
+      SELECT   count(*) || ' peptide matches were found'
+      FROM     (
+        SELECT   peptide
+        FROM     uniprotkb_pep_ppep_view
+        GROUP BY peptide
+        )
+      """
+    )
+    for row in ker.fetchall():
+        print(row[0])
+
+    ker.execute(
+        """
+      SELECT   count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count
+      FROM     (
+        SELECT   phosphopeptide
+        FROM     uniprotkb_pep_ppep_view
+        GROUP BY phosphopeptide
+        )
+      """
+    )
+    for row in ker.fetchall():
+        print(row[0])
+
+    # link peptides not found in sequence database to a dummy sequence-record
+    ker.execute(
+        """
+        INSERT INTO deppep_UniProtKB(deppep_id,UniProtKB_id,pos_start,pos_end)
+          SELECT id, 'No Uniprot_ID', 0, 0
+          FROM   deppep
+          WHERE  id NOT IN (SELECT deppep_id FROM deppep_UniProtKB)
+        """
+    )
+
+    con.commit()
+    ker.execute("vacuum")
+    con.close()
+
+
+if __name__ == "__main__":
+    wrap_start_time = time.perf_counter()
+    __main__()
+    wrap_stop_time = time.perf_counter()
+    # print(wrap_start_time)
+    # print(wrap_stop_time)
+    print(
+        "\nThe matching process took %d milliseconds to run.\n"
+        % ((wrap_stop_time - wrap_start_time) * 1000),
+    )
+
+# vim: sw=4 ts=4 et ai :
diff -r 000000000000 -r ba62d93a9ef5 test-data/alpha_levels.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/alpha_levels.tabular	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,3 @@
+0.05
+0.1
+0.2
diff -r 000000000000 -r ba62d93a9ef5 test-data/pSTY_motifs.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/pSTY_motifs.tabular	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,355 @@
+"counter"	"pcre"	"symbol"	"description"	"pubmed_id"	"classification"	"source"
+"1"	"R.R..(pS|pT)(F|L)"	"PKB_group"	"Akt kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8985174"	"kinase substrate"	"HPRD"
+"2"	"R.R..(pS|pT)"	"PKB_group"	"Akt kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=10945990"	"kinase substrate"	"HPRD"
+"3"	"GRART(S|T)pSFAE"	"PKB_group"	"Akt kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8524413"	"kinase substrate"	"HPRD"
+"4"	"(R|Q|K)(R|K|N|Q|P|H)(R|K)(R|S|T)(N|K|Q|H|D|P)pS(F|W|I|M|N|S)(S|T|H)(R|S|K)(S|T|P|Q)"	"PKB_group"	"Akt kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=15782149"	"kinase substrate"	"HPRD"
+"5"	"(R|K).(R|K)(S|T).pS"	"PKB_group"	"Akt kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"6"	"(M|V|L|I|F)(R|K|H)...(pS|pT)...(M|V|L|I|F)"	"AMPK_group"	"AMP-activated protein kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7902296,7698321"	"kinase substrate"	"HPRD"
+"7"	"(M|V|L|I)..(R|K|H).(pS|pT)...(M|V|L|I)"	"AMPK_group"	"AMP-activated protein kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7902296"	"kinase substrate"	"HPRD"
+"8"	"(M|V|L|I|F)(R|K|H)..(pS|pT)...(M|V|L|I|F)"	"AMPK_group"	"AMP-activated protein kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7698321"	"kinase substrate"	"HPRD"
+"9"	"(R|K).R..pS...(R|K)"	"AMPK_group"	"AMP-activated protein kinase 2 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7698321"	"kinase substrate"	"HPRD"
+"10"	"(P|L|I|M).(L|I|D|E)pSQ"	"ATM"	"ATM kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=10608806"	"kinase substrate"	"HPRD"
+"11"	"LpSQE"	"ATM"	"ATM kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=10801797,11544175"	"kinase substrate"	"HPRD"
+"12"	"pSQ"	"ATM"	"ATM kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"13"	"(R|K|N)R.(pS|pT)(M|L|V|I)"	"Aurora A"	"Aurora-A kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=16083426"	"kinase substrate"	"HPRD"
+"14"	"(D|E)(pS|pT)..."	"GRK-2"	"b-Adrenergic Receptor kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1645191"	"kinase substrate"	"HPRD"
+"15"	"HpSTSDD"	"BCKDK"	"Branched chain alpha-ketoacid dehydrogenase kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=3947057"	"kinase substrate"	"HPRD"
+"16"	"YRpSVDE"	"BCKDK"	"Branched chain alpha-ketoacid dehydrogenase kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=3947057"	"kinase substrate"	"HPRD"
+"17"	"(M|V|L|I|F).R..(pS|pT)...(M|V|L|I|F)"	"CaM-KI_group"	"Calmodulin-dependent protein kinase I substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9452427,7698321,8022798"	"kinase substrate"	"HPRD"
+"18"	"(M|I|L|V|F|Y).R..(pS|pT)(M|I|L|V|F|Y)"	"CaM-KII_alpha"	"Calmodulin-dependent protein kinase II alpha substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9452427"	"kinase substrate"	"HPRD"
+"19"	"R..(pS|pT)"	"CaM-KII_group"	"Calmodulin-dependent protein kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"20"	"(K|F)(R|K)(Q|M)(Q|M|K|L|F)pS(F|I|M|L|V)(D|E|I)(L|M|K|I)(F|K)"	"CaM-KII_group"	"Calmodulin-dependent protein kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=8887677"	"kinase substrate"	"HPRD"
+"21"	"(M|V|L|I|F).(R|K)..(pS|pT).."	"CaM-KII_group"	"Calmodulin-dependent protein kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=8280084"	"kinase substrate"	"HPRD"
+"22"	"R..pS"	"CaM-KII_group"	"Calmodulin-dependent protein kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"23"	"VPGKARKKpSSCQLL"	"CaM-KIV"	"Calmodulin-dependent protein kinase IV substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=1901412"	"kinase substrate"	"HPRD"
+"24"	"PLARTLpSVAGLP"	"CaM-KIV"	"Calmodulin-dependent protein kinase IV substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=1309765"	"kinase substrate"	"HPRD"
+"25"	"(M|I|L|V|F|Y).R..(pS|pT)"	"CaM-KIV"	"Calmodulin-dependent protein kinase IV substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9452427"	"kinase substrate"	"HPRD"
+"26"	"E(F|E)D(T|A|G)GpSI(I|F|Y|G)(I|G|F)(F|G)(F|P|L)"	"CK1_delta|CK1_group"	"Casein Kinase I delta substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=8887677"	"kinase substrate"	"HPRD"
+"27"	"Y(Y|E)(D|Y)(A|D)(A|G)pSI(I|Y|F|G)(I|G|F)(F|G)(F|P|L)"	"CK1_group|CK1_gamma Q9HCP0"	"Casein Kinase I gamma substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=8887677"	"kinase substrate"	"HPRD"
+"28"	"pSP..(pS|pT)"	"CK1_group"	"Casein Kinase I substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"29"	"(D|E)..(pS|pT)"	"CK1_group"	"Casein Kinase I substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=12925738"	"kinase substrate"	"HPRD"
+"30"	"(pS|pT)..(S|T)"	"CK1_group"	"Casein Kinase I substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=12925738"	"kinase substrate"	"HPRD"
+"31"	"(pS|pT)...(S|T)(M|L|V|I|F)"	"CK1_group"	"Casein Kinase I substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=12925738"	"kinase substrate"	"HPRD"
+"32"	"(E|D|A)(D|E)(E|D)(E|D)pS(E|D|A)(D|E|A)(E|D)(E|D)"	"CK2_group"	"Casein Kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=8887677"	"kinase substrate"	"HPRD"
+"33"	"pS.(E|pS|pT)"	"CK2_group"	"Casein Kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=1650349,3474230"	"kinase substrate"	"HPRD"
+"34"	"pS..(E|pS|pT)"	"CK2_group"	"Casein Kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=1650349,12925738"	"kinase substrate"	"HPRD"
+"35"	"(pS|pT)..(E|D)"	"CK2_group"	"Casein Kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9272871"	"kinase substrate"	"HPRD"
+"36"	"pSD.E"	"CK2_group"	"Casein kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"37"	"pS..(E|D)"	"CK2_group"	"Casein kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"38"	"pS(D|E).(D|E).(D|E)"	"CK2_group"	"Casein Kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"39"	"(D|E)pS(D|E).(D|E)"	"CK2_group"	"Casein Kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"40"	"pS(D|E)(D|E)(D|E)"	"CK2_group"	"Casein Kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"41"	"(pS|pT)..(D|E)"	"CK2_group"	"Casein Kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=2044770,2117608"	"kinase substrate"	"HPRD"
+"42"	"(pS|pT)..(E|D|pS|pY)"	"CK2_group"	"Casein Kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7735314"	"kinase substrate"	"HPRD"
+"43"	"(S|E|P|G)(D|S|N|E|P)(E|D|G|Q|W)(Y|E|D|S|W|T)(W|E|D)pS(D|E)(D|E|W|N)(E|D)(E|D|N|Q)"	"CK2_group"	"Casein Kinase II substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=15782149"	"kinase substrate"	"HPRD"
+"44"	"(R|K)pSP(R|P)(R|K|H)"	"CDK1"	"Cdc2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=12586835"	"kinase substrate"	"HPRD"
+"45"	"(pS|pT)P.(R|K)"	"CDK1"	"Cdc2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7874496,9003781"	"kinase substrate"	"HPRD"
+"46"	"HHH(R|K)pSPR(R|K)R"	"CDK1"	"Cdc2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7874496"	"kinase substrate"	"HPRD"
+"47"	"P.(pS|pT)PKK.KK"	"CDK1"	"Cdc2 like protein kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8407912"	"kinase substrate"	"HPRD"
+"48"	"(pS|pT)P.(R|K)"	"CDK1|CDK2|CDK4|CDK6"	"CDK1,2, 4, 6 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=12501191,10607671"	"kinase substrate"	"HPRD"
+"49"	"pSP.(R|K)."	"CDK_group"	"CDK kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"50"	"PL(pS|pT)PIP(K|R|H)"	"CDK4"	"CDK4 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=9003781"	"kinase substrate"	"HPRD"
+"51"	"PL(pS|pT)P.(K|R|H)"	"CDK4"	"CDK4 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=9003781"	"kinase substrate"	"HPRD"
+"52"	"pTP.K"	"CDK5"	"CDK5 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=11684694"	"kinase substrate"	"HPRD"
+"53"	"(K|H|G)H(H|P)(K|G|H)pSP(R|K)(H|R|K)(R|H|K)"	"CDK5"	"CDK5 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8887677"	"kinase substrate"	"HPRD"
+"54"	"(pS|pT)PG(pS|pT)PGTP"	"CDK5"	"CDK5 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=9003781"	"kinase substrate"	"HPRD"
+"55"	"(M|I|L|V).(R|K)..(pS|pT)"	"CHK1"	"Chk1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=10648819"	"kinase substrate"	"HPRD"
+"56"	"R..(pS|pT)..R"	"CLK1"	"CLK1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=10954422"	"kinase substrate"	"HPRD"
+"57"	"(R|K).(R|K).(R|K).pS..R"	"CLK1"	"CLK1|CLK2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=10480872"	"kinase substrate"	"HPRD"
+"58"	"R(R|H)(R|H)(R|E)RE(R|H)pSR(R|D)L"	"CLK1"	"CLK2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=11827553"	"kinase substrate"	"HPRD"
+"59"	"KK.RRpT(L|V)."	"DMPK_group"	"DMPK1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=12897125"	"kinase substrate"	"HPRD"
+"60"	"KKR.RpT(L|V)."	"DMPK_group"	"DMPK1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=12897125"	"kinase substrate"	"HPRD"
+"61"	"(R|K).RR.(pS|pT)(L|V)."	"DMPK_group"	"DMPK1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=12897125"	"kinase substrate"	"HPRD"
+"62"	"R..(pS|pT)(L|V)R"	"DMPK_group"	"DMPK1|DMPK2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=10913253"	"kinase substrate"	"HPRD"
+"63"	".pSQ"	"DNA-PK"	"DNA dependent Protein kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1751287"	"kinase substrate"	"HPRD"
+"64"	"P(pS|pT)."	"DNA-PK"	"DNA dependent Protein kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8086496"	"kinase substrate"	"HPRD"
+"65"	"R(R|K)R(E|R)R(E|A)(H|R)pSRR(R|D)(L|E)"	"CLK1"	"DOA/CDC-like kinase 2 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11827553"	"kinase substrate"	"HPRD"
+"66"	"(I|L|V|F|M)RR..(pS|pT)(I|L|M|V|F)"	"DCAMKL1"	"Doublecortin kinase-1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=12590608"	"kinase substrate"	"HPRD"
+"67"	"E.pS.R..R"	"HRI|EIF2AK2|EIF2AK3"	"elF2 alpha kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8982275,1671834"	"kinase substrate"	"HPRD"
+"68"	"(T|P|S)(G|P|E|Y)(P|L|I)(L|M|P)pSP(G|P|F)(P|F|G|Y)(F|Y|I)"	"MAP2K1|MAP2K2|MAP2K_group"	"ERK1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8887677"	"kinase substrate"	"HPRD"
+"69"	"pTEpY"	"MAP2K1|MAP2K2|MAP2K_group"	"ERK1 Kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=12646559"	"kinase substrate"	"HPRD"
+"70"	"P.(pS|pT)PP"	"MAP2K1|MAP2K2|MAP2K_group"	"ERK1,2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1939237"	"kinase substrate"	"HPRD"
+"71"	"..P.(pS|pT)PPP."	"MAP2K1|MAP2K2|MAP2K_group"	"ERK1,2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1939237"	"kinase substrate"	"HPRD"
+"72"	"P.(pS|pT)P"	"MAP2K1|MAP2K2|MAP2K_group"	"ERK1, ERK2 Kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=9792705"	"kinase substrate"	"HPRD"
+"73"	"pSP"	"MAP2K1|MAP2K2|MAP2K_group"	"ERK1, ERK2 Kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"74"	"KpSPP"	"MAP2K1|MAP2K2|MAP2K_group|CDK5|GSK-3 (HPRD)"	"ERK1, ERK2, SAPK, CDK5 and GSK3 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=12586839"	"kinase substrate"	"HPRD"
+"75"	"(D|Y|W|E)(C)(P|S|C|E)(P|C|S|L|T|V)(L|M|T)pS(P|A)(T|S|G|R|C|F)(W|P|S)(W|F)"	"MAP2K1|MAP2K_group"	"ERK2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=15782149"	"kinase substrate"	"HPRD"
+"76"	"..(pS|pT)E"	"GRK-1"	"G protein-coupled receptor kinase 1 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=1645191"	"kinase substrate"	"HPRD"
+"77"	".(pS|pT)...(A|P|S|T)"	"GRK-1"	"G protein-coupled receptor kinase 1 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=1645191"	"kinase substrate"	"HPRD"
+"78"	"(pS|pT)P.(K|R)"	"CDK2|MOD_CDK_SPxK_1"	"Growth associated histone HI kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate|ELM"	"HPRD"
+"79"	"(K|R)(pS|pT)P"	"GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5|MOD_ProDKin_1"	"Growth associated histone HI kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate|ELM"	"HPRD"
+"80"	"(pS|pT)P(K|R)"	"CDK2|MOD_CDK_SPK_2"	"Growth associated histone HI kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate|ELM"	"HPRD"
+"81"	"pS...pS"	"GSK-3 (HPRD)"	"GSK3 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339,16141410"	"kinase substrate"	"HPRD"
+"82"	"P.pTP"	"GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5"	"GSK3, Erk1, Erk2 and CDK5 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=16377132"	"kinase substrate"	"HPRD"
+"83"	"R..pSPV"	"GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5"	"GSK-3, ERK1, ERK2, CDK5 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=15358237"	"kinase substrate"	"HPRD"
+"84"	"K(pS|pT)P.K"	"GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5"	"GSK-3, ERK1, ERK2, CDK5 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9592082"	"kinase substrate"	"HPRD"
+"85"	"KpSP...K"	"GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5"	"GSK-3, ERK1, ERK2, CDK5 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9592082"	"kinase substrate"	"HPRD"
+"86"	"KpSP..K"	"GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5"	"GSK-3, ERK1, ERK2, CDK5 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9592082"	"kinase substrate"	"HPRD"
+"87"	"KpSP....K"	"GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5"	"GSK-3, ERK1, ERK2, CDK5 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9592082"	"kinase substrate"	"HPRD"
+"88"	"KpTPAKEE"	"GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5"	"GSK-3, ERK1, ERK2, CDK5 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9819213"	"kinase substrate"	"HPRD"
+"89"	"P.pSP"	"GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5"	"GSK-3, ERK1, ERK2, CDK5 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16020478"	"kinase substrate"	"HPRD"
+"90"	".(pS|pT)P"	"GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5"	"GSK-3, ERK1, ERK2, CDK5 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16020478"	"kinase substrate"	"HPRD"
+"91"	"..pSP"	"GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5"	"GSK-3, ERK1, ERK2, CDK5 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16020478"	"kinase substrate"	"HPRD"
+"93"	"GP(Q|M)pSPI"	"JNK_group"	"JNK1 Kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=15629715"	"kinase substrate"	"HPRD"
+"94"	"LRpT"	"LKB1"	"LKB1 Kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=14985505"	"kinase substrate"	"HPRD"
+"95"	"(R|K).R..pS"	"RSK-1|RSK-2|RSK_group"	"MAPKAPK1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7498520"	"kinase substrate"	"HPRD"
+"96"	"RRR.pS"	"RSK_group"	"MAPKAPK1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7498520"	"kinase substrate"	"HPRD"
+"97"	"(L|F|I)...R(Q|S|T)L(pS|pT)(M|L|I|V)"	"MAPKAPK2"	"MAPKAPK2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=15807522"	"kinase substrate"	"HPRD"
+"98"	"..[^P].R..pS.."	"MAPKAPK2"	"MAPKAPK2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8280084"	"kinase substrate"	"HPRD"
+"99"	"pS...(pS|pT)"	"MAPKAPK2"	"MAPKAPK2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=15629715,8280084"	"kinase substrate"	"HPRD"
+"100"	"pT(G|P|E)pY"	"MAPK11|MAPK13|MAPK14"	"MAPK 11,13,14 Kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=9295308,7535770"	"kinase substrate"	"HPRD"
+"101"	"RRFGpS[^P]RRF"	"MEKK (HPRD)"	"MEKK kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7874496"	"kinase substrate"	"HPRD"
+"102"	"RRFGpS(M|L|V|I|F)RR(M|L|V|I|F)"	"MEKK (HPRD)"	"MEKK kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7874496"	"kinase substrate"	"HPRD"
+"103"	"KKR..pS.(R|K)(R|K)"	"MLCK_group"	"MLCK kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7961752"	"kinase substrate"	"HPRD"
+"104"	"FpTY"	"mTOR"	"mTOR kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=14560963"	"kinase substrate"	"HPRD"
+"105"	"IRRLpSTRRR"	"NEK2"	"Nek 2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8982275,7759549"	"kinase substrate"	"HPRD"
+"106"	"(R|N)(F|L|M)(R|K)(R|K)pS(R|I|V|M)(R|I|M|V)(M|I|F|V)(I|F|M)"	"NIMA (HPRD)"	"NIMA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8887677,1416988"	"kinase substrate"	"HPRD"
+"107"	"FR.(pS|pT)"	"NIMA (HPRD)"	"NIMA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7759549,8120013"	"kinase substrate"	"HPRD"
+"108"	"RF(R|K)(R|K)pS(R|I)(R|I)MI"	"NIMA (HPRD)"	"NIMA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8120013,8982275"	"kinase substrate"	"HPRD"
+"109"	"(R|K).R..(pS|pT)(M|L|V|I)"	"p70S6K"	"p70 Ribosomal S6 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7498520,1737763"	"kinase substrate"	"HPRD"
+"110"	"VFLGFpTYVAP"	"p70S6K"	"p70 Ribosomal S6 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7498520"	"kinase substrate"	"HPRD"
+"111"	"AKRRRLSpSLRA"	"PAK1"	"PAK1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8051089"	"kinase substrate"	"HPRD"
+"112"	"VRKRpTLRRL"	"PAK1"	"PAK1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8051089"	"kinase substrate"	"HPRD"
+"113"	"(R|K)(R|.).(pS|pT)"	"PAK2"	"PAK2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=9405039"	"kinase substrate"	"HPRD"
+"114"	"F..F(pS|pT)(F|Y)"	"PDK-1"	"PDK1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=11516946"	"kinase substrate"	"HPRD"
+"115"	"KRKQIpSVR"	"PHK_group"	"Phosphorylase kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8180216"	"kinase substrate"	"HPRD"
+"116"	"(F|M|K)(R|K)(M|R|Q|F)(M|F|L|I)pS(F|I|M|L)(F|R|K)(L|I)(F|L|I)"	"PHK_group"	"Phosphorylase kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8887677"	"kinase substrate"	"HPRD"
+"117"	"(K|R)..pS(V|I)"	"PHK_group"	"Phosphorylase kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"118"	"(R|K)(R|K)(R|K).(pS|pT)."	"Pim1 (HPRD)"	"Pim1 kinase substrate sequence"	"https://pubmed.ncbi.nlm.nih.gov/?term=1416988"	"kinase substrate"	"HPRD"
+"119"	"(R|K)(R|K|A|Q|P)(R|K)(R|Q|H|N|Y)(P|H|K)pS(G|S|T)(P|S|G|Q|H|S|T)(S|P|Q|G|D)(T|S|P|G)"	"Pim2 (HPRD)"	"Pim2 kinase substrate sequence"	"https://pubmed.ncbi.nlm.nih.gov/?term=15782149"	"kinase substrate"	"HPRD"
+"120"	"RR.pS(M|I|L|V|F|Y)"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8463304,194899"	"kinase substrate"	"HPRD"
+"121"	"R.pS"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"122"	"KR..pS"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"123"	"R..pS"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"124"	"(R|K).(pS|pT)"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"125"	"K..(pS|pT)"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"126"	"(R|K)(R|K).(pS|pT)"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"127"	"K...(pS|pT)"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"128"	"(pS|pT).(R|K)"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"129"	"RRRRpSIIFI"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7874496"	"kinase substrate"	"HPRD"
+"130"	"RR.pS"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8982275,1848111"	"kinase substrate"	"HPRD"
+"131"	"R(R|K).(pS|pT)(I|L|V|F|Y)(D|C|.).D"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=14679191"	"kinase substrate"	"HPRD"
+"132"	"RR.pS"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8163498"	"kinase substrate"	"HPRD"
+"133"	"RRR(R|N)pSII(F|D)"	"PKA_group"	"PKA kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8887677"	"kinase substrate"	"HPRD"
+"135"	"R(R|K).(pS|pT)[^P]"	"PKA_alpha|MOD_PKA_1"	"PKA, PKG kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7654713"	"kinase substrate|ELM"	"HPRD"
+"136"	"ARKGpSLRQ"	"PKC_alpha"	"PKC alpha kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8887677"	"kinase substrate"	"HPRD"
+"137"	"R(R|F)RR(R|K)GpSF(R|K)(R|K)"	"PKC_alpha"	"PKC alpha kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8887677"	"kinase substrate"	"HPRD"
+"138"	"(L|R|F)(R|K)R(K|Q)GpS(F|M)KK.A"	"PKC_beta"	"PKC beta kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=12566450,10574945"	"kinase substrate"	"HPRD"
+"139"	"R.RKGpSF"	"PKC_delta"	"PKC delta kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8995387"	"kinase substrate"	"HPRD"
+"140"	"KRQGpSVRR"	"PKC_epsilon"	"PKC epsilon kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8995387"	"kinase substrate"	"HPRD"
+"141"	"R(K|E|R).pS"	"PKC_epsilon"	"PKC epsilon kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8995387"	"kinase substrate"	"HPRD"
+"142"	"AR..R(R|K)RpSFRR"	"PKC_eta"	"PKC eta kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8995387"	"kinase substrate"	"HPRD"
+"143"	"F..F(pS|pT)(F|Y)"	"PKC_group"	"PKC family kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8995387"	"kinase substrate"	"HPRD"
+"145"	".R..(pS|pT).R."	"PKC_group"	"PKC kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=2473066"	"kinase substrate"	"HPRD"
+"146"	"(pS|pT).(R|K)"	"PKC_group"	"PKC kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"147"	"(R|K)..(pS|pT)"	"PKC_group"	"PKC kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"148"	"(R|K)..(pS|pT).(R|K)"	"PKC_group"	"PKC kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"149"	"(K|R).(pS|pT)"	"PKC_group"	"PKC kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"150"	"(R|K).(pS|pT).(R|K)"	"PKC_group"	"PKC kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1956339"	"kinase substrate"	"HPRD"
+"151"	"(L|V)(V|L|A)R(Q|K|E)MpS"	"PKD1"	"PKC mu kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8995387"	"kinase substrate"	"HPRD"
+"152"	"(R|F|W|M)(W|A|K|S)(R|S|K|H)(R|H|S|Q)(R|K|N|P|G|Q)pS(I|F|R|V|K|S|L|M)(K|M|R|S|T)(R|S|K|W)(R|K|G)"	"PKC_theta"	"PKC theta kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=15782149"	"kinase substrate"	"HPRD"
+"153"	"F.R..pS(F|M)(F|M)"	"PKC_zeta"	"PKC zeta kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8995387"	"kinase substrate"	"HPRD"
+"154"	"(L|V|I)(R|K|Q)(R|K)(R|K|T|Q|M)(N|K|R|L|M|H)pS(F|W|I|M|L|V)(S|N)(R|S|P|Y|W)(S|R|N|L)"	"PKD"	"PKD kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=15782149"	"kinase substrate"	"HPRD"
+"155"	"R(R|K).(pS|pT)[^P]"	"PKA_group|MOD_PKA_1"	"PKG kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7654713"	"kinase substrate|ELM"	"HPRD"
+"156"	"R..(pS|pT).R..R"	"EIF2AK2"	"PKR kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=1671834"	"kinase substrate"	"HPRD"
+"157"	"(D|E).(pS|pT)(I|L|V|M).(D|E)"	"PLK1"	"Plk1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=12738781"	"kinase substrate"	"HPRD"
+"158"	".pS..D.."	"PDHK1"	"Pyruvate dehydrogenase kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=3002277"	"kinase substrate"	"HPRD"
+"159"	"PLpTLP"	"RAF1"	"RAF1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8376361"	"kinase substrate"	"HPRD"
+"160"	"PLLpTP"	"RAF1"	"RAF1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8376361"	"kinase substrate"	"HPRD"
+"161"	"PLpTP"	"RAF1"	"RAF1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8376361"	"kinase substrate"	"HPRD"
+"162"	"PpTLP"	"RAF1"	"RAF1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8376361"	"kinase substrate"	"HPRD"
+"163"	"PLpTLP"	"RAF1"	"RAF1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8376361"	"kinase substrate"	"HPRD"
+"164"	"PpTLP"	"RAF1"	"RAF1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8376361"	"kinase substrate"	"HPRD"
+"165"	"LpTP"	"RAF1"	"RAF1 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8376361"	"kinase substrate"	"HPRD"
+"166"	"KKKKKK(pS|pT)..."	"TGF-beta (HPRD)"	"TGF beta receptor kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8524844"	"kinase substrate"	"HPRD"
+"167"	"(R|K|Q|N)(M|C|W)(R|T|S|N)(E|D|S|N)(R|K|E|D|N)pS(S|D|E)(S|GC|D)(SM|R|N)(N|H|S|R|C)"	"TGF-beta (HPRD)"	"TGF beta receptor kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=15782149"	"kinase substrate"	"HPRD"
+"168"	"RR..pS"	"DAPK3"	"ZIP kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=15001356"	"kinase substrate"	"HPRD"
+"169"	"KR.RpS"	"DAPK3"	"ZIP kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=15001356"	"kinase substrate"	"HPRD"
+"170"	"KRR.pT"	"DAPK3"	"ZIP kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=15001356"	"kinase substrate"	"HPRD"
+"171"	"pTEY"	"DUSP1 P28562"	"Dual specificity protein phosphatase 1 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16183637"	"phosphatase substrate"	"HPRD"
+"172"	"pT.pY"	"DUSP6 Q16828"	"Dual specificity protein phosphatase 6 substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11432864"	"phosphatase substrate"	"HPRD"
+"173"	"RRA(pS|pT)VA"	"PKA_group|MOD_PKA_1"	"PP2A, PP2C substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7508382,1653021,3027075"	"kinase substrate|ELM"	"HPRD"
+"174"	".R..pSVA"	"Calcineurin (HPRD)"	"PP2B substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7508382"	"kinase substrate"	"HPRD"
+"175"	".pT.pY."	"Wip1 O15297"	"PP2C delta substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=15807522"	"kinase substrate"	"HPRD"
+"1"	"KCSpTWP"	"14-3-3 (HPRD)"	"14-3-3 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=12819209"	"domain binding"	"HPRD"
+"2"	"R..pS"	"14-3-3 (HPRD)"	"14-3-3 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9524113,9341175"	"domain binding"	"HPRD"
+"3"	"R.R..pS.P"	"14-3-3 (HPRD)"	"14-3-3 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9524113"	"domain binding"	"HPRD"
+"4"	"YpTV"	"14-3-3 (HPRD)"	"14-3-3 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=12196105"	"domain binding"	"HPRD"
+"5"	"RS.(pS|pT).P"	"14-3-3 (HPRD)"	"14-3-3 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=15139812"	"domain binding"	"HPRD"
+"6"	"R.(Y|F).pS.P"	"LIG_14-3-3_CanoR_1"	"14-3-3 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9428519"	"domain binding|ELM"	"HPRD"
+"7"	"RPVSSAApSVY"	"14-3-3 (HPRD)"	"14-3-3 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9524113"	"domain binding"	"HPRD"
+"8"	"pS(D|E)(D|E)E"	"BARD1 Q99728"	"BARD1 BRCT domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14578343"	"domain binding"	"HPRD"
+"9"	"DpSG..pS"	"BTRC WD40"	"Beta-TrCP1 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=15070733,12820959"	"domain binding"	"HPRD"
+"10"	"pS(F|Y|H)(V|F|Y)(F|Y)"	"LIG_BRCT_BRCA1_1"	"BRCA1 BRCT domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14578343"	"domain binding|ELM"	"HPRD"
+"11"	"(I|L)(I|L|P)pTP(R|K)"	"hCDC4 Q969H0"	"CDC4 WD40 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11734846"	"domain binding"	"HPRD"
+"12"	"HFDpTYLI"	"LIG_FHA_1"	"Chk2 FHA domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11106755,12049740"	"domain binding|ELM"	"HPRD"
+"13"	"(R|D|H)(L|Y)(L|M)(K|A)pT(Q|L|M|E|V)(K|L|I|R)"	"FHA (HPRD)"	"FHA domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11106755"	"domain binding"	"HPRD"
+"14"	"S(pS|pT)."	"MDC1 FHA"	"MDC1 BRCT domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14578343"	"domain binding"	"HPRD"
+"15"	"S(pS|pT)."	"PLK1 PBD"	"Plk1 PBD domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=15139812,12595692,14532005"	"domain binding"	"HPRD"
+"16"	"pSYII"	"RAD9 BRCT (HPRD)"	"RAD9 BRCT domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14578343"	"domain binding"	"HPRD"
+"17"	"(pS|pT)P"	"DOC_WW_Pin1_4"	"WW domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11607836,11248545,15139812,10037602"	"domain binding|ELM"	"HPRD"
+"1"	"pYM.M"	"Abl"	"Abl kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8227078"	"kinase substrate"	"HPRD"
+"2"	"EDAIpY"	"Abl"	"Abl kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8125961"	"kinase substrate"	"HPRD"
+"3"	".VIpYAAPF"	"Abl"	"Abl kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8578591"	"kinase substrate"	"HPRD"
+"4"	"EAIpYAAPF"	"Abl"	"Abl kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=11860343,7845468"	"kinase substrate"	"HPRD"
+"5"	"EEIpYEEpY"	"Abl"	"Abl kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=11860343"	"kinase substrate"	"HPRD"
+"6"	"E.IpY..P."	"Abl"	"Abl kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=11860343"	"kinase substrate"	"HPRD"
+"7"	"EEIpYYYVH"	"Abl"	"Abl kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=11860343"	"kinase substrate"	"HPRD"
+"8"	"ERIpYARTK"	"Abl"	"Abl kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=11860343"	"kinase substrate"	"HPRD"
+"9"	"AEV(I|V|L|F)pYAA(P|F)F"	"Abl"	"Abl kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7845468"	"kinase substrate"	"HPRD"
+"10"	"pY...YY"	"ALK|PLCG1 SH2"	"ALK kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=15938644"	"kinase substrate"	"HPRD"
+"11"	"pY(D|E).(I|L|V|M)"	"ALK"	"ALK kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"12"	"(D|E)..pY"	"ALK"	"ALK kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"13"	"pY....(F|Y)"	"ALK"	"ALK kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"14"	"EE(D|E)IpYFFFF"	"Csk"	"CSK kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=9425036"	"kinase substrate"	"HPRD"
+"15"	"...IpY(M|I|F)FFF"	"Csk"	"CSK kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8982275"	"kinase substrate"	"HPRD"
+"16"	"EEEEpYFELV"	"EGFR"	"EGFR kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8578591"	"kinase substrate"	"HPRD"
+"17"	"(E|D|R|A)(D|E)(D|E)(E|D|I)pY(F|V|I|E)(E|F|D)(L|I|F|V)V"	"EGFR"	"EGFR kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7845468,8578591"	"kinase substrate"	"HPRD"
+"18"	".(D|E)pY."	"EGFR"	"EGFR kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8578591"	"kinase substrate"	"HPRD"
+"19"	"pYIPP"	"EGFR"	"EGFR kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=12522132"	"kinase substrate"	"HPRD"
+"20"	".(D|E)pY(I|L|V)"	"EGFR"	"EGFR kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8578591"	"kinase substrate"	"HPRD"
+"21"	"EEEIpYEEIE"	"Fes"	"Fes kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8578591"	"kinase substrate"	"HPRD"
+"22"	"(E|A|D)(E|A)(E|A)(I|E|V)pY(D|E)(D|E)(I|V|E)(E|I|V)"	"Fes"	"Fes kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7845468"	"kinase substrate"	"HPRD"
+"23"	"EEEpYFFLF"	"FGFR (HPRD)"	"FGFR kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8578591"	"kinase substrate"	"HPRD"
+"24"	"A(E|A)EEpY(F|V)F(L|F|M|I|V)F"	"FGFR (HPRD)"	"FGFR kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7845468,8578591"	"kinase substrate"	"HPRD"
+"25"	"ME(E|N)(I|V)pY(G|E)IFF"	"Fgr"	"Fgr kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8982275"	"kinase substrate"	"HPRD"
+"26"	"KSPGEpYVNIEFG"	"IGF1R|INSR"	"IGF1 receptor kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8530377"	"kinase substrate"	"HPRD"
+"27"	"pYM.M"	"INSR"	"Insulin receptor kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8227078,1312712"	"kinase substrate"	"HPRD"
+"28"	"EE(E|N|D)pY(M|F)(M|F)(M|F|I|E)(M|F)"	"INSR"	"Insulin receptor kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7845468,8578591"	"kinase substrate"	"HPRD"
+"29"	".EEEpYMMMM"	"INSR"	"Insulin receptor kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8578591"	"kinase substrate"	"HPRD"
+"30"	"KKSRGDpYMTMQIG"	"INSR"	"Insulin receptor kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8530377,1312712"	"kinase substrate"	"HPRD"
+"31"	"KKKLPATGDpYMNMSPVGD"	"INSR"	"Insulin receptor kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8530377,1312712"	"kinase substrate"	"HPRD"
+"32"	"pY..(L|I|V)"	"JAK2"	"JAK2 kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=15143187"	"kinase substrate"	"HPRD"
+"33"	"pTPpY"	"MAP2K7|MAP2K6"	"JNK kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=11390361"	"kinase substrate"	"HPRD"
+"34"	".E.IpYGVLF"	"Lck"	"Lck kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8578591"	"kinase substrate"	"HPRD"
+"35"	"E.(I|V|L|F)pY(G|A)V(L|V|F|I)(F|L|V|I)"	"Lck"	"Lck kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7845468"	"kinase substrate"	"HPRD"
+"36"	"DEEIpY(E|G)EL."	"Lyn"	"Lyn kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8982275"	"kinase substrate"	"HPRD"
+"37"	"(D|E).......(D|E)..pY..L.......Y..(L|I)"	"Lyn"	"Lyn kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7526393,10452987"	"kinase substrate"	"HPRD"
+"38"	"EEEEpYVFI."	"PDGFR_group"	"PDGFR kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8578591"	"kinase substrate"	"HPRD"
+"39"	"(L|N)(R|I)TpY"	"PDGFR_group"	"PDGFR kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8843147"	"kinase substrate"	"HPRD"
+"40"	"(D|E)(D|E)(D|E)(D|E)pY(V|E|I)F(I|V|F)"	"PDGFR_group"	"PDGFR kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7845468,8578591"	"kinase substrate"	"HPRD"
+"41"	"(D|E).......(D|E)..pY..L.......Y..(L|I)"	"SRC_group"	"Src family kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7526393,10452987"	"kinase substrate"	"HPRD"
+"42"	"(I|V|L|S).pY..(L|I)"	"SRC_group"	"Src family kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=9469421"	"kinase substrate"	"HPRD"
+"43"	"pYM.M"	"SRC"	"Src kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8227078"	"kinase substrate"	"HPRD"
+"44"	"YIpYGSFK"	"SRC"	"Src kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7558590"	"kinase substrate"	"HPRD"
+"45"	"EEEIpY(G|E)EFD"	"SRC"	"Src kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8578591"	"kinase substrate"	"HPRD"
+"46"	"D(D|E)(E|D|G)(I|V|L)pY(G|E)E(F|I)F"	"SRC"	"Src kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7845468"	"kinase substrate"	"HPRD"
+"47"	"(D|E).......(D|E)..pY..L.......Y..(L|I)"	"SRC"	"Src kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7612891,15173175"	"kinase substrate"	"HPRD"
+"48"	"(D|E)(D|E)(E|D|G)(I|V|L)pY(G|E|D)E(F|I|L|V)(D|E)"	"SRC"	"Src kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=7845468"	"kinase substrate"	"HPRD"
+"49"	"pY(A|G|S|T|D|E)"	"SRC"	"Src kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=16273072"	"kinase substrate"	"HPRD"
+"50"	"(E|D|pT|pY).pYEE"	"SYK"	"Syk kinase"	"https://pubmed.ncbi.nlm.nih.gov/?term=8982275"	"kinase substrate"	"HPRD"
+"51"	"(D|E)pYpY(R|K)"	"PTP1B (HPRD)"	"PTP1B phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11694501,11163213"	"phosphatase substrate"	"HPRD"
+"52"	"EFpY(G|A)TY(G|A)"	"PTP1B (HPRD)"	"PTP1B phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14578355"	"phosphatase substrate"	"HPRD"
+"53"	"E(Y|F|D)pYM"	"PTP1B (HPRD)"	"PTP1B phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=12237455"	"phosphatase substrate"	"HPRD"
+"54"	"(E|P)(M|L|I|V|F)pY(G|A).(M|L|I|V|F|Y)A"	"PTP1B (HPRD)"	"PTP1B phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14578355"	"phosphatase substrate"	"HPRD"
+"55"	"RD.Y.TDYpYR"	"PTP1B (HPRD)"	"PTP1B phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=12237455"	"phosphatase substrate"	"HPRD"
+"56"	"E(F|D|Y)pY"	"PTP1B (HPRD)"	"PTP1B phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9527843,9843364,12237455"	"phosphatase substrate"	"HPRD"
+"57"	"DpYpYR"	"PTPN6 SH2|PTPN11 SH2"	"PTP1B, TC-PTP phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11163213"	"phosphatase substrate"	"HPRD"
+"58"	"(D|E)FpY(G|A)(F|Y)(A|G)"	"PTPRH SH2 (HPRD)"	"PTPRH phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14578355"	"phosphatase substrate"	"HPRD"
+"59"	"F(M|L|V|I)pY"	"PTPRJ SH2 (HPRD)"	"PTPRJ phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14578355"	"phosphatase substrate"	"HPRD"
+"60"	"(D|E).(L|I|V).pY..(L|I|V)"	"PTPN6 SH2"	"SHP1 phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=10660565"	"phosphatase substrate"	"HPRD"
+"61"	"(D|E).(L|I|V)..pY..(L|I|V)"	"PTPN6 SH2"	"SHP1 phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=10660565"	"phosphatase substrate"	"HPRD"
+"62"	"(D|E)(D|E)(D|E|L).pY..(F|M|L|V|I)(D|E)"	"PTPN6 SH2"	"SHP1 phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14699166"	"phosphatase substrate"	"HPRD"
+"63"	"(D|E).pY"	"PTPN6 SH2"	"SHP1 phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11994017"	"phosphatase substrate"	"HPRD"
+"64"	"(E|P)(F|I|L)pYA.(F|I|L|V)"	"PTPN6 SH2"	"SHP1 phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14578355"	"phosphatase substrate"	"HPRD"
+"65"	"pYIDL"	"PTPN11 SH2"	"SHP2 phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7504175,9756938"	"phosphatase substrate"	"HPRD"
+"66"	"pYASI"	"PTPN11 SH2"	"SHP2 phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16020478,9756938"	"phosphatase substrate"	"HPRD"
+"67"	"EFpYA.(V|I)G(R|K|H)S"	"PTPN11 SH2"	"SHP2 phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14578355"	"phosphatase substrate"	"HPRD"
+"68"	"(D|E)(D|E)...pYVA"	"TC-PTP SH2 (HPRD)"	"TC-PTP phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7678807"	"phosphatase substrate"	"HPRD"
+"69"	"(E|D|Y)pY"	"TC-PTP SH2 (HPRD)"	"TC-PTP phosphatase substrate motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11352902,12237455"	"phosphatase substrate"	"HPRD"
+"1"	"pY(E|M|V)(N|V|I)"	"SH3BP2 SH2"	"3BP2 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7511210"	"domain binding"	"HPRD"
+"2"	"pYENP"	"ABL1 SH2"	"Abl SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7511210,7680959"	"domain binding"	"HPRD"
+"3"	"pY..P"	"CRK SH2"	"Crk SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11994738,11607838,7511210"	"domain binding"	"HPRD"
+"4"	"pYDHP"	"CRK SH2"	"Crk SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11994738,7680959"	"domain binding"	"HPRD"
+"5"	"pY(T|A|S)(K|R|Q|N)(M|I|V|R)"	"Csk SH2"	"Csk SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11994738"	"domain binding"	"HPRD"
+"6"	"pY(Y|I|V)N(F|L|I|V)"	"GRB2 SH2"	"Grb2 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7511210"	"domain binding"	"HPRD"
+"7"	"pYE.(V|I)"	"FES SH2"	"Fes SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11994738,7511210"	"domain binding"	"HPRD"
+"8"	"pYEE(I|V)"	"FGR SH2"	"Fgr SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7680959"	"domain binding"	"HPRD"
+"9"	"pYEDP"	"Fyn SH2"	"Fyn SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=8622893"	"domain binding"	"HPRD"
+"10"	"pY(M|I|L|V).(M|I|L|V)"	"FES SH2|SH3BP2 SH2|Csk SH2|GRB2 SH2|SYK SH2"	"GRB2, 3BP2, Csk, Fes, Syk C-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7511210,7680959"	"domain binding"	"HPRD"
+"11"	"pY(Q|Y|V)N(Y|Q|F)"	"GRB2 SH2"	"Grb2 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7511210"	"domain binding"	"HPRD"
+"12"	"pY.N"	"GRB2 SH2"	"Grb2 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7511210,11994738"	"domain binding"	"HPRD"
+"13"	"(F|Y)pY(E|T|Y|S)N(I|L|V|P|T|Y|S)"	"GRB7 SH2|GRB10 SH2"	"GRB7, GRB10 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14679191"	"domain binding"	"HPRD"
+"14"	"pYF.(F|P|L|Y)"	"PTPN6 SH2"	"HCP  SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7511210"	"domain binding"	"HPRD"
+"15"	"pY(A|E|V)(Y|F|E|S|N|V)(P|F|I|H)"	"ITK SH2"	"Itk SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=10636929"	"domain binding"	"HPRD"
+"16"	"pYDYV"	"Lck SH2|Src SH2"	"Lck and Src SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16245368"	"domain binding"	"HPRD"
+"17"	"pYDEP"	"NCK SH2"	"Nck SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7511210,7680959"	"domain binding"	"HPRD"
+"18"	"pYM.M"	"PIK3R1 SH2"	"PI3 Kinase p85 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7511210,11994738"	"domain binding"	"HPRD"
+"19"	"pY..M"	"PIK3R1 SH2"	"PI3 Kinase p85 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=1380456"	"domain binding"	"HPRD"
+"20"	"pYMPMS"	"PIK3R1 SH2"	"PI3 Kinase p85 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16020478"	"domain binding"	"HPRD"
+"21"	"pY(L|I|V)E(L|I|V)"	"PLCG1 SH2|PTPN11 SH2"	"PLCgamma C and N-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7511210,7680959"	"domain binding"	"HPRD"
+"22"	"pY..P"	"RASA_group SH2"	"RasGAP C-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9233798"	"domain binding"	"HPRD"
+"23"	"pYILV.(M|L|I|V|P)"	"RASA_group SH2"	"RasGAP N-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9233798"	"domain binding"	"HPRD"
+"24"	"TIpY..(V|I)"	"SH2D1A SH2|SH2D1B SH2"	"SAP and EAT2 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=10549287"	"domain binding"	"HPRD"
+"25"	"pY(L|V)N(V|P)"	"GRB2 SH2|STAT3 SH2"	"Sem5 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7680959"	"domain binding"	"HPRD"
+"26"	"pY(T|V|I).L"	"SHB SH2"	"Shb SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7537362"	"domain binding"	"HPRD"
+"27"	"pY(I|E|Y|L).(I|L|M)"	"SHC_group SH2|SHC1 SH2|SHC2 SH2"	"SHC SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7511210"	"domain binding"	"HPRD"
+"28"	"(I|V|L|S).pY..(L|I)"	"PTPN11 SH2|PTPN6 SH2"	"SHIP2 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=10789675"	"domain binding"	"HPRD"
+"29"	"(V|I|L).pYA.(L|V)"	"PTPN6 SH2"	"SHP1 C-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11052678"	"domain binding"	"HPRD"
+"30"	"..pYYM(K|R)"	"PTPN6 SH2"	"SHP1 C-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11052678"	"domain binding"	"HPRD"
+"31"	"L(Y|H)pY(M|F).(F|M)"	"PTPN6 SH2"	"SHP1 N-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11052678"	"domain binding"	"HPRD"
+"32"	"L.pYA.L"	"PTPN6 SH2"	"SHP1 N-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11052678"	"domain binding"	"HPRD"
+"33"	"(I|V).pY..(L|V)"	"PTPN6 SH2"	"SHP1 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9148918"	"domain binding"	"HPRD"
+"34"	"(V|I|L).pY(M|L|F).P"	"PTPN6 SH2|PTPN11 SH2"	"SHP1, SHP2 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16274240"	"domain binding"	"HPRD"
+"35"	"(T|V|I|Y).pY(A|S|T|V).(I|V|L)"	"PTPN11 SH2"	"SHP2 CSH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16274240"	"domain binding"	"HPRD"
+"36"	"(I|L|V)(I|L|V)(I|L|V|F|T|Y)pY(T|I|L|V)(I|L)(I|L|V|P)"	"PTPN11 SH2"	"SHP2 C-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14679191"	"domain binding"	"HPRD"
+"37"	"(H|F).V.(T|S|A)pY"	"PTPN11 SH2"	"SHP2 N-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16142918"	"domain binding"	"HPRD"
+"38"	"(I|V|L).pY(F|M).P"	"PTPN11 SH2"	"SHP2 N-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16274240"	"domain binding"	"HPRD"
+"39"	"pY(I|V).(I|V)"	"PTPN11 SH2"	"SHP2 N-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7680959"	"domain binding"	"HPRD"
+"40"	"(I|L|V|M).pY(T|V|A).(I|V|L|F)"	"PTPN11 SH2"	"SHP2 N-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16274240"	"domain binding"	"HPRD"
+"41"	"(I|V).pY(L|M|T)Y(A|P|T)SG"	"PTPN11 SH2"	"SHP2 N-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=16274240"	"domain binding"	"HPRD"
+"42"	"W(M|T|V)pY(Y|R)(I|L)."	"PTPN11 SH2"	"SHP2 N-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11052678"	"domain binding"	"HPRD"
+"43"	"pYIPP"	"PLCG1 SH2|PTPN11 SH2"	"SHP2, PLCgamma SH2 domain binding motifs"	"https://pubmed.ncbi.nlm.nih.gov/?term=9516477"	"domain binding"	"HPRD"
+"44"	"pYM.M"	"PIK3R1 SH2|Src SH2"	"Src and Abl SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=8227078"	"domain binding"	"HPRD"
+"45"	"pY(R|K|H|Q|E|D)(R|K|H|Q|E|D)(I|P)"	"Src SH2|Fyn SH2|Lck SH2|FGR SH2|ABL1 SH2|CRK SH2|NCK SH2"	"Src, Fyn, Lck, Fgr, Abl, Crk, Nck SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7680959"	"domain binding"	"HPRD"
+"46"	"PP.pY"	"Src SH2|Fyn SH2|Lck SH2|Csk SH2|NCK SH2|SHC1 SH2"	"Src, Fyn,Csk, Nck and SHC SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=11724572"	"domain binding"	"HPRD"
+"47"	"pYEEI"	"Src SH2|Fyn SH2|Lck SH2"	"Src,Lck and Fyn SH2 domains binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=8578591,7680959"	"domain binding"	"HPRD"
+"48"	"pY(D|E)(P|R)(R|P|Q)"	"STAT1 SH2"	"STAT1 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=12591923"	"domain binding"	"HPRD"
+"49"	"pY..Q"	"STAT3 SH2"	"STAT3 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14966128"	"domain binding"	"HPRD"
+"50"	"pY(M|L|V|I|F)(P|R|K|H)Q"	"STAT3 SH2"	"STAT3 SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=12591923"	"domain binding"	"HPRD"
+"51"	"pY(Q|T|E)(E|Q)(L|I)"	"SYK SH2"	"Syk C-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7511210"	"domain binding"	"HPRD"
+"52"	"pYTT(I|L|M)"	"SYK SH2"	"Syk N-terminal SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=8578591"	"domain binding"	"HPRD"
+"53"	"(D|E).......(D|E)..pY..L.......Y..(L|I)"	"SYK SH2|SHC_group SH2|Lyn SH2|ZAP70"	"Syk, ZAP-70, Shc, Lyn SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=10452987"	"domain binding"	"HPRD"
+"54"	"pYEN(F|I|V)"	"FES SH2|SH3BP2 SH2"	"Tensin SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=8578591"	"domain binding"	"HPRD"
+"55"	"pY(M|L|E)EP"	"VAV1 SH2|VAV2 SH2|VAV_group SH2"	"Vav SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7511210"	"domain binding"	"HPRD"
+"56"	"pYESP"	"VAV1 SH2|VAV2 SH2|VAV_group SH2"	"Vav SH2 domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9151714"	"domain binding"	"HPRD"
+"57"	"D(N|D).pY"	"CBL PTB"	"Cbl PTB domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9407100,10078535"	"domain binding"	"HPRD"
+"58"	"N.LpY"	"DOK_group PTB"	"Dok1 PTB domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=14607833,12665569"	"domain binding"	"HPRD"
+"59"	"N..pY"	"FRS2 PTB"	"FRIP PTB domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9697832"	"domain binding"	"HPRD"
+"60"	"NP.pY"	"SHC1 PTB"	"Shc PTB  domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=7542744,7541030"	"domain binding"	"HPRD"
+"61"	"DD.pY"	"SHB SH2"	"Shb PTB domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=9484780"	"domain binding"	"HPRD"
+"62"	"NP.pYF.R"	"ShcA PTB"	"ShcA PTB domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=8662772"	"domain binding"	"HPRD"
+"63"	"HN(M|L|V|I)(M|L|V|I|N)NP(S|T)pY"	"ShcC PTB"	"ShcC PTB domain binding motif"	"https://pubmed.ncbi.nlm.nih.gov/?term=8662772"	"domain binding"	"HPRD"
+"1"	"R.(pS|pT)"	"PKA_group"	"PKA"	"https://pubmed.ncbi.nlm.nih.gov/1956339"	"kinase substrate"	"Phosida"
+"2"	"R(R|K).(pS|pT)"	"PKA_group"	"PKA"	"https://pubmed.ncbi.nlm.nih.gov/1956339"	"kinase substrate"	"Phosida"
+"3"	"KR..(pS|pT)"	"PKA_group"	"PKA"	"https://pubmed.ncbi.nlm.nih.gov/1956339"	"kinase substrate"	"Phosida"
+"4"	"S..(pS|pT)"	"CK1_group"	"CK1"	"https://pubmed.ncbi.nlm.nih.gov/1956339"	"kinase substrate"	"Phosida"
+"5"	"(S|T)...pS"	"CK1_group"	"CK1"	"https://pubmed.ncbi.nlm.nih.gov/1956339"	"kinase substrate"	"Phosida"
+"6"	"(pS|pT)..E"	"CK2_group"	"CK2"	"https://pubmed.ncbi.nlm.nih.gov/1956339"	"kinase substrate"	"Phosida"
+"7"	"pS...S"	"GSK3"	"GSK3"	"https://pubmed.ncbi.nlm.nih.gov/2156841"	"kinase substrate"	"Phosida"
+"8"	"(pS|pT)P.(K|R)"	"CDK2"	"CDK2"	"https://pubmed.ncbi.nlm.nih.gov/1956339"	"kinase substrate"	"Phosida"
+"9"	"R..(pS|pT)"	"CaM-KII_group"	"CAMK2"	"https://pubmed.ncbi.nlm.nih.gov/1956339"	"kinase substrate"	"Phosida"
+"10"	"R..(pS|pT)V"	"CaM-KII_group"	"CAMK2"	"https://pubmed.ncbi.nlm.nih.gov/1956339"	"kinase substrate"	"Phosida"
+"11"	"P.(pS|pT)P"	"MAP2K_group"	"ERK/MAPK"	"https://pubmed.ncbi.nlm.nih.gov/8325833"	"kinase substrate"	"Phosida"
+"12"	"V.(pS|pT)P"	"MAP2K_group"	"ERK/MAPK"	"https://pubmed.ncbi.nlm.nih.gov/8325833"	"kinase substrate"	"Phosida"
+"13"	"PE(pS|pT)P"	"MAP2K_group"	"ERK/MAPK"	"https://pubmed.ncbi.nlm.nih.gov/8325833"	"kinase substrate"	"Phosida"
+"14"	"R(R|S|T).(pS|pT).(S|T)"	"PKB_group"	"PKB/AKT"	"https://pubmed.ncbi.nlm.nih.gov/15789031"	"kinase substrate"	"Phosida"
+"15"	"R.R..(pS|pT)"	"PKB_group"	"PKB/AKT"	"https://pubmed.ncbi.nlm.nih.gov/15789031"	"kinase substrate"	"Phosida"
+"16"	"R..(pS|pT).R"	"PKC_group"	"PKC"	"https://pubmed.ncbi.nlm.nih.gov/15782149"	"kinase substrate"	"Phosida"
+"17"	"(L|V|I).(R|K)..(pS|pT)"	"PKD"	"PKD"	"https://pubmed.ncbi.nlm.nih.gov/15782149"	"kinase substrate"	"Phosida"
+"18"	"(I|E|V)pY(E|G)(E|D|P|N)(I|V|L)"	"Lck"	"LCK"	"https://pubmed.ncbi.nlm.nih.gov/7845468"	"kinase substrate"	"Phosida"
+"19"	"(I|V|L)pY..(P|F)"	"ABL1"	"ABL"	"https://pubmed.ncbi.nlm.nih.gov/7845468"	"kinase substrate"	"Phosida"
+"20"	"(E|D)..pY..(D|E|A|G|S|T)"	"SRC_group"	"SRC"	"https://pubmed.ncbi.nlm.nih.gov/16273072"	"kinase substrate"	"Phosida"
+"21"	"pY..(I|L|V|M)"	"ALK"	"ALK"	"https://pubmed.ncbi.nlm.nih.gov/16273072"	"kinase substrate"	"Phosida"
+"22"	"(D|P|S|A|E|N).pY(V|L|D|E|I|N|P)"	"EGFR"	"EGFR"	"https://pubmed.ncbi.nlm.nih.gov/16381900"	"kinase substrate"	"Phosida"
+"23"	"(pS|pT)P.(K|R)"	"CDK1"	"CDK1"	"https://pubmed.ncbi.nlm.nih.gov/12501191"	"kinase substrate"	"Phosida"
+"24"	"(pS|pT)P(K|R)"	"CDK1"	"CDK1"	"https://pubmed.ncbi.nlm.nih.gov/12501191"	"kinase substrate"	"Phosida"
+"25"	"(R|K).(pS|pT)(I|L|V)"	"Aurora A"	"AURORA"	"https://pubmed.ncbi.nlm.nih.gov/12408861"	"kinase substrate"	"Phosida"
+"26"	"(R|K|N)R.(pS|pT)(M|L|V|I)"	"Aurora A"	"AURORA-A"	"https://pubmed.ncbi.nlm.nih.gov/16083426"	"kinase substrate"	"Phosida"
+"27"	"(D|E).(pS|pT)(V|I|L|M).(D|E)"	"PLK"	"PLK"	"https://pubmed.ncbi.nlm.nih.gov/12738781"	"kinase substrate"	"Phosida"
+"28"	"(E|D).(pS|pT)(F|L|I|Y|W|V|M)"	"PLK"	"PLK1"	"https://pubmed.ncbi.nlm.nih.gov/12738781"	"kinase substrate"	"Phosida"
+"29"	"L..(pS|pT)"	"NEK6"	"NEK6"	"https://pubmed.ncbi.nlm.nih.gov/12023960"	"kinase substrate"	"Phosida"
+"30"	"L.R..(pS|pT)"	"CHK1"	"CHK1/2"	"https://pubmed.ncbi.nlm.nih.gov/17464182"	"kinase substrate"	"Phosida"
+"31"	"(M|I|L|V).(R|K)..(pS|pT)"	"CHK1"	"CHK1"	"https://pubmed.ncbi.nlm.nih.gov/10648819"	"kinase substrate"	"Phosida"
+"32"	"F..F(pS|pT)(F|Y)"	"PDK1"	"PDK1"	"https://pubmed.ncbi.nlm.nih.gov/11516946"	"kinase substrate"	"Phosida"
+"33"	"(F|L|M)(R|K)(R|K)(pS|pT)"	"NIMA"	"NIMA"	"https://pubmed.ncbi.nlm.nih.gov/8887677"	"kinase substrate"	"Phosida"
diff -r 000000000000 -r ba62d93a9ef5 test-data/test_input_for_anova.sqlite
Binary file test-data/test_input_for_anova.sqlite has changed
diff -r 000000000000 -r ba62d93a9ef5 test-data/test_input_for_anova.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input_for_anova.tabular	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,24 @@
+Phosphopeptide	Sequence10	Sequence7	Gene_Name	Phosphoresidue	UniProt_ID	Description	Function Phosphoresidue(PSP=PhosphoSitePlus.org)	Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains	Intensity.shL.1A	Intensity.shL.1B	Intensity.shL.1C	Intensity.shR.2A	Intensity.shR.2B	Intensity.shR.2C
+AAAAPDSRVpSEEENLK	MAAAAPDSRVpSEEENLKKTPK	AAPDSRVsEEENLKK	RRP15	pS11	Q9Y3B9	RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2	N/A	CK2alpha | BARD1 Q99728	38150000	39445000	56305000	55338000	7010600	70203000
+AAAITDMADLEELSRLpSPLPPGpSPGSAAR	MADLEELSRLpSPLPPGSPGSA; LSRLSPLPPGpSPGSAARGRAE	LEELSRLsPLPPGSP | LSPLPPGsPGSAARG	AEBP2; AEBP2	pS18, pS24; pS18, pS24	Q6ZN18; Q6ZN18-2	AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2	N/A	N/A	5416400	7101800	385280000	208060000	41426000	352400000
+ADALQAGASQFETpSAAK	LQAGASQFETpSAAKLKRKYWW	GASQFETsAAKLKRK	VAMP2; VAMP3	pS80; pS63	P63027; Q15836	VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3	N/A	PKD3 | PKCiota	44627000	41445000	69094000	42521000	5738000	61819000
+DQKLpSELDDR	DKVLERDQKLpSELDDRADALQ	LERDQKLsELDDRAD	VAMP1; VAMP1; VAMP1; VAMP2; VAMP3	pS63; pS63; pS63; pS61; pS44	P23763; P23763-2; P23763-3; P63027; Q15836	VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1; VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3	N/A	CK2alpha | PKAbeta | PKAgamma | PKCiota | PDHK1	75542000	44814000	32924000	35016000	11023000	4669900
+EFVpSSDESSSGENK	SESFKSKEFVpSSDESSSGENK	FKSKEFVsSDESSSG	SSRP1	pS667	Q08945	SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1	N/A	CK2alpha | CK2a2 | CDK7 | GSK3	12562000	16302000	23000000	7857800	0	18830000
+EGMNPSYDEYADpSDEDQHDAYLER	MNPSYDEYADpSDEDQHDAYLE	SYDEYADsDEDQHDA	SSRP1	pS444	Q08945	SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1	N/A	CK2alpha | CK2a2 | CDK7 | CK1alpha | GRK-2 | PDHK1	0	0	0	0	0	0
+IGNEEpSDLEEACILPHpSPINVDK	DDEEKIGNEEpSDLEEACILPH; DLEEACILPHpSPINVDKRPIA	EKIGNEEsDLEEACI | EACILPHsPINVDKR	HERC2	pS1577, pS1588	O95714	HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2	N/A	CK2alpha | GRK-2 | DOC_WW_Pin1_4 | NEK6	167764000	121218000	155736000	140640000	83642000	128468000
+IRAEEEDLAAVPFLApSDNEEEEDEK	EDLAAVPFLApSDNEEEEDEKG	AAVPFLAsDNEEEED	HERC2	pS2928	O95714	HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2	N/A	CK2alpha	22562000	18225000	9119700	11689000	0	0
+KGLLApTpSGNDGTIR	VWCNKKGLLApTSGNDGTIRVW; WCNKKGLLATpSGNDGTIRVWN	NKKGLLAtSGNDGTI | KKGLLATsGNDGTIR	HERC1	pT3445, pS3446	Q15751	HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2	N/A	N/A	7843600	0	241700000	0	0	10042600
+KpSSLVTSK	PTPQDLPQRKpSSLVTSKLAGG; PTPQDLPQRKpSSLVTSKLAG	QDLPQRKsSLVTSKL	ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA	pS108; pS108; pS124; pS131; pS104; pS104; pS120; pS124	O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9	ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA	N/A	N/A	0	0	18629000	0	0	0
+KSpSLVTSK	TPQDLPQRKSpSLVTSKLAGGQ; TPQDLPQRKSpSLVTSKLAG	DLPQRKSsLVTSKLA	ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA	pS109; pS109; pS125; pS132; pS105; pS105; pS121; pS125	O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9	ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA	molecular association, regulation; protein conformation; SNCA(DISRUPTS)	MDC1 FHA | GSK3 | PLK1 PBD	7090300	8341200	9691500	10030000	1675200	9952100
+LpSPNPWQEK	MLAVDIEDRLpSPNPWQEKREI	VDIEDRLsPNPWQEK	HERC2	pS3462	O95714	HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2	N/A	DOC_WW_Pin1_4	0	11706000	12495000	0	7273000	8877800
+NLLEDDpSDEEEDFFLR	SERRNLLEDDpSDEEEDFFLRG	RNLLEDDsDEEEDFF	VAMP4	pS30	O75379	VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2	N/A	CK2alpha | GRK-2 | BARD1 Q99728 | Csnk2a1	1592100000	973800000	1011600000	1450300000	631970000	878760000
+pSQKQEEENPAEETGEEK	MpSQKQEEENPAE	______MsQKQEEEN	ENSA; ENSA; ENSA; ENSA; ENSA; ENSA	pS2; pS2; pS2; pS2; pS2; pS2	O43768; O43768-2; O43768-3; O43768-4; O43768-8; O43768-9	ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA	N/A	N/A	0	0	8765300	0	2355900	14706000
+pTYVDPFTpYEDPNQAVR	EEKHLNQGVRpTYVDPFTYEDP; GVRTYVDPFTpYEDPNQAVREF	HLNQGVRtYVDPFTY | TYVDPFTyEDPNQAV	EPHA4; EPHA4	pT595, pY602; pT544, pY551	P54764; P54764-2	EPHA4_HUMAN Ephrin type-A receptor 4 OS=Homo sapiens OX=9606 GN=EPHA4 PE=1 SV=1; EPHA4_HUMAN Isoform 2 of Ephrin type-A receptor 4 OS=Homo sapiens OX=9606 GN=EPHA4	N/A	EPHA4 | EphA1 | EphA2 | EphA3 | EphA5 | EphA7 | EphA6 | Abl | EphA8 | Fgr | Yes | BLK | HCK | EphB6 | EphB3	725460	0	1651300	655850	646420	0
+QLSEpSFK	SKSSSRQLSEpSFKSKEFVSSD	SSRQLSEsFKSKEFV	SSRP1	pS659	Q08945	SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1	N/A	CK2a2 | CDK7 | PKCalpha | PKCbeta | DNAPK | NEK6	68201000	87774000	138300000	95357000	19966000	149110000
+RGpSLEMSSDGEPLSR	SSATSGGRRGpSLEMSSDGEPL	TSGGRRGsLEMSSDG	AEBP2; AEBP2	pS206; pS206	Q6ZN18; Q6ZN18-2	AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2	N/A	GSK3	19262000	11103000	19454000	0	1816900	22028000
+SDGpSLEDGDDVHR	IEDGGARSDGpSLEDGDDVHRA	GGARSDGsLEDGDDV	SERINC1	pS364	Q9NRX5	SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1	N/A	PLK1 | PDHK1	31407000	17665000	20892000	23194000	5132400	54893000
+SEpSLTAESR	EGGGLMTRSEpSLTAESRLVHT	GLMTRSEsLTAESRL	HERC1	pS1491	Q15751	HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2	N/A	GRK-2	11766000	13176000	20540000	16963000	4364700	21308000
+STGPTAATGpSNRR	MSTGPTAATGpSNRRLQQTQNQ	GPTAATGsNRRLQQT	VAMP3	pS11	Q15836	VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3	N/A	PKCalpha | PKCbeta | PKCzeta	3057100	4718800	12052000	5047700	1070900	8333500
+TEDLEATpSEHFK	RNKTEDLEATpSEHFKTTSQKV	TEDLEATsEHFKTTS	VAMP8	pS55	Q9BV40	VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1	activity, inhibited; abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion	N/A	20400000	9738500	7862300	0	0	76518000
+TFWpSPELK	SSMNSIKTFWpSPELKKERVLR	NSIKTFWsPELKKER	ERC2	pS187	O15083	ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3	N/A	IKKalpha | IKKbeta | HIPK2 | DOC_WW_Pin1_4	29764000	20957000	24855000	30752000	8304800	23771000
+YFDpSGDYNMAK	CADEMQKYFDpSGDYNMAKAKM; RLQKGQKYFDpSGDYNMAKAKM; MKSVEQKYFDpSGDYNMAKAKM	EMQKYFDsGDYNMAK | KGQKYFDsGDYNMAK | VEQKYFDsGDYNMAK	ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA	pS67; pS67; pS83; pS90; pS63; pS63; pS79; pS83	O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9	ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA	molecular association, regulation; cell cycle regulation; PPP2CA(INDUCES)	GRK-2	323250000	127970000	0	67123000	12790000	71378000
diff -r 000000000000 -r ba62d93a9ef5 test-data/test_input_for_preproc.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input_for_preproc.tabular	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,39 @@
+Proteins	Positions within proteins	Leading proteins	Protein	Fasta headers	Localization prob	Score diff	PEP	Score	Delta score	Score for localization	Localization prob shL.1A	Score diff shL.1A	PEP shL.1A	Score shL.1A	Localization prob shL.1B	Score diff shL.1B	PEP shL.1B	Score shL.1B	Localization prob shL.1C	Score diff shL.1C	PEP shL.1C	Score shL.1C	Localization prob shR.2A	Score diff shR.2A	PEP shR.2A	Score shR.2A	Localization prob shR.2B	Score diff shR.2B	PEP shR.2B	Score shR.2B	Localization prob shR.2C	Score diff shR.2C	PEP shR.2C	Score shR.2C	Diagnostic peak	Number of Phospho (STY)	Amino acid	Sequence window	Modification window	Peptide window coverage	Phospho (STY) Probabilities	Phospho (STY) Score diffs	Position in peptide	Charge	Mass error [ppm]	Identification type shL.1A	Identification type shL.1B	Identification type shL.1C	Identification type shR.2A	Identification type shR.2B	Identification type shR.2C	Intensity	Intensity___1	Intensity___2	Intensity___3	Ratio mod/base	Intensity shL.1A	Intensity shL.1B	Intensity shL.1C	Intensity shR.2A	Intensity shR.2B	Intensity shR.2C	Ratio mod/base shL.1A	Ratio mod/base shL.1B	Ratio mod/base shL.1C	Ratio mod/base shR.2A	Ratio mod/base shR.2B	Ratio mod/base shR.2C	Intensity shL.1A___1	Intensity shL.1A___2	Intensity shL.1A___3	Intensity shL.1B___1	Intensity shL.1B___2	Intensity shL.1B___3	Intensity shL.1C___1	Intensity shL.1C___2	Intensity shL.1C___3	Intensity shR.2A___1	Intensity shR.2A___2	Intensity shR.2A___3	Intensity shR.2B___1	Intensity shR.2B___2	Intensity shR.2B___3	Intensity shR.2C___1	Intensity shR.2C___2	Intensity shR.2C___3	Occupancy shL.1A	Occupancy ratioshL.1A	Occupancy error scale shL.1A	Occupancy shL.1B	Occupancy ratioshL.1B	Occupancy error scale shL.1B	Occupancy shL.1C	Occupancy ratioshL.1C	Occupancy error scale shL.1C	Occupancy shR.2A	Occupancy ratioshR.2A	Occupancy error scale shR.2A	Occupancy shR.2B	Occupancy ratioshR.2B	Occupancy error scale shR.2B	Occupancy shR.2C	Occupancy ratioshR.2C	Occupancy error scale shR.2C	Reverse	Potential contaminant	id	Protein group IDs	Positions	Position	Peptide IDs	Mod. peptide IDs	Evidence IDs	MS/MS IDs	Best localization evidence ID	Best localization MS/MS ID	Best localization raw file	Best localization scan number	Best score evidence ID	Best score MS/MS ID	Best score raw file	Best score scan number	Best PEP evidence ID	Best PEP MS/MS ID	Best PEP raw file	Best PEP scan number
+sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN	108;108;124;124;131;104;104;120	sp|O43768-2|ENSA_HUMAN	sp|O43768-2|ENSA_HUMAN		0.877317	8.54376	0.001041	110.11	55.028	110.11																										1	S	TGDHIPTPQDLPQRKSSLVTSKLAG______	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXXPPPPPPPPXXXXXXXXX	KS(0.877)S(0.123)LVTSK	KS(8.54)S(-8.54)LVT(-58.58)S(-72.01)K	2	2	0.022801			By MS/MS				18629000	18629000	0	0		0	0	18629000	0	0	0							0	0	0	0	0	0	18629000	0	0	0	0	0	0	0	0	0	0	0																					700	529	108	108	12310;20039	13742;22688	99166	91729	99166	91729	QE05099	5593	99166	91729	QE05099	5593	99166	91729	QE05099	5593
+sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN	109;109;125;125;132;105;105;121	sp|O43768-2|ENSA_HUMAN	sp|O43768-2|ENSA_HUMAN		0.877764	9.23011	0.00135208	98.182	25.939	55.754																										1	S	GDHIPTPQDLPQRKSSLVTSKLAG_______	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXPPPPPPPPXXXXXXXXXX	KS(0.105)S(0.878)LVT(0.015)S(0.002)K	KS(-9.23)S(9.23)LVT(-17.65)S(-25.69)K	3	2	-0.061619	By MS/MS	By MS/MS	By matching	By matching	By matching	By MS/MS	81973000	81973000	0	0		7090300	8341200	9691500	10030000	1675200	9952100							7090300	0	0	8341200	0	0	9691500	0	0	10030000	0	0	1675200	0	0	9952100	0	0																					701	529	109	109	12310;20039	13742;22688	99164;99165;99168;99169;160369;160370;160371;160372;160373;160374	91727;91728;91731;142479	99164	91727	QE05097	5219	99167	91730	QE05100	5516	99167	91730	QE05100	5516
+CON__P02662	46	CON__P02662	CON__P02662		0.99978	36.4544	1.10E-08	122.19	116.48	122.19																										2	S	VFGKEKVNELSKDIGSESTEDQAMEDIKQME	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;Phospho (STY);X;X;X;X;X;Oxidation (M);X;X;X;X;X;X;X	XXXXXXXXXXXXPPPPPPPPPPPPPPPPXXX	DIGS(1)ES(0.972)T(0.029)EDQAMEDIK	DIGS(36.45)ES(15.33)T(-15.33)EDQAMEDIK	4	2	0.56139	By MS/MS		By MS/MS			By MS/MS	49187000	0	49187000	0	NaN	16494000	0	20139000	0	0	12553000	NaN	NaN	NaN	NaN	NaN	NaN	0	16494000	0	0	0	0	0	20139000	0	0	0	0	0	0	0	0	12553000	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN		+	2	14	46	46	3452	3862;3863	27864;27865;27866;27867	25820;25821;25822;25823	27865	25821	QE05099	36641	27865	25821	QE05099	36641	27865	25821	QE05099	36641
+CON__P02662	48	CON__P02662	CON__P02662		0.971522	15.3284	1.10E-08	122.19	116.48	122.19																										2	S	GKEKVNELSKDIGSESTEDQAMEDIKQMEAE	X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;Phospho (STY);X;X;X;X;X;Oxidation (M);X;X;X;X;X;X;X;X;X	XXXXXXXXXXPPPPPPPPPPPPPPPPXXXXX	DIGS(1)ES(0.972)T(0.029)EDQAMEDIK	DIGS(36.45)ES(15.33)T(-15.33)EDQAMEDIK	6	2	0.56139	By MS/MS		By MS/MS			By MS/MS	49187000	0	49187000	0	NaN	16494000	0	20139000	0	0	12553000	NaN	NaN	NaN	NaN	NaN	NaN	0	16494000	0	0	0	0	0	20139000	0	0	0	0	0	0	0	0	12553000	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN		+	3	14	48	48	3452	3862;3863	27864;27865;27866;27867	25820;25821;25822;25823	27865	25821	QE05099	36641	27865	25821	QE05099	36641	27865	25821	QE05099	36641
+CON__P02662	115	CON__P02662	CON__P02662		1	50.1781	4.91E-07	124.08	88.205	50.178																										1	S	RLKKYKVPQLEIVPNSAEERLHSMKEGIHAQ	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXPPPPPPPPPPPPPPXXXXXXXXXXX	VPQLEIVPNS(1)AEER	VPQLEIVPNS(50.18)AEER	10	3	-0.26085	By MS/MS	By matching	By MS/MS	By matching	By matching	By MS/MS	228160000	228160000	0	0	NaN	36938000	3667100	7945800	0	2359500	8418700	NaN	NaN	NaN	NaN	NaN	NaN	36938000	0	0	3667100	0	0	7945800	0	0	0	0	0	2359500	0	0	8418700	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN		+	4	14	115	115	23142	26196	185609;185610;185611;185612;185613;185614;185615	165233;165234;165235;165236	185612	165236	QE05102	41518	185610	165234	QE05097	41110	185610	165234	QE05097	41110
+sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-8|ENSA_HUMAN	2;2;2;2;2;2	sp|O43768-2|ENSA_HUMAN	sp|O43768-2|ENSA_HUMAN		1.0	73.249	3.69e-06	83.395	74.925	83.395																										1	S	______________MSQKQEEENPAEETGEE	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXXXPPPPPPPPPPPPPPPP	S(1)QKQEEENPAEETGEEK	S(73.25)QKQEEENPAEET(-73.25)GEEK	1	2	-0.84902			By matching		By matching	By MS/MS	25828000	25828000	0	0		0	0	8765300	0	2355900	14706000							0	0	0	0	0	0	8765300	0	0	0	0	0	2355900	0	0	14706000	0	0																					702	529	2	2	19781	22398	158249;158250;158251	140920	158249	140920	QE05102	12907	158249	140920	QE05102	12907	158249	140920	QE05102	12907
+sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN;sp|P56211-2|ARP19_HUMAN;sp|P56211|ARP19_HUMAN	67;67;83;83;90;63;63;79;46;62	sp|O43768-2|ENSA_HUMAN;sp|P56211-2|ARP19_HUMAN	sp|O43768-2|ENSA_HUMAN		0.999907	42.1841	4.04e-05	77.894	72.756	77.894																										1	S	DFLMKRLQKGQKYFDSGDYNMAKAKMKNKQL;DFLRKRLQKGQKYFDSGDYNMAKAKMKNKQL	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXPPPPPPPPPPPXXXXXXXX	YFDS(1)GDYNMAK	Y(-44.9)FDS(42.18)GDY(-42.18)NMAK	4	2	0.090313	By MS/MS	By MS/MS		By matching	By MS/MS	By MS/MS	602510000	602510000	0	0		323250000	127970000	0	67123000	12790000	71378000							323250000	0	0	127970000	0	0	0	0	0	67123000	0	0	12790000	0	0	71378000	0	0																					703	529;2007	67;46	67	23817	26932	190543;190544;190545;190546;190547	169398;169399;169400;169401	190543	169398	QE05097	28697	190543	169398	QE05097	28697	190543	169398	QE05097	28697
+sp|O95714|HERC2_HUMAN;sp|Q9BVR0|HRC23_HUMAN	1577;304	sp|O95714|HERC2_HUMAN	sp|O95714|HERC2_HUMAN		1.0	100.152	1.12e-15	100.15	94.415	100.15																										2	S	KPESTDDEEKIGNEESDLEEACILPHSPINV	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X	XXXXXXXXXXPPPPPPPPPPPPPPPPPPPPP	IGNEES(1)DLEEACILPHS(1)PINVDK	IGNEES(100.15)DLEEACILPHS(100.15)PINVDK	6	3	-0.31776	By matching	By matching	By matching	By matching	By MS/MS	By MS/MS	398730000	0	398730000	0		83882000	60609000	77868000	70320000	41821000	64234000							0	83882000	0	0	60609000	0	0	77868000	0	0	70320000	0	0	41821000	0	0	64234000	0																					1295	867	1577	1577	11517	12858	93270;93271;93272;93273;93274;93275	86700;86701	93271	86701	QE05102	51298	93271	86701	QE05102	51298	93271	86701	QE05102	51298
+sp|O95714|HERC2_HUMAN;sp|Q9BVR0|HRC23_HUMAN	1588;315	sp|O95714|HERC2_HUMAN	sp|O95714|HERC2_HUMAN		1.0	100.152	1.12e-15	100.15	94.415	100.15																										2	S	GNEESDLEEACILPHSPINVDKRPIAIKSPK	X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	PPPPPPPPPPPPPPPPPPPPPPXXXXXXXXX	IGNEES(1)DLEEACILPHS(1)PINVDK	IGNEES(100.15)DLEEACILPHS(100.15)PINVDK	17	3	-0.31776	By matching	By matching	By matching	By matching	By MS/MS	By MS/MS	398730000	0	398730000	0		83882000	60609000	77868000	70320000	41821000	64234000							0	83882000	0	0	60609000	0	0	77868000	0	0	70320000	0	0	41821000	0	0	64234000	0																					1296	867	1588	1588	11517	12858	93270;93271;93272;93273;93274;93275	86700;86701	93271	86701	QE05102	51298	93271	86701	QE05102	51298	93271	86701	QE05102	51298
+sp|O95714|HERC2_HUMAN	2928	sp|O95714|HERC2_HUMAN	sp|O95714|HERC2_HUMAN		1.0	44.9549	6.81e-12	84.285	78.578	44.955																										1	S	IRAEEEDLAAVPFLASDNEEEEDEKGNSGSL	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	PPPPPPPPPPPPPPPPPPPPPPPPPXXXXXX	IRAEEEDLAAVPFLAS(1)DNEEEEDEK	IRAEEEDLAAVPFLAS(44.95)DNEEEEDEK	16	3	-0.24823	By MS/MS	By MS/MS	By matching	By matching			61597000	61597000	0	0		22562000	18225000	9119700	11689000	0	0							22562000	0	0	18225000	0	0	9119700	0	0	11689000	0	0	0	0	0	0	0	0																					1297	867	2928	2928	11904	13281	96043;96044;96045;96046	89048;89049	96044	89049	QE05098	52942	96043	89048	QE05097	52381	96043	89048	QE05097	52381
+sp|O95714|HERC2_HUMAN	1938	sp|O95714|HERC2_HUMAN	sp|O95714|HERC2_HUMAN		0.427104	0.0	4.17e-06	44.164	42.292	44.164																											S	KYDLKLAELPAAAQPSAEDSDTEDDSEAEQT	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXPPPPPPPPPPPPPPPPPPPPPPPPPP	LAELPAAAQPS(0.427)AEDS(0.427)DT(0.142)EDDS(0.003)EAEQTER	LAELPAAAQPS(0)AEDS(0)DT(-4.78)EDDS(-20.87)EAEQT(-37.92)ER	11	3	-1.2171							0	0	0	0		0	0	0	0	0	0							0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0																					1298	867	1938	1938	12395	13829			99721	92163	QE05099	31358	99721	92163	QE05099	31358	99721	92163	QE05099	31358
+sp|O95714|HERC2_HUMAN	1942	sp|O95714|HERC2_HUMAN	sp|O95714|HERC2_HUMAN		0.427104	0.0	4.17e-06	44.164	42.292	44.164																											S	KLAELPAAAQPSAEDSDTEDDSEAEQTERNI	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX	LAELPAAAQPS(0.427)AEDS(0.427)DT(0.142)EDDS(0.003)EAEQTER	LAELPAAAQPS(0)AEDS(0)DT(-4.78)EDDS(-20.87)EAEQT(-37.92)ER	15	3	-1.2171							0	0	0	0		0	0	0	0	0	0							0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0																					1299	867	1942	1942	12395	13829			99721	92163	QE05099	31358	99721	92163	QE05099	31358	99721	92163	QE05099	31358
+sp|O95714|HERC2_HUMAN	3462	sp|O95714|HERC2_HUMAN	sp|O95714|HERC2_HUMAN		1.0	41.1171	0.0267288	41.117	33.02	41.117																										1	S	NGEECMLAVDIEDRLSPNPWQEKREIVSSED	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXXPPPPPPPPPXXXXXXXX	LS(1)PNPWQEK	LS(41.12)PNPWQEK	2	2	0.64603		By matching	By MS/MS		By matching	By matching	40352000	40352000	0	0		0	11706000	12495000	0	7273000	8877800							0	0	0	11706000	0	0	12495000	0	0	0	0	0	7273000	0	0	8877800	0	0																					1300	867	3462	3462	14140	15756	112737;112738;112739;112740	102778	112737	102778	QE05099	28079	112737	102778	QE05099	28079	112737	102778	QE05099	28079
+sp|Q08945|SSRP1_HUMAN	667	sp|Q08945|SSRP1_HUMAN	sp|Q08945|SSRP1_HUMAN		0.824557	6.72928	2.29e-05	88.385	80.253	88.385																										1	S	SSRQLSESFKSKEFVSSDESSSGENKSKKKR	X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXPPPPPPPPPPPPPPXXXXX	EFVS(0.825)S(0.175)DESSSGENK	EFVS(6.73)S(-6.73)DES(-34.1)S(-47.3)S(-52.91)GENK	4	2	-0.31453	By MS/MS	By MS/MS	By MS/MS	By MS/MS		By MS/MS	78553000	78553000	0	0		12562000	16302000	23000000	7857800	0	18830000							12562000	0	0	16302000	0	0	23000000	0	0	7857800	0	0	0	0	0	18830000	0	0																					3469	2387	667	667	6499	7276	53820;53821;53822;53823;53824	51145;51146;51147;51148;51149	53820	51145	QE05097	12983	53820	51145	QE05097	12983	53820	51145	QE05097	12983
+sp|Q08945|SSRP1_HUMAN	444	sp|Q08945|SSRP1_HUMAN	sp|Q08945|SSRP1_HUMAN		0.999939	44.165	7.94e-20	97.469	93.771	97.469																										1	S	GLKEGMNPSYDEYADSDEDQHDAYLERMKEE	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXPPPPPPPPPPPPPPPPPPPPPPPPXXXX	EGMNPSYDEYADS(1)DEDQHDAYLER	EGMNPS(-49.21)Y(-49.82)DEY(-44.17)ADS(44.17)DEDQHDAY(-90.19)LER	13	3	0.19918			By MS/MS			By MS/MS	0	0	0	0		0	0	0	0	0	0							0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0																					3470	2387	444	444	6658	7448	55048;55049	52320;52321	55048	52320	QE05099	31926	55048	52320	QE05099	31926	55048	52320	QE05099	31926
+sp|Q08945|SSRP1_HUMAN	659	sp|Q08945|SSRP1_HUMAN	sp|Q08945|SSRP1_HUMAN		0.999878	39.1416	0.00235198	117.7	65.216	117.7																										1	S	SRGSSSKSSSRQLSESFKSKEFVSSDESSSG	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X	XXXXXXXXXXXPPPPPPPXXXXXXXXXXXXX	QLSES(1)FK	QLS(-39.14)ES(39.14)FK	5	2	0.14738	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By matching	By MS/MS	558700000	558700000	0	0		68201000	87774000	138300000	95357000	19966000	149110000							68201000	0	0	87774000	0	0	138300000	0	0	95357000	0	0	19966000	0	0	149110000	0	0																					3471	2387	659	659	16873	19002	134380;134381;134382;134383;134384;134385	120469;120470;120471;120472;120473	134381	120470	QE05098	17736	134381	120470	QE05098	17736	134381	120470	QE05098	17736
+sp|Q15751|HERC1_HUMAN	3446	sp|Q15751|HERC1_HUMAN	sp|Q15751|HERC1_HUMAN		0.999981	47.2167	0.0187791	47.548	7.8172	47.548																										2	S	VMTCVWCNKKGLLATSGNDGTIRVWNVTKKQ	X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXPPPPPPPPPPPPPPXXXXXXXX	KGLLAT(1)S(1)GNDGTIR	KGLLAT(47.2)S(47.22)GNDGT(-47.2)IR	7	2	-0.95722	By matching		By MS/MS			By matching	129800000	0	129800000	0		3921800	0	120850000	0	0	5021300							0	3921800	0	0	0	0	0	120850000	0	0	0	0	0	0	0	0	5021300	0																					4421	2824	3446	3446	12194	13609	98227;98228;98229	90789	98227	90789	QE05099	12004	98227	90789	QE05099	12004	98227	90789	QE05099	12004
+sp|Q15751|HERC1_HUMAN	1491	sp|Q15751|HERC1_HUMAN	sp|Q15751|HERC1_HUMAN		0.9956	24.4686	0.000725254	80.245	41.065	80.245																										1	S	STSASEGGGLMTRSESLTAESRLVHTSPNYR	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXPPPPPPPPPXXXXXXXXX	S(0.004)ES(0.996)LT(0.001)AESR	S(-24.47)ES(24.47)LT(-30.8)AES(-48.77)R	3	2	-0.02332	By matching	By MS/MS	By MS/MS	By MS/MS	By matching	By MS/MS	88117000	88117000	0	0		11766000	13176000	20540000	16963000	4364700	21308000							11766000	0	0	13176000	0	0	20540000	0	0	16963000	0	0	4364700	0	0	21308000	0	0																					4422	2824	1491	1491	18146	20455	144586;144587;144588;144589;144590;144591	129449;129450;129451;129452	144587	129450	QE05099	10286	144587	129450	QE05099	10286	144587	129450	QE05099	10286
+sp|Q15751|HERC1_HUMAN	1510	sp|Q15751|HERC1_HUMAN	sp|Q15751|HERC1_HUMAN		0.330689	0.0	7.97e-05	45.193	39.23	45.193																											S	ESRLVHTSPNYRLIKSRSESDLSQPESDEEG	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXXXPPPPPPPPPPPPPPPP	S(0.331)RS(0.331)ES(0.331)DLS(0.008)QPESDEEGYALSGR	S(0)RS(0)ES(0)DLS(-16.27)QPES(-35.13)DEEGY(-44.24)ALS(-45.11)GR	1	3	0.88872							0	0	0	0		0	0	0	0	0	0							0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0																					4423	2824	1510	1510	19884	22510			159108	141525	QE05102	26609	159108	141525	QE05102	26609	159108	141525	QE05102	26609
+sp|Q15751|HERC1_HUMAN	1512	sp|Q15751|HERC1_HUMAN	sp|Q15751|HERC1_HUMAN		0.473289	2.22394	8.37e-06	56.783	53.982	56.783																											S	RLVHTSPNYRLIKSRSESDLSQPESDEEGYA	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXPPPPPPPPPPPPPPPPPP	S(0.284)RS(0.473)ES(0.219)DLS(0.024)QPESDEEGYALSGR	S(-2.22)RS(2.22)ES(-3.34)DLS(-13.02)QPES(-39.32)DEEGY(-52.92)ALS(-56.34)GR	3	3	-0.16378							0	0	0	0		0	0	0	0	0	0							0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0																					4424	2824	1512	1512	19884	22510			159107	141524	QE05101	26243	159107	141524	QE05101	26243	159107	141524	QE05101	26243
+sp|Q15751|HERC1_HUMAN	1514	sp|Q15751|HERC1_HUMAN	sp|Q15751|HERC1_HUMAN		0.330689	0.0	7.97e-05	45.193	39.23	45.193																											S	VHTSPNYRLIKSRSESDLSQPESDEEGYALS	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX	S(0.331)RS(0.331)ES(0.331)DLS(0.008)QPESDEEGYALSGR	S(0)RS(0)ES(0)DLS(-16.27)QPES(-35.13)DEEGY(-44.24)ALS(-45.11)GR	5	3	0.88872							0	0	0	0		0	0	0	0	0	0							0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0																					4425	2824	1514	1514	19884	22510			159108	141525	QE05102	26609	159108	141525	QE05102	26609	159108	141525	QE05102	26609
+sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN	18;18	sp|Q6ZN18-2|AEBP2_HUMAN	sp|Q6ZN18-2|AEBP2_HUMAN		0.998316	27.7896	1.21e-62	181.56	176.76	181.56																										2	S	AAITDMADLEELSRLSPLPPGSPGSAARGRA	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X	PPPPPPPPPPPPPPPPPPPPPPPPPPPPXXX	AAAITDMADLEELS(0.002)RLS(0.998)PLPPGS(0.809)PGS(0.191)AAR	AAAIT(-99.88)DMADLEELS(-27.79)RLS(27.79)PLPPGS(6.28)PGS(-6.28)AAR	17	3	0.97551	By matching	By matching	By matching	By MS/MS	By MS/MS	By MS/MS	499850000	0	499850000	0		2708200	3550900	192640000	104030000	20713000	176200000							0	2708200	0	0	3550900	0	0	192640000	0	0	104030000	0	0	20713000	0	0	176200000	0																					5468	3335	18	18	28	35	264;265;266;267;268;269	236;237;238;239	264	236	QE05100	65231	264	236	QE05100	65231	264	236	QE05100	65231
+sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN	24;24	sp|Q6ZN18-2|AEBP2_HUMAN	sp|Q6ZN18-2|AEBP2_HUMAN		0.809237	6.27624	1.21e-62	181.56	176.76	181.56																										2	S	ADLEELSRLSPLPPGSPGSAARGRAEPPEEE	X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	PPPPPPPPPPPPPPPPPPPPPPXXXXXXXXX	AAAITDMADLEELS(0.002)RLS(0.998)PLPPGS(0.809)PGS(0.191)AAR	AAAIT(-99.88)DMADLEELS(-27.79)RLS(27.79)PLPPGS(6.28)PGS(-6.28)AAR	23	3	0.97551	By matching	By matching	By matching	By MS/MS	By MS/MS	By MS/MS	499850000	0	499850000	0		2708200	3550900	192640000	104030000	20713000	176200000							0	2708200	0	0	3550900	0	0	192640000	0	0	104030000	0	0	20713000	0	0	176200000	0																					5469	3335	24	24	28	35	264;265;266;267;268;269	236;237;238;239	264	236	QE05100	65231	264	236	QE05100	65231	264	236	QE05100	65231
+sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN	206;206	sp|Q6ZN18-2|AEBP2_HUMAN	sp|Q6ZN18-2|AEBP2_HUMAN		0.999982	48.3708	1.18e-09	128.05	118.25	128.05																										1	S	TGGGGSSATSGGRRGSLEMSSDGEPLSRMDS	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXPPPPPPPPPPPPPPPXXX	RGS(1)LEMSSDGEPLSR	RGS(48.37)LEMS(-48.37)S(-54.13)DGEPLS(-99.69)R	3	2	-0.10602	By MS/MS	By MS/MS	By MS/MS		By matching	By MS/MS	73663000	73663000	0	0		19262000	11103000	19454000	0	1816900	22028000							19262000	0	0	11103000	0	0	19454000	0	0	0	0	0	1816900	0	0	22028000	0	0																					5470	3335	206	206	17255	19413	137099;137100;137101;137102;137103	122913;122914;122915;122916	137099	122913	QE05097	23240	137099	122913	QE05097	23240	137099	122913	QE05097	23240
+		REV__sp|P35908|K22E_HUMAN	REV__sp|P35908|K22E_HUMAN		1	71.692	0.00457965	71.692	14.102	71.692																										1	S		X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXPPPPPPPPPXXXXXXXXXXXX	IIKELS(1)DGR	IIKELS(71.69)DGR	6	2	2.0005	By matching	By MS/MS	By matching	By matching		By matching	431850000	431850000	0	0	NaN	103010000	67359000	64124000	74201000	0	55805000	NaN	NaN	NaN	NaN	NaN	NaN	103010000	0	0	67359000	0	0	64124000	0	0	74201000	0	0	0	0	0	55805000	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	+	+	61	57	252	252	11589	12932	93729;93730;93731;93732;93733;93734	87100	93729	87100	QE05098	47490	93729	87100	QE05098	47490	93729	87100	QE05098	47490
+		REV__sp|Q9NSB4|KRT82_HUMAN	REV__sp|Q9NSB4|KRT82_HUMAN		1	45.368	0.0161156	45.368	28.697	45.368																										1	S		X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXPPPPPPPPPPXXXXXXXXX	VDGS(1)VCDLRR	VDGS(45.37)VCDLRR	4	2	0.77096	By matching	By matching	By matching	By matching	By matching	By MS/MS	1670400000	1670400000	0	0	NaN	218420000	241200000	328130000	240860000	52984000	294390000	NaN	NaN	NaN	NaN	NaN	NaN	218420000	0	0	241200000	0	0	328130000	0	0	240860000	0	0	52984000	0	0	294390000	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	+	+	62	58	330	330	22307	25289	178961;178962;178963;178964;178965;178966;178967	159240	178961	159240	QE05102	16922	178961	159240	QE05102	16922	178961	159240	QE05102	16922
+		REV__sp|Q6S5H4-2|POTEB_HUMAN	REV__sp|Q6S5H4-2|POTEB_HUMAN		1	51.2862	0.045235	51.286	32.662	51.286																											S		X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXPPPPPPPPPXXXXXXXXX	EVS(1)EIEELK	EVS(51.29)EIEELK	3	2	0.81181		By matching	By matching	By matching	By matching	By matching	50767000	50767000	0	0	0.044169	0	8469100	14247000	11062000	1262600	15726000	0	0.056281	0.030122	0.051456	0.037786	0.081346	0	0	0	8469100	0	0	14247000	0	0	11062000	0	0	1262600	0	0	15726000	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	+		63	59	22	22	8166	9110	66515;66516;66517;66518;66519	61714;61715	66516	61715	QE05100	38402	66516	61715	QE05100	38402	66516	61715	QE05100	38402
+sp|Q8IUD2-4|RB6I2_HUMAN;sp|Q8IUD2-2|RB6I2_HUMAN;sp|Q8IUD2-3|RB6I2_HUMAN;sp|Q8IUD2|RB6I2_HUMAN;sp|Q8IUD2-5|RB6I2_HUMAN;sp|O15083|ERC2_HUMAN	191;191;191;191;191;187	sp|Q8IUD2-4|RB6I2_HUMAN	sp|Q8IUD2-4|RB6I2_HUMAN		0.999998	58.0663	0.00181554	89.827	67.799	89.827																										1	S	ESKLSSSMNSIKTFWSPELKKERALRKDEAS	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXPPPPPPPPXXXXXXXXXXX	TFWS(1)PELK	T(-58.07)FWS(58.07)PELK	4	2	0.075831	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By MS/MS	138400000	138400000	0	0		29764000	20957000	24855000	30752000	8304800	23771000							29764000	0	0	20957000	0	0	24855000	0	0	30752000	0	0	8304800	0	0	23771000	0	0																					6037	3584	191	191	21148	23984	169817;169818;169819;169820;169821;169822	151176;151177;151178;151179;151180;151181	169822	151181	QE05102	49176	169822	151181	QE05102	49176	169822	151181	QE05102	49176
+sp|Q9NRX5|SERC1_HUMAN	364	sp|Q9NRX5|SERC1_HUMAN	sp|Q9NRX5|SERC1_HUMAN		0.999996	54.0798	2.24e-16	159.22	148.1	159.22																										1	S	DESTLIEDGGARSDGSLEDGDDVHRAVDNER	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXPPPPPPPPPPPPPXXXXXX	SDGS(1)LEDGDDVHR	S(-54.08)DGS(54.08)LEDGDDVHR	4	2	0.64808	By MS/MS	By MS/MS	By matching	By MS/MS	By MS/MS	By MS/MS	222110000	222110000	0	0		31407000	17665000	20892000	23194000	5132400	54893000							31407000	0	0	17665000	0	0	20892000	0	0	23194000	0	0	5132400	0	0	54893000	0	0																					8729	5187	364	364	17793	20026	141355;141356;141357;141358;141359;141360;141361;141362;141363;141364;141365	126543;126544;126545;126546;126547;126548;126549	141361	126549	QE05102	10564	141361	126549	QE05102	10564	141361	126549	QE05102	10564
+sp|Q9Y3B9|RRP15_HUMAN	11	sp|Q9Y3B9|RRP15_HUMAN	sp|Q9Y3B9|RRP15_HUMAN		0.997432	25.8922	9.39e-31	175.33	139.7	175.33																										1	S	_____MAAAAPDSRVSEEENLKKTPKKKMKM	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXPPPPPPPPPPPPPPPPXXXXXXXXX	AAAAPDS(0.003)RVS(0.997)EEENLK	AAAAPDS(-25.89)RVS(25.89)EEENLK	10	2	-0.029697	By matching	By matching	By MS/MS	By MS/MS	By MS/MS	By MS/MS	266450000	266450000	0	0		38150000	39445000	56305000	55338000	7010600	70203000							38150000	0	0	39445000	0	0	56305000	0	0	55338000	0	0	7010600	0	0	70203000	0	0																					9895	5791	11	11	12	17	158;159;160;161;162;163	166;167;168;169	159	167	QE05100	23225	159	167	QE05100	23225	159	167	QE05100	23225
+sp|Q15751|HERC1_HUMAN	3445	sp|Q15751|HERC1_HUMAN	sp|Q15751|HERC1_HUMAN		0.999981	47.2024	0.0187791	47.548	7.8172	47.548																										2	T	RVMTCVWCNKKGLLATSGNDGTIRVWNVTKK	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXPPPPPPPPPPPPPPXXXXXXX	KGLLAT(1)S(1)GNDGTIR	KGLLAT(47.2)S(47.22)GNDGT(-47.2)IR	6	2	-0.95722	By matching		By MS/MS			By matching	129800000	0	129800000	0		3921800	0	120850000	0	0	5021300							0	3921800	0	0	0	0	0	120850000	0	0	0	0	0	0	0	0	5021300	0																					10983	2824	3445	3445	12194	13609	98227;98228;98229	90789	98227	90789	QE05099	12004	98227	90789	QE05099	12004	98227	90789	QE05099	12004
+sp|O75379|VAMP4_HUMAN	30	sp|O75379|VAMP4_HUMAN	sp|O75379|VAMP4_HUMAN		1	67.6437	1.44E-52	203.56	187.24	67.644																										1	S	TGSVKSERRNLLEDDSDEEEDFFLRGPSGPR	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXPPPPPPPPPPPPPPPPPPPPPP	NLLEDDS(1)DEEEDFFLR	NLLEDDS(67.64)DEEEDFFLR	7	3	-0.051914	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By MS/MS	7929000000	7929000000	0	0	NaN	1592100000	973800000	1011600000	1450300000	631970000	878760000	NaN	NaN	NaN	NaN	NaN	NaN	1592100000	0	0	973800000	0	0	1011600000	0	0	1450300000	0	0	631970000	0	0	878760000	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN			963	669	30	30	15558;15559	17538;17539	124829;124830;124831;124832;124833;124834;124835;124836;124837;124838;124839;124840;124841;124842;124843;124844;124845;124846	112951;112952;112953;112954;112955;112956;112957;112958;112959;112960;112961;112962;112963;112964;112965;112966;112967;112968;112969;112970;112971;112972	124840	112969	QE05102	57877	124833	112957	QE05099	57820	124833	112957	QE05099	57820
+sp|O95183|VAMP5_HUMAN	48	sp|O95183|VAMP5_HUMAN	sp|O95183|VAMP5_HUMAN		0.72657	5.36697	5.72E-05	79.514	55.133	79.514																										1	S	KLAELQQRSDQLLDMSSTFNKTTQNLAQKKC	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXPPPPPPPPPPPPPXXXXXXXXXX	SDQLLDMS(0.727)S(0.211)T(0.062)FNK	S(-64.13)DQLLDMS(5.37)S(-5.37)T(-10.67)FNK	8	2	-0.18713	By matching	By matching	By MS/MS	By matching	By matching	By matching	86590000	86590000	0	0	0.032027	17447000	15753000	20219000	14001000	6284700	12885000	0.028348	0.025719	0.032895	0.033925	0.083789	0.034516	17447000	0	0	15753000	0	0	20219000	0	0	14001000	0	0	6284700	0	0	12885000	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN			1189	809	48	48	17891	20149	142427;142428;142429;142430;142431;142432	127454	142427	127454	QE05099	48504	142427	127454	QE05099	48504	142427	127454	QE05099	48504
+sp|Q15836|VAMP3_HUMAN;sp|P63027|VAMP2_HUMAN	63;80	sp|Q15836|VAMP3_HUMAN	sp|Q15836|VAMP3_HUMAN		0.920811	10.6555	1.81E-09	124.1	98.278	107.25																										1	S	DRADALQAGASQFETSAAKLKRKYWWKNCKM	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXPPPPPPPPPPPPPPPPPXXXXXXXXXXXX	ADALQAGASQFET(0.079)S(0.921)AAK	ADALQAGAS(-49.99)QFET(-10.66)S(10.66)AAK	14	2	0.23449	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By matching	By MS/MS	265240000	265240000	0	0	0.036151	44627000	41445000	69094000	42521000	5738000	61819000	0.03226	0.028442	0.039791	0.036967	0.030963	0.043392	44627000	0	0	41445000	0	0	69094000	0	0	42521000	0	0	5738000	0	0	61819000	0	0	0.47624	0.90925	12.188	0.51677	1.0694	7.2217	NaN	NaN	NaN	0.81588	4.4311	19.209	NaN	NaN	NaN	0.4388	0.78189	5.9861			4442	2836	63	63	279	319	2297;2298;2299;2300;2301;2302	1992;1993;1994;1995;1996	2300	1995	QE05100	30086	2301	1996	QE05102	30007	2301	1996	QE05102	30007
+sp|Q15836|VAMP3_HUMAN;sp|P63027|VAMP2_HUMAN;sp|P23763-2|VAMP1_HUMAN;sp|P23763-3|VAMP1_HUMAN;sp|P23763|VAMP1_HUMAN	44;61;63;63;63	sp|Q15836|VAMP3_HUMAN	sp|Q15836|VAMP3_HUMAN		1	65.4951	2.36E-06	126.19	98.602	65.495																										1	S	MRVNVDKVLERDQKLSELDDRADALQAGASQ	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXPPPPPPPPPPXXXXXXXXXX	DQKLS(1)ELDDR	DQKLS(65.5)ELDDR	5	3	-0.72518	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By matching	By MS/MS	412950000	412950000	0	0	NaN	75542000	44814000	32924000	35016000	11023000	4669900	NaN	NaN	NaN	NaN	NaN	NaN	75542000	0	0	44814000	0	0	32924000	0	0	35016000	0	0	11023000	0	0	4669900	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN			4443	2836	44	44	4530	5083	37093;37094;37095;37096;37097;37098;37099;37100;37101;37102;37103;37104	34712;34713;34714;34715;34716;34717;34718;34719	37100	34719	QE05102	18436	37093	34712	QE05097	18245	37093	34712	QE05097	18245
+sp|Q15836|VAMP3_HUMAN	11	sp|Q15836|VAMP3_HUMAN	sp|Q15836|VAMP3_HUMAN		0.97018	15.1316	0.000117365	79.652	72.041	79.652																										1	S	_____MSTGPTAATGSNRRLQQTQNQVDEVV	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXPPPPPPPPPPPPPXXXXXXXXXXXX	STGPTAAT(0.03)GS(0.97)NRR	S(-66.94)T(-63.48)GPT(-42.47)AAT(-15.13)GS(15.13)NRR	10	2	-0.15791	By matching	By matching	By MS/MS	By matching	By matching	By MS/MS	34280000	34280000	0	0	NaN	3057100	4718800	12052000	5047700	1070900	8333500	NaN	NaN	NaN	NaN	NaN	NaN	3057100	0	0	4718800	0	0	12052000	0	0	5047700	0	0	1070900	0	0	8333500	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN			4444	2836	11	11	20280	22978	162490;162491;162492;162493;162494;162495	144222;144223	162490	144222	QE05099	7582	162490	144222	QE05099	7582	162490	144222	QE05099	7582
+sp|Q9BV40|VAMP8_HUMAN	55	sp|Q9BV40|VAMP8_HUMAN	sp|Q9BV40|VAMP8_HUMAN		0.959784	13.7778	3.78E-05	91.969	27.98	91.969																										1	S	NLEHLRNKTEDLEATSEHFKTTSQKVARKFW	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXPPPPPPPPPPPPXXXXXXXXXXX	TEDLEAT(0.04)S(0.96)EHFK	T(-83.18)EDLEAT(-13.78)S(13.78)EHFK	8	2	0.40785	By matching	By matching	By matching			By MS/MS	114520000	114520000	0	0	NaN	20400000	9738500	7862300	0	0	76518000	NaN	NaN	NaN	NaN	NaN	NaN	20400000	0	0	9738500	0	0	7862300	0	0	0	0	0	0	0	0	76518000	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN			7902	4687	55	55	21013	23827	168874;168875;168876;168877	150433	168874	150433	QE05102	19524	168874	150433	QE05102	19524	168874	150433	QE05102	19524
+sp|P54764-2|EPHA4_HUMAN;sp|P54764|EPHA4_HUMAN	551;602	sp|P54764-2|EPHA4_HUMAN	sp|P54764-2|EPHA4_HUMAN		0.871707	6.48916	4.61E-08	65.374	58.758	65.374																									+	2	Y	KHLNQGVRTYVDPFTYEDPNQAVREFAKEID	X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXPPPPPPPPPPPPPPPPXXXXXXX	T(0.499)Y(0.501)VDPFT(0.128)Y(0.872)EDPNQAVR	T(0.85)Y(-0.85)VDPFT(-6.49)Y(6.49)EDPNQAVR	8	3	0.97415	By matching		By MS/MS	By matching	By matching		3679100	0	3679100	0	NaN	725460	0	1651300	655850	646420	0	NaN	NaN	NaN	NaN	NaN	NaN	0	725460	0	0	0	0	0	1651300	0	0	655850	0	0	646420	0	0	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN			242	260	551	551	972	999	4968;4969;4970;4971	3421	4968	3421	QE04980	9557	4968	3421	QE04980	9557	4968	3421	QE04980	9557
diff -r 000000000000 -r ba62d93a9ef5 test-data/test_kinase_substrate.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_kinase_substrate.tabular	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,5 @@
+GENE	KINASE	KIN_ACC_ID	KIN_ORGANISM	SUBSTRATE	SUB_GENE_ID	SUB_ACC_ID	SUB_GENE	SUB_ORGANISM	SUB_MOD_RSD	SITE_GRP_ID	SITE_+/-7_AA	DOMAIN	IN_VIVO_RXN	IN_VITRO_RXN	CST_CAT#
+Csnk2a1	CK2A1	Q60737	human	VAMP4	53330	O70480	Vamp4	human	S30	454285	RNLLEDDsDEEEDFF		 	X	
+EPHA2	EphA2	P29317	human	EphA2	1969	P29317	EPHA2	human	Y588	450859	QLkPLktyVDPHtyE	EphA2_TM	X	X	7423; 12677
+EPHA4	EphA4	P54764	human	EphA4	2043	P54764	EPHA4	human	Y596	450856	LNQGVRtyVDPFtyE	EphA2_TM		X	
+EPHA4	EphA4	P54764	human	EphA4	2043	P54764	EPHA4	human	Y602	450857	tyVDPFtyEDPNQAV	EphA2_TM		X	
diff -r 000000000000 -r ba62d93a9ef5 test-data/test_networkin.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_networkin.tabular	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,101 @@
+#substrate	position	id	networkin_score	tree	netphorest_group	netphorest_score	string_identifier	string_score	substrate_name	sequence	string_path
+VAMP4 (ENSP00000236192)	30	CK2alpha	35.6396	KIN	CK2_group	0.5228	ENSP00000236192	0.85	VAMP4	LLEDDsDEEED	"ENSP00000217244, 0.68 ENSP00000236192"
+SSRP1 (ENSP00000278412)	444	CK2alpha	28.6345	KIN	CK2_group	0.3768	ENSP00000278412	0.874	SSRP1	DEYADsDEDQH	"ENSP00000217244, 0.6992 ENSP00000278412"
+SSRP1 (ENSP00000278412)	667	CK2alpha	22.2088	KIN	CK2_group	0.3168	ENSP00000278412	0.874	SSRP1	SKEFVsSDESS	"ENSP00000217244, 0.6992 ENSP00000278412"
+HERC2 (ENSP00000261609)	1577	CK2alpha	10.7686	KIN	CK2_group	0.5253	ENSP00000261609	0.4514	HERC2	IGNEEsDLEEA	"ENSP00000217244, 0.764 ENSP00000346659, 0.76 ENSP00000261609"
+HERC2 (ENSP00000261609)	2928	CK2alpha	10.7686	KIN	CK2_group	0.4698	ENSP00000261609	0.4514	HERC2	VPFLAsDNEEE	"ENSP00000217244, 0.764 ENSP00000346659, 0.76 ENSP00000261609"
+RRP15 (ENSP00000355899)	11	CK2alpha	8.5484	KIN	CK2_group	0.3566	ENSP00000355899	0.461	RRP15	PDSRVsEEENL	"ENSP00000217244, 0.3688 ENSP00000355899"
+SSRP1 (ENSP00000278412)	444	CK2a2	7.8435	KIN	CK2_group	0.3768	ENSP00000278412	0.615	SSRP1	DEYADsDEDQH	"ENSP00000262506, 0.492 ENSP00000278412"
+SSRP1 (ENSP00000278412)	667	CK2a2	7.7757	KIN	CK2_group	0.3168	ENSP00000278412	0.615	SSRP1	SKEFVsSDESS	"ENSP00000262506, 0.492 ENSP00000278412"
+VAMP2 (ENSP00000314214)	80	PKD3	6.9217	KIN	PKD_group	0.0744	ENSP00000314214	0.949	VAMP2	SQFETsAAKLK	"ENSP00000234179, 0.7592 ENSP00000314214"
+VAMP2 (ENSP00000314214)	61	CK2alpha	6.3122	KIN	CK2_group	0.3338	ENSP00000314214	0.4391	VAMP2	RDQKLsELDDR	"ENSP00000217244, 0.7992 ENSP00000222812, 0.7544 ENSP00000314214"
+VAMP1 (ENSP00000380148)	63	CK2alpha	6.1363	KIN	CK2_group	0.3338	ENSP00000380148	0.4364	VAMP1	RDQKLsELDDR	"ENSP00000217244, 0.7944 ENSP00000222812, 0.7544 ENSP00000380148"
+ERC1 (ENSP00000354158)	191	IKKalpha	5.3194	KIN	IKKalpha_IKKbeta_group	0.031	ENSP00000354158	0.96	ERC1	IKTFWsPELKK	"ENSP00000359424, 0.768 ENSP00000354158"
+ERC1 (ENSP00000354158)	191	IKKalpha	5.3194	KIN	IKKalpha_IKKbeta_group	0.031	ENSP00000354158	0.96	ERC1	IKTFWsPELKK	"ENSP00000359424, 0.768 ENSP00000354158"
+VAMP2 (ENSP00000314214)	61	PKAbeta	4.9293	KIN	PKA_group	0.1153	ENSP00000314214	0.8	VAMP2	RDQKLsELDDR	"ENSP00000359719, 0.64 ENSP00000314214"
+VAMP2 (ENSP00000314214)	61	PKAgamma	4.9293	KIN	PKA_group	0.1153	ENSP00000314214	0.8	VAMP2	RDQKLsELDDR	"ENSP00000366488, 0.64 ENSP00000314214"
+VAMP3 (ENSP00000054666)	44	CK2alpha	4.2842	KIN	CK2_group	0.3338	ENSP00000054666	0.4201	VAMP3	RDQKLsELDDR	"ENSP00000217244, 0.7992 ENSP00000317714, 0.6792 ENSP00000054666"
+VAMP2 (ENSP00000314214)	80	PKCiota	3.8971	KIN	PKC_group	0.0928	ENSP00000314214	0.899	VAMP2	SQFETsAAKLK	"ENSP00000295797, 0.7192 ENSP00000314214"
+SSRP1 (ENSP00000278412)	444	CDK7	3.6159	KIN	CDK7	0.0186	ENSP00000278412	0.903	SSRP1	DEYADsDEDQH	"ENSP00000256443, 0.7224 ENSP00000278412"
+SSRP1 (ENSP00000278412)	444	CK1alpha	3.3573	KIN	CK1_group	0.1264	ENSP00000278412	0.404	SSRP1	DEYADsDEDQH	"ENSP00000261798, 0.3232 ENSP00000278412"
+VAMP3 (ENSP00000054666)	11	PKCalpha	3.0633	KIN	PKC_group	0.4633	ENSP00000054666	0.3277	VAMP3	TAATGsNRRLQ	"ENSP00000284384, 0.6232 ENSP00000359025, 0.6352 ENSP00000054666"
+SSRP1 (ENSP00000278412)	659	PKCalpha	3.0524	KIN	PKC_group	0.4345	ENSP00000278412	0.237	SSRP1	RQLSEsFKSKE	"ENSP00000284384, 0.4552 ENSP00000351885, 0.76 ENSP00000278412"
+VAMP2 (ENSP00000314214)	61	PKCiota	2.7785	KIN	PKC_group	0.0463	ENSP00000314214	0.899	VAMP2	RDQKLsELDDR	"ENSP00000295797, 0.7192 ENSP00000314214"
+SSRP1 (ENSP00000278412)	659	CDK7	2.5961	KIN	CDK7	0.0104	ENSP00000278412	0.903	SSRP1	RQLSEsFKSKE	"ENSP00000256443, 0.7224 ENSP00000278412"
+SSRP1 (ENSP00000278412)	667	CDK7	2.5961	KIN	CDK7	0.0124	ENSP00000278412	0.903	SSRP1	SKEFVsSDESS	"ENSP00000256443, 0.7224 ENSP00000278412"
+ERC1 (ENSP00000354158)	191	IKKbeta	2.571	KIN	IKKalpha_IKKbeta_group	0.031	ENSP00000354158	0.946	ERC1	IKTFWsPELKK	"ENSP00000339151, 0.7568 ENSP00000354158"
+ERC1 (ENSP00000354158)	191	IKKbeta	2.571	KIN	IKKalpha_IKKbeta_group	0.031	ENSP00000354158	0.946	ERC1	IKTFWsPELKK	"ENSP00000339151, 0.7568 ENSP00000354158"
+SSRP1 (ENSP00000278412)	659	PKCbeta	2.4948	KIN	PKC_group	0.4345	ENSP00000278412	0.1743	SSRP1	RQLSEsFKSKE	"ENSP00000305355, 0.7976 ENSP00000366013, 0.7192 ENSP00000284811, 0.7448 ENSP00000278412"
+VAMP3 (ENSP00000054666)	11	PKCbeta	2.4948	KIN	PKC_group	0.4633	ENSP00000054666	0.2393	VAMP3	TAATGsNRRLQ	"ENSP00000305355, 0.512 ENSP00000348986, 0.7616 ENSP00000054666"
+SSRP1 (ENSP00000278412)	659	CK2a2	2.4345	KIN	CK2_group	0.0356	ENSP00000278412	0.615	SSRP1	RQLSEsFKSKE	"ENSP00000262506, 0.492 ENSP00000278412"
+ERC1 (ENSP00000354158)	191	HIPK2	2.2748	KIN	HIPK1_HIPK2_group	0.0463	ENSP00000354158	0.4159	ERC1	IKTFWsPELKK	"ENSP00000263551, 0.7696 ENSP00000286332, 0.7192 ENSP00000354158"
+VAMP3 (ENSP00000054666)	11	PKCzeta	2.0773	KIN	PKC_group	0.4633	ENSP00000054666	0.4263	VAMP3	TAATGsNRRLQ	"ENSP00000367830, 0.7688 ENSP00000320935, 0.796 ENSP00000054666"
+SSRP1 (ENSP00000278412)	659	DNAPK	2.0042	KIN	DNAPK	0.0584	ENSP00000278412	0.56	SSRP1	RQLSEsFKSKE	"ENSP00000313420, 0.448 ENSP00000278412"
+EPHA4 (ENSP00000386829)	602	EphA4	35.9325	KIN	Eph_group	0.1443	ENSP00000281821	1	EPHA4	VDPFTyEDPNQ	1 KIN
+EPHA4 (ENSP00000386829)	596	EphA4	35.921	KIN	Eph_group	0.1442	ENSP00000281821	1	EPHA4	QGVRTyVDPFT	1 KIN
+EPHA4 (ENSP00000386829)	779	EphA4	17.3679	KIN	Eph_group	0.0482	ENSP00000281821	1	EPHA4	DPEAAyTTRGG	1 KIN
+EPHA4 (ENSP00000386829)	798	EphA4	17.3679	KIN	Eph_group	0.0482	ENSP00000281821	1	EPHA4	PEAIAyRKFTS	1 KIN
+EPHA4 (ENSP00000386829)	928	EphA4	17.3679	KIN	Eph_group	0.0482	ENSP00000281821	1	EPHA4	IKMDRyKDNFT	1 KIN
+EPHA4 (ENSP00000386829)	602	EphA1	5.7706	KIN	Eph_group	0.1443	ENSP00000281821	0.907	EPHA4	VDPFTyEDPNQ	"ENSP00000275815, 0.7256 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	EphA1	5.7688	KIN	Eph_group	0.1442	ENSP00000281821	0.907	EPHA4	QGVRTyVDPFT	"ENSP00000275815, 0.7256 ENSP00000281821"
+EPHA4 (ENSP00000386829)	602	EphA2	5.7678	KIN	Eph_group	0.1443	ENSP00000281821	0.904	EPHA4	VDPFTyEDPNQ	"ENSP00000351209, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	602	EphA3	5.7678	KIN	Eph_group	0.1443	ENSP00000281821	0.904	EPHA4	VDPFTyEDPNQ	"ENSP00000337451, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	602	EphA5	5.7678	KIN	Eph_group	0.1443	ENSP00000281821	0.904	EPHA4	VDPFTyEDPNQ	"ENSP00000273854, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	602	EphA7	5.7678	KIN	Eph_group	0.1443	ENSP00000281821	0.904	EPHA4	VDPFTyEDPNQ	"ENSP00000358309, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	602	EphA6	5.7668	KIN	Eph_group	0.1443	ENSP00000281821	0.903	EPHA4	VDPFTyEDPNQ	"ENSP00000374323, 0.7224 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	EphA2	5.7659	KIN	Eph_group	0.1442	ENSP00000281821	0.904	EPHA4	QGVRTyVDPFT	"ENSP00000351209, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	EphA3	5.7659	KIN	Eph_group	0.1442	ENSP00000281821	0.904	EPHA4	QGVRTyVDPFT	"ENSP00000337451, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	EphA5	5.7659	KIN	Eph_group	0.1442	ENSP00000281821	0.904	EPHA4	QGVRTyVDPFT	"ENSP00000273854, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	EphA7	5.7659	KIN	Eph_group	0.1442	ENSP00000281821	0.904	EPHA4	QGVRTyVDPFT	"ENSP00000358309, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	EphA6	5.765	KIN	Eph_group	0.1442	ENSP00000281821	0.903	EPHA4	QGVRTyVDPFT	"ENSP00000374323, 0.7224 ENSP00000281821"
+EPHA4 (ENSP00000386829)	779	Abl	5.6735	KIN	Abl_group	0.0573	ENSP00000281821	0.806	EPHA4	DPEAAyTTRGG	"ENSP00000361423, 0.6448 ENSP00000281821"
+EPHA4 (ENSP00000386829)	602	EphA8	3.8493	KIN	Eph_group	0.1443	ENSP00000281821	0.576	EPHA4	VDPFTyEDPNQ	"ENSP00000166244, 0.7984 ENSP00000403005, 0.78 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	EphA8	3.8481	KIN	Eph_group	0.1442	ENSP00000281821	0.576	EPHA4	QGVRTyVDPFT	"ENSP00000166244, 0.7984 ENSP00000403005, 0.78 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	Abl	3.757	KIN	Abl_group	0.0432	ENSP00000281821	0.806	EPHA4	QGVRTyVDPFT	"ENSP00000361423, 0.6448 ENSP00000281821"
+EPHA4 (ENSP00000386829)	602	Fgr	3.5442	KIN	Src_group	0.0705	ENSP00000281821	0.902	EPHA4	VDPFTyEDPNQ	"ENSP00000363115, 0.7216 ENSP00000281821"
+EPHA4 (ENSP00000386829)	602	Yes	3.5442	KIN	Src_group	0.0705	ENSP00000281821	0.902	EPHA4	VDPFTyEDPNQ	"ENSP00000324740, 0.7216 ENSP00000281821"
+EPHA4 (ENSP00000386829)	602	BLK	3.5431	KIN	Src_group	0.0705	ENSP00000281821	0.9	EPHA4	VDPFTyEDPNQ	"ENSP00000259089, 0.72 ENSP00000281821"
+EPHA4 (ENSP00000386829)	779	Fgr	2.8234	KIN	Src_group	0.0583	ENSP00000281821	0.902	EPHA4	DPEAAyTTRGG	"ENSP00000363115, 0.7216 ENSP00000281821"
+EPHA4 (ENSP00000386829)	779	Yes	2.8234	KIN	Src_group	0.0583	ENSP00000281821	0.902	EPHA4	DPEAAyTTRGG	"ENSP00000324740, 0.7216 ENSP00000281821"
+EPHA4 (ENSP00000386829)	779	BLK	2.8225	KIN	Src_group	0.0583	ENSP00000281821	0.9	EPHA4	DPEAAyTTRGG	"ENSP00000259089, 0.72 ENSP00000281821"
+EPHA4 (ENSP00000386829)	779	EphA1	2.7892	KIN	Eph_group	0.0482	ENSP00000281821	0.907	EPHA4	DPEAAyTTRGG	"ENSP00000275815, 0.7256 ENSP00000281821"
+EPHA4 (ENSP00000386829)	798	EphA1	2.7892	KIN	Eph_group	0.0482	ENSP00000281821	0.907	EPHA4	PEAIAyRKFTS	"ENSP00000275815, 0.7256 ENSP00000281821"
+EPHA4 (ENSP00000386829)	928	EphA1	2.7892	KIN	Eph_group	0.0482	ENSP00000281821	0.907	EPHA4	IKMDRyKDNFT	"ENSP00000275815, 0.7256 ENSP00000281821"
+EPHA4 (ENSP00000386829)	779	EphA2	2.7878	KIN	Eph_group	0.0482	ENSP00000281821	0.904	EPHA4	DPEAAyTTRGG	"ENSP00000351209, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	779	EphA3	2.7878	KIN	Eph_group	0.0482	ENSP00000281821	0.904	EPHA4	DPEAAyTTRGG	"ENSP00000337451, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	779	EphA5	2.7878	KIN	Eph_group	0.0482	ENSP00000281821	0.904	EPHA4	DPEAAyTTRGG	"ENSP00000273854, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	779	EphA7	2.7878	KIN	Eph_group	0.0482	ENSP00000281821	0.904	EPHA4	DPEAAyTTRGG	"ENSP00000358309, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	798	EphA2	2.7878	KIN	Eph_group	0.0482	ENSP00000281821	0.904	EPHA4	PEAIAyRKFTS	"ENSP00000351209, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	798	EphA3	2.7878	KIN	Eph_group	0.0482	ENSP00000281821	0.904	EPHA4	PEAIAyRKFTS	"ENSP00000337451, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	798	EphA5	2.7878	KIN	Eph_group	0.0482	ENSP00000281821	0.904	EPHA4	PEAIAyRKFTS	"ENSP00000273854, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	798	EphA7	2.7878	KIN	Eph_group	0.0482	ENSP00000281821	0.904	EPHA4	PEAIAyRKFTS	"ENSP00000358309, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	928	EphA2	2.7878	KIN	Eph_group	0.0482	ENSP00000281821	0.904	EPHA4	IKMDRyKDNFT	"ENSP00000351209, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	928	EphA3	2.7878	KIN	Eph_group	0.0482	ENSP00000281821	0.904	EPHA4	IKMDRyKDNFT	"ENSP00000337451, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	928	EphA5	2.7878	KIN	Eph_group	0.0482	ENSP00000281821	0.904	EPHA4	IKMDRyKDNFT	"ENSP00000273854, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	928	EphA7	2.7878	KIN	Eph_group	0.0482	ENSP00000281821	0.904	EPHA4	IKMDRyKDNFT	"ENSP00000358309, 0.7232 ENSP00000281821"
+EPHA4 (ENSP00000386829)	779	EphA6	2.7874	KIN	Eph_group	0.0482	ENSP00000281821	0.903	EPHA4	DPEAAyTTRGG	"ENSP00000374323, 0.7224 ENSP00000281821"
+EPHA4 (ENSP00000386829)	798	EphA6	2.7874	KIN	Eph_group	0.0482	ENSP00000281821	0.903	EPHA4	PEAIAyRKFTS	"ENSP00000374323, 0.7224 ENSP00000281821"
+EPHA4 (ENSP00000386829)	928	EphA6	2.7874	KIN	Eph_group	0.0482	ENSP00000281821	0.903	EPHA4	IKMDRyKDNFT	"ENSP00000374323, 0.7224 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	Fgr	2.7541	KIN	Src_group	0.036	ENSP00000281821	0.902	EPHA4	QGVRTyVDPFT	"ENSP00000363115, 0.7216 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	Yes	2.7541	KIN	Src_group	0.036	ENSP00000281821	0.902	EPHA4	QGVRTyVDPFT	"ENSP00000324740, 0.7216 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	BLK	2.7532	KIN	Src_group	0.036	ENSP00000281821	0.9	EPHA4	QGVRTyVDPFT	"ENSP00000259089, 0.72 ENSP00000281821"
+EPHA4 (ENSP00000386829)	798	Fgr	2.7477	KIN	Src_group	0.0263	ENSP00000281821	0.902	EPHA4	PEAIAyRKFTS	"ENSP00000363115, 0.7216 ENSP00000281821"
+EPHA4 (ENSP00000386829)	798	Yes	2.7477	KIN	Src_group	0.0263	ENSP00000281821	0.902	EPHA4	PEAIAyRKFTS	"ENSP00000324740, 0.7216 ENSP00000281821"
+EPHA4 (ENSP00000386829)	928	Fgr	2.7472	KIN	Src_group	0.0257	ENSP00000281821	0.902	EPHA4	IKMDRyKDNFT	"ENSP00000363115, 0.7216 ENSP00000281821"
+EPHA4 (ENSP00000386829)	928	Yes	2.7472	KIN	Src_group	0.0257	ENSP00000281821	0.902	EPHA4	IKMDRyKDNFT	"ENSP00000324740, 0.7216 ENSP00000281821"
+EPHA4 (ENSP00000386829)	798	BLK	2.7468	KIN	Src_group	0.0263	ENSP00000281821	0.9	EPHA4	PEAIAyRKFTS	"ENSP00000259089, 0.72 ENSP00000281821"
+EPHA4 (ENSP00000386829)	928	BLK	2.7463	KIN	Src_group	0.0257	ENSP00000281821	0.9	EPHA4	IKMDRyKDNFT	"ENSP00000259089, 0.72 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	HCK	2.7098	KIN	Src_group	0.036	ENSP00000281821	0.899	EPHA4	QGVRTyVDPFT	"ENSP00000365012, 0.7192 ENSP00000281821"
+EPHA4 (ENSP00000386829)	602	HCK	2.7098	KIN	Src_group	0.0705	ENSP00000281821	0.899	EPHA4	VDPFTyEDPNQ	"ENSP00000365012, 0.7192 ENSP00000281821"
+EPHA4 (ENSP00000386829)	779	HCK	2.7098	KIN	Src_group	0.0583	ENSP00000281821	0.899	EPHA4	DPEAAyTTRGG	"ENSP00000365012, 0.7192 ENSP00000281821"
+EPHA4 (ENSP00000386829)	798	HCK	2.7098	KIN	Src_group	0.0263	ENSP00000281821	0.899	EPHA4	PEAIAyRKFTS	"ENSP00000365012, 0.7192 ENSP00000281821"
+EPHA4 (ENSP00000386829)	928	HCK	2.7098	KIN	Src_group	0.0257	ENSP00000281821	0.899	EPHA4	IKMDRyKDNFT	"ENSP00000365012, 0.7192 ENSP00000281821"
+EPHA4 (ENSP00000386829)	780	PKCalpha	2.5567	KIN	PKC_group	0.3699	ENSP00000281821	0.401	EPHA4	PEAAYtTRGGK	"ENSP00000284384, 0.7464 ENSP00000244007, 0.7784 ENSP00000281821"
+EPHA4 (ENSP00000386829)	780	PKCbeta	2.4948	KIN	PKC_group	0.3699	ENSP00000281821	0.3759	EPHA4	PEAAYtTRGGK	"ENSP00000305355, 0.7464 ENSP00000244007, 0.7296 ENSP00000281821"
+EPHA4 (ENSP00000386829)	602	Abl	2.1653	KIN	Abl_group	0.0221	ENSP00000281821	0.806	EPHA4	VDPFTyEDPNQ	"ENSP00000361423, 0.6448 ENSP00000281821"
+EPHA4 (ENSP00000386829)	798	Abl	2.1376	KIN	Abl_group	0.0221	ENSP00000281821	0.806	EPHA4	PEAIAyRKFTS	"ENSP00000361423, 0.6448 ENSP00000281821"
+EPHA4 (ENSP00000386829)	928	Abl	2.1099	KIN	Abl_group	0.0221	ENSP00000281821	0.806	EPHA4	IKMDRyKDNFT	"ENSP00000361423, 0.6448 ENSP00000281821"
+EPHA4 (ENSP00000386829)	602	EphB6	2.04	KIN	Eph_group	0.1443	ENSP00000281821	0.5258	EPHA4	VDPFTyEDPNQ	"ENSP00000376684, 0.7976 ENSP00000226091, 0.7976 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	EphB6	2.0393	KIN	Eph_group	0.1442	ENSP00000281821	0.5258	EPHA4	QGVRTyVDPFT	"ENSP00000376684, 0.7976 ENSP00000226091, 0.7976 ENSP00000281821"
+EPHA4 (ENSP00000386829)	602	EphB3	2.0282	KIN	Eph_group	0.1443	ENSP00000281821	0.5231	EPHA4	VDPFTyEDPNQ	"ENSP00000332118, 0.7976 ENSP00000226091, 0.7936 ENSP00000281821"
+EPHA4 (ENSP00000386829)	596	EphB3	2.0276	KIN	Eph_group	0.1442	ENSP00000281821	0.5231	EPHA4	QGVRTyVDPFT	"ENSP00000332118, 0.7976 ENSP00000226091, 0.7936 ENSP00000281821"
diff -r 000000000000 -r ba62d93a9ef5 test-data/test_regulatory_sites.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_regulatory_sites.tabular	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,9 @@
+32017																				
+"PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words ""PhosphoSitePlus(R), www.phosphosite.org"" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: ""Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."""																				
+																				
+GENE	PROTEIN	PROT_TYPE	ACC_ID	GENE_ID	HU_CHR_LOC	ORGANISM	MOD_RSD	SITE_GRP_ID	SITE_+/-7_AA	DOMAIN	ON_FUNCTION	ON_PROCESS	ON_PROT_INTERACT	ON_OTHER_INTERACT	PMIDs	LT_LIT	MS_LIT	MS_CST	NOTES	
+ENSA	ENSA	"Inhibitor; Protein phosphatase, regulatory subunit"	O43768	2029	1q21.3	human	S109-p	477819	DLPQRKSsLVTSKLA	Endosulfine	"molecular association, regulation; protein conformation"		SNCA(DISRUPTS)		18973346	1	34	50		
+VAMP8	VAMP8	"Membrane protein, integral; Vesicle"	Q9BV40	8673	2p11.2	human	S55-p	12738929	TEDLEATsEHFKTTS	Synaptobrevin	"activity, inhibited"				27402227	1	8	0	"abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion"	
+ENSA	ENSA	"Inhibitor; Protein phosphatase, regulatory subunit"	O43768	2029	1q21.3	human	S67-p	455934	KGQKYFDsGDYNMAK	Endosulfine	"molecular association, regulation"	cell cycle regulation	PPP2CA(INDUCES)		27889260	3	56	47		
+Vamp4	VAMP4	"Membrane protein, integral; Vesicle"	O70480	53330	1 H2.1|1 70.29 cM	mouse	S30-p	454285	RNLLEDDsDEEEDFF		"molecular association, regulation; intracellular localization"		PACS-1(INDUCES)		14608369	1	64	10		
+EPHA4	EphA4	"EC 2.7.10.1; KINASE; Kinase, protein; Membrane protein, integral; Protein kinase, TK; Protein kinase, tyrosine (receptor)"	P54764	2043	2q36.1	human	Y602-p	450857	TYVDPFTyEDPNQAV	EphA2_TM	"molecular association, regulation"		Fyn(INDUCES)		8622893	6	16	155		
diff -r 000000000000 -r ba62d93a9ef5 test-data/test_swissprot.fasta
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_swissprot.fasta	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,72 @@
+>sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2
+MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDDDAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEKEKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKHQKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKSEEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT
+>sp|Q08945|SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1
+MAETLEFNDVYQEVKGSMNDGRLRLSRQGIIFKNSKTGKVDNIQAGELTEGIWRRVALGHGLKLLTKNGHVYKYDGFRESEFEKLSDFFKTHYRLELMEKDLCVKGWNWGTVKFGGQLLSFDIGDQPVFEIPLSNVSQCTTGKNEVTLEFHQNDDAEVSLMEVRFYVPPTQEDGVDPVEAFAQNVLSKADVIQATGDAICIFRELQCLTPRGRYDIRIYPTFLHLHGKTFDYKIPYTTVLRLFLLPHKDQRQMFFVISLDPPIKQGQTRYHFLILLFSKDEDISLTLNMNEEEVEKRFEGRLTKNMSGSLYEMVSRVMKALVNRKITVPGNFQGHSGAQCITCSYKASSGLLYPLERGFIYVHKPPVHIRFDEISFVNFARGTTTTRSFDFEIETKQGTQYTFSSIEREEYGKLFDFVNAKKLNIKNRGLKEGMNPSYDEYADSDEDQHDAYLERMKEEGKIREENANDSSDDSGEETDESFNPGEEEEDVAEEFDSNASASSSSNEGDSDRDEKKRKQLKKAKMAKDRKSRKKPVEVKKGKDPNAPKRPMSAYMLWLNASREKIKSDHPGISITDLSKKAGEIWKGMSKEKKEEWDRKAEDARRDYEKAMKEYEGGRGESSKRDKSKKKKKVKVKMEKKSTPSRGSSSKSSSRQLSESFKSKEFVSSDESSSGENKSKKKRRRSEDSEEEELASTPPSSEDSASGSDE
+>sp|Q96SA4|SERC2_HUMAN Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 PE=2 SV=3
+MGACLGACSLLSCASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS
+>sp|Q96SA4-2|SERC2_HUMAN Isoform 2 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2
+MGAEGAPDFLSCPRVRRASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS
+>sp|Q96SA4-3|SERC2_HUMAN Isoform 3 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2
+MRSMRLREEESPGPSHTASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS
+>sp|Q96SA4-4|SERC2_HUMAN Isoform 4 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2
+MDGRMMRSMRLREEESPGPSHTASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS
+>sp|Q9NRX5|SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1
+MGSVLGLCSMASWIPCLCGSAPCLLCRCCPSGNNSTVTRLIYALFLLVGVCVACVMLIPGMEEQLNKIPGFCENEKGVVPCNILVGYKAVYRLCFGLAMFYLLLSLLMIKVKSSSDPRAAVHNGFWFFKFAAAIAIIIGAFFIPEGTFTTVWFYVGMAGAFCFILIQLVLLIDFAHSWNESWVEKMEEGNSRCWYAALLSATALNYLLSLVAIVLFFVYYTHPASCSENKAFISVNMLLCVGASVMSILPKIQESQPRSGLLQSSVITVYTMYLTWSAMTNEPETNCNPSLLSIIGYNTTSTVPKEGQSVQWWHAQGIIGLILFLLCVFYSSIRTSNNSQVNKLTLTSDESTLIEDGGARSDGSLEDGDDVHRAVDNERDGVTYSYSFFHFMLFLASLYIMMTLTNWYRYEPSREMKSQWTAVWVKISSSWIGIVLYVWTLVAPLVLTNRDFD
+>sp|O43768|ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1
+MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE
+>sp|O43768-2|ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG
+>sp|O43768-3|ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE
+>sp|O43768-4|ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGVWGIASYPLSLGLKEVLRMKSVEQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG
+>sp|O43768-5|ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE
+>sp|O43768-6|ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG
+>sp|O43768-7|ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE
+>sp|O43768-8|ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGVWGIVSYPLSLELKEVLRMKSVEVLLDPFLEVLLLNRSRGEFEI
+>sp|O43768-9|ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG
+>sp|Q15751|HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2
+MATMIPPVKLKWLEHLNSSWITEDSESIATREGVAVLYSKLVSNKEVVPLPQQVLCLKGPQLPDFERESLSSDEQDHYLDALLSSQLALAKMVCSDSPFAGALRKRLLVLQRVFYALSNKYHDKGKVKQQQHSPESSSGSADVHSVSERPRSSTDALIEMGVRTGLSLLFALLRQSWMMPVSGPGLSLCNDVIHTAIEVVSSLPPLSLANESKIPPMGLDCLSQVTTFLKGVTIPNSGADTLGRRLASELLLGLAAQRGSLRYLLEWIEMALGASAVVHTMEKGKLLSSQEGMISFDCFMTILMQMRRSLGSSADRSQWREPTRTSDGLCSLYEAALCLFEEVCRMASDYSRTCASPDSIQTGDAPIVSETCEVYVWGSNSSHQLVEGTQEKILQPKLAPSFSDAQTIEAGQYCTFVISTDGSVRACGKGSYGRLGLGDSNNQSTLKKLTFEPHRSIKKVSSSKGSDGHTLAFTTEGEVFSWGDGDYGKLGHGNSSTQKYPKLIQGPLQGKVVVCVSAGYRHSAAVTEDGELYTWGEGDFGRLGHGDSNSRNIPTLVKDISNVGEVSCGSSHTIALSKDGRTVWSFGGGDNGKLGHGDTNRVYKPKVIEALQGMFIRKVCAGSQSSLALTSTGQVYAWGCGACLGCGSSEATALRPKLIEELAATRIVDVSIGDSHCLALSHDNEVYAWGNNSMGQCGQGNSTGPITKPKKVSGLDGIAIQQISAGTSHSLAWTALPRDRQVVAWHRPYCVDLEESTFSHLRSFLERYCDKINSEIPPLPFPSSREHHSFLKLCLKLLSNHLALALAGGVATSILGRQAGPLRNLLFRLMDSTVPDEIQEVVIETLSVGATMLLPPLRERMELLHSLLPQGPDRWESLSKGQRMQLDIILTSLQDHTHVASLLGYSSPSDAADLSSVCTGYGNLSDQPYGTQSCHPDTHLAEILMKTLLRNLGFYTDQAFGELEKNSDKFLLGTSSSENSQPAHLHELLCSLQKQLLAFCHINNISENSSSVALLHKHLQLLLPHATDIYSRSANLLKESPWNGSVGEKLRDVIYVSAAGSMLCQIVNSLLLLPVSVARPLLSYLLDLLPPLDCLNRLLPAADLLEDQELQWPLHGGPELIDPAGLPLPQPAQSWVWLVDLERTIALLIGRCLGGMLQGSPVSPEEQDTAYWMKTPLFSDGVEMDTPQLDKCMSCLLEVALSGNEEQKPFDYKLRPEIAVYVDLALGCSKEPARSLWISMQDYAVSKDWDSATLSNESLLDTVSRFVLAALLKHTNLLSQACGESRYQPGKHLSEVYRCVYKVRSRLLACKNLELIQTRSSSRDRWISENQDSADVDPQEHSFTRTIDEEAEMEEQAERDREEGHPEPEDEEEEREHEVMTAGKIFQCFLSAREVARSRDRDRMNSGAGSGARADDPPPQSQQERRVSTDLPEGQDVYTAACNSVIHRCALLILGVSPVIDELQKRREEGQLQQPSTSASEGGGLMTRSESLTAESRLVHTSPNYRLIKSRSESDLSQPESDEEGYALSGRRNVDLDLAASHRKRGPMHSQLESLSDSWARLKHSRDWLCNSSYSFESDFDLTKSLGVHTLIENVVSFVSGDVGNAPGFKEPEESMSTSPQASIIAMEQQQLRAELRLEALHQILVLLSGMEEKGSISLAGSRLSSGFQSSTLLTSVRLQFLAGCFGLGTVGHTGGKGESGRLHHYQDGIRAAKRNIQIEIQVAVHKIYQQLSATLERALQANKHHIEAQQRLLLVTVFALSVHYQPVDVSLAISTGLLNVLSQLCGTDTMLGQPLQLLPKTGVSQLSTALKVASTRLLQILAITTGTYADKLSPKVVQSLLDLLCSQLKNLLSQTGVLHMASFGEGEQEDGEEEEKKVDSSGETEKKDFRAALRKQHAAELHLGDFLVFLRRVVSSKAIQSKMASPKWTEVLLNIASQKCSSGIPLVGNLRTRLLALHVLEAVLPACESGVEDDQMAQIVERLFSLLSDCMWETPIAQAKHAIQIKEKEQEIKLQKQGELEEEDENLPIQEVSFDPEKAQCCLVENGQILTHGSGGKGYGLASTGVTSGCYQWKFYIVKENRGNEGTCVGVSRWPVHDFNHRTTSDMWLYRAYSGNLYHNGEQTLTLSSFTQGDFITCVLDMEARTISFGKNGEEPKLAFEDVDAAELYPCVMFYSSNPGEKVKICDMQMRGTPRDLLPGDPICSPVAAVLAEATIQLIRILHRTDRWTYCINKKMMERLHKIKICIKESGQKLKKSRSVQSREENEMREEKESKEEEKGKHTRHGLADLSELQLRTLCIEVWPVLAVIGGVDAGLRVGGRCVHKQTGRHATLLGVVKEGSTSAKVQWDEAEITISFPTFWSPSDTPLYNLEPCEPLPFDVARFRGLTASVLLDLTYLTGVHEDMGKQSTKRHEKKHRHESEEKGDVEQKPESESALDMRTGLTSDDVKSQSTTSSKSENEIASFSLDPTLPSVESQHQITEGKRKNHEHMSKNHDVAQSEIRAVQLSYLYLGAMKSLSALLGCSKYAELLLIPKVLAENGHNSDCASSPVVHEDVEMRAALQFLMRHMVKRAVMRSPIKRALGLADLERAQAMIYKLVVHGLLEDQFGGKIKQEIDQQAEESDPAQQAQTPVTTSPSASSTTSFMSSSLEDTTTATTPVTDTETVPASESPGVMPLSLLRQMFSSYPTTTVLPTRRAQTPPISSLPTSPSDEVGRRQSLTSPDSQSARPANRTALSDPSSRLSTSPPPPAIAVPLLEMGFSLRQIAKAMEATGARGEADAQNITVLAMWMIEHPGHEDEEEPQSGSTADSRPGAAVLGSGGKSNDPCYLQSPGDIPSADAAEMEEGFSESPDNLDHTENAASGSGPSARGRSAVTRRHKFDLAARTLLARAAGLYRSVQAHRNQSRREGISLQQDPGALYDFNLDEELEIDLDDEAMEAMFGQDLTSDNDILGMWIPEVLDWPTWHVCESEDREEVVVCELCECSVVSFNQHMKRNHPGCGRSANRQGYRSNGSYVDGWFGGECGSGNPYYLLCGTCREKYLAMKTKSKSTSSERYKGQAPDLIGKQDSVYEEDWDMLDVDEDEKLTGEEEFELLAGPLGLNDRRIVPEPVQFPDSDPLGASVAMVTATNSMEETLMQIGCHGSVEKSSSGRITLGEQAAALANPHDRVVALRRVTAAAQVLLARTMVMRALSLLSVSGSSCSLAAGLESLGLTDIRTLVRLMCLAAAGRAGLSTSPSAMASTSERSRGGHSKANKPISCLAYLSTAVGCLASNAPSAAKLLVQLCTQNLISAATGVNLTTVDDSIQRKFLPSFLRGIAEENKLVTSPNFVVTQALVALLADKGAKLRPNYDKSEVEKKGPLELANALAACCLSSRLSSQHRQWAAQQLVRTLAAHDRDNQTTLQTLADMGGDLRKCSFIKLEAHQNRVMTCVWCNKKGLLATSGNDGTIRVWNVTKKQYSLQQTCVFNRLEGDAEESLGSPSDPSFSPVSWSISGKYLAGALEKMVNIWQVNGGKGLVDIQPHWVSALAWPEEGPATAWSGESPELLLVGRMDGSLGLIEVVDVSTMHRRELEHCYRKDVSVTCIAWFSEDRPFAVGYFDGKLLLGTKEPLEKGGIVLIDAHKDTLISMKWDPTGHILMTCAKEDSVKLWGSISGCWCCLHSLCHPSIVNGIAWCRLPGKGSKLQLLMATGCQSGLVCVWRIPQDTTQTNVTSAEGWWEQESNCQDGYRKSSGAKCVYQLRGHITPVRTVAFSSDGLALVSGGLGGLMNIWSLRDGSVLQTVVIGSGAIQTTVWIPEVGVAACSNRSKDVLVVNCTAEWAAANHVLATCRTALKQQGVLGLNMAPCMRAFLERLPMMLQEQYAYEKPHVVCGDQLVHSPYMQCLASLAVGLHLDQLLCNPPVPPHHQNCLPDPASWNPNEWAWLECFSTTIKAAEALTNGAQFPESFTVPDLEPVPEDELVFLMDNSKWINGMDEQIMSWATSRPEDWHLGGKCDVYLWGAGRHGQLAEAGRNVMVPAAAPSFSQAQQVICGQNCTFVIQANGTVLACGEGSYGRLGQGNSDDLHVLTVISALQGFVVTQLVTSCGSDGHSMALTESGEVFSWGDGDYGKLGHGNSDRQRRPRQIEALQGEEVVQMSCGFKHSAVVTSDGKLFTFGNGDYGRLGLGNTSNKKLPERVTALEGYQIGQVACGLNHTLAVSADGSMVWAFGDGDYGKLGLGNSTAKSSPQKIDVLCGIGIKKVACGTQFSVALTKDGHVYTFGQDRLIGLPEGRARNHNRPQQIPVLAGVIIEDVAVGAEHTLALASNGDVYAWGSNSEGQLGLGHTNHVREPTLVTGLQGKNVRQISAGRCHSAAWTAPPVPPRAPGVSVPLQLGLPDTVPPQYGALREVSIHTVRARLRLLYHFSDLMYSSWRLLNLSPNNQNSTSHYNAGTWGIVQGQLRPLLAPRVYTLPMVRSIGKTMVQGKNYGPQITVKRISTRGRKCKPIFVQIARQVVKLNASDLRLPSRAWKVKLVGEGADDAGGVFDDTITEMCQELETGIVDLLIPSPNATAEVGYNRDRFLFNPSACLDEHLMQFKFLGILMGVAIRTKKPLDLHLAPLVWKQLCCVPLTLEDLEEVDLLYVQTLNSILHIEDSGITEESFHEMIPLDSFVGQSADGKMVPIIPGGNSIPLTFSNRKEYVERAIEYRLHEMDRQVAAVREGMSWIVPVPLLSLLTAKQLEQMVCGMPEISVEVLKKVVRYREVDEQHQLVQWFWHTLEEFSNEERVLFMRFVSGRSRLPANTADISQRFQIMKVDRPYDSLPTSQTCFFQLRLPPYSSQLVMAERLRYAINNCRSIDMDNYMLSRNVDNAEGSDTDY
+>sp|O95714|HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2
+MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIVYTGTESTQNGELPPRKDDSVEPSGTKKEDLNDKEKKDEEETPAPIYRAKSILDSWVWGKQPDVNELKECLSVLVKEQQALAVQSATTTLSALRLKQRLVILERYFIALNRTVFQENVKVKWKSSGISLPPVDKKSSRPAGKGVEGLARVGSRAALSFAFAFLRRAWRSGEDADLCSELLQESLDALRALPEASLFDESTVSSVWLEVVERATRFLRSVVTGDVHGTPATKGPGSIPLQDQHLALAILLELAVQRGTLSQMLSAILLLLQLWDSGAQETDNERSAQGTSAPLLPLLQRFQSIICRKDAPHSEGDMHLLSGPLSPNESFLRYLTLPQDNELAIDLRQTAVVVMAHLDRLATPCMPPLCSSPTSHKGSLQEVIGWGLIGWKYYANVIGPIQCEGLANLGVTQIACAEKRFLILSRNGRVYTQAYNSDTLAPQLVQGLASRNIVKIAAHSDGHHYLALAATGEVYSWGCGDGGRLGHGDTVPLEEPKVISAFSGKQAGKHVVHIACGSTYSAAITAEGELYTWGRGNYGRLGHGSSEDEAIPMLVAGLKGLKVIDVACGSGDAQTLAVTENGQVWSWGDGDYGKLGRGGSDGCKTPKLIEKLQDLDVVKVRCGSQFSIALTKDGQVYSWGKGDNQRLGHGTEEHVRYPKLLEGLQGKKVIDVAAGSTHCLALTEDSEVHSWGSNDQCQHFDTLRVTKPEPAALPGLDTKHIVGIACGPAQSFAWSSCSEWSIGLRVPFVVDICSMTFEQLDLLLRQVSEGMDGSADWPPPQEKECVAVATLNLLRLQLHAAISHQVDPEFLGLGLGSILLNSLKQTVVTLASSAGVLSTVQSAAQAVLQSGWSVLLPTAEERARALSALLPCAVSGNEVNISPGRRFMIDLLVGSLMADGGLESALHAAITAEIQDIEAKKEAQKEKEIDEQEANASTFHRSRTPLDKDLINTGICESSGKQCLPLVQLIQQLLRNIASQTVARLKDVARRISSCLDFEQHSRERSASLDLLLRFQRLLISKLYPGESIGQTSDISSPELMGVGSLLKKYTALLCTHIGDILPVAASIASTSWRHFAEVAYIVEGDFTGVLLPELVVSIVLLLSKNAGLMQEAGAVPLLGGLLEHLDRFNHLAPGKERDDHEELAWPGIMESFFTGQNCRNNEEVTLIRKADLENHNKDGGFWTVIDGKVYDIKDFQTQSLTGNSILAQFAGEDPVVALEAALQFEDTRESMHAFCVGQYLEPDQEIVTIPDLGSLSSPLIDTERNLGLLLGLHASYLAMSTPLSPVEIECAKWLQSSIFSGGLQTSQIHYSYNEEKDEDHCSSPGGTPASKSRLCSHRRALGDHSQAFLQAIADNNIQDHNVKDFLCQIERYCRQCHLTTPIMFPPEHPVEEVGRLLLCCLLKHEDLGHVALSLVHAGALGIEQVKHRTLPKSVVDVCRVVYQAKCSLIKTHQEQGRSYKEVCAPVIERLRFLFNELRPAVCNDLSIMSKFKLLSSLPRWRRIAQKIIRERRKKRVPKKPESTDDEEKIGNEESDLEEACILPHSPINVDKRPIAIKSPKDKWQPLLSTVTGVHKYKWLKQNVQGLYPQSPLLSTIAEFALKEEPVDVEKMRKCLLKQLERAEVRLEGIDTILKLASKNFLLPSVQYAMFCGWQRLIPEGIDIGEPLTDCLKDVDLIPPFNRMLLEVTFGKLYAWAVQNIRNVLMDASAKFKELGIQPVPLQTITNENPSGPSLGTIPQARFLLVMLSMLTLQHGANNLDLLLNSGMLALTQTALRLIGPSCDNVEEDMNASAQGASATVLEETRKETAPVQLPVSGPELAAMMKIGTRVMRGVDWKWGDQDGPPPGLGRVIGELGEDGWIRVQWDTGSTNSYRMGKEGKYDLKLAELPAAAQPSAEDSDTEDDSEAEQTERNIHPTAMMFTSTINLLQTLCLSAGVHAEIMQSEATKTLCGLLRMLVESGTTDKTSSPNRLVYREQHRSWCTLGFVRSIALTPQVCGALSSPQWITLLMKVVEGHAPFTATSLQRQILAVHLLQAVLPSWDKTERARDMKCLVEKLFDFLGSLLTTCSSDVPLLRESTLRRRRVRPQASLTATHSSTLAEEVVALLRTLHSLTQWNGLINKYINSQLRSITHSFVGRPSEGAQLEDYFPDSENPEVGGLMAVLAVIGGIDGRLRLGGQVMHDEFGEGTVTRITPKGKITVQFSDMRTCRVCPLNQLKPLPAVAFNVNNLPFTEPMLSVWAQLVNLAGSKLEKHKIKKSTKQAFAGQVDLDLLRCQQLKLYILKAGRALLSHQDKLRQILSQPAVQETGTVHTDDGAVVSPDLGDMSPEGPQPPMILLQQLLASATQPSPVKAIFDKQELEAAALAVCQCLAVESTHPSSPGFEDCSSSEATTPVAVQHIRPARVKRRKQSPVPALPIVVQLMEMGFSRRNIEFALKSLTGASGNASSLPGVEALVGWLLDHSDIQVTELSDADTVSDEYSDEEVVEDVDDAAYSMSTGAVVTESQTYKKRADFLSNDDYAVYVRENIQVGMMVRCCRAYEEVCEGDVGKVIKLDRDGLHDLNVQCDWQQKGGTYWVRYIHVELIGYPPPSSSSHIKIGDKVRVKASVTTPKYKWGSVTHQSVGVVKAFSANGKDIIVDFPQQSHWTGLLSEMELVPSIHPGVTCDGCQMFPINGSRFKCRNCDDFDFCETCFKTKKHNTRHTFGRINEPGQSAVFCGRSGKQLKRCHSSQPGMLLDSWSRMVKSLNVSSSVNQASRLIDGSEPCWQSSGSQGKHWIRLEIFPDVLVHRLKMIVDPADSSYMPSLVVVSGGNSLNNLIELKTININPSDTTVPLLNDCTEYHRYIEIAIKQCRSSGIDCKIHGLILLGRIRAEEEDLAAVPFLASDNEEEEDEKGNSGSLIRKKAAGLESAATIRTKVFVWGLNDKDQLGGLKGSKIKVPSFSETLSALNVVQVAGGSKSLFAVTVEGKVYACGEATNGRLGLGISSGTVPIPRQITALSSYVVKKVAVHSGGRHATALTVDGKVFSWGEGDDGKLGHFSRMNCDKPRLIEALKTKRIRDIACGSSHSAALTSSGELYTWGLGEYGRLGHGDNTTQLKPKMVKVLLGHRVIQVACGSRDAQTLALTDEGLVFSWGDGDFGKLGRGGSEGCNIPQNIERLNGQGVCQIECGAQFSLALTKSGVVWTWGKGDYFRLGHGSDVHVRKPQVVEGLRGKKIVHVAVGALHCLAVTDSGQVYAWGDNDHGQQGNGTTTVNRKPTLVQGLEGQKITRVACGSSHSVAWTTVDVATPSVHEPVLFQTARDPLGASYLGVPSDADSSAASNKISGASNSKPNRPSLAKILLSLDGNLAKQQALSHILTALQIMYARDAVVGALMPAAMIAPVECPSFSSAAPSDASAMASPMNGEECMLAVDIEDRLSPNPWQEKREIVSSEDAVTPSAVTPSAPSASARPFIPVTDDLGAASIIAETMTKTKEDVESQNKAAGPEPQALDEFTSLLIADDTRVVVDLLKLSVCSRAGDRGRDVLSAVLSGMGTAYPQVADMLLELCVTELEDVATDSQSGRLSSQPVVVESSHPYTDDTSTSGTVKIPGAEGLRVEFDRQCSTERRHDPLTVMDGVNRIVSVRSGREWSDWSSELRIPGDELKWKFISDGSVNGWGWRFTVYPIMPAAGPKELLSDRCVLSCPSMDLVTCLLDFRLNLASNRSIVPRLAASLAACAQLSALAASHRMWALQRLRKLLTTEFGQSININRLLGENDGETRALSFTGSALAALVKGLPEALQRQFEYEDPIVRGGKQLLHSPFFKVLVALACDLELDTLPCCAETHKWAWFRRYCMASRVAVALDKRTPLPRLFLDEVAKKIRELMADSENMDVLHESHDIFKREQDEQLVQWMNRRPDDWTLSAGGSGTIYGWGHNHRGQLGGIEGAKVKVPTPCEALATLRPVQLIGGEQTLFAVTADGKLYATGYGAGGRLGIGGTESVSTPTLLESIQHVFIKKVAVNSGGKHCLALSSEGEVYSWGEAEDGKLGHGNRSPCDRPRVIESLRGIEVVDVAAGGAHSACVTAAGDLYTWGKGRYGRLGHSDSEDQLKPKLVEALQGHRVVDIACGSGDAQTLCLTDDDTVWSWGDGDYGKLGRGGSDGCKVPMKIDSLTGLGVVKVECGSQFSVALTKSGAVYTWGKGDYHRLGHGSDDHVRRPRQVQGLQGKKVIAIATGSLHCVCCTEDGEVYTWGDNDEGQLGDGTTNAIQRPRLVAALQGKKVNRVACGSAHTLAWSTSKPASAGKLPAQVPMEYNHLQEIPIIALRNRLLLLHHLSELFCPCIPMFDLEGSLDETGLGPSVGFDTLRGILISQGKEAAFRKVVQATMVRDRQHGPVVELNRIQVKRSRSKGGLAGPDGTKSVFGQMCAKMSSFGPDSLLLPHRVWKVKFVGESVDDCGGGYSESIAEICEELQNGLTPLLIVTPNGRDESGANRDCYLLSPAARAPVHSSMFRFLGVLLGIAIRTGSPLSLNLAEPVWKQLAGMSLTIADLSEVDKDFIPGLMYIRDNEATSEEFEAMSLPFTVPSASGQDIQLSSKHTHITLDNRAEYVRLAINYRLHEFDEQVAAVREGMARVVPVPLLSLFTGYELETMVCGSPDIPLHLLKSVATYKGIEPSASLIQWFWEVMESFSNTERSLFLRFVWGRTRLPRTIADFRGRDFVIQVLDKYNPPDHFLPESYTCFFLLKLPRYSCKQVLEEKLKYAIHFCKSIDTDDYARIALTGEPAADDSSDDSDNEDVDSFASDSTQDYLTGH
+>sp|Q6ZN18|AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2
+MAAAITDMADLEELSRLSPLPPGSPGSAARGRAEPPEEEEEEEEEEEEAEAEAVAALLLNGGSGGGGGGGGGGVGGGEAETMSEPSPESASQAGEDEDEEEDDEEEEDESSSSGGGEEESSAESLVGSSGGSSSDETRSLSPGAASSSSGDGDGKEGLEEPKGPRGSQGGGGGGSSSSSVVSSGGDEGYGTGGGGSSATSGGRRGSLEMSSDGEPLSRMDSEDSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKRTLIRKVFNLYLSKQ
+>sp|Q6ZN18-2|AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2
+MAAAITDMADLEELSRLSPLPPGSPGSAARGRAEPPEEEEEEEEEEEEAEAEAVAALLLNGGSGGGGGGGGGGVGGGEAETMSEPSPESASQAGEDEDEEEDDEEEEDESSSSGGGEEESSAESLVGSSGGSSSDETRSLSPGAASSSSGDGDGKEGLEEPKGPRGSQGGGGGGSSSSSVVSSGGDEGYGTGGGGSSATSGGRRGSLEMSSDGEPLSRMDSEDSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKR
+>sp|Q6ZN18-3|AEBP2_HUMAN Isoform 3 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2
+MYTRRYSSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKRTLIRKVFNLYLSKQ
+>sp|O15083|ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3
+MYGSARTITNLEGSPSRSPRLPRSPRLGHRRTSSGGGGGTGKTLSMENIQSLNAAYATSGPMYLSDHEGVASTTYPKGTMTLGRATNRAVYGGRVTAMGSSPNIASAGLSHTDVLSYTDQHGGLTGSSHHHHHQVPSMLRQVRDSTMLDLQAQLKELQRENDLLRKELDIKDSKLGSSMNSIKTFWSPELKKERVLRKEEAARMSVLKEQMRVSHEENQHLQLTIQALQDELRTQRDLNHLLQQESGNRGAEHFTIELTEENFRRLQAEHDRQAKELFLLRKTLEEMELRIETQKQTLNARDESIKKLLEMLQSKGLPSKSLEDDNERTRRMAEAESQVSHLEVILDQKEKENIHLREELHRRSQLQPEPAKTKALQTVIEMKDTKIASLERNIRDLEDEIQMLKANGVLNTEDREEEIKQIEVYKSHSKFMKTKIDQLKQELSKKESELLALQTKLETLSNQNSDCKQHIEVLKESLTAKEQRAAILQTEVDALRLRLEEKESFLNKKTKQLQDLTEEKGTLAGEIRDMKDMLEVKERKINVLQKKIENLQEQLRDKDKQLTNLKDRVKSLQTDSSNTDTALATLEEALSEKERIIERLKEQRERDDRERLEEIESFRKENKDLKEKVNALQAELTEKESSLIDLKEHASSLASAGLKRDSKLKSLEIAIEQKKEECSKLEAQLKKAHNIEDDSRMNPEFADQIKQLDKEASYYRDECGKAQAEVDRLLEILKEVENEKNDKDKKIAELESLTLRHMKDQNKKVANLKHNQQLEKKKNAQLLEEVRRREDSMADNSQHLQIEELMNALEKTRQELDATKARLASTQQSLAEKEAHLANLRIERRKQLEEILEMKQEALLAAISEKDANIALLELSASKKKKTQEEVMALKREKDRLVHQLKQQTQNRMKLMADNYDDDHHHYHHHHHHHHHRSPGRSQHSNHRPSPDQDDEEGIWA
+>sp|P23763|VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1
+MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVIYFFT
+>sp|P23763-3|VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1
+MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVSKYR
+>sp|P23763-2|VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1
+MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVRRD
+>sp|Q15836|VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3
+MSTGPTAATGSNRRLQQTQNQVDEVVDIMRVNVDKVLERDQKLSELDDRADALQAGASQFETSAAKLKRKYWWKNCKMWAIGITVLVIFIIIIIVWVVSS
+>sp|P63027|VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3
+MSATAATAPPAAPAGEGGPPAPPPNLTSNRRLQQTQAQVDEVVDIMRVNVDKVLERDQKLSELDDRADALQAGASQFETSAAKLKRKYWWKNLKMMIILGVICAIILIIIIVYFST
+>sp|O75379|VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2
+MPPKFKRHLNDDDVTGSVKSERRNLLEDDSDEEEDFFLRGPSGPRFGPRNDKIKHVQNQVDEVIDVMQENITKVIERGERLDELQDKSESLSDNATAFSNRSKQLRRQMWWRGCKIKAIMALVAAILLLVIIILIVMKYRT
+>sp|O75379-2|VAMP4_HUMAN_Isoform 2 of Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4
+MPPKFKRHLNDDDVTGSVKSERRNLLEDDSDEEEDFFLGPSGPRFGPRNDKIKHVQNQVDEVIDVMQENITKVIERGERLDELQDKSESLSDNATAFSNRSKQLRRQMWWRGCKIKAIMALVAAILLLVIIILIVMKYRT
+>sp|O95183|VAMP5_HUMAN_Vesicle-associated membrane protein 5 OS=Homo sapiens OX=9606 GN=VAMP5 PE=1 SV=1
+MAGIELERCQQQANEVTEIMRNNFGKVLERGVKLAELQQRSDQLLDMSSTFNKTTQNLAQKKCWENIRYRICVGLVVVGVLLIILIVLLVVFLPQSSDSSSAPRTQDAGIASGPGN
+>sp|P51809|VAMP7_HUMAN_Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7 PE=1 SV=3
+MAILFAVVARGTTILAKHAWCGGNFLEVTEQILAKIPSENNKLTYSHGNYLFHYICQDRIVYLCITDDDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIDLVAQRGERLELLIDKTENLVDSSVTFKTTSRNLARAMCMKNLKLTIIIIIVSIVFIYIIVSPLCGGFTWPSCVKK
+>sp|P51809-2|VAMP7_HUMAN_Isoform 2 of Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7
+MAILFAVVARGTTILAKHAWCGGNFLEVTEQILAKIPSENNKLTYSHGNYLFHYICQDRIVYLCITDDDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIVCHLQNYQQKSCSSHVYEEPQAHYYHHHRINCVHLYHCFTSLWWIYMAKLCEEIGKKKLPLTKDMREQGVKSNPCDSSLSHTDRWYLPVSSTLFSLFKILFHASRFIFVLSTSLFL
+>sp|P51809-3|VAMP7_HUMAN_Isoform 3 of Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7
+MAILFAVVARGTTILAKHAWCGGNFLEDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIDLVAQRGERLELLIDKTENLVDSSVTFKTTSRNLARAMCMKNLKLTIIIIIVSIVFIYIIVSPLCGGFTWPSCVKK
+>sp|Q9BV40|VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1
+MEEASEGGGNDRVRNLQSEVEGVKNIMTQNVERILARGENLEHLRNKTEDLEATSEHFKTTSQKVARKFWWKNVKMIVLICVIVFIIILFIVLFATGAFS
+>sp|P54764|EPHA4_HUMAN Ephrin type-A receptor 4 OS=Homo sapiens OX=9606 GN=EPHA4 PE=1 SV=1
+MAGIFYFALFSCLFGICDAVTGSRVYPANEVTLLDSRSVQGELGWIASPLEGGWEEVSIMDEKNTPIRTYQVCNVMEPSQNNWLRTDWITREGAQRVYIEIKFTLRDCNSLPGVMGTCKETFNLYYYESDNDKERFIRENQFVKIDTIAADESFTQVDIGDRIMKLNTEIRDVGPLSKKGFYLAFQDVGACIALVSVRVFYKKCPLTVRNLAQFPDTITGADTSSLVEVRGSCVNNSEEKDVPKMYCGADGEWLVPIGNCLCNAGHEERSGECQACKIGYYKALSTDATCAKCPPHSYSVWEGATSCTCDRGFFRADNDAASMPCTRPPSAPLNLISNVNETSVNLEWSSPQNTGGRQDISYNVVCKKCGAGDPSKCRPCGSGVHYTPQQNGLKTTKVSITDLLAHTNYTFEIWAVNGVSKYNPNPDQSVSVTVTTNQAAPSSIALVQAKEVTRYSVALAWLEPDRPNGVILEYEVKYYEKDQNERSYRIVRTAARNTDIKGLNPLTSYVFHVRARTAAGYGDFSEPLEVTTNTVPSRIIGDGANSTVLLVSVSGSVVLVVILIAAFVISRRRSKYSKAKQEADEEKHLNQGVRTYVDPFTYEDPNQAVREFAKEIDASCIKIEKVIGVGEFGEVCSGRLKVPGKREICVAIKTLKAGYTDKQRRDFLSEASIMGQFDHPNIIHLEGVVTKCKPVMIITEYMENGSLDAFLRKNDGRFTVIQLVGMLRGIGSGMKYLSDMSYVHRDLAARNILVNSNLVCKVSDFGMSRVLEDDPEAAYTTRGGKIPIRWTAPEAIAYRKFTSASDVWSYGIVMWEVMSYGERPYWDMSNQDVIKAIEEGYRLPPPMDCPIALHQLMLDCWQKERSDRPKFGQIVNMLDKLIRNPNSLKRTGTESSRPNTALLDPSSPEFSAVVSVGDWLQAIKMDRYKDNFTAAGYTTLEAVVHVNQEDLARIGITAITHQNKILSSVQAMRTQMQQMHGRMVPV
+>sp|P54764-2|EPHA4_HUMAN Isoform 2 of Ephrin type-A receptor 4 OS=Homo sapiens OX=9606 GN=EPHA4
+MKWEEVSIMDEKNTPIRTYQVCNVMEPSQNNWLRTDWITREGAQRVYIEIKFTLRDCNSLPGVMGTCKETFNLYYYESDNDKERFIRENQFVKIDTIAADESFTQVDIGDRIMKLNTEIRDVGPLSKKGFYLAFQDVGACIALVSVRVFYKKCPLTVRNLAQFPDTITGADTSSLVEVRGSCVNNSEEKDVPKMYCGADGEWLVPIGNCLCNAGHEERSGECQACKIGYYKALSTDATCAKCPPHSYSVWEGATSCTCDRGFFRADNDAASMPCTRPPSAPLNLISNVNETSVNLEWSSPQNTGGRQDISYNVVCKKCGAGDPSKCRPCGSGVHYTPQQNGLKTTKVSITDLLAHTNYTFEIWAVNGVSKYNPNPDQSVSVTVTTNQAAPSSIALVQAKEVTRYSVALAWLEPDRPNGVILEYEVKYYEKDQNERSYRIVRTAARNTDIKGLNPLTSYVFHVRARTAAGYGDFSEPLEVTTNTVPSRIIGDGANSTVLLVSVSGSVVLVVILIAAFVISRRRSKYSKAKQEADEEKHLNQGVRTYVDPFTYEDPNQAVREFAKEIDASCIKIEKVIGVGEFGEVCSGRLKVPGKREICVAIKTLKAGYTDKQRRDFLSEASIMGQFDHPNIIHLEGVVTKCKPVMIITEYMENGSLDAFLRKNDGRFTVIQLVGMLRGIGSGMKYLSDMSYVHRDLAARNILVNSNLVCKVSDFGMSRVLEDDPEAAYTTRGGKIPIRWTAPEAIAYRKFTSASDVWSYGIVMWEVMSYGERPYWDMSNQDVIKAIEEGYRLPPPMDCPIALHQLMLDCWQKERSDRPKFGQIVNMLDKLIRNPNSLKRTGTESSRPNTALLDPSSPEFSAVVSVGDWLQAIKMDRYKDNFTAAGYTTLEAVVHVNQEDLARIGITAITHQNKILSSVQAMRTQMQQMHGRMVPV
diff -r 000000000000 -r ba62d93a9ef5 workflow/ppenrich_suite_wf.ga
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/workflow/ppenrich_suite_wf.ga	Mon Jul 11 19:21:19 2022 +0000
@@ -0,0 +1,904 @@
+{
+    "a_galaxy_workflow": "true",
+    "annotation": "phoshpoproteomic enrichment data pre-processing and ANOVA",
+    "creator": [
+        {
+            "class": "Person",
+            "identifier": "0000-0002-2882-0508",
+            "name": "Art Eschenlauer"
+        }
+    ],
+    "format-version": "0.1",
+    "license": "MIT",
+    "name": "ppenrich_suite_wf",
+    "steps": {
+        "0": {
+            "annotation": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).",
+            "content_id": null,
+            "errors": null,
+            "id": 0,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).",
+                    "name": "Phospho (STY)Sites.txt"
+                }
+            ],
+            "label": "Phospho (STY)Sites.txt",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 290.16561126708984,
+                "height": 82.1624984741211,
+                "left": 515.090576171875,
+                "right": 715.0874328613281,
+                "top": 208.00311279296875,
+                "width": 199.99685668945312,
+                "x": 515.090576171875,
+                "y": 208.00311279296875
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": \"\"}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "c366566c-2a61-4918-b4ea-c1f565c4f2ca",
+            "workflow_outputs": []
+        },
+        "1": {
+            "annotation": "THIS IS pST BY DEFAULT.  Change if your data are enriched for pY.",
+            "content_id": null,
+            "errors": null,
+            "id": 1,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "THIS IS pST BY DEFAULT.  Change if your data are enriched for pY.",
+                    "name": "enrichmentType"
+                }
+            ],
+            "label": "enrichmentType",
+            "name": "Input parameter",
+            "outputs": [],
+            "position": {
+                "bottom": 375.7687225341797,
+                "height": 61.76249694824219,
+                "left": 531.1312255859375,
+                "right": 731.1280822753906,
+                "top": 314.0062255859375,
+                "width": 199.99685668945312,
+                "x": 531.1312255859375,
+                "y": 314.0062255859375
+            },
+            "tool_id": null,
+            "tool_state": "{\"restrictions\": [\"pST\", \"pY\"], \"parameter_type\": \"text\", \"optional\": false}",
+            "tool_version": null,
+            "type": "parameter_input",
+            "uuid": "5f31b776-9e2b-4f3a-a9e6-886ac2062e15",
+            "workflow_outputs": [
+                {
+                    "label": null,
+                    "output_name": "output",
+                    "uuid": "1ff7eb95-9dd3-4006-ab0b-03e4f84a1aa5"
+                }
+            ]
+        },
+        "2": {
+            "annotation": "Pattern matching columns that have peptide intensity data (PERL-compatible regular expression matching column label)",
+            "content_id": null,
+            "errors": null,
+            "id": 2,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "Pattern matching columns that have peptide intensity data (PERL-compatible regular expression matching column label)",
+                    "name": "Intensity-column pattern"
+                }
+            ],
+            "label": "Intensity-column pattern",
+            "name": "Input parameter",
+            "outputs": [],
+            "position": {
+                "bottom": 576.2812118530273,
+                "height": 102.56249237060547,
+                "left": 590.1468505859375,
+                "right": 790.1437072753906,
+                "top": 473.7187194824219,
+                "width": 199.99685668945312,
+                "x": 590.1468505859375,
+                "y": 473.7187194824219
+            },
+            "tool_id": null,
+            "tool_state": "{\"default\": \"^Intensity[^_]\", \"parameter_type\": \"text\", \"optional\": true}",
+            "tool_version": null,
+            "type": "parameter_input",
+            "uuid": "86505e43-20be-40f5-ad66-eeb3527c6a60",
+            "workflow_outputs": [
+                {
+                    "label": null,
+                    "output_name": "output",
+                    "uuid": "ebb65015-b681-4798-9504-c8c948f82fee"
+                }
+            ]
+        },
+        "3": {
+            "annotation": "Pattern extracting sample-names from names of columns that have peptide intensity data (PERL-compatible regular expression)",
+            "content_id": null,
+            "errors": null,
+            "id": 3,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "Pattern extracting sample-names from names of columns that have peptide intensity data (PERL-compatible regular expression)",
+                    "name": "Sample-extraction pattern"
+                }
+            ],
+            "label": "Sample-extraction pattern",
+            "name": "Input parameter",
+            "outputs": [],
+            "position": {
+                "bottom": 688.256217956543,
+                "height": 102.56249237060547,
+                "left": 606.2249755859375,
+                "right": 806.2218322753906,
+                "top": 585.6937255859375,
+                "width": 199.99685668945312,
+                "x": 606.2249755859375,
+                "y": 585.6937255859375
+            },
+            "tool_id": null,
+            "tool_state": "{\"default\": \"\\\\.\\\\d+[A-Z]$\", \"parameter_type\": \"text\", \"optional\": true}",
+            "tool_version": null,
+            "type": "parameter_input",
+            "uuid": "79f4b36c-dd9b-4d24-a9c8-e0084af50597",
+            "workflow_outputs": [
+                {
+                    "label": null,
+                    "output_name": "output",
+                    "uuid": "3f5f7c91-dc90-4e14-84d9-94db5e49a625"
+                }
+            ]
+        },
+        "4": {
+            "annotation": "Pattern extracting sample-group from the sample-names that are extracted by 'Sample-extraction pattern' (PERL-compatible regular expression)",
+            "content_id": null,
+            "errors": null,
+            "id": 4,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "Pattern extracting sample-group from the sample-names that are extracted by 'Sample-extraction pattern' (PERL-compatible regular expression)",
+                    "name": "Group-extraction pattern"
+                }
+            ],
+            "label": "Group-extraction pattern",
+            "name": "Input parameter",
+            "outputs": [],
+            "position": {
+                "bottom": 804.2999801635742,
+                "height": 102.56249237060547,
+                "left": 610.2562255859375,
+                "right": 810.2530822753906,
+                "top": 701.7374877929688,
+                "width": 199.99685668945312,
+                "x": 610.2562255859375,
+                "y": 701.7374877929688
+            },
+            "tool_id": null,
+            "tool_state": "{\"default\": \"\\\\d+\", \"parameter_type\": \"text\", \"optional\": true}",
+            "tool_version": null,
+            "type": "parameter_input",
+            "uuid": "67f4321c-9b08-4dd2-b448-813f6fdb1b6a",
+            "workflow_outputs": [
+                {
+                    "label": null,
+                    "output_name": "output",
+                    "uuid": "4abd7c2f-9614-4b08-8ea1-8c5c19d69b7c"
+                }
+            ]
+        },
+        "5": {
+            "annotation": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)",
+            "content_id": null,
+            "errors": null,
+            "id": 5,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)",
+                    "name": "SwissProt_Human_Canonical_Isoform.fasta"
+                }
+            ],
+            "label": "SwissProt_Human_Canonical_Isoform.fasta",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 1096.5749435424805,
+                "height": 102.56249237060547,
+                "left": 639.121826171875,
+                "right": 839.1186828613281,
+                "top": 994.012451171875,
+                "width": 199.99685668945312,
+                "x": 639.121826171875,
+                "y": 994.012451171875
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"fasta\"], \"tag\": \"\"}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "870d3075-3ebb-4505-99a2-c3d01b51a86b",
+            "workflow_outputs": []
+        },
+        "6": {
+            "annotation": "Derived from https://networkin.info/download/networkin_human_predictions_3.1.tsv.xz (which is free for non-commercial use - for required citation, see https://networkin.info/)",
+            "content_id": null,
+            "errors": null,
+            "id": 6,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "Derived from https://networkin.info/download/networkin_human_predictions_3.1.tsv.xz (which is free for non-commercial use - for required citation, see https://networkin.info/)",
+                    "name": "NetworKIN_cutoffscore2.0.tabular"
+                }
+            ],
+            "label": "NetworKIN_cutoffscore2.0.tabular",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 1227.581169128418,
+                "height": 102.56249237060547,
+                "left": 656.1561889648438,
+                "right": 856.1530456542969,
+                "top": 1125.0186767578125,
+                "width": 199.99685668945312,
+                "x": 656.1561889648438,
+                "y": 1125.0186767578125
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": \"\"}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "0ecd2f07-9b2c-41c5-8bcf-fa45927f61ca",
+            "workflow_outputs": []
+        },
+        "7": {
+            "annotation": "Derived from http://hprd.org/serine_motifs, http://hprd.org/tyrosine_motifs, and http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx",
+            "content_id": null,
+            "errors": null,
+            "id": 7,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "Derived from http://hprd.org/serine_motifs, http://hprd.org/tyrosine_motifs, and http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx",
+                    "name": "pSTY_Motifs.tabular"
+                }
+            ],
+            "label": "pSTY_Motifs.tabular",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 1336.2092514038086,
+                "height": 82.1624984741211,
+                "left": 673.1718139648438,
+                "right": 873.1686706542969,
+                "top": 1254.0467529296875,
+                "width": 199.99685668945312,
+                "x": 673.1718139648438,
+                "y": 1254.0467529296875
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": \"\"}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "d8f605d8-4cf6-48dc-9ec5-ceda9f6ee4b2",
+            "workflow_outputs": []
+        },
+        "8": {
+            "annotation": "Derived from Kinase_Substrate_Dataset.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use  - see that link for citation.)",
+            "content_id": null,
+            "errors": null,
+            "id": 8,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "Derived from Kinase_Substrate_Dataset.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use  - see that link for citation.)",
+                    "name": "PSP_Kinase_Substrate_Dataset.tabular"
+                }
+            ],
+            "label": "PSP_Kinase_Substrate_Dataset.tabular",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 1466.596794128418,
+                "height": 102.56249237060547,
+                "left": 673.1718139648438,
+                "right": 873.1686706542969,
+                "top": 1364.0343017578125,
+                "width": 199.99685668945312,
+                "x": 673.1718139648438,
+                "y": 1364.0343017578125
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": \"\"}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "ed06b46c-d6b3-4d52-a6e6-fa5211da5a0a",
+            "workflow_outputs": []
+        },
+        "9": {
+            "annotation": "Derived from Regulatory_sites.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use  - see that link for citation.)",
+            "content_id": null,
+            "errors": null,
+            "id": 9,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "Derived from Regulatory_sites.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use  - see that link for citation.)",
+                    "name": "PSP_Regulatory_sites.tabular"
+                }
+            ],
+            "label": "PSP_Regulatory_sites.tabular",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 1576.2092514038086,
+                "height": 82.1624984741211,
+                "left": 674.1561889648438,
+                "right": 874.1530456542969,
+                "top": 1494.0467529296875,
+                "width": 199.99685668945312,
+                "x": 674.1561889648438,
+                "y": 1494.0467529296875
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": \"\"}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "47cf1ca8-315d-425f-bb32-0946cd866d5f",
+            "workflow_outputs": []
+        },
+        "10": {
+            "annotation": "List of alpha cutoff values for significance testing; text file having no header and a single line for each cutoff value.",
+            "content_id": null,
+            "errors": null,
+            "id": 10,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "List of alpha cutoff values for significance testing; text file having no header and a single line for each cutoff value.",
+                    "name": "alpha_levels.tabular"
+                }
+            ],
+            "label": "alpha_levels.tabular",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 1835.699851989746,
+                "height": 82.1624984741211,
+                "left": 691.1249389648438,
+                "right": 891.1217956542969,
+                "top": 1753.537353515625,
+                "width": 199.99685668945312,
+                "x": 691.1249389648438,
+                "y": 1753.537353515625
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": \"\"}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "5d66ff58-9c83-4edd-96c6-6132dc8377c7",
+            "workflow_outputs": []
+        },
+        "11": {
+            "annotation": "Transform the output of MaxQuant for phosphoproteome-enriched samples to prepare it for statistical anlaysis.",
+            "content_id": "mqppep_preproc",
+            "errors": null,
+            "id": 11,
+            "input_connections": {
+                "networkin": {
+                    "id": 6,
+                    "output_name": "output"
+                },
+                "p_sty_motifs": {
+                    "id": 7,
+                    "output_name": "output"
+                },
+                "phosphoSites": {
+                    "id": 0,
+                    "output_name": "output"
+                },
+                "protein_fasta": {
+                    "id": 5,
+                    "output_name": "output"
+                },
+                "psp_kinase_substrate": {
+                    "id": 8,
+                    "output_name": "output"
+                },
+                "psp_regulatory_sites": {
+                    "id": 9,
+                    "output_name": "output"
+                },
+                "pst_py_selector": {
+                    "id": 1,
+                    "output_name": "output"
+                },
+                "startCol": {
+                    "id": 2,
+                    "output_name": "output"
+                }
+            },
+            "inputs": [
+                {
+                    "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing",
+                    "name": "networkin"
+                },
+                {
+                    "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing",
+                    "name": "p_sty_motifs"
+                },
+                {
+                    "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing",
+                    "name": "phosphoSites"
+                },
+                {
+                    "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing",
+                    "name": "protein_fasta"
+                },
+                {
+                    "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing",
+                    "name": "psp_kinase_substrate"
+                },
+                {
+                    "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing",
+                    "name": "psp_regulatory_sites"
+                }
+            ],
+            "label": "Preprocess MaxQuant Phospho (STY)Sites",
+            "name": "MaxQuant Phosphopeptide Preprocessing",
+            "outputs": [
+                {
+                    "name": "phosphoPepIntensities",
+                    "type": "tabular"
+                },
+                {
+                    "name": "enrichGraph",
+                    "type": "pdf"
+                },
+                {
+                    "name": "locProbCutoffGraph",
+                    "type": "pdf"
+                },
+                {
+                    "name": "enrichGraph_svg",
+                    "type": "svg"
+                },
+                {
+                    "name": "locProbCutoffGraph_svg",
+                    "type": "svg"
+                },
+                {
+                    "name": "filteredData_tabular",
+                    "type": "tabular"
+                },
+                {
+                    "name": "quantData_tabular",
+                    "type": "tabular"
+                },
+                {
+                    "name": "mapped_phophopeptides",
+                    "type": "tabular"
+                },
+                {
+                    "name": "melted_phophopeptide_map",
+                    "type": "tabular"
+                },
+                {
+                    "name": "mqppep_output_sqlite",
+                    "type": "sqlite"
+                },
+                {
+                    "name": "preproc_tab",
+                    "type": "tabular"
+                },
+                {
+                    "name": "preproc_csv",
+                    "type": "csv"
+                },
+                {
+                    "name": "preproc_sqlite",
+                    "type": "sqlite"
+                }
+            ],
+            "position": {
+                "bottom": 1652.2499389648438,
+                "height": 956.231201171875,
+                "left": 1336.60302734375,
+                "right": 1536.5998840332031,
+                "top": 696.0187377929688,
+                "width": 199.99685668945312,
+                "x": 1336.60302734375,
+                "y": 696.0187377929688
+            },
+            "post_job_actions": {
+                "HideDatasetActionfilteredData_tabular": {
+                    "action_arguments": {},
+                    "action_type": "HideDatasetAction",
+                    "output_name": "filteredData_tabular"
+                },
+                "HideDatasetActionmapped_phophopeptides": {
+                    "action_arguments": {},
+                    "action_type": "HideDatasetAction",
+                    "output_name": "mapped_phophopeptides"
+                },
+                "HideDatasetActionmelted_phophopeptide_map": {
+                    "action_arguments": {},
+                    "action_type": "HideDatasetAction",
+                    "output_name": "melted_phophopeptide_map"
+                },
+                "HideDatasetActionmqppep_output_sqlite": {
+                    "action_arguments": {},
+                    "action_type": "HideDatasetAction",
+                    "output_name": "mqppep_output_sqlite"
+                },
+                "HideDatasetActionpreproc_csv": {
+                    "action_arguments": {},
+                    "action_type": "HideDatasetAction",
+                    "output_name": "preproc_csv"
+                },
+                "HideDatasetActionquantData_tabular": {
+                    "action_arguments": {},
+                    "action_type": "HideDatasetAction",
+                    "output_name": "quantData_tabular"
+                },
+                "RenameDatasetActionenrichGraph": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.enrichGraph_pdf"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "enrichGraph"
+                },
+                "RenameDatasetActionenrichGraph_svg": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.enrichGraph_svg"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "enrichGraph_svg"
+                },
+                "RenameDatasetActionfilteredData_tabular": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.filteredData"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "filteredData_tabular"
+                },
+                "RenameDatasetActionlocProbCutoffGraph": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.locProbCutoffGraph_pdf"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "locProbCutoffGraph"
+                },
+                "RenameDatasetActionlocProbCutoffGraph_svg": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.locProbCutoffGraph_svg"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "locProbCutoffGraph_svg"
+                },
+                "RenameDatasetActionmapped_phophopeptides": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.ppep_map"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "mapped_phophopeptides"
+                },
+                "RenameDatasetActionmelted_phophopeptide_map": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.melted"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "melted_phophopeptide_map"
+                },
+                "RenameDatasetActionmqppep_output_sqlite": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.ppep_mapping_sqlite"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "mqppep_output_sqlite"
+                },
+                "RenameDatasetActionphosphoPepIntensities": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.ppep_intensities"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "phosphoPepIntensities"
+                },
+                "RenameDatasetActionpreproc_csv": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.preproc_csv"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "preproc_csv"
+                },
+                "RenameDatasetActionpreproc_sqlite": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.preproc_sqlite"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "preproc_sqlite"
+                },
+                "RenameDatasetActionpreproc_tab": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.preproc_tab"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "preproc_tab"
+                },
+                "RenameDatasetActionquantData_tabular": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.quantData"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "quantData_tabular"
+                }
+            },
+            "tool_id": "mqppep_preproc",
+            "tool_state": "{\"collapseFunc\": \"sum\", \"intervalCol\": \"1\", \"localProbCutoff\": \"0.75\", \"merge_function\": \"sum\", \"networkin\": {\"__class__\": \"RuntimeValue\"}, \"p_sty_motifs\": {\"__class__\": \"RuntimeValue\"}, \"phosphoCol\": \"^Number of Phospho [(]STY[)]$\", \"phosphoSites\": {\"__class__\": \"RuntimeValue\"}, \"protein_fasta\": {\"__class__\": \"RuntimeValue\"}, \"psp_kinase_substrate\": {\"__class__\": \"RuntimeValue\"}, \"psp_regulatory_sites\": {\"__class__\": \"RuntimeValue\"}, \"pst_py_selector\": {\"__class__\": \"ConnectedValue\"}, \"species\": \"human\", \"startCol\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}",
+            "tool_version": null,
+            "type": "tool",
+            "uuid": "83ae6038-e871-4051-9544-181cdf2a5257",
+            "workflow_outputs": [
+                {
+                    "label": "locProbCutoffGraph_pdf",
+                    "output_name": "locProbCutoffGraph",
+                    "uuid": "c3840695-4deb-4347-94c4-1d60f0de3744"
+                },
+                {
+                    "label": "enrichGraph_svg",
+                    "output_name": "enrichGraph_svg",
+                    "uuid": "dff65302-dc37-4812-9ab1-10178d880412"
+                },
+                {
+                    "label": "locProbCutoffGraph_svg",
+                    "output_name": "locProbCutoffGraph_svg",
+                    "uuid": "b48535ab-ee39-44c3-bc37-5f4e79a147ee"
+                },
+                {
+                    "label": "preproc_tab",
+                    "output_name": "preproc_tab",
+                    "uuid": "ce6d767a-b24d-404c-9eeb-fa8f5156fa93"
+                },
+                {
+                    "label": "preproc_sqlite",
+                    "output_name": "preproc_sqlite",
+                    "uuid": "52c88bda-4863-47e1-afb0-46839fb1b601"
+                },
+                {
+                    "label": "ppep_intensities",
+                    "output_name": "phosphoPepIntensities",
+                    "uuid": "b1729d3e-b934-4e7e-a38f-23d963df3c22"
+                },
+                {
+                    "label": "enrichGraph_pdf",
+                    "output_name": "enrichGraph",
+                    "uuid": "72f605a1-a8a7-4e9e-99e8-0c1360303fc0"
+                }
+            ]
+        },
+        "12": {
+            "annotation": "Perform ANOVA. For imputing missing values, use median of non-missing values from the same treatment group.",
+            "content_id": "mqppep_anova",
+            "errors": null,
+            "id": 12,
+            "input_connections": {
+                "alpha_file": {
+                    "id": 10,
+                    "output_name": "output"
+                },
+                "input_file": {
+                    "id": 11,
+                    "output_name": "preproc_tab"
+                },
+                "sample_grouping_regex": {
+                    "id": 4,
+                    "output_name": "output"
+                },
+                "sample_names_regex": {
+                    "id": 3,
+                    "output_name": "output"
+                }
+            },
+            "inputs": [
+                {
+                    "description": "runtime parameter for tool MaxQuant Phosphopeptide ANOVA",
+                    "name": "alpha_file"
+                },
+                {
+                    "description": "runtime parameter for tool MaxQuant Phosphopeptide ANOVA",
+                    "name": "input_file"
+                }
+            ],
+            "label": "ANOVA group-median imputed",
+            "name": "MaxQuant Phosphopeptide ANOVA",
+            "outputs": [
+                {
+                    "name": "imputed_data_file",
+                    "type": "tabular"
+                },
+                {
+                    "name": "imp_qn_lt_file",
+                    "type": "tabular"
+                },
+                {
+                    "name": "report_file",
+                    "type": "pdf"
+                }
+            ],
+            "position": {
+                "bottom": 2246.653045654297,
+                "height": 347.1187438964844,
+                "left": 1028.184326171875,
+                "right": 1228.1811828613281,
+                "top": 1899.5343017578125,
+                "width": 199.99685668945312,
+                "x": 1028.184326171875,
+                "y": 1899.5343017578125
+            },
+            "post_job_actions": {
+                "RenameDatasetActionimp_qn_lt_file": {
+                    "action_arguments": {
+                        "newname": "#{input_file}.intensities_group-mean-imputed_QN_LT"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "imp_qn_lt_file"
+                },
+                "RenameDatasetActionimputed_data_file": {
+                    "action_arguments": {
+                        "newname": "#{input_file}.intensities_group-mean-imputed"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "imputed_data_file"
+                },
+                "RenameDatasetActionreport_file": {
+                    "action_arguments": {
+                        "newname": "#{input_file}.intensities_group-mean-imputed_report"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "report_file"
+                }
+            },
+            "tool_id": "mqppep_anova",
+            "tool_state": "{\"alpha_file\": {\"__class__\": \"RuntimeValue\"}, \"imputation\": {\"imputation_method\": \"group-median\", \"__current_case__\": 0}, \"input_file\": {\"__class__\": \"RuntimeValue\"}, \"intensity_column_regex\": \"^Intensity[^_]\", \"sample_grouping_regex\": {\"__class__\": \"ConnectedValue\"}, \"sample_names_regex\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}",
+            "tool_version": null,
+            "type": "tool",
+            "uuid": "71cbf127-f9d1-4a10-be61-53daea7ff1f6",
+            "workflow_outputs": [
+                {
+                    "label": "intensities_group-mean-imputed_QN_LT",
+                    "output_name": "imp_qn_lt_file",
+                    "uuid": "3ad9495d-0b38-4527-a0bf-7b2c62eb9dc9"
+                },
+                {
+                    "label": "intensities_group-mean-imputed",
+                    "output_name": "imputed_data_file",
+                    "uuid": "933baff0-3c19-4363-822c-2bce5d436ac1"
+                },
+                {
+                    "label": "intensities_group-mean-imputed_report",
+                    "output_name": "report_file",
+                    "uuid": "792cacc0-e202-44e4-9048-9e1186ea5ba9"
+                }
+            ]
+        },
+        "13": {
+            "annotation": "Perform ANOVA. For imputing missing values, create random values.",
+            "content_id": "mqppep_anova",
+            "errors": null,
+            "id": 13,
+            "input_connections": {
+                "alpha_file": {
+                    "id": 10,
+                    "output_name": "output"
+                },
+                "input_file": {
+                    "id": 11,
+                    "output_name": "preproc_tab"
+                },
+                "sample_grouping_regex": {
+                    "id": 4,
+                    "output_name": "output"
+                },
+                "sample_names_regex": {
+                    "id": 3,
+                    "output_name": "output"
+                }
+            },
+            "inputs": [
+                {
+                    "description": "runtime parameter for tool MaxQuant Phosphopeptide ANOVA",
+                    "name": "alpha_file"
+                },
+                {
+                    "description": "runtime parameter for tool MaxQuant Phosphopeptide ANOVA",
+                    "name": "input_file"
+                }
+            ],
+            "label": "MaxQuant Phosphopeptide ANOVA randomly imputed",
+            "name": "MaxQuant Phosphopeptide ANOVA",
+            "outputs": [
+                {
+                    "name": "imputed_data_file",
+                    "type": "tabular"
+                },
+                {
+                    "name": "imp_qn_lt_file",
+                    "type": "tabular"
+                },
+                {
+                    "name": "report_file",
+                    "type": "pdf"
+                }
+            ],
+            "position": {
+                "bottom": 2106.0374145507812,
+                "height": 367.51873779296875,
+                "left": 1399.153076171875,
+                "right": 1599.1499328613281,
+                "top": 1738.5186767578125,
+                "width": 199.99685668945312,
+                "x": 1399.153076171875,
+                "y": 1738.5186767578125
+            },
+            "post_job_actions": {
+                "RenameDatasetActionimp_qn_lt_file": {
+                    "action_arguments": {
+                        "newname": "#{input_file}.intensities_randomly-imputed_QN_LT"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "imp_qn_lt_file"
+                },
+                "RenameDatasetActionimputed_data_file": {
+                    "action_arguments": {
+                        "newname": "#{input_file}.intensities_randomly-imputed"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "imputed_data_file"
+                },
+                "RenameDatasetActionreport_file": {
+                    "action_arguments": {
+                        "newname": "#{input_file}.intensities_randomly-imputed_report"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "report_file"
+                }
+            },
+            "tool_id": "mqppep_anova",
+            "tool_state": "{\"alpha_file\": {\"__class__\": \"RuntimeValue\"}, \"imputation\": {\"imputation_method\": \"random\", \"__current_case__\": 3, \"meanPercentile\": \"1\", \"sdPercentile\": \"1.0\"}, \"input_file\": {\"__class__\": \"RuntimeValue\"}, \"intensity_column_regex\": \"^Intensity[^_]\", \"sample_grouping_regex\": {\"__class__\": \"ConnectedValue\"}, \"sample_names_regex\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}",
+            "tool_version": null,
+            "type": "tool",
+            "uuid": "e71562a7-c941-429d-99a8-e14721df3670",
+            "workflow_outputs": [
+                {
+                    "label": "intensities_randomly-imputed",
+                    "output_name": "imputed_data_file",
+                    "uuid": "e27c540b-07d0-496f-8b11-b4c1472dce12"
+                },
+                {
+                    "label": "intensities_randomly-imputed_report",
+                    "output_name": "report_file",
+                    "uuid": "abe2dbf4-956d-4625-a0e1-ad1c6c988a7c"
+                },
+                {
+                    "label": "intensities_randomly-imputed_QN_LT",
+                    "output_name": "imp_qn_lt_file",
+                    "uuid": "cb5b1d8f-905b-453a-a479-507e01a8f8f7"
+                }
+            ]
+        }
+    },
+    "tags": [
+        "ppenrich"
+    ],
+    "uuid": "234db768-520c-4eaa-a5be-061e3d858682",
+    "version": 2
+}