Mercurial > repos > eschen42 > mqppep_anova

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/MaxQuantProcessingScript.R	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,500 @@
+#!/usr/bin/env Rscript
+
+# This is the implementation for the
+#   "MaxQuant Phosphopeptide Localization Probability Cutoff"
+#   Galaxy tool (mqppep_lclztn_filter)
+# It is adapted from the MaxQuant Processing Script written by Larry Cheng.
+
+# libraries
+library(optparse)
+library(data.table)
+library(stringr)
+library(ggplot2)
+#library(PTXQC)
+#require(PTXQC)
+#require(methods)
+
+# title: "MaxQuant Processing Script"
+# author: "Larry Cheng"
+# date: "February 19, 2018"
+#
+# # MaxQuant Processing Script
+# Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order):
+# 1) Runs the Proteomics Quality Control software
+# 2) Remove contaminant and reverse sequence rows
+# 3) Filters rows based on localization probability
+# 4) Extract the quantitative data
+# 5) Sequences phosphopeptides
+# 6) Merges multiply phosphorylated peptides
+# 7) Filters out phosphopeptides based on enrichment
+# The output file contains the phosphopeptide (first column) and the quantitative values for each sample
+#
+# ## Revision History
+# Rev. 2022-02-10 :wrap for inclusion in Galaxy
+# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script"
+# Rev. 2017-12-12 :added PTXQC
+#                  added additional plots and table outputs for quality control
+#                  allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <-
+#                  "\\.(\\d+)[A-Z]$"
+#                  converted from .r to .rmd file to knit report for quality control
+# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times
+# Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows
+# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output
+# Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\.(\\d+)[AB]$" so that it looks at the end of string
+# Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....)
+# Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA
+# Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75
+# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards
+# Rev. 2016-06-21 :test18 + standardized the regexpression in protocol
+
+
+### FUNCTION DECLARATIONS begin ----------------------------------------------
+
+# Read first line of file at filePath
+# adapted from: https://stackoverflow.com/a/35761217/15509512
+readFirstLine <- function(filepath) {
+  con = file(filepath, "r")
+  line = readLines(con, n = 1)
+  close(con)
+  return(line)
+}
+
+# Move columns to the end of dataframe
+# - data: the dataframe
+# - move: a vector of column names, each of which is an element of names(data)
+movetolast <- function(data, move) {
+  data[c(setdiff(names(data), move), move)]
+}
+
+# Generate phosphopeptide and build list when applied
+phosphopeptide_func <- function(df) {
+
+  #generate peptide sequence and list of phosphopositions
+  phosphoprobsequence <- strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]]
+  output <- vector()
+  phosphopeptide <- ""
+  counter <- 0 #keep track of position in peptide
+  phosphopositions <- vector() #keep track of phosphorylation positions in peptide
+  score_diff <- ""
+  for (chara in phosphoprobsequence){
+    #build peptide sequence
+    if (!(chara == " " | chara == "(" | chara == ")" | chara =="." | chara =="-" | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9")) {
+      phosphopeptide <- paste(phosphopeptide,chara,sep="")
+      counter <- counter + 1
+    }
+    #generate score_diff
+    if (chara == "-" | chara =="." | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9"){
+      score_diff <- paste(score_diff,chara,sep="")
+    }
+    #evaluate score_diff
+    if (chara == ")" ){
+      score_diff <- as.numeric(score_diff)
+      #only consider a phosphoresidue if score_diff > 0
+      if (score_diff > 0) {
+        phosphopositions <- append(phosphopositions, counter)
+      }
+      score_diff <- ""
+    }
+  }
+
+  #generate phosphopeptide sequence (ie, peptide sequence with "p"'s)
+  counter <- 1
+  phosphoposition_correction1 <- -1 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
+  phosphoposition_correction2 <- 0 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string
+  while (counter <= length(phosphopositions) ) {
+    phosphopeptide <- paste(substr(phosphopeptide,0,phosphopositions[counter]+phosphoposition_correction1),"p",substr(phosphopeptide,phosphopositions[counter]+phosphoposition_correction2,nchar(phosphopeptide)),sep="")
+    counter <- counter + 1
+    phosphoposition_correction1 <- phosphoposition_correction1 + 1
+    phosphoposition_correction2 <- phosphoposition_correction2 + 1
+  }
+
+  #building phosphopeptide list
+  output <- append(output,phosphopeptide)
+  return(output)
+}
+
+### FUNCTION DECLARATIONS end ------------------------------------------------
+
+
+### EXTRACT ARGUMENTS begin --------------------------------------------------
+
+# parse options
+option_list <- list(
+  make_option(
+    c("-i", "--input"),
+    action = "store",
+    type = "character",
+    help = "A MaxQuant Phospho (STY)Sites.txt"
+  )
+,  make_option(
+    c("-o", "--output"),
+    action = "store",
+    type = "character",
+    help = "path to output file"
+  )
+, make_option(
+    c("-E", "--enrichGraph"),
+    action = "store",
+    type = "character",
+    help = "path to enrichment graph PDF"
+  )
+, make_option(
+    c("-F", "--enrichGraph_svg"),
+    action = "store",
+    type = "character",
+    help = "path to enrichment graph SVG"
+  )
+, make_option(
+    c("-L", "--locProbCutoffGraph"),
+    action = "store",
+    type = "character",
+    help = "path to location-proability cutoff graph PDF"
+  )
+, make_option(
+    c("-M", "--locProbCutoffGraph_svg"),
+    action = "store",
+    type = "character",
+    help = "path to location-proability cutoff graph SVG"
+  )
+, make_option(
+    c("-e", "--enriched"),
+    action = "store",
+    type = "character",
+    help = "pY or pST enriched samples (ie, 'Y' or 'ST')"
+  )
+  # default = "^Number of Phospho [(]STY[)]$",
+, make_option(
+    c("-p", "--phosphoCol"),
+    action = "store",
+    type = "character",
+    help = "PERL-compatible regular expression matching header of column having number of 'Phospho (STY)'"
+  )
+  # default = "^Intensity[^_]",
+, make_option(
+    c("-s", "--startCol"),
+    action = "store",
+    type = "character",
+    help = "PERL-compatible regular expression matching column header having first sample intensity"
+  )
+  # default = 1,
+, make_option(
+    c("-I", "--intervalCol"),
+    action = "store",
+    type = "integer",
+    help = "Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column"
+  )
+  # default = 0.75,
+, make_option(
+    c("-l", "--localProbCutoff"),
+    action = "store",
+    type = "double",
+    help = "Localization Probability Cutoff"
+  )
+  # default = "sum",
+, make_option(
+    c("-f", "--collapse_func"),
+    action = "store",
+    type = "character",
+    help = "merge identical phosphopeptides by ('sum' or 'average') the intensities"
+  )
+  # default = "filteredData.txt",
+, make_option(
+    c("-r", "--filtered_data"),
+    action = "store",
+    type = "character",
+    help = "filteredData.txt"
+  )
+  # default = "quantData.txt",
+, make_option(
+    c("-q", "--quant_data"),
+    action = "store",
+    type = "character",
+    help = "quantData.txt"
+  )
+)
+args <- parse_args(OptionParser(option_list=option_list))
+# Check parameter values
+
+### EXTRACT ARGUMENTS end ----------------------------------------------------
+
+
+### EXTRACT PARAMETERS from arguments begin ----------------------------------
+
+if (! file.exists(args$input)) {
+  stop((paste("File", args$input, "does not exist")))
+}
+
+phosphoColPattern <- "^Number of Phospho [(][STY][STY]*[)]$"
+startColPattern <- "^Intensity[^_]"
+phosphoColPattern <- readFirstLine(args$phosphoCol)
+startColPattern <- readFirstLine(args$startCol)
+
+sink(getConnection(2))
+#ACE print(paste("phosphoColPattern", phosphoColPattern))
+#ACE print(paste("startColPattern", startColPattern))
+
+inputFilename <- args$input
+filteredFilename <- args$filtered_data
+quantFilename <- args$quant_data
+intervalCol <- as.integer(args$intervalCol)
+
+firstLine <- readFirstLine(inputFilename)
+columnHeaders <- unlist(strsplit(x=firstLine, split=c('\t'), fixed=TRUE))
+sink(getConnection(2))
+#ACE print("columnHeaders")
+#ACE print(columnHeaders)
+sink()
+
+
+intensityHeaderCols <- grep(pattern=startColPattern, x=columnHeaders, perl=TRUE)
+if ( length(intensityHeaderCols) == 0) {
+    err_msg <- paste("Found no intensity columns matching pattern:", startColPattern)
+    # Divert output to stderr
+    sink(getConnection(2))
+    print(err_msg)
+    sink()
+    stop(err_msg)
+    }
+
+
+phosphoCol <- grep(pattern=phosphoColPattern, x=columnHeaders, perl=TRUE)[1]
+if (is.na(phosphoCol)) {
+    err_msg <- paste("Found no 'number of phospho sites' columns matching pattern:", phosphoColPattern)
+    # Divert output to stderr
+    sink(getConnection(2))
+    print(err_msg)
+    sink()
+    stop(err_msg)
+    }
+
+
+i_count <- 0
+this_column <- 1
+last_value <- intensityHeaderCols[1]
+intensityCols <- c(last_value)
+
+while ( length(intensityHeaderCols) >= intervalCol * i_count ) {
+  i_count <- 1 + i_count
+  this_column <- intervalCol + this_column
+  if ( last_value + intervalCol != intensityHeaderCols[this_column] ) break
+  last_value <- intensityHeaderCols[this_column]
+  if (length(intensityHeaderCols) < intervalCol * i_count) break
+  intensityCols <- c(intensityCols, intensityHeaderCols[this_column])
+  }
+
+startCol <- intensityCols[1]
+numSamples <- i_count
+
+outputfilename <- args$output
+enrichGraphFilename <- args$enrichGraph
+locProbCutoffGraphFilename <- args$locProbCutoffGraph
+enrichGraphFilename_svg <- args$enrichGraph_svg
+locProbCutoffGraphFilename_svg <- args$locProbCutoffGraph_svg
+
+localProbCutoff <- args$localProbCutoff
+enriched <- args$enriched
+collapse_FUN <- args$collapse_func
+
+### EXTRACT PARAMETERS from arguments end ------------------------------------
+
+
+# Proteomics Quality Control for MaxQuant Results
+#  (Bielow C et al. J Proteome Res. 2016 PMID: 26653327)
+# is run by the Galaxy MaxQuant wrapper and need not be invoked here.
+
+
+# Read data, filtering out contaminants, reverse sequences, and localization probability
+# ---
+fullData <- read.table(file = inputFilename, sep ="\t", header=T, quote="")
+
+#Filter out contaminant rows and reverse rows
+filteredData <- subset(fullData,!grepl("CON__", Proteins))
+filteredData <- subset(filteredData,!grepl("_MYCOPLASMA", Proteins))
+filteredData <- subset(filteredData,!grepl("CONTAMINANT_", Proteins))
+filteredData <- subset(filteredData,!grepl("REV__", Protein)) #since REV__ rows are blank in the first column (Proteins)
+write.table(filteredData, file = filteredFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE)
+# ...
+
+
+# Filter out data with localization probability below localProbCutoff
+# ---
+#Data filtered by localization probability
+locProbFilteredData <- filteredData[filteredData$Localization.prob>=localProbCutoff,]
+# ...
+
+
+# Localization probability -- visualize locprob cutoff
+# ---
+locProbGraphData <- data.frame(
+  group = c(paste(">",toString(localProbCutoff),sep=""), paste("<",toString(localProbCutoff),sep="")),
+  value = c(nrow(locProbFilteredData)/nrow(filteredData)*100, (nrow(filteredData)-nrow(locProbFilteredData))/nrow(filteredData)*100)
+)
+gigi <-
+  ggplot(locProbGraphData, aes(x = "", y = value, fill = group)) +
+  geom_bar(width = 0.5, stat = "identity", color = "black") +
+  labs(
+    x = NULL
+  , y = "percent"
+  , title = "Phosphopeptides partitioned by localization-probability cutoff"
+  ) +
+  scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") +
+  theme_minimal() +
+  theme(
+         legend.position = "right"
+       , legend.title=element_text()
+       , plot.title = element_text(hjust = 0.5)
+       , plot.subtitle = element_text(hjust = 0.5)
+       , plot.title.position = "plot"
+       )
+pdf(locProbCutoffGraphFilename)
+print(gigi)
+dev.off()
+svg(locProbCutoffGraphFilename_svg)
+print(gigi)
+dev.off()
+# ...
+
+
+# Extract quantitative values from filtered data
+# ---
+quantData <- locProbFilteredData[,seq(from=startCol, by=intervalCol, length.out=numSamples)]
+# ...
+
+
+# Generate Phosphopeptide Sequence
+#   for latest version of MaxQuant (Version 1.5.3.30)
+# ---
+dataTable <- data.frame(locProbFilteredData[,1:8],locProbFilteredData[,phosphoCol],locProbFilteredData[,phosphoCol+1],locProbFilteredData[,phosphoCol+2],locProbFilteredData[,phosphoCol+3],locProbFilteredData[,phosphoCol+4],locProbFilteredData[,phosphoCol+5],locProbFilteredData[,phosphoCol+6],locProbFilteredData[,phosphoCol+7],quantData)
+colnames(dataTable) <- c("Proteins","Positions within proteins", "Leading proteins", "Protein", "Protein names", "Gene names", "Fasta headers", "Localization prob", "Number of Phospho (STY)", "Amino Acid", "Sequence window","Modification window", "Peptide window coverage", "Phospho (STY) Probabilities", "Phospho (STY) Score diffs", "Position in peptide", colnames(quantData))
+# 'phosphopeptide_func' generates a phosphopeptide sequence for each row of data.
+#   for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1,2) = both
+dataTable$Phosphopeptide <- apply(X=dataTable, MARGIN=1, FUN=phosphopeptide_func)
+# Move the quant data columns to the right end of the data.frame
+dataTable <- movetolast(dataTable,c(colnames(quantData)))
+# ...
+
+
+# Write quantitative values for debugging purposes
+# ---
+quantWrite <- cbind( dataTable[,"Sequence window"], quantData )
+colnames(quantWrite)[1] <- "Sequence.Window"
+write.table(quantWrite, file = quantFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE)
+# ...
+
+
+# Make new data frame containing only Phosphopeptides to be mapped to quant data (merge_df)
+# ---
+dataTable <- setDT(dataTable, keep.rownames=TRUE) #row name will be used to map
+merge_df <- data.frame(as.integer(dataTable$rn), dataTable$Phosphopeptide) #row index to merge data frames
+colnames(merge_df) <- c("rn", "Phosphopeptide")
+# ...
+
+
+# Add Phosphopeptide column to quant columns for quality control checking
+# ---
+quantData_qc <- as.data.frame(quantData)
+setDT(quantData_qc, keep.rownames=TRUE) #will use to match rowname to data
+quantData_qc$rn <- as.integer(quantData_qc$rn)
+quantData_qc <- merge(merge_df,quantData_qc, by="rn")
+quantData_qc$rn <- NULL #remove rn column
+# ...
+
+
+# Collapse multiphosphorylated peptides
+# ---
+quantData_qc_collapsed <- data.table(quantData_qc, key = "Phosphopeptide")
+quantData_qc_collapsed <- aggregate(. ~ Phosphopeptide,quantData_qc, FUN= collapse_FUN)
+# ...
+
+
+# Compute (as string) % of phosphopeptides that are multiphosphorylated (for use in next step)
+# ---
+pct_multiphos <- (nrow(quantData_qc) - nrow(quantData_qc_collapsed)) / (2 * nrow(quantData_qc))
+pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%")
+# ...
+
+
+# Compute and visualize breakdown of pY, pS, and pT before enrichment filter
+# ---
+pY_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
+pS_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS"),]
+pT_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
+
+pY_num <- nrow(pY_data)
+pS_num <- nrow(pS_data)
+pT_num <- nrow(pT_data)
+
+# Visualize enrichment
+enrichGraphData <- data.frame(
+  group = c("pY", "pS", "pT"),
+  value = c(pY_num, pS_num, pT_num)
+)
+
+enrichGraphData <- enrichGraphData[enrichGraphData$value > 0,]
+
+# Plot pie chart with legend
+# start: https://stackoverflow.com/a/62522478/15509512
+# refine: https://www.statology.org/ggplot-pie-chart/
+# colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8
+slices <- enrichGraphData$value
+phosphoresidue <- enrichGraphData$group
+pct    <- round(100 * slices / sum(slices))
+lbls   <- paste(enrichGraphData$group,"\n",pct, "%\n(", slices, ")", sep="")
+slc_ctr <- c()
+run_tot <- 0
+for (p in pct) {
+  slc_ctr <- c(slc_ctr, run_tot + p/2.0)
+  run_tot <- run_tot + p
+}
+lbl_y  <- 100 - slc_ctr
+df     <- data.frame(slices, pct, lbls, phosphoresidue = factor(phosphoresidue, levels = phosphoresidue))
+gigi <- ggplot(
+  df
+, aes(x = 1, y = pct, fill = phosphoresidue)) +
+  geom_col(position = "stack", orientation = "x") +
+  geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") +
+  coord_polar(theta = "y", direction = -1) +
+  labs(
+    x = NULL
+  , y = NULL
+  , title = "Percentages (and counts) of phosphosites, by type of residue"
+  , caption = sprintf("Roughly %s of peptides have multiple phosphosites.", pct_multiphos)
+  ) +
+  labs(x = NULL, y = NULL, fill = NULL) +
+  theme_classic() +
+  theme( legend.position="right"
+       , axis.line = element_blank()
+       , axis.text = element_blank()
+       , axis.ticks = element_blank()
+       , plot.title = element_text(hjust = 0.5)
+       , plot.subtitle = element_text(hjust = 0.5)
+       , plot.caption = element_text(hjust = 0.5)
+       , plot.title.position = "plot"
+       ) +
+  scale_fill_manual(breaks = phosphoresidue, values=c("#c7eae5", "#f6e8c3", "#dfc27d"))
+
+pdf(enrichGraphFilename)
+print(gigi)
+dev.off()
+svg(enrichGraphFilename_svg)
+print(gigi)
+dev.off()
+# ...
+
+
+# Filter phosphopeptides by enrichment
+# --
+if (enriched == "Y"){
+  quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),]
+} else if ( enriched == "ST" ) {
+  quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),]
+} else {
+  print("Error in enriched variable. Set to either 'Y' or 'ST'")
+}
+# ...
+
+
+# Write phosphopeptides filtered by enrichment
+# --
+write.table(quantData_qc_enrichment, file=outputfilename, sep="\t", quote = FALSE, row.names = FALSE)
+# ...
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PhosphoPeptide_Upstream_Kinase_Mapping.pl	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,2124 @@
+#!/usr/local/bin/perl
+###############################################################################################################################
+#    perl Kinase_enrichment_analysis_complete_v0.pl
+#
+#    Nick Graham, USC
+#    2016-02-27
+#
+#    Built from scripts written by NG at UCLA in Tom Graeber's lab:
+#        CombinePhosphoSites.pl
+#        Retrieve_p_motifs.pl
+#        NetworKIN_Motif_Finder_v7.pl
+#
+#    Given a list of phospho-peptides, find protein information and upstream kinases.
+#    Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl
+#
+#    Updated 2022-01-13, Art Eschenlauer, UMN on behalf of Justin Drake's lab:
+#        Added warnings and used strict;
+#        fixed some code paths resulting in more NetworKIN matches;
+#        applied Aho-Corasick algorithm (via external Python script because Perl implementation was still too slow)
+#        to speed up "Match the non_p_peptides to the @sequences array";
+#        added support for SQLite-formatted UniProtKB/Swiss-Prot data as an alternative to FASTA-formatted data;
+#        added support for SQLite output in addition to tabular files.
+#
+#
+###############################################################################################################################
+
+use strict;
+use warnings;
+
+use Getopt::Std;
+use DBD::SQLite::Constants qw/:file_open/;
+use DBI qw(:sql_types);
+use File::Copy;
+use File::Basename;
+use POSIX qw(strftime);
+use Time::HiRes qw(gettimeofday);
+#use Data::Dump qw(dump);
+
+my $USE_SEARCH_PPEP_PY = 1;
+
+my $dirname = dirname(__FILE__);
+my %opts;
+my ($file_in, $average_or_sum, $db_out, $file_out, $file_melt, $phospho_type);
+my $dbtype;
+my ($fasta_in, $networkin_in, $motifs_in, $PSP_Kinase_Substrate_in, $PSP_Regulatory_Sites_in);
+my (@samples, %sample_id_lut, %ppep_id_lut, %data, @tmp_data, %n);
+my $line = 0;
+my @failed_match = ("Failed match");
+my @failed_matches;
+my (%all_data);
+my (@p_peptides, @non_p_peptides);
+my @parsed_fasta;
+my (@accessions, @names, @sequences, @databases, $database);
+my ($dbfile, $dbh, $stmth);
+my @col_names;
+my (%matched_sequences, %accessions,     %names,     %sites,   );
+my (@tmp_matches,       @tmp_accessions, @tmp_names, @tmp_sites);
+my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues);
+my (@kinases_observed, $kinases);
+my (@kinases_observed_lbl, @phosphosites_observed_lbl);
+my ($p_sequence_kinase, $p_sequence, $kinase);
+my (@motif_sequence, %motif_type, %motif_count);
+my (@kinases_PhosphoSite, $kinases_PhosphoSite);
+my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite);
+my (%regulatory_sites_PhosphoSite_hash);
+#ACE my %psp_regsite_protein;
+my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes, %organism);
+my (%unique_motifs);
+my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches);
+my %psp_regsite_protein_2;
+my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2, %organism_2);
+my @timeData;
+my $PhosphoSitePlusCitation;
+my %site_description;
+
+my %kinase_substrate_NetworKIN_matches;
+my %kinase_motif_matches;
+my $regulatory_sites_PhosphoSite;
+my ($seq_plus5aa, $seq_plus7aa, %seq_plus7aa_2);
+my %kinase_substrate_PhosphoSite_matches;
+my @formatted_sequence;
+my $pSTY_sequence;
+my $i;
+my @a;
+my $use_sqlite;
+my $verbose;
+
+##########
+## opts ##
+##########
+  ## input files
+    # i : path to input file, e.g., 'outputfile_STEP2.txt'
+    # f : path to UniProtKB/SwissProt FASTA
+    # s : optional species argument
+    # n : path to NetworKIN_201612_cutoffscore2.0.txt
+    # m : path to pSTY_Motifs.txt
+    # p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
+    # r : path to 2017-03_PSP_Regulatory_sites.txt
+  ## options
+    # P : phospho_type
+    # F : function
+    # v : verbose output
+  ## output files
+    # o : path to output file
+    # O : path to "melted" output file
+    # D : path to output SQLite file
+
+sub usage()
+    {
+        print STDERR <<"EOH";
+    This program given a list of phospho-peptides, finds protein information and upstream kinases.
+    usage: $0 [-hvd] -f FASTA_file
+     -h : this (help) message
+     -v : slightly verbose
+     -a : use SQLite less
+     ## input files
+     -i : path to input file, e.g., 'outputfile_STEP2.txt'
+     -f : path to UniProtDB/SwissProt FASTA
+     -s : optional species filter argument for PSP records; defaults to 'human'
+     -n : path to NetworKIN_201612_cutoffscore2.0.txt
+     -m : path to pSTY_Motifs.txt
+     -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
+     -r : path to 2017-03_PSP_Regulatory_sites.txt
+     ## options
+     -P : phospho_type
+     -F : function
+     ## output files
+     -o : path to output file
+     -O : path to "melted" output file
+     -D : path to output SQLite file
+    example: $0
+EOH
+        exit;
+    }
+
+sub format_localtime_iso8601 {
+    # ref: https://perldoc.perl.org/Time::HiRes
+    my ($seconds, $microseconds) = gettimeofday;
+    # ref: https://pubs.opengroup.org/onlinepubs/9699919799/functions/strftime.html
+    return strftime("%Y-%m-%dT%H:%M:%S",localtime(time)) . sprintf(".%03d", $microseconds/1000);
+}
+
+sub replace_pSpTpY {
+    my ($formatted_sequence, $phospho_type) = @_;
+    if ($phospho_type eq 'y') {
+        $formatted_sequence =~ s/pS/S/g;
+        $formatted_sequence =~ s/pT/T/g;
+        $formatted_sequence =~ s/pY/y/g;
+        }
+    elsif ($phospho_type eq "sty") {
+        $formatted_sequence =~ s/pS/s/g;
+        $formatted_sequence =~ s/pT/t/g;
+        $formatted_sequence =~ s/pY/y/g;
+        }
+    $formatted_sequence;
+}
+
+sub pseudo_sed()
+{
+    # Comments give the sed equivalent
+    my $s;
+    # / GN=/!{ s:\(OX=[^ \t]*\):\1 GN=N/A:; };
+    unless (m / GN=/s)
+    {
+        $s = s :(OX=[^ \t]*):${1} GN=N/A:s;
+    }
+    # / PE=/!{ s:\(GN=[^ \t]*\):\1 PE=N/A:; };
+    unless (m / PE=/s)
+    {
+        $s = s :(GN=[^ \t]*):${1} PE=N/A:s;
+    }
+    # / SV=/!{ s:\(PE=[^ \t]*\):\1 SV=N/A:; };
+    unless (m / SV=/s)
+    {
+        $s = s :(PE=[^ \t]*):${1} SV=N/A:s;
+    }
+    # s/^sp.//;
+    $s = s /^sp.//s;
+    # s/[|]/\t/g;
+    $s = s /[|]/\t/sg;
+    # s/ OS=/\t/;
+    $s = s / OS=/\t/s;
+    # s/ OX=/\t/;
+    $s = s / OX=/\t/s;
+    # s/ GN=/\t/;
+    $s = s / GN=/\t/s;
+    # s/ PE=/\t/;
+    $s = s / PE=/\t/s;
+    # s/ SV=/\t/;
+    $s = s / SV=/\t/s;
+} # sub pseudo_sed
+
+getopts('i:f:s:n:m:p:r:P:F:o:O:D:hva', \%opts) ;
+
+#ACE print %opts; #ACE
+#ACE print "\n"; #ACE
+
+if (exists($opts{'h'})) {
+    usage();
+}
+if (exists($opts{'a'})) {
+    $USE_SEARCH_PPEP_PY = 0;
+}
+if (exists($opts{'v'})) {
+    $verbose = 1;
+} else {
+    $verbose = 0;
+}
+if (!exists($opts{'i'}) || !-e $opts{'i'}) {
+    die('Input File not found');
+} else {
+    $file_in = $opts{'i'};
+}
+if (!exists($opts{'f'}) || !-e $opts{'f'}) {
+    die('FASTA not found');
+} else {
+    $fasta_in = $opts{'f'};
+    $use_sqlite = 0;
+}
+#ACE  if (exists($opts{'s'}) && -e $opts{'s'}) {
+#ACE      $use_sqlite = 1;
+#ACE      $dbfile = $opts{'s'};
+#ACE  } elsif (!exists($opts{'f'}) || !-e $opts{'f'}) {
+#ACE      die('Neither input FASTA file nor input SQLite file was found');
+#ACE  } else {
+#ACE      $use_sqlite = 0;
+#ACE      $fasta_in = $opts{'f'};
+#ACE  }
+my $species;
+if ((!exists($opts{'s'})) || ($opts{'s'} eq '')) {
+    $species = 'human';
+} else {
+    $species = $opts{'s'};
+    print "'-s' option is '$species'\n";
+}
+print "species filter is '$species'\n";
+
+if (!exists($opts{'n'}) || !-e $opts{'n'}) {
+    die('Input NetworKIN File not found');
+} else {
+    $networkin_in = $opts{'n'};
+}
+if (!exists($opts{'m'}) || !-e $opts{'m'}) {
+    die('Input pSTY_Motifs File not found');
+} else {
+    $motifs_in = $opts{'m'};
+}
+if (!exists($opts{'p'}) || !-e $opts{'p'}) {
+    die('Input PSP_Kinase_Substrate_Dataset File not found');
+} else {
+    $PSP_Kinase_Substrate_in = $opts{'p'};
+}
+if (!exists($opts{'r'}) || !-e $opts{'r'}) {
+    die('Input PSP_Regulatory_sites File not found');
+} else {
+    $PSP_Regulatory_Sites_in = $opts{'r'};
+}
+if (exists($opts{'P'})) {
+    $phospho_type = $opts{'P'};
+}
+else {
+    $phospho_type = "sty";
+}
+if (exists($opts{'F'})) {
+    $average_or_sum = $opts{'F'};
+}
+else {
+    $average_or_sum = "sum";
+}
+if (exists($opts{'D'})) {
+    $db_out = $opts{'D'};
+}
+else {
+    $db_out = "db_out.sqlite";
+}
+if (exists($opts{'O'})) {
+    $file_melt = $opts{'O'};
+}
+else {
+    $file_melt = "output_melt.tsv";
+}
+if (exists($opts{'o'})) {
+    $file_out = $opts{'o'};
+}
+else {
+    $file_out = "output.tsv";
+}
+
+
+###############################################################################################################################
+# Print the relevant file names to the screen
+###############################################################################################################################
+# print "\nData file:  $data_in\nFASTA file:  $fasta_in\nSpecies:  $species\nOutput file:  $motifs_out\n\n";
+print "\n--- parameters:\n";
+print "Data file:  $file_in\nAverage or sum identical p-sites?  $average_or_sum\nOutput file:  $file_out\nMelted map:  $file_melt\n";
+if ($use_sqlite == 0) {
+  print "Motifs file:  $motifs_in\nNetworKIN file:  networkin_in\nPhosphosite kinase substrate data:  $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data:  $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt FASTA file:  $fasta_in\nOutput SQLite file: $db_out\n";
+} else {
+  print "Motifs file:  $motifs_in\nNetworKIN file:  networkin_in\nPhosphosite kinase substrate data:  $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data:  $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt SQLIte file:  $dbfile\nOutput SQLite file: $db_out\n";
+}
+print "...\n\n";
+
+print "Phospho-residues(s) = $phospho_type\n\n";
+if ($phospho_type ne 'y') {
+    if ($phospho_type ne 'sty') {
+        die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n";
+    }
+}
+
+###############################################################################################################################
+# read the input data file
+# average or sum identical phospho-sites, depending on the value of $average_or_sum
+###############################################################################################################################
+
+open (IN, "$file_in") or die "I couldn't find the input file:  $file_in\n";
+
+die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ;
+
+
+$line = 0;
+
+while (<IN>) {
+    chomp;
+    my @x = split(/\t/);
+    for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n]  =~ s/\n//g; $x[$n]  =~ s/\"//g;}
+
+    # Read in the samples
+    if ($line == 0) {
+        for my $n (1 .. $#x) {
+            push (@samples, $x[$n]);
+            $sample_id_lut{$x[$n]} = $n;
+        }
+        $line++;
+    } else {
+        # check whether we have already seen a phospho-peptide
+        if (exists($data{$x[0]})) {
+            if ($average_or_sum eq "sum") {        # add the data
+                # unload the data
+                @tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); }
+                # add the new data and repack
+                for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; }
+                $all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); }
+
+            } elsif ($average_or_sum eq "average") {        # average the data
+                # unload the data
+                @tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); }
+                # average with the new data and repack
+                for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); }
+                $n{$x[0]}++;
+                $data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); }
+            }
+        }
+        # if the phospho-sequence has not been seen, save the data
+        else {
+            for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); }
+            $n{$x[0]} = 1;
+        }
+    }
+}
+close(IN);
+
+
+###############################################################################################################################
+# Search the FASTA database for phospho-sites and motifs
+#
+# based on Retrieve_p_peptide_motifs_v2.pl
+###############################################################################################################################
+
+
+###############################################################################################################################
+#
+#    Read in the Data file:
+#        1) make @p_peptides array as in the original file
+#        2) make @non_p_peptides array w/o residue modifications (p, #, other)
+#
+###############################################################################################################################
+
+foreach my $peptide (keys %data) {
+    $peptide =~ s/s/pS/g;    $peptide =~ s/t/pT/g;    $peptide =~ s/y/pY/g;
+    push (@p_peptides, $peptide);
+    $peptide =~ s/p//g;
+    push(@non_p_peptides, $peptide);
+}
+
+if ($use_sqlite == 0) {
+  ###############################################################################################################################
+  #
+  #    Read in the UniProtKB/Swiss-Prot data from FASTA; save to @sequences array and SQLite output database
+  #
+  ###############################################################################################################################
+
+  # e.g.
+  #   >sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2
+  #   MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD
+  #   DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK
+  #   EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH
+  #   QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS
+  #   EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT
+  # accession: Q9Y3B9
+  # name: RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2
+  # sequence: MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT
+
+  open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n";
+  print "Reading FASTA file $fasta_in\n";
+  # ref: https://perldoc.perl.org/perlsyn#Compound-Statements
+  #      "If the condition expression of a while statement is based on any of
+  #      a group of iterative expression types then it gets some magic treatment.
+  #      The affected iterative expression types are readline, the <FILEHANDLE>
+  #      input operator, readdir, glob, the <PATTERN> globbing operator, and
+  #      `each`. If the condition expression is one of these expression types,
+  #      then the value yielded by the iterative operator will be implicitly
+  #      assigned to `$_`."
+  while (<IN1>) {
+    chomp;
+    # ref: https://perldoc.perl.org/functions/split#split-/PATTERN/,EXPR
+    #      "If only PATTERN is given, EXPR defaults to $_."
+    my (@x) = split(/\|/);
+    for my $i (0 .. $#x) {
+      $x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g; }
+    if ($x[0] =~ /^>/) {
+      $x[0] =~ s/\>//g;
+      push (@databases, $x[0]);
+      push (@accessions, $x[1]);
+      push (@names, $x[2]);
+      #ACE print "names $x[2]\n";
+      #ACE print "--- $_\n";
+      pseudo_sed();
+      s/$/\t/;
+      push (@parsed_fasta, $_);
+    } elsif ($x[0] =~ /^\w/) {
+      if (defined $sequences[$#accessions]) {
+        $sequences[$#accessions] = $sequences[$#accessions].$x[0];
+      } else {
+        $sequences[$#accessions] = $x[0];
+      }
+      $parsed_fasta[$#accessions] = $parsed_fasta[$#accessions].$x[0];
+    }
+    #ACE print "... '$parsed_fasta[$#accessions]'\n";
+  }
+  close IN1;
+  print "Done Reading FASTA file $fasta_in\n";
+  $dbfile = $db_out;
+  print "Begin writing $dbfile at " . format_localtime_iso8601() . "\n";
+  $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef);
+  my $auto_commit = $dbh->{AutoCommit};
+  print "auto_commit was $auto_commit and is now 0\n" if ($verbose);
+  $dbh->{AutoCommit} = 0;
+
+  # begin DDL-to-SQLite
+  # ---
+  $stmth = $dbh->prepare("
+    DROP TABLE IF EXISTS UniProtKB;
+    ");
+  $stmth->execute();
+
+  $stmth = $dbh->prepare("
+  CREATE TABLE UniProtKB (
+    Uniprot_ID TEXT PRIMARY KEY ON CONFLICT IGNORE,
+    Description TEXT,
+    Organism_Name TEXT,
+    Organism_ID INTEGER,
+    Gene_Name TEXT,
+    PE TEXT,
+    SV TEXT,
+    Sequence TEXT,
+    Database TEXT
+  )
+  ");
+  $stmth->execute();
+  $stmth = $dbh->prepare("
+  CREATE UNIQUE INDEX idx_uniq_UniProtKB_0 on UniProtKB(Uniprot_ID);
+  ");
+  $stmth->execute();
+  $stmth = $dbh->prepare("
+  CREATE INDEX idx_UniProtKB_0 on UniProtKB(Gene_Name);
+  ");
+  $stmth->execute();
+  # ...
+  # end DDL-to-SQLite
+
+  # insert all rows
+  # begin store-to-SQLite "UniProtKB" table
+  # ---
+  $stmth = $dbh->prepare("
+  INSERT INTO UniProtKB (
+    Uniprot_ID,
+    Description,
+    Organism_Name,
+    Organism_ID,
+    Gene_Name,
+    PE,
+    SV,
+    Sequence,
+    Database
+  ) VALUES (?,?,?,?,?,?,?,?,?)
+  ");
+  my $row_count = 1;
+  my $row_string;
+  my (@row, @rows);
+  my $wrd;
+  while ( scalar @parsed_fasta > 0 ) {
+      $database = $databases[$#parsed_fasta];
+      #### print "parsed_fasta[-1]: " . $parsed_fasta[$#parsed_fasta] . "\n";
+      $row_string = pop(@parsed_fasta);
+      #### print "row_string: $row_string\n";
+      @row = (split /\t/, $row_string);
+      for $i (1..3,5..8) {
+          $stmth->bind_param($i, $row[$i]);
+      }
+      $stmth->bind_param(9, $database);
+      $stmth->bind_param(4, $row[4], { TYPE => SQL_INTEGER });
+      if (not $stmth->execute()) {
+          print "Error in row $row_count: $stmth->errstr\n";
+      }
+      $row_count += 1;
+  }
+  # ...
+  # end store-to-SQLite "UniProtKB" table
+
+  print "begin commit at " . format_localtime_iso8601() . "\n";
+  $dbh->{AutoCommit} = $auto_commit;
+  print "auto_commit is now $auto_commit\n" if ($verbose);
+  $dbh->disconnect if ( defined $dbh );
+  print "Finished writing $dbfile at " . format_localtime_iso8601() . "\n\n";
+  $dbtype = "FASTA";
+}
+
+if ($use_sqlite == 1) {
+  ###############################################################################################################################
+  #
+  #    Read in the UniProtKB/Swiss-Prot data from SQLite; save to @sequences array
+  #
+  ###############################################################################################################################
+
+  copy($dbfile, $db_out) or die "Copy $dbfile to $db_out failed: $!";
+
+  # https://metacpan.org/pod/DBD::SQLite#Read-Only-Database
+  $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef, {
+    sqlite_open_flags => SQLITE_OPEN_READONLY,
+  });
+  print "DB connection $dbh is to $dbfile\n";
+
+  # Uniprot_ID, Description, Organism_Name, Organism_ID, Gene_Name, PE, SV, Sequence
+  $stmth = $dbh->prepare("
+  SELECT Uniprot_ID
+  , Description || ' OS=' || Organism_Name || ' OX=' || Organism_ID
+    || CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN='|| Gene_Name END
+    || CASE WHEN PE = 'N/A' THEN '' ELSE ' PE='|| PE        END
+    || CASE WHEN SV = 'N/A' THEN '' ELSE ' SV='|| SV        END
+    AS Description
+  , Sequence
+  , Database
+  FROM
+    UniProtKB
+  ");
+  $stmth->execute();
+  @col_names = @{$stmth->{NAME}};
+  print "\nColumn names selected from UniProtKB SQLite table: " . join(", ", @col_names) . "\n\n" if ($verbose);
+  while (my @row = $stmth->fetchrow_array) {
+    push (@names,              $row[1]); # redacted Description
+    push (@accessions,         $row[0]); # Uniprot_ID
+    $sequences[$#accessions] = $row[2];  # Sequence
+    push (@databases,          $row[3]); # Database (should be 'sp')
+  }
+
+  $dbh->disconnect if ( defined $dbh );
+
+  print "Done Reading UniProtKB/Swiss-Prot file $dbfile\n\n";
+  $dbtype = "SQLite";
+}
+
+print "$#accessions accessions were read from the UniProtKB/Swiss-Prot $dbtype file\n";
+
+@timeData = localtime(time);
+print "\n--- Start search at " . format_localtime_iso8601() ."\n";
+
+print "    --> Calling 'search_ppep' script\n\n";
+if ($verbose) {
+  $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in --verbose");
+} else {
+  $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in");
+}
+if ($i) {
+  print "python $dirname/search_ppep.py -u $db_out -p $file_in\n  exited with exit code $i\n";
+  die "Search failed for phosphopeptides in SwissProt/SQLite file.";
+}
+print "    <-- Returned from 'search_ppep' script\n";
+
+@timeData = localtime(time);
+print "... Finished search at " . format_localtime_iso8601() ."\n\n";
+
+
+###############################################################################################################################
+#
+#    Match the non_p_peptides to the @sequences array:
+#        1) Format the motifs +/- 10 residues around the phospho-site
+#        2) Print the original data plus the phospho-motif to the output file
+#
+###############################################################################################################################
+
+#ACE print OUT "$headers\tFormatted Motifs\tUnique Motifs\tPhospho-site(s)\tAccessions(s)\tName(s)\n";
+
+print "--- Match the non_p_peptides to the \@sequences array:\n";
+
+if ($USE_SEARCH_PPEP_PY) {
+  print "Find the matching protein sequence(s) for the peptide using SQLite\n";
+} else {
+  print "Find the matching protein sequence(s) for the peptide using slow search\n";
+}
+
+# https://metacpan.org/pod/DBD::SQLite#Read-Only-Database
+$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef, {
+  sqlite_open_flags => SQLITE_OPEN_READONLY,
+});
+print "DB connection $dbh is to $db_out\n";
+
+# CREATE VIEW uniprotid_pep_ppep AS
+#   SELECT   deppep_UniProtKB.UniprotKB_ID       AS accession
+#          , deppep.seq                          AS peptide
+#          , ppep.seq                            AS phosphopeptide
+#          , UniProtKB.Sequence                  AS sequence
+#          , UniProtKB.Description               AS description
+#   FROM     ppep, deppep, deppep_UniProtKB, UniProtKB
+#   WHERE    deppep.id = ppep.deppep_id
+#   AND      deppep.id = deppep_UniProtKB.deppep_id
+#   AND      deppep_UniProtKB.UniprotKB_ID = UniProtKB.Uniprot_ID
+#   ORDER BY UniprotKB_ID, deppep.seq, ppep.seq;
+
+my %ppep_to_count_lut;
+print "start select peptide counts " . format_localtime_iso8601() . "\n";
+$stmth = $dbh->prepare("
+    SELECT DISTINCT
+      phosphopeptide
+    , count(*) as i
+    FROM
+      uniprotkb_pep_ppep_view
+    GROUP BY
+      phosphopeptide
+    ORDER BY
+      phosphopeptide
+");
+if (not $stmth->execute()) {
+    die "Error fetching peptide counts: $stmth->errstr\n";
+}
+while (my @row = $stmth->fetchrow_array) {
+  $ppep_to_count_lut{$row[0]} = $row[1];
+  #print "\$ppep_to_count_lut{$row[0]} = $ppep_to_count_lut{$row[0]}\n";
+}
+
+# accession, peptide, sequence, description, phosphopeptide, long_description, pos_start, pos_end, scrubbed, ppep_id
+# 0          1        2         3            4               5                 6          7        8         9
+my $COL_ACCESSION        = 0;
+my $COL_PEPTIDE          = 1;
+my $COL_SEQUENCE         = 2;
+my $COL_DESCRIPTION      = 3;
+my $COL_PHOSPHOPEPTIDE   = 4;
+my $COL_LONG_DESCRIPTION = 5;
+my $COL_POS_START        = 6;
+my $COL_POS_END          = 7;
+my $COL_SCRUBBED         = 8;
+my $COL_PPEP_ID          = 9;
+
+my %ppep_to_row_lut;
+print "start select all records without qualification " . format_localtime_iso8601() . "\n";
+$stmth = $dbh->prepare("
+    SELECT DISTINCT
+      accession
+    , peptide
+    , sequence
+    , description
+    , phosphopeptide
+    , long_description
+    , pos_start
+    , pos_end
+    , scrubbed
+    , ppep_id
+    FROM
+      uniprotkb_pep_ppep_view
+    ORDER BY
+      phosphopeptide
+");
+if (not $stmth->execute()) {
+    die "Error fetching all records without qualification: $stmth->errstr\n";
+}
+my $current_ppep;
+my $counter = 0;
+my $former_ppep = "";
+@tmp_matches = ();
+@tmp_accessions = ();
+@tmp_names = ();
+@tmp_sites = ();
+while (my @row = $stmth->fetchrow_array) {
+    # Identify phosphopeptide for current row;
+    #   it is an error for it to change when the counter is not zero.
+    $current_ppep = $row[$COL_PHOSPHOPEPTIDE];
+
+    # when counter is zero, prepare for a new phosphopeptide
+    if (not $current_ppep eq $former_ppep) {
+        die "counter is $counter instead of zero" if ($counter != 0);
+        $ppep_id_lut{$current_ppep} = $row[$COL_PPEP_ID];
+        print "next phosphpepetide: $current_ppep; id: $ppep_id_lut{$current_ppep}\n" if ($verbose);
+        $counter = $ppep_to_count_lut{$current_ppep};
+        @tmp_matches = ();
+        @tmp_accessions = ();
+        @tmp_names = ();
+        @tmp_sites = ();
+    }
+
+    if ($USE_SEARCH_PPEP_PY) {
+        push(@tmp_matches,    $row[ $COL_SEQUENCE         ]);
+        push(@tmp_accessions, $row[ $COL_ACCESSION        ]);
+        push(@tmp_names,      $row[ $COL_LONG_DESCRIPTION ]);
+        push(@tmp_sites,      $row[ $COL_POS_START        ]);
+    }
+
+    # Prepare counter and phosphopeptide tracker for next row
+    #ACE print "counter: $counter; phosphpepetide: $current_ppep\n";
+    $former_ppep = $current_ppep;
+    $counter -= 1;
+
+    # Set trackers for later use after last instance of current phosphopeptide
+    if ($counter == 0) {
+        if ($USE_SEARCH_PPEP_PY) {
+            $matched_sequences{$current_ppep} = [ @tmp_matches ];
+            $accessions{       $current_ppep} = [ @tmp_accessions ];
+            $names{            $current_ppep} = [ @tmp_names ];
+            $sites{            $current_ppep} = [ @tmp_sites ];
+        }
+    }
+}
+
+
+print "end select all records without qualification " . format_localtime_iso8601() . "\n";
+
+for my $j (0 .. $#p_peptides) {
+
+    #Find the matching protein sequence(s) for the peptide using SQLite
+    my ($site, $sequence);
+    my (@row, @rows);
+    my $match = 0;
+    my $p_peptide = $p_peptides[$j];
+    @tmp_matches = ();
+    @tmp_accessions = ();
+    @tmp_names = ();
+    @tmp_sites = ();
+
+    #Find the matching protein sequence(s) for the peptide using slow search
+    $site = -1;
+    unless ($USE_SEARCH_PPEP_PY) {
+        for my $k (0 .. $#sequences) {
+            $site = index($sequences[$k], $non_p_peptides[$j]);
+            if ($site != -1) {
+                  push(@tmp_matches, $sequences[$k]);
+                  push(@tmp_accessions, $accessions[$k]);
+                  push(@tmp_names, $names[$k]);
+                  push(@tmp_sites, $site);
+                }
+                # print "Non-phosphpeptide $non_p_peptides[$j] matched accession $accessions[$k] ($names[$k]) at site $site\n";
+                $site = -1; $match++;
+                # print "tmp_accessions @tmp_accessions \n";
+        }
+        if ($match == 0) {    # Check to see if no match was found.  Skip to next if no match found.
+            print "Warning:  Failed match for $p_peptides[$j]\n";
+            $matched_sequences{$p_peptides[$j]} = \@failed_match;
+            push(@failed_matches,$p_peptides[$j]);
+            next;
+        } else {
+            $matched_sequences{$p_peptides[$j]} = [ @tmp_matches ];
+            $accessions{$p_peptides[$j]} = [ @tmp_accessions ];
+            $names{$p_peptides[$j]} = [ @tmp_names ];
+            $sites{$p_peptides[$j]} = [ @tmp_sites ];
+        }
+    }
+
+} # end for my $j (0 .. $#p_peptides)
+
+print "... Finished match the non_p_peptides at " . format_localtime_iso8601() ."\n\n";
+
+print "--- Match the p_peptides to the \@sequences array:\n";
+
+for my $peptide_to_match ( keys %matched_sequences ) {
+    if (grep($peptide_to_match, @failed_matches)) {
+        print "Failed to match peptide $peptide_to_match\n";
+    }
+    next if (grep($peptide_to_match, @failed_matches));
+    my @matches = @{$matched_sequences{$peptide_to_match}};
+    @tmp_motifs_array = ();
+    for my $i (0 .. $#matches) {
+        #ACE print "Matching $peptide_to_match to match $i\n";
+        #ACE print "\$sites{\$peptide_to_match}[\$i] $sites{$peptide_to_match}[$i]\n";
+
+        # Find the location of the phospo-site in the sequence(s)
+        $tmp_site = 0; my $offset = 0;
+        my $tmp_p_peptide = $peptide_to_match;
+        #ACE print "peptide_to_match: $peptide_to_match at position $sites{$peptide_to_match}[$i] in sequence $matched_sequences{$peptide_to_match}[$i]\n";
+        $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g;
+        #ACE print "tmp_p_peptide: $tmp_p_peptide\n";
+
+        # Find all phosphorylated residues in the p_peptide
+        @p_sites = ();
+        while ($tmp_site != -1) {
+            $tmp_site = index($tmp_p_peptide, 'p', $offset);
+            if ($tmp_site != -1) {push (@p_sites, $tmp_site);}
+            $offset = $tmp_site + 1;
+            $tmp_p_peptide =~ s/p//;
+        }
+        @tmp_p_residues = ();
+        for my $l (0 .. $#p_sites) {
+            next if not defined $sites{$peptide_to_match}[$i];
+
+            push (@tmp_p_residues, $p_sites[$l] + $sites{$peptide_to_match}[$i]);
+
+            # Match the sequences around the phospho residues to find the motifs
+            my ($desired_residues_L, $desired_residues_R);
+            if ($tmp_p_residues[0] - 10 < 0) {    #check to see if there are fewer than 10 residues left of the first p-site
+                # eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3
+                $desired_residues_L = $tmp_p_residues[0];
+            }
+            else {
+                $desired_residues_L = 10;
+            }
+            my $seq_length = length($matched_sequences{$peptide_to_match}[$i]);
+            if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) {    #check to see if there are fewer than 10 residues right of the last p-site
+                $desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1);
+                # eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3
+                # print "Line 170:  seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n";
+            }
+            else {
+                $desired_residues_R = 10;
+            }
+
+            my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1;
+            my $arg2 = $tmp_p_residues[0] - $desired_residues_L;
+            my $arg1 = $matched_sequences{$peptide_to_match}[$i];
+
+            if (length($arg1) > $arg2 + $total_length - 1) {
+                $tmp_motif = substr($arg1, $arg2, $total_length);
+                #ACE print "tmp_motif = $tmp_motif\ti = $i\tpeptide_to_match = $peptide_to_match\tmatched_sequences{peptide_to_match}[i] = $matched_sequences{$peptide_to_match}[$i]\targ2 = $arg2\targ3 = $total_length\n";
+
+                # Put the "p" back in front of the appropriate phospho-residue(s).
+                my (@tmp_residues, $tmp_position);
+                for my $m (0 .. $#p_sites) {
+                    # print "Line 183: $p_sites[$m]\n";
+                    if ($m == 0) {
+                        $tmp_position = $desired_residues_L;
+                    } else {
+                        $tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0];
+                    }
+                    #ACE print "Line 431: p_sites = $p_sites[$m]\ttmp_position = $tmp_position\ttmp_motif = $tmp_motif\n";
+                    if ($tmp_position < length($tmp_motif) + 1) {
+                        push (@tmp_residues, substr($tmp_motif, $tmp_position, 1));
+                        if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");}
+                        if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");}
+                        if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");}
+                    }
+                }
+
+                $tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g;
+
+                # Comment out on 8.10.13 to remove the numbers from motifs
+                my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1;
+                my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1;
+                $tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue;
+                push(@tmp_motifs_array, $tmp_motif);
+                $residues{$peptide_to_match}{$i} = [ @tmp_residues ];
+                $p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ];
+            }
+        }
+        $p_motifs{$peptide_to_match} = [ @tmp_motifs_array ];
+    }  # end for my $i (0 .. $#matches)       ### this bracket could be in the wrong place
+}
+
+print "... Finished match the p_peptides to the \@sequences array at " . format_localtime_iso8601() ."\n\n";
+
+###############################################################################################################################
+#
+#  Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs
+#
+###############################################################################################################################
+
+
+print "--- Reading various site data:\n";
+
+###############################################################################################################################
+#
+#    Read the NetworKIN_predictions file:
+#        1) make a "kinases_observed" array
+#        2) annotate the phospho-substrates with the appropriate kinase
+#
+###############################################################################################################################
+my $SITE_KINASE_SUBSTRATE = 1;
+$site_description{$SITE_KINASE_SUBSTRATE} = "NetworKIN";
+
+open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n";
+print "Reading the NetworKIN data:  $networkin_in\n";
+while (<IN1>) {
+    chomp;
+    my (@x) = split(/\t/);
+    for my $i (0 .. $#x) {
+        $x[$i] =~ s/\r//g;     $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
+    }
+    next if ($x[0] eq "#substrate");
+    if (exists ($kinases -> {$x[2]})) {
+        #do nothing
+    }
+    else {
+        $kinases -> {$x[2]} = $x[2];
+        push (@kinases_observed, $x[2]);
+    }
+    my $tmp = $x[10]."_".$x[2];    #eg, REEILsEMKKV_PKCalpha
+    if (exists($p_sequence_kinase -> {$tmp})) {
+        #do nothing
+    }
+    else {
+        $p_sequence_kinase -> {$tmp} = $tmp;
+    }
+}
+close IN1;
+
+###############################################################################################################################
+#
+#    Read the Kinase motifs file:
+#        1) make a "motif_sequence" array
+#
+###############################################################################################################################
+
+# file format (tab separated):
+#   x[0] = primary key (character), e.g., '17' or '23a'
+#   x[1] = pattern (egrep pattern), e.g., '(M|I|L|V|F|Y).R..(pS|pT)'
+#   x[2] = description, e.g., 'PKA_Phosida' or '14-3-3 domain binding motif (HPRD)' or 'Akt kinase substrate motif (HPRD & Phosida)'
+
+my $SITE_MOTIF = 2;
+$site_description{$SITE_MOTIF} = "motif";
+
+open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n";
+print "Reading the Motifs file:  $motifs_in\n";
+
+while (<IN2>) {
+    chomp;
+    my (@x) = split(/\t/);
+    for my $i (0 .. 2) {
+        $x[$i] =~ s/\r//g;
+        $x[$i]  =~ s/\n//g;
+        $x[$i]  =~ s/\"//g;
+        }
+    if (exists ($motif_type{$x[1]})) {
+        $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2];
+    } else {
+        $motif_type{$x[1]} = $x[2];
+        $motif_count{$x[1]} = 0;
+        push (@motif_sequence, $x[1]);
+    }
+}
+close (IN2);
+
+
+###############################################################################################################################
+#  6.28.2011
+#    Read PSP_Kinase_Substrate data:
+#        1) make a "kinases_PhosphoSite" array
+#        2) annotate the phospho-substrates with the appropriate kinase
+#
+#  Columns:
+#     (0) GENE
+#     (1) KINASE
+#     (2) KIN_ACC_ID
+#     (3) KIN_ORGANISM
+#     (4) SUBSTRATE
+#     (5) SUB_GENE_ID
+#     (6) SUB_ACC_ID
+#     (7) SUB_GENE
+#     (8) SUB_ORGANISM
+#     (9) SUB_MOD_RSD
+#     (10) SITE_GRP_ID
+#     (11) SITE_+/-7_AA
+#     (12) DOMAIN
+#     (13) IN_VIVO_RXN
+#     (14) IN_VITRO_RXN
+#     (15) CST_CAT#
+###############################################################################################################################
+
+my $SITE_PHOSPHOSITE = 3;
+$site_description{$SITE_PHOSPHOSITE} = "PhosphoSite";
+
+
+$line = 0;
+
+open (IN3, "$PSP_Kinase_Substrate_in") or die "I couldn't find $PSP_Kinase_Substrate_in\n";
+print "Reading the PhosphoSite Kinase-Substrate data:  $PSP_Kinase_Substrate_in\n";
+
+while (<IN3>) {
+    chomp;
+    my (@x) = split(/\t/);
+    for my $i (0 .. $#x) {
+        $x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
+        }
+    if ($line != 0) {
+        #ACE FUE if (($species eq $species) && ($species eq $species)) {
+        if (($species eq $x[3]) && ($species eq $x[8])) {
+            #ACE print "KIN_ORGANISM is '$x[3]' and SUB_ORGANISM is '$x[8]', line: $line\n";
+            if (exists ($kinases_PhosphoSite -> {$x[0]})) {
+                #do nothing
+            }
+            else {
+                $kinases_PhosphoSite -> {$x[0]} = $x[0];
+                push (@kinases_PhosphoSite, $x[0]);
+            }
+            my $offset = 0;
+            # Replace the superfluous lower case s, t and y
+            my @lowercase = ('s','t','y');
+            my @uppercase = ('S','T','Y');
+            for my $k (0 .. 2) {
+                my $site = 0;
+                while ($site != -1) {
+                    $site = index($x[11],$lowercase[$k], $offset);
+                    if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);}
+                    $offset = $site + 1;
+                }
+            }
+            my $tmp = $x[11]."_".$x[0];        #eg, RTPGRPLsSYGMDSR_PAK2
+            if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
+                #do nothing
+            }
+            else {
+                $p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp;
+            }
+        }
+        else {
+            # do nothing
+            #print "PSP_kinase_substrate line rejected because KIN_ORGANISM is '$x[3]' and SUB_ORGANISM is '$x[8]': $line\n";
+        }
+    }
+    $line++;
+}
+close IN3;
+
+
+###############################################################################################################################
+#  Read PhosphoSite regulatory site data:
+#        1) make a "regulatory_sites_PhosphoSite" hash
+#
+#  Columns:
+#    (0)  GENE
+#    (1)  PROTEIN           --> #ACE %psp_regsite_protein
+#    (2)  PROT_TYPE
+#    (3)  ACC_ID
+#    (4)  GENE_ID
+#    (5)  HU_CHR_LOC
+#    (6)  ORGANISM          --> %organism
+#    (7)  MOD_RSD
+#    (8)  SITE_GRP_ID
+#    (9)  SITE_+/-7_AA      --> %regulatory_sites_PhosphoSite_hash
+#    (10) DOMAIN            --> %domain
+#    (11) ON_FUNCTION       --> %ON_FUNCTION
+#    (12) ON_PROCESS        --> %ON_PROCESS
+#    (13) ON_PROT_INTERACT  --> %ON_PROT_INTERACT
+#    (14) ON_OTHER_INTERACT --> %ON_OTHER_INTERACT
+#    (15) PMIDs
+#    (16) LT_LIT
+#    (17) MS_LIT
+#    (18) MS_CST
+#    (19) NOTES             --> %notes
+###############################################################################################################################
+
+
+$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef);
+my $auto_commit = $dbh->{AutoCommit};
+$dbh->{AutoCommit} = 0;
+print "DB connection $dbh is to $db_out, opened for modification\n";
+
+# add partial PSP_Regulatory_site table (if not exists) regardless of whether SwissProt input was FASTA or SQLite
+$stmth = $dbh->prepare("
+CREATE TABLE IF NOT EXISTS PSP_Regulatory_site (
+  SITE_PLUSMINUS_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
+  DOMAIN             TEXT,
+  ON_FUNCTION        TEXT,
+  ON_PROCESS         TEXT,
+  ON_PROT_INTERACT   TEXT,
+  ON_OTHER_INTERACT  TEXT,
+  NOTES              TEXT,
+  ORGANISM           TEXT,
+  PROTEIN            TEXT
+)
+");
+$stmth->execute();
+
+# add partial PSP_Regulatory_site LUT (if not exists) regardless of whether SwissProt input was FASTA or SQLite
+$stmth = $dbh->prepare("
+CREATE TABLE IF NOT EXISTS ppep_regsite_LUT
+( ppep_id            INTEGER REFERENCES ppep(id)
+, site_plusminus_7AA TEXT    REFERENCES PSP_Regulatory_site(site_plusminus_7AA)
+, PRIMARY KEY (ppep_id, site_plusminus_7AA) ON CONFLICT IGNORE
+);
+");
+$stmth->execute();
+
+# $stmth = $dbh->prepare("
+# CREATE UNIQUE INDEX idx_PSP_Regulatory_site_0
+#   ON PSP_Regulatory_site(site_plusminus_7AA);
+# ");
+# $stmth->execute();
+
+
+# add Citation table (if not exists) regardless of whether SwissProt input was FASTA or SQLite
+my $citation_sql;
+$citation_sql = "
+CREATE TABLE IF NOT EXISTS Citation (
+  ObjectName TEXT REFERENCES sqlite_schema(name) ON DELETE CASCADE,
+  CitationData TEXT,
+  PRIMARY KEY (ObjectName, CitationData) ON CONFLICT IGNORE
+)
+";
+$stmth = $dbh->prepare($citation_sql);
+$stmth->execute();
+
+
+open (IN4, "$PSP_Regulatory_Sites_in") or die "I couldn't find $PSP_Regulatory_Sites_in\n";
+print "Reading the PhosphoSite regulatory site data:  $PSP_Regulatory_Sites_in\n";
+
+#ACE $i = system("head -n 4 $PSP_Regulatory_Sites_in");
+
+$line = -1;
+while (<IN4>) {
+    $line++;
+    chomp;
+    if ($_ =~ m/PhosphoSitePlus/) {
+        #$PhosphoSitePlusCitation = ($_ =~ s/PhosphoSitePlus/FooBar/g);
+        $PhosphoSitePlusCitation = $_;
+        $PhosphoSitePlusCitation =~ s/\t//g;
+        $PhosphoSitePlusCitation =~ s/\r//g;
+        $PhosphoSitePlusCitation =~ s/\n//g;
+        $PhosphoSitePlusCitation =~ s/""/"/g;
+        $PhosphoSitePlusCitation =~ s/^"//g;
+        $PhosphoSitePlusCitation =~ s/"$//g;
+        print "$PhosphoSitePlusCitation\n";
+        next;
+    }
+    my (@x) = split(/\t/);
+    for my $i (0 .. $#x) {
+        $x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
+    }
+    my $found_GENE=0;
+    #ACE print STDERR "line $line: $_\n";
+    if ( (not exists($x[0])) ) {
+        next;
+    }
+    elsif ( ($x[0] eq "GENE") ) {
+        $found_GENE=1;
+        next;
+    }
+    if ( (not exists($x[9])) || ($x[9] eq "") ) {
+        if (exists($x[8]) && (not $x[8] eq "")) {
+            die "$PSP_Regulatory_Sites_in line $line has no SITE_+/-7_AA: $_\n";
+        } else {
+            if ( (not exists($x[1])) || (not $x[1] eq "") ) {
+                print "$PSP_Regulatory_Sites_in line $line (".length($_)." characters) has no SITE_+/-7_AA: $_\n"
+                  if $found_GENE==1;
+            }
+            next;
+        }
+    }
+    elsif ($line != 0) {
+        #ACE print "PSPReg $line: $_\n" if ($x[9] eq 'KGQKYFDsGDYNMAK');
+        #ACE FUE if ($species ne $species) {
+        if ($species ne $x[6]) {
+            # Do nothing - this record was filtered out by the species filter
+            #ACE print "PSP_regsite line rejected: $line\n";
+        }
+        elsif (!exists($regulatory_sites_PhosphoSite_hash{$x[9]})) {
+            #ACE print "testing \$domain{\$x[9]} for \$regulatory_sites_PhosphoSite_hash{$x[9]}\n" if ($x[9] eq 'KGQKYFDsGDYNMAK'); #ACE
+            if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") {
+                #ACE print "adding found \$regulatory_sites_PhosphoSite_hash{$x[9]}\n" if ($x[9] eq 'KGQKYFDsGDYNMAK'); #ACE
+                $regulatory_sites_PhosphoSite_hash{$x[9]} = $x[9];
+                $domain{$x[9]} = $x[10];
+                #ACE $psp_regsite_protein{$x[9]} = $x[1];
+                $ON_FUNCTION{$x[9]} = $x[11];
+                $ON_PROCESS{$x[9]} = $x[12];
+                $ON_PROT_INTERACT{$x[9]} = $x[13];
+                $ON_OTHER_INTERACT{$x[9]} = $x[14];
+                $notes{$x[9]} = $x[19];
+                $organism{$x[9]} = $x[6];
+            }
+        }
+        else {
+            # $domain
+            if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") {
+                if ($x[10] ne "") {
+                  $domain{$x[9]} = $domain{$x[10]};
+                  }
+                else {
+                  # do nothing
+                  #ACE print "WARNING line $line - no domain or 7aa for:  GENE $x[0]   PROTEIN $x[1]   PROT_TYPE $x[2]   ACC_ID $x[3]   GENE_ID $x[4]   HU_CHR_LOC $x[5]   ORGANISM $x[6]   MOD_RSD $x[7]   SITE_GRP_ID $x[8]   SITE_+/-7_AA $x[9]   DOMAIN $x[10]\n";
+                  #ACE print "$_\n";
+                  }
+            }
+            else {
+                #ACE print "Checking $domain{$x[9]} =~ /$x[10]/\n";
+                if ($domain{$x[9]} =~ /$x[10]/) {
+                  # do nothing
+                  }
+                else {
+                  $domain{$x[9]} = $domain{$x[9]}." / ".$x[10];
+                  #print "INFO line $line - compound domain for 7aa:  GENE $x[0]   PROTEIN $x[1]   PROT_TYPE $x[2]   ACC_ID $x[3]   GENE_ID $x[4]   HU_CHR_LOC $x[5]   ORGANISM $x[6]   MOD_RSD $x[7]   SITE_GRP_ID $x[8]   SITE_+/-7_AA $x[9]   DOMAIN $domain{$x[9]}\n";
+                  }
+            }
+
+            # $ON_FUNCTION
+            if (!defined $ON_FUNCTION{$x[9]} || $ON_FUNCTION{$x[9]} eq "") {
+                $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10];
+            }
+
+            # $ON_PROCESS
+            if (!defined $ON_PROCESS{$x[9]} || $ON_PROCESS{$x[9]} eq "") {
+                $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10];
+            }
+
+            # $ON_PROT_INTERACT
+            if (!defined $ON_PROT_INTERACT{$x[9]}  || $ON_PROT_INTERACT{$x[9]} eq "") {
+                $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10];
+            }
+
+            # $ON_OTHER_INTERACT
+            if (!defined $ON_OTHER_INTERACT{$x[9]} || $ON_OTHER_INTERACT{$x[9]} eq "") {
+                $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10];
+            }
+
+            # $notes
+            if (!defined $notes{$x[9]} || $notes{$x[9]} eq "") {
+                $notes{$x[9]} = $notes{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $notes{$x[9]} = $notes{$x[9]}." / ".$x[10];
+            }
+
+            # $organism
+            if (!defined $organism{$x[9]} || $organism{$x[9]} eq "") {
+                $organism{$x[9]} = $organism{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $organism{$x[9]} = $organism{$x[9]}." / ".$x[10];
+            }
+        }
+    }
+}
+close IN4;
+
+print "... Finished reading various site data at " . format_localtime_iso8601() ."\n\n";
+
+$stmth = $dbh->prepare("
+INSERT INTO Citation (
+  ObjectName,
+  CitationData
+) VALUES (?,?)
+");
+
+sub add_citation {
+    my ($cit_table, $cit_text, $cit_label) = @_;
+    $stmth->bind_param(1, $cit_table);
+    $stmth->bind_param(2, $cit_text);
+    if (not $stmth->execute()) {
+        print "Error writing $cit_label cit for table $cit_table: $stmth->errstr\n";
+    }
+}
+my ($citation_text, $citation_table);
+
+# PSP regulatory or kinase/substrate site
+$citation_text = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."';
+$citation_table = "PSP_Regulatory_site";
+add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "PSP_Regulatory_site");
+$citation_text = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122';
+$citation_table = "PSP_Regulatory_site";
+add_citation($citation_table, $citation_text, "PSP_Regulatory_site");
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
+
+# NetworKIN site
+$citation_text = 'Linding, 2007, "Systematic discovery of in vivo phosphorylation networks.", https://pubmed.ncbi.nlm.nih.gov/17570479, https://doi.org/10.1016/j.cell.2007.05.052';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_text = 'Horn, 2014, "KinomeXplorer: an integrated platform for kinome biology studies.", https://pubmed.ncbi.nlm.nih.gov/24874572, https://doi.org/10.1038/nmeth.296';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_text = 'Aken, 2016, "The Ensembl gene annotation system.", https://pubmed.ncbi.nlm.nih.gov/33137190, https://doi.org/10.1093/database/baw093';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+
+# pSTY motifs
+$citation_text = 'Amanchy, 2007, "A curated compendium of phosphorylation motifs.", https://pubmed.ncbi.nlm.nih.gov/17344875, https://doi.org/10.1038/nbt0307-285';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs");
+$citation_text = 'Gnad, 2011, "PHOSIDA 2011: the posttranslational modification database.", https://pubmed.ncbi.nlm.nih.gov/21081558, https://doi.org/10.1093/nar/gkq1159';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs");
+
+
+###############################################################################################################################
+#
+#    Read the data file:
+#        1) find sequences that match the NetworKIN predictions
+#        2) find motifs that match the observed sequences
+#
+###############################################################################################################################
+
+print "--- Find sequences that match the NetworKIN predictions and find motifs that match observed sequences\n";
+
+my $ppep_regsite_LUT_stmth;
+$ppep_regsite_LUT_stmth = $dbh->prepare("
+  INSERT INTO ppep_regsite_LUT (
+    ppep_id,
+    site_plusminus_7AA
+  ) VALUES (?,?)
+");
+
+my ($start_seconds, $start_microseconds) = gettimeofday;
+
+foreach my $peptide (keys %data) {
+    # find the unique phospho-motifs for this $peptide
+    my @all_motifs = ();
+    my $have_all_motifs = 0;
+    for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
+        my $tmp_motif = $p_motifs{$peptide}[$i];
+        push(@all_motifs, $tmp_motif);
+        $have_all_motifs = 1;
+    }
+    if ($have_all_motifs == 1) {
+        for my $j (0 .. $#all_motifs) {
+            if (defined $all_motifs[$j]) {
+                $all_motifs[$j] =~ s/\d+-\[\s//;
+                $all_motifs[$j] =~ s/\s\]\-\d+//;
+            }
+        }
+    }
+    my %seen = ();
+    if ($have_all_motifs == 1) {
+        foreach my $a (@all_motifs) {
+            if (defined $a) {
+                if (exists($seen{$a})) {
+                    next;
+                } else {
+                    push(@{$unique_motifs{$peptide}}, $a);
+                    $seen{$a} = 1;
+                }
+            }
+            print "push(\@{\$unique_motifs{$peptide}}, $a);\n" if ($verbose);
+        }
+    }
+
+    # count the number of phospo-sites in the motif
+    my $number_pY = 0;
+    my $number_pSTY = 0;
+    if ($phospho_type eq 'y') {
+        if (defined(${$unique_motifs{$peptide}}[0])) {
+            while (${$unique_motifs{$peptide}}[0] =~ /pY/g) {
+                $number_pY++;
+            }
+        }
+    }
+    if ($phospho_type eq 'sty') {
+        print "looking for unique_motifs for $peptide\n" if ($verbose);
+        if (defined(${$unique_motifs{$peptide}}[0])) {
+            while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) {
+                $number_pSTY++;
+               print "We have found $number_pSTY unique_motifs for $peptide\n" if ($verbose);
+            }
+        }
+    }
+
+
+    # search each of the unique motifs for matches
+    print "searching $#{$unique_motifs{$peptide}} motifs for peptide $peptide\n" if ($verbose);
+    for my $i (0 .. $#{$unique_motifs{$peptide}}) {
+        print "\$i = $i; peptide = $peptide; unique_motif = ${$unique_motifs{$peptide}}[$i]\n" if ($verbose);
+        my $tmp_motif = ${$unique_motifs{$peptide}}[$i];
+        print "   --- matching unique motif $tmp_motif for peptide  $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
+        my $formatted_sequence;
+        if (($number_pY == 1) || ($number_pSTY == 1)) {
+            my $seq_plus5aa = "";
+            my $seq_plus7aa = "";
+            #ACE print "tmp_motif is $tmp_motif before replacement\n";
+            $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type);
+            print "       a #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequence for peptide  $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
+            #ACE print "formatted_sequence is $formatted_sequence after replacement\n";
+            if ($phospho_type eq 'y') {
+                $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1];
+                $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1];
+            }
+            elsif ($phospho_type eq "sty") {
+                $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1];
+                $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1];
+            }
+
+            if (defined $seq_plus7aa) {
+                # commit the 7aa LUT records
+                $ppep_regsite_LUT_stmth->bind_param( 1, $ppep_id_lut{$peptide} );
+                $ppep_regsite_LUT_stmth->bind_param( 2, $seq_plus7aa             );
+                if (not $ppep_regsite_LUT_stmth->execute()) {
+                    print "Error writing tuple ($ppep_id_lut{$peptide},$seq_plus7aa) for peptide $peptide to ppep_regsite_LUT: $ppep_regsite_LUT_stmth->errstr\n";
+                }
+            }
+            #ACE print "seq_plus5aa is $seq_plus5aa \n";
+            #ACE print "seq_plus7aa is $seq_plus7aa \n";
+            for my $i (0 .. $#kinases_observed) {
+                if (defined $seq_plus5aa) {
+                    my $tmp = $seq_plus5aa."_".$kinases_observed[$i];    #eg, should be PGRPLsSYGMD_PKCalpha
+                    if (exists($p_sequence_kinase -> {$tmp})) {
+                        #ACE print($tmp."\t");
+                        #ACE print(($p_sequence_kinase -> {$tmp})."\n"); #ACE
+                        $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; #ACE
+                    }
+                }
+            }
+            for my $i (0 .. $#motif_sequence) {
+                if ($peptide =~ /$motif_sequence[$i]/) {
+                    $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X";
+                    #ACE print "\$kinase_motif_matches{$peptide}{$motif_sequence[$i]} = 'X'; $motif_type{$motif_sequence[$i]}\n"; #ACE
+                }
+            }
+            for my $i (0 .. $#kinases_PhosphoSite) {
+                if (defined $seq_plus7aa) {
+                    my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i];    #eg, should be RTPGRPLsSYGMDSR_PAK2
+                    if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
+                        $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
+                    }
+                }
+            }
+            #ACE print "checking for existence of \$regulatory_sites_PhosphoSite_hash{$seq_plus7aa}\n"; #ACE
+            if (exists($regulatory_sites_PhosphoSite_hash{$seq_plus7aa})) {
+                #ACE print "found regulatory_sites_PhosphoSite_hash{$seq_plus7aa}\n"; #ACE
+                $seq_plus7aa_2{$peptide} = $seq_plus7aa;
+                $domain_2{$peptide} = $domain{$seq_plus7aa};
+                #ACE $psp_regsite_protein_2{$peptide} = $psp_regsite_protein{$seq_plus7aa};
+                $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
+                $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
+                $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
+                $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
+                $notes_2{$peptide} = $notes{$seq_plus7aa};
+                $organism_2{$peptide} = $organism{$seq_plus7aa};
+            } else {
+                #ACE print "c not found \$regulatory_sites_PhosphoSite_hash{{$seq_plus7aa}\n"; #ACE
+            }
+        }
+        elsif (($number_pY > 1) || ($number_pSTY > 1)) {  #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2
+            $formatted_sequence = $tmp_motif;
+            #ACE print "formatted_sequence is $formatted_sequence \n";
+            $seq_plus5aa = "";
+            $seq_plus7aa = "";
+            #Create the sequences with only one phosphorylation site
+            #eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes  1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329  and  1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329
+
+            my (@sites, $offset, $next_p_site);
+            $sites[0] = index($tmp_motif, "p");
+            $offset = $sites[0] + 1;
+            $next_p_site = 0;
+            while ($next_p_site != -1) {
+                $next_p_site = index($tmp_motif, "p", $offset);
+                if ($next_p_site != -1) {
+                    push (@sites, $next_p_site);
+                }
+                $offset = $next_p_site+1;
+            }
+
+            my @pSTY_sequences;
+            for my $n (0 .. $#sites) {
+                $pSTY_sequences[$n] = $tmp_motif;
+                for (my $m = $#sites; $m >= 0; $m--) {
+                    if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";}
+                }
+            }
+
+            my @formatted_sequences;
+            for my $k (0 .. $#sites) {
+                #ACE print "pSTY_sequences[k] is $pSTY_sequences[$k] before replacement\n";
+                $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type);
+                #ACE print "formatted_sequences[k] is $formatted_sequences[$k] \n";
+            }
+
+            for my $k (0 .. $#formatted_sequences) {
+                print "       b #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequences[$k] for peptide  $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
+                #ACE print "formatted_sequences[k] for phosphotype $phospho_type is $formatted_sequences[$k] \n";
+                if ($phospho_type eq 'y') {
+                    $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequences[$k]))[1];
+                    $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequences[$k]))[1];
+                }
+                elsif ($phospho_type eq "sty") {
+                    $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequences[$k]))[1];
+                    $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequences[$k]))[1];
+                }
+                #ACE print "seq_plus5aa is $seq_plus5aa \n";
+                #ACE print "seq_plus7aa is $seq_plus7aa \n";
+                for my $i (0 .. $#kinases_observed) {
+                    my $tmp = $seq_plus5aa."_".$kinases_observed[$i];    #eg, should look like REEILsEMKKV_PKCalpha
+                    #ACE print "seq_plus5aa._.kinases_observed[i] is $tmp\n"; #ACE
+                    if (exists($p_sequence_kinase -> {$tmp})) {
+                        $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X";
+                        #ACE print "$tmp matched\n";
+                    }
+                }
+                $pSTY_sequence = $formatted_sequences[$k];
+                #ACE print "trying pSTY_sequence $pSTY_sequence \n";
+                for my $i (0 .. $#motif_sequence) {
+                    if ($pSTY_sequence =~ /$motif_sequence[$i]/) {
+                        #ACE print "match for pSTY_sequence $pSTY_sequence was $motif_sequence[$i]\n";
+                        $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X";
+                    }
+                }
+                for my $i (0 .. $#kinases_PhosphoSite) {
+                    my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i];    #eg, should be RTPGRPLsSYGMDSR_PAK2
+                    #print "seq_plus7aa._.kinases_PhosphoSite[i] is $tmp";
+                    if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
+                        $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
+                    #ACE print "$tmp matched \n";
+                    }
+                }
+                if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) {
+                    #ACE print "ACE processing seq_plus7aa '$domain{$seq_plus7aa}'\n"; #ACE
+                    $seq_plus7aa_2{$peptide} = $seq_plus7aa;
+
+                    # $domain
+                    if ($domain_2{$peptide} eq "") {
+                        $domain_2{$peptide} = $domain{$seq_plus7aa};
+                    }
+                    elsif ($domain{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa};
+                    }
+
+                    #ACE # $psp_regsite_protein
+                    #ACE if ($psp_regsite_protein_2{$peptide} eq "") {
+                    #ACE     $psp_regsite_protein_2{$peptide} = $psp_regsite_protein{$seq_plus7aa};
+                    #ACE }
+                    #ACE elsif ($psp_regsite_protein{$seq_plus7aa} eq "") {
+                    #ACE     # do nothing
+                    #ACE }
+                    #ACE else {
+                    #ACE     $psp_regsite_protein_2{$peptide} = $psp_regsite_protein_2{$peptide}." / ".$psp_regsite_protein{$seq_plus7aa};
+                    #ACE }
+
+                    # $ON_FUNCTION_2
+                    if ($ON_FUNCTION_2{$peptide} eq "") {
+                        $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
+                    }
+                    elsif ($ON_FUNCTION{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa};
+                    }
+
+                    # $ON_PROCESS_2
+                    if ($ON_PROCESS_2{$peptide} eq "") {
+                        $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
+                    }
+                    elsif ($ON_PROCESS{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa};
+                    }
+
+                    # $ON_PROT_INTERACT_2
+                    if ($ON_PROT_INTERACT_2{$peptide} eq "") {
+                        $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
+                    }
+                    elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa};
+                    }
+
+                    # $ON_OTHER_INTERACT_2
+                    if ($ON_OTHER_INTERACT_2{$peptide} eq "") {
+                        $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
+                    }
+                    elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa};
+                    }
+
+                    # $notes_2
+                    if ($notes_2{$peptide} eq "") {
+                        $notes_2{$peptide} = $notes{$seq_plus7aa};
+                    }
+                    elsif ($notes{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa};
+                    }
+                    $notes_2{$peptide} = $notes{$seq_plus7aa};
+
+                    # $organism_2
+                    if ($organism_2{$peptide} eq "") {
+                        $organism_2{$peptide} = $organism{$seq_plus7aa};
+                    }
+                    elsif ($organism{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $organism_2{$peptide} = $organism_2{$peptide}." / ".$organism{$seq_plus7aa};
+                    }
+                    $organism_2{$peptide} = $organism{$seq_plus7aa};
+                } else {
+                    #ACE print "d not found \$regulatory_sites_PhosphoSite_hash{{$seq_plus7aa}}\n";
+                } # if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa}))
+            } # for my $k (0 .. $#formatted_sequences)
+        } # if/else number of phosphosites
+    } # for each motif i # for my $i (0 .. $#{$unique_motifs{$peptide}})
+} # for each $peptide
+
+my ($end_seconds, $end_microseconds) = gettimeofday;
+
+my $delta_seconds = $end_seconds - $start_seconds;
+my $delta_microseconds = $end_microseconds - $start_microseconds;
+$delta_microseconds += 1000000 * $delta_seconds;
+my $key_count = keys(%data);
+print sprintf("Average search time is %d microseconds per phopshopeptide\n", ($delta_microseconds / $key_count));
+
+($start_seconds, $start_microseconds) = gettimeofday;
+
+print "Writing PSP_Regulatory_site records\n";
+
+#ACE $stmth = $dbh->prepare("
+#ACE     INSERT INTO PSP_Regulatory_site (
+#ACE       DOMAIN,
+#ACE       ON_FUNCTION,
+#ACE       ON_PROCESS,
+#ACE       ON_PROT_INTERACT,
+#ACE       ON_OTHER_INTERACT,
+#ACE       NOTES,
+#ACE       SITE_PLUSMINUS_7AA,
+#ACE       ORGANISM,
+#ACE       PROTEIN
+#ACE     ) VALUES (?,?,?,?,?,?,?,?,?)
+#ACE     ");
+
+$stmth = $dbh->prepare("
+    INSERT INTO PSP_Regulatory_site (
+      DOMAIN,
+      ON_FUNCTION,
+      ON_PROCESS,
+      ON_PROT_INTERACT,
+      ON_OTHER_INTERACT,
+      NOTES,
+      SITE_PLUSMINUS_7AA,
+      ORGANISM
+    ) VALUES (?,?,?,?,?,?,?,?)
+    ");
+
+foreach my $peptide (keys %data) {
+    if (exists($domain_2{$peptide}) and (defined $domain_2{$peptide}) and (not $domain_2{$peptide} eq "") ) {
+        #ACE print "writing domain $domain_2{$peptide} for regulatory site(s) $seq_plus7aa_2{$peptide}\n"; #ACE
+        $stmth->bind_param(1, $domain_2{$peptide});
+        $stmth->bind_param(2, $ON_FUNCTION_2{$peptide});
+        $stmth->bind_param(3, $ON_PROCESS_2{$peptide});
+        $stmth->bind_param(4, $ON_PROT_INTERACT_2{$peptide});
+        $stmth->bind_param(5, $ON_OTHER_INTERACT_2{$peptide});
+        $stmth->bind_param(6, $notes_2{$peptide});
+        $stmth->bind_param(7, $seq_plus7aa_2{$peptide});
+        $stmth->bind_param(8, $organism_2{$peptide});
+        #ACE $stmth->bind_param(9, $psp_regsite_protein_2{$peptide});
+        if (not $stmth->execute()) {
+            print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $stmth->errstr\n";
+        } else {
+            #ACE print "added domain for $domain_2{$peptide}\n";
+        }
+    } elsif (exists($domain_2{$peptide}) and (not defined $domain_2{$peptide})) {
+        print "\$domain_2{$peptide} is undefined\n";  #ACE
+    }
+}
+
+$dbh->{AutoCommit} = $auto_commit;
+# auto_commit implicitly finishes stmth, apparently # $stmth->finish;
+$dbh->disconnect if ( defined $dbh );
+
+
+($end_seconds, $end_microseconds) = gettimeofday;
+
+$delta_seconds = $end_seconds - $start_seconds;
+$delta_microseconds = $end_microseconds - $start_microseconds;
+$delta_microseconds += 1000000 * $delta_seconds;
+$key_count = keys(%data);
+print sprintf("Write time is %d microseconds\n", ($delta_microseconds));
+
+print "... Finished find sequences that match the NetworKIN predictions and find motifs that match observed sequences at " . format_localtime_iso8601() ."\n\n";
+
+###############################################################################################################################
+#
+# Print to the output file
+#
+###############################################################################################################################
+open (OUT, ">$file_out") || die "could not open the fileout: $file_out";
+open (MELT, ">$file_melt") || die "could not open the fileout: $file_melt";
+
+# print the header info
+print MELT "phospho_peptide\tgene_names\tsite_type\tkinase_map\n";
+print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t";
+
+# print the PhosphoSite regulatory data
+print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t";
+
+# print the sample names
+for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; }
+
+# print the kinases and groups
+for my $i (0 .. $#kinases_observed) {
+    my $temp = $kinases_observed[$i]."_NetworKIN";
+    print OUT "$temp\t";
+    push(@kinases_observed_lbl, $temp);
+}
+for my $i (0 .. $#motif_sequence) {
+    print OUT "$motif_type{$motif_sequence[$i]} ($motif_sequence[$i])\t";
+}
+for my $i (0 .. $#kinases_PhosphoSite) {
+    my $temp = $kinases_PhosphoSite[$i]."_PhosphoSite";
+    if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; }
+    if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; }
+    push(@phosphosites_observed_lbl, $temp);
+}
+
+# begin DDL-to-SQLite
+# ---
+$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef);
+$auto_commit = $dbh->{AutoCommit};
+$dbh->{AutoCommit} = 0;
+print "DB connection $dbh is to $db_out, opened for modification\n";
+
+my $sample_stmth;
+$sample_stmth = $dbh->prepare("
+  INSERT INTO sample (
+    id,
+    name
+  ) VALUES (?,?)
+");
+
+my $ppep_intensity_stmth;
+$ppep_intensity_stmth = $dbh->prepare("
+  INSERT INTO ppep_intensity (
+    ppep_id,
+    sample_id,
+    intensity
+  ) VALUES (?,?,?)
+");
+
+my $site_type_stmth;
+$site_type_stmth = $dbh->prepare("
+  insert into site_type (
+    id,
+    type_name
+  ) values (?,?)
+");
+
+my $ppep_gene_site_stmth;
+$ppep_gene_site_stmth = $dbh->prepare("
+  insert into ppep_gene_site (
+    ppep_id,
+    gene_names,
+    kinase_map,
+    site_type_id
+  ) values (?,?,?,?)
+");
+
+my $ppep_metadata_stmth;
+$ppep_metadata_stmth = $dbh->prepare("
+  INSERT INTO ppep_metadata
+    ( ppep_id
+    , protein_description
+    , gene_name
+    , FASTA_name
+    , phospho_sites
+    , motifs_unique
+    , accessions
+    , motifs_all_members
+    , domain
+    , ON_FUNCTION
+    , ON_PROCESS
+    , ON_PROT_INTERACT
+    , ON_OTHER_INTERACT
+    , notes
+  ) VALUES (
+    ?,?,?,?,?,?,?
+  , ?,?,?,?,?,?,?
+  )
+");
+# end DDL-to-SQLite
+# ...
+
+# begin store-to-SQLite "sample" table
+# ---
+# %sample_id_lut maps name -> ID
+for my $sample_name (keys %sample_id_lut) {
+    $sample_stmth->bind_param( 2, $sample_name                 );
+    $sample_stmth->bind_param( 1, $sample_id_lut{$sample_name} );
+    if (not $sample_stmth->execute()) {
+        print "Error writing tuple ($sample_name,$sample_id_lut{$sample_name}): $sample_stmth->errstr\n";
+    }
+}
+# end store-to-SQLite "sample" table
+# ...
+
+# begin store-to-SQLite "site_type" table
+# ---
+sub add_site_type {
+    my ($site_type_id, $site_type_type_name) = @_;
+    $site_type_stmth->bind_param( 2, $site_type_type_name );
+    $site_type_stmth->bind_param( 1, $site_type_id        );
+    if (not $site_type_stmth->execute()) {
+        die "Error writing tuple ($site_type_id,$site_type_type_name): $site_type_stmth->errstr\n";
+    }
+}
+add_site_type($SITE_KINASE_SUBSTRATE, $site_description{$SITE_KINASE_SUBSTRATE});
+add_site_type($SITE_MOTIF, $site_description{$SITE_MOTIF});
+add_site_type($SITE_PHOSPHOSITE, $site_description{$SITE_PHOSPHOSITE});
+# end store-to-SQLite "site_type" table
+# ...
+
+foreach my $peptide (sort(keys %data)) {
+    next if (grep($peptide, @failed_matches));
+    my $ppep_id = $ppep_id_lut{$peptide};
+    my @ppep_metadata = ();
+    my @ppep_intensity = ();
+    my @gene = ();
+    my $gene_names;
+    my $j;
+    # Print the peptide itself
+    #   column 1: p-peptide
+    print OUT "$peptide\t";
+    push (@ppep_metadata, $ppep_id);
+    push (@ppep_intensity, $peptide);
+
+    # skip over failed matches
+    if ($matched_sequences{$peptide} eq "Failed match") {
+        print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t";
+    } else {
+        my @description = ();
+        my %seen = ();
+        # Print just the protein description
+        for $i (0 .. $#{$names{$peptide}}) {
+            my $long_name = $names{$peptide}[$i];
+            my @naming_parts = split(/\sOS/, $long_name);
+            my @front_half = split(/\s/, $naming_parts[0]);
+            push(@description, join(" ", @front_half[1..($#front_half)]));
+        }
+        # column 2: Protein description
+        print OUT join(" /// ", @description), "\t";
+        push (@ppep_metadata, join(" /// ", @description));
+
+        # Print just the gene name
+        for $i (0 .. $#{$names{$peptide}}) {
+            my $tmp_gene = $names{$peptide}[$i];
+            $tmp_gene =~ s/^.*GN=//;
+            $tmp_gene =~ s/\s.*//;
+            if (!exists($seen{$tmp_gene})) {
+                push(@gene, $tmp_gene);
+                $seen{$tmp_gene} = $tmp_gene;
+            }
+        }
+        # column 3: Gene name(s)
+        $gene_names = join(" /// ", @gene);
+        print OUT $gene_names, "\t";
+        push (@ppep_metadata, join(" /// ", @gene));
+
+        # print the entire names
+        # column 4: FASTA name
+        print OUT join(" /// ", @{$names{$peptide}}), "\t";
+        push (@ppep_metadata, join(" /// ", @{$names{$peptide}}));
+
+        # Print the phospho-residues
+        # column 5:
+        my $tmp_for_insert = "";
+        for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
+            if ($i < $#{ $matched_sequences{$peptide} }) {
+                if (defined $p_residues{$peptide}{$i}) {
+                    @tmp_p_residues = @{$p_residues{$peptide}{$i}};
+                    for $j (0 .. $#tmp_p_residues) {
+                        if ($j < $#tmp_p_residues) {
+                            my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                            print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
+                            $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
+                        }
+                        elsif ($j == $#tmp_p_residues) {
+                            my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                            print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// ";
+                            $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// ";
+                        }
+                    }
+                }
+            }
+            elsif ($i == $#{ $matched_sequences{$peptide} }) {
+                if (defined $p_residues{$peptide}{$i}) {
+                    @tmp_p_residues = @{$p_residues{$peptide}{$i}};
+                    for my $j (0 .. $#tmp_p_residues) {
+                        if ($j < $#tmp_p_residues) {
+                            my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                            print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
+                            $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
+                        }
+                        elsif ($j == $#tmp_p_residues) {
+                            my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                            print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing\t";
+                            $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing";
+                        }
+                    }
+                } else {
+                  print OUT "\t";
+                }
+            }
+        }
+        push (@ppep_metadata, $tmp_for_insert);
+
+        # Print the UNIQUE phospho-motifs
+        # Column 6:
+        print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t";
+        push (@ppep_metadata, join(" /// ", @{$unique_motifs{$peptide}}));
+
+        # Print the accessions
+        # Column 7:
+        if (defined $accessions{$peptide}) {
+            print OUT join(" /// ", @{$accessions{$peptide}}), "\t";
+            push (@ppep_metadata, join(" /// ", @{$accessions{$peptide}}));
+        } else {
+            print OUT "\t";
+            push (@ppep_metadata, "");
+        }
+
+        # print ALL motifs with residue numbers
+        # Column 8:
+        if (defined $p_motifs{$peptide}) {
+            print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t";
+            push (@ppep_metadata, join(" /// ", @{$p_motifs{$peptide}}));
+        } else {
+            print OUT "\t";
+            push (@ppep_metadata, "");
+        }
+
+    }
+
+    # Print the PhosphoSite regulatory data
+
+    if (defined $domain_2{$peptide})            { print OUT "$domain_2{$peptide}\t";            } else { print OUT "\t"; }
+    if (defined $ON_FUNCTION_2{$peptide})       { print OUT "$ON_FUNCTION_2{$peptide}\t";       } else { print OUT "\t"; }
+    if (defined $ON_PROCESS_2{$peptide})        { print OUT "$ON_PROCESS_2{$peptide}\t";        } else { print OUT "\t"; }
+    if (defined $ON_PROT_INTERACT_2{$peptide})  { print OUT "$ON_PROT_INTERACT_2{$peptide}\t";  } else { print OUT "\t"; }
+    if (defined $ON_OTHER_INTERACT_2{$peptide}) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; }
+    if (defined $notes_2{$peptide})             { print OUT "$notes_2{$peptide}\t";             } else { print OUT "\t"; }
+
+    if (defined $domain_2{$peptide})            { push (@ppep_metadata, $domain_2{$peptide});            } else { push(@ppep_metadata, ""); }
+    if (defined $ON_FUNCTION_2{$peptide})       { push (@ppep_metadata, $ON_FUNCTION_2{$peptide});       } else { push(@ppep_metadata, ""); }
+    if (defined $ON_PROCESS_2{$peptide})        { push (@ppep_metadata, $ON_PROCESS_2{$peptide});        } else { push(@ppep_metadata, ""); }
+    if (defined $ON_PROT_INTERACT_2{$peptide})  { push (@ppep_metadata, $ON_PROT_INTERACT_2{$peptide});  } else { push(@ppep_metadata, ""); }
+    if (defined $ON_OTHER_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_OTHER_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); }
+    if (defined $notes_2{$peptide})             { push (@ppep_metadata, $notes_2{$peptide});             } else { push(@ppep_metadata, ""); }
+
+    # begin store-to-SQLite "ppep_metadata" table
+    # ---
+    for $i (1..14) {
+        #ACE print "\$ppep_metadata_stmth->bind_param($i, " . $ppep_metadata[$i-1] . ")\n";
+        $ppep_metadata_stmth->bind_param($i, $ppep_metadata[$i-1]);
+    }
+    if (not $ppep_metadata_stmth->execute()) {
+        print "Error writing ppep_metadata row for phosphopeptide $ppep_metadata[$i]: $ppep_metadata_stmth->errstr\n";
+    }
+    # ...
+    # end store-to-SQLite "ppep_metadata" table
+
+    # Print the data
+    @tmp_data = ();
+    foreach (@{$data{$peptide}}) {
+        push(@tmp_data, $_);
+    }
+    print OUT join("\t", @tmp_data), "\t";
+
+    # begin store-to-SQLite "ppep_intensity" table
+    # ---
+    # commit the sample intensities
+    $i = 0;
+    foreach (@{$data{$peptide}}) {
+        my $intense = $_;
+        $ppep_intensity_stmth->bind_param( 1, $ppep_id                     );
+        $ppep_intensity_stmth->bind_param( 2, $sample_id_lut{$samples[$i]} );
+        $ppep_intensity_stmth->bind_param( 3, $intense                     );
+        #ACE print "insert ($peptide, $samples[$i], $intense)\n";
+        if (not $ppep_intensity_stmth->execute()) {
+            print "Error writing tuple ($peptide,$samples[$i],$intense): $ppep_intensity_stmth->errstr\n";
+        }
+        $i += 1;
+    }
+    # ...
+    # end store-to-SQLite "ppep_intensity" table
+
+    # print the kinase-substrate data
+    for my $i (0 .. $#kinases_observed) {
+        if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) {
+            print OUT "X\t";
+            my $NetworKIN_label = $kinases_observed[$i]."_NetworKIN";
+            print MELT "$peptide\t$gene_names\t$site_description{$SITE_KINASE_SUBSTRATE}\t$NetworKIN_label\n";
+            # begin store-to-SQLite "ppep_gene_site" table
+            # ---
+            $ppep_gene_site_stmth->bind_param(1, $ppep_id);               # ppep_gene_site.ppep_id
+            $ppep_gene_site_stmth->bind_param(2, $gene_names);            # ppep_gene_site.gene_names
+            $ppep_gene_site_stmth->bind_param(3, $NetworKIN_label);       # ppep_gene_site.kinase_map
+            $ppep_gene_site_stmth->bind_param(4, $SITE_KINASE_SUBSTRATE); # ppep_gene_site.site_type_id
+            if (not $ppep_gene_site_stmth->execute()) {
+                print "Error writing tuple ($peptide,$gene_names,$kinases_observed[$i]): $ppep_gene_site_stmth->errstr\n";
+            }
+            # ...
+            # end store-to-SQLite "ppep_gene_site" table
+        }
+        else { print OUT "\t";}
+    }
+    #ACE my %wrote_motif = {};
+    my %wrote_motif;
+    my $motif_parts_0;
+    for my $i (0 .. $#motif_sequence) {
+        if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) {
+            print OUT "X\t";
+            #ACE my @motif_parts = split(/ motif /, $motif_type{$motif_sequence[$i]});
+            $motif_parts_0 = $motif_type{$motif_sequence[$i]}." ".$motif_sequence[$i];
+            my $key = "$peptide\t$gene_names\t$motif_parts_0";
+            if (!exists($wrote_motif{$key})) {
+                $wrote_motif{$key} = $key;
+                print MELT "$peptide\t$gene_names\t$site_description{$SITE_MOTIF}\t$motif_parts_0\n";
+                # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n";            #debug
+                # begin store-to-SQLite "ppep_gene_site" table
+                # ---
+                $ppep_gene_site_stmth->bind_param(1, $ppep_id);        # ppep_gene_site.ppep_id
+                $ppep_gene_site_stmth->bind_param(2, $gene_names);     # ppep_gene_site.gene_names
+                $ppep_gene_site_stmth->bind_param(3, $motif_parts_0); # ppep_gene_site.kinase_map
+                $ppep_gene_site_stmth->bind_param(4, $SITE_MOTIF);     # ppep_gene_site.site_type_id
+                if (not $ppep_gene_site_stmth->execute()) {
+                    print "Error writing tuple ($peptide,$gene_names,$motif_parts_0): $ppep_gene_site_stmth->errstr\n";
+                }
+                # ...
+                # end store-to-SQLite "ppep_gene_site" table
+            }
+        }
+        else { print OUT "\t";}
+    }
+    for my $i (0 .. $#kinases_PhosphoSite) {
+        if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]})) {
+            print MELT "$peptide\t$gene_names\t$site_description{$SITE_PHOSPHOSITE}\t$phosphosites_observed_lbl[$i]\n";
+            if ($i < $#kinases_PhosphoSite) {
+                print OUT "X\t";
+            }
+            else {
+                print OUT "X\n";
+            }
+            # begin store-to-SQLite "ppep_gene_site" table
+            # ---
+            $ppep_gene_site_stmth->bind_param(1, $ppep_id);                       # ppep_gene_site.ppep_id
+            $ppep_gene_site_stmth->bind_param(2, $gene_names);                    # ppep_gene_site.gene_names
+            $ppep_gene_site_stmth->bind_param(3, $phosphosites_observed_lbl[$i]); # ppep_gene_site.kinase_map
+            $ppep_gene_site_stmth->bind_param(4, $SITE_PHOSPHOSITE);              # ppep_gene_site.site_type_id
+            if (not $ppep_gene_site_stmth->execute()) {
+                print "Error writing tuple ($peptide,$gene_names,$phosphosites_observed_lbl[$i]): $ppep_gene_site_stmth->errstr\n";
+            }
+            # ...
+            # end store-to-SQLite "ppep_gene_site" table
+        }
+        else {
+            if ($i < $#kinases_PhosphoSite) {
+                print OUT "\t";
+            }
+            elsif ($i == $#kinases_PhosphoSite) {
+                print OUT "\n";
+            }
+        }
+    }
+}
+
+close OUT;
+close MELT;
+$ppep_gene_site_stmth->finish;
+print "begin DB commit at " . format_localtime_iso8601() . "\n";
+$dbh->{AutoCommit} = $auto_commit;
+$dbh->disconnect if ( defined $dbh );
+
+print "\nFinished writing output at " . format_localtime_iso8601() ."\n\n";
+
+###############################################################################################################################
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,4 @@
+<macros>
+    <token name="@TOOL_VERSION@">0.1.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_anova.R	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,191 @@
+#!/usr/bin/env Rscript
+# libraries
+library(optparse)
+library(data.table)
+library(stringr)
+#library(ggplot2)
+#library(PTXQC)
+#require(PTXQC)
+#require(methods)
+# bioconductor-preprocesscore
+#  - libopenblas
+#  - r-data.table
+#  - r-rmarkdown
+#  - r-ggplot2
+#  - texlive-core
+
+# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
+
+# parse options
+option_list <- list(
+  # <param name="inputFilename" type="data" format="tabular" label="Phosphopeptide Intensities" help="First column label 'Phosphopeptide'; sample-intensities must begin in column 10 and must have column labels to match argument regexSampleNames"/>
+  make_option(
+    c("-i", "--inputFile"),
+    action = "store",
+    default = NA,
+    type = "character",
+    help = "Phosphopeptide Intensities sparse input file path"
+  ),
+  make_option(
+    c("-a", "--alphaFile"),
+    action = "store",
+    default = NA,
+    type = "character",
+    help = "List of alpha cutoff values for significance testing; path to text file having one column and no header"
+  ),
+  make_option(
+    c("-f", "--firstDataColumn"),
+    action = "store",
+    default = "10",
+    type = "character",
+    help = "First column of intensity values"
+  ),
+  make_option( # imputationMethod <- c("group-median","median","mean","random")[1]
+    c("-m", "--imputationMethod"),
+    action = "store",
+    default = "group-median",
+    type = "character",
+    help = "Method for missing-value imputation, one of c('group-median','median','mean','random')"
+  ),
+  make_option(
+    c("-p", "--meanPercentile"),
+    action = "store",
+    default = 3,
+    type = "integer",
+    help = "Mean percentile for randomly generated imputed values; range [1,99]"
+  ),
+  make_option(
+    c("-d", "--sdPercentile"),
+    action = "store",
+    default = 3,
+    type = "double",
+    help = "Adjustment value for standard deviation of randomly generated imputed values; real"
+  ),
+  make_option(
+    c("-s", "--regexSampleNames"),
+    action = "store",
+    default = "\\.(\\d+)[A-Z]$",
+    type = "character",
+    help = "Regular expression extracting sample-names"
+  ),
+  make_option(
+    c("-g", "--regexSampleGrouping"),
+    action = "store",
+    default = "(\\d+)",
+    type = "character",
+    help = "Regular expression extracting sample-group from an extracted sample-name"
+  ),
+  # <data name="imputed_data_file" format="tabular" label="${input_file.name}.intensities_${imputation.imputation_method}-imputed_QN_LT" ></data>
+  make_option(
+    c("-o", "--imputedDataFile"),
+    action = "store",
+    default = "output_imputed.tsv",
+    type = "character",
+    help = "Imputed Phosphopeptide Intensities output file path"
+  ),
+  # <data name="report_file" format="html" label="report (download/unzip to view)" ></data>
+  make_option(
+    c("-r", "--reportFile"),
+    action = "store",
+    default = "QuantDataProcessingScript.html",
+    type = "character",
+    help = "HTML report file path"
+  )
+)
+args <- parse_args(OptionParser(option_list=option_list))
+# Check parameter values
+
+if (! file.exists(args$inputFile)) {
+  stop((paste("Input file", args$inputFile, "does not exist")))
+}
+inputFile <- args$inputFile
+alphaFile <- args$alphaFile
+firstDataColumn <- args$firstDataColumn
+imputationMethod <- args$imputationMethod
+meanPercentile <- args$meanPercentile
+sdPercentile <- args$sdPercentile
+
+regexSampleNames    <- gsub('^[ \t\n]*', ''  , readChar(args$regexSampleNames,  1000))
+regexSampleNames    <- gsub('[ \t\n]*$', ''  ,               regexSampleNames        )
+# regexSampleNames    <- gsub('\\\\'     , '@@',               regexSampleNames        )
+# regexSampleNames    <- gsub('@@'       , '\\',               regexSampleNames        )
+cat(regexSampleNames)
+cat('\n')
+
+regexSampleGrouping <- gsub('^[ \t\n]*', '', readChar(args$regexSampleGrouping, 1000))
+regexSampleGrouping <- gsub('[ \t\n]*$', '',               regexSampleGrouping       )
+# regexSampleGrouping <- gsub('\\\\'     , '@@',             regexSampleGrouping       )
+cat(regexSampleGrouping)
+cat('\n')
+
+# regexSampleGrouping <- gsub('@@'       , '\\',             regexSampleGrouping       )
+imputedDataFilename <- args$imputedDataFile
+reportFileName <- args$reportFile
+
+print("args is:")
+cat(str(args))
+
+print("regexSampleNames is:")
+cat(str(regexSampleNames))
+
+print("regexSampleGrouping is:")
+cat(str(regexSampleGrouping))
+
+# from: https://github.com/molgenis/molgenis-pipelines/wiki/How-to-source-another_file.R-from-within-your-R-script
+LocationOfThisScript = function() # Function LocationOfThisScript returns the location of this .R script (may be needed to source other files in same dir)
+{
+    this.file = NULL
+    # This file may be 'sourced'
+    for (i in -(1:sys.nframe())) {
+        if (identical(sys.function(i), base::source)) this.file = (normalizePath(sys.frame(i)$ofile))
+    }
+
+    if (!is.null(this.file)) return(dirname(this.file))
+
+    # But it may also be called from the command line
+    cmd.args = commandArgs(trailingOnly = FALSE)
+    cmd.args.trailing = commandArgs(trailingOnly = TRUE)
+    cmd.args = cmd.args[seq.int(from=1, length.out=length(cmd.args) - length(cmd.args.trailing))]
+    res = gsub("^(?:--file=(.*)|.*)$", "\\1", cmd.args)
+
+    # If multiple --file arguments are given, R uses the last one
+    res = tail(res[res != ""], 1)
+    if (0 < length(res)) return(dirname(res))
+
+    # Both are not the case. Maybe we are in an R GUI?
+    return(NULL)
+}
+
+script.dir <-  LocationOfThisScript()
+
+rmarkdown_params <- list(
+    inputFile = inputFile
+  , alphaFile = alphaFile
+  , firstDataColumn = firstDataColumn
+  , imputationMethod = imputationMethod
+  , meanPercentile = meanPercentile
+  , sdPercentile = sdPercentile
+  , regexSampleNames = regexSampleNames
+  , regexSampleGrouping = regexSampleGrouping
+  , imputedDataFilename = imputedDataFilename
+  )
+
+str(rmarkdown_params)
+
+# BUG
+# Must render as HTML for the time being until this issue is resolved:
+#   https://github.com/conda-forge/texlive-core-feedstock/issues/19
+# for reason:
+#   "The following dependencies are not available in conda"
+# reported here:
+#   https://github.com/ami-iit/bipedal-locomotion-framework/pull/457/commits/e98ccef8c8cb63e207df36628192af6ce22feb13
+
+# freeze the random number generator so the same results will be produced from run to run
+set.seed(28571)
+
+rmarkdown::render(
+  input = paste(script.dir, "mqppep_anova_script.Rmd", sep="/")
+, output_format = rmarkdown::html_document(pandoc_args = "--self-contained")
+, output_file = reportFileName
+, params = rmarkdown_params
+)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_anova.xml	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,219 @@
+<tool id="mqppep_anova" name="MaxQuant Phosphopeptide ANOVA" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" python_template_version="3.5">
+    <description>Perform ANOVA on merged and filtered data from phospho-peptide enrichment/MaxQuant pipeline</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <requirements>
+        <requirement type="package" version="1.7.1">r-optparse</requirement>
+        <requirement type="package" version="1.4.0">r-stringr</requirement>
+        <requirement type="package" version="1.14.2">r-data.table</requirement>
+        <requirement type="package" version="3.3.5">r-ggplot2</requirement>
+        <requirement type="package" version="1.56.0">bioconductor-preprocesscore</requirement>
+        <requirement type="package" version="0.3.3" >openblas</requirement>
+        <requirement type="package" version="2.11"  >r-rmarkdown</requirement>
+        <requirement type="package" version="0.4.0" >r-sass</requirement>
+        <requirement type="package"                 >texlive-core</requirement>
+
+    </requirements>
+    <!-- Rscript -e 'rmarkdown::render("QuantDataProcessingScript.Rmd")' -->
+    <command detect_errors="exit_code"><![CDATA[
+cat $sample_names_regex_f; cat $sample_grouping_regex_f;
+Rscript '$__tool_directory__/mqppep_anova.R'
+--inputFile '$input_file'
+--alphaFile $alpha_file
+--firstDataColumn $first_data_column
+--imputationMethod $imputation.imputation_method
+#if '$imputation_method' == 'random':
+  --meanPercentile '$meanPercentile'
+  --sdPercentile   '$sdPercentile'
+#end if
+--regexSampleNames $sample_names_regex_f
+--regexSampleGrouping $sample_grouping_regex_f
+--imputedDataFile $imputed_data_file
+--reportFile $report_file
+    ]]></command>
+    <configfiles>
+      <configfile name="sample_names_regex_f">
+        $sample_names_regex
+      </configfile>
+      <configfile name="sample_grouping_regex_f">
+        $sample_grouping_regex
+      </configfile>
+    </configfiles>
+    <inputs>
+        <param name="input_file" type="data" format="tabular" label="Filtered Phosphopeptide Intensities"
+               help="[input_file] Phosphopeptide intensities filtered for minimal quality.  First column label 'Phosphopeptide'; sample-intensities must begin in column 10 and must have column labels to match argument [sample_names_regex]"
+        />
+        <param name="alpha_file" type="data" format="tabular" label="alpha cutoff level"
+               help="[alpha_file] List of alpha cutoff values for significance testing; text file having one column and no header"
+        />
+        <param name="first_data_column" type="text" value="Intensity"
+               label="First data column"
+               help="[first_data_column] First column having intensity values (integer or PERL-compatible regular expression matching column label)"
+        />
+        <!-- imputation_method <- c("group-median","median","mean","random")[1] -->
+        <conditional name="imputation">
+            <param name="imputation_method" type="select" label="Imputation Method"
+                   help="[imputation_method] Impute missing values by (1) using median for each sample-group; (2) using median across all samples; (3) using mean across all samples; or (4) using randomly generated values having same std. dev. as across all samples (with mean specified by [meanPercentile])"
+            >
+                <option value="random" selected="true">random</option>
+                <option value="group-median">group-median</option>
+                <option value="median">median</option>
+                <option value="mean">mean</option>
+            </param>
+            <when value="group-median" />
+            <when value="median" />
+            <when value="mean" />
+            <when value="random">
+                <param name="meanPercentile" type="integer" value="1" min="1" max="99"
+                       label="Mean percentile for random values"
+                       help="[meanPercentile] Percentile center of random values; range [1,99]"
+                />
+                <param name="sdPercentile" type="float" value="0.2"
+                       label="Percentile std. dev. for random values"
+                       help="[sdPercentile] Standard deviation adjustment-factor for random values; real number.  (1.0 means SD equal to the SD for the entire data set.)"
+                />
+            </when>
+        </conditional>
+        <param name="sample_names_regex" type="text" value="\.(\d+)[A-Z]$"
+               help="[sample_names_regex] PERL-compatible regular expression extracting sample-names from the the name of a spectrum file (without extension)"
+               label="Sample-extraction regex">
+          <sanitizer>
+            <valid initial="string.printable">
+              <remove value="&apos;"/>
+            </valid>
+          </sanitizer>
+        </param>
+        <param name="sample_grouping_regex" type="text" value="(\d+)"
+               help="[sample_grouping_regex] PERL-compatible regular expression extracting sample-group from each sample-name (i.e., extracted by previous regex pattern)"
+               label="Group-extraction regex">
+          <sanitizer>
+            <valid initial="string.printable">
+              <remove value="&apos;"/>
+            </valid>
+          </sanitizer>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="imputed_data_file" format="tabular" label="${input_file.name}.intensities_${imputation.imputation_method}-imputed_QN_LT" ></data>
+        <data name="report_file" format="html" label="${input_file.name}.report (download/unzip to view)" ></data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/>
+            <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/>
+            <param name="first_data_column" value="10"/>
+            <param name="imputation_method" value="group-median"/>
+            <param name="sample_names_regex" value="\.\d+[A-Z]$"/>
+            <param name="sample_grouping_regex" value="\d+"/>
+            <output name="imputed_data_file">
+                <assert_contents>
+                    <has_text text="Phosphopeptide" />
+                    <has_text text="AAAAAAAGDpSDpSWDADAFSVEDPVRK" />
+                    <has_text text="23574000" />
+                    <has_text text="pSESELIDELSEDFDR" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/>
+            <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/>
+            <param name="first_data_column" value="10"/>
+            <param name="imputation_method" value="random"/>
+            <param name="sample_names_regex" value="\.\d+[A-Z]$"/>
+            <param name="sample_grouping_regex" value="\d+"/>
+            <output name="imputed_data_file">
+                <assert_contents>
+                    <has_text text="Phosphopeptide" />
+                    <has_text text="AAAAAAAGDpSDpSWDADAFSVEDPVRK" />
+                    <has_text text="997800000" />
+                    <has_text text="pSESELIDELSEDFDR" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+===========================================
+Phopsphoproteomic Enrichment Pipeline ANOVA
+===========================================
+
+**Input files**
+
+``input_file``
+  Phosphopeptides annotated with SwissProt and phosphosite metadata (in tabular format).
+  This is the output from the "Phopsphoproteomic Enrichment Pipeline Merge and Filter"
+  (``mqppep_mrgflt``) tool.
+
+``alpha_file``
+  List of alpha cutoff values for significance testing; text file having one column and no header.  For example:
+
+::
+
+  0.2
+  0.1
+  0.05
+
+**Input parameters**
+
+``first_data_column``
+  First column of ``input_file`` having intensity values (integer or PERL-compatible regular expression matching column label). Default: **Intensity**
+
+``imputation_method``
+  Impute missing values by:
+
+    1. using median for each sample-group;
+    2. using median across all samples;
+    3. using mean across all samples; or
+    4. using randomly generated values where:
+
+      - ``meanPercentile`` specifies the percentile among non-missing values to be used as mean of random values, and
+      - ``sdPercentile`` specifies the factor to be mulitplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values.
+
+``sample_names_regex``
+  PERL-compatible regular expression extracting the sample-name from the the name of a column of instensities (from ``input_file``) for one sample.
+
+    - For example, ``"\.\d+[A-Z]$"`` applied to ``Intensity.splunge.10A`` would produce ``.10A``
+    - Note that *this is case sensitive* by default.
+
+``sample_grouping_regex``
+  PERL-compatible regular expression extracting the sample-grouping from the sample-name that was extracted with ``sample_names_regex`` from a column of intensites (from ``input_file``).
+
+    - For example, ``"\d+$"`` applied to ``.10A`` would produce ``10``
+    - Note that *this is case sensitive* by default.
+
+
+**Outputs**
+
+``intensities_*-imputed_QN_LT``
+  Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format.
+
+``report_file``
+  (download/unzip to view) Summary report for normalization, imputation, and ANOVA.
+  This dataset is displayed in Galaxy as having a datatype of ``html`` in Galaxy,
+  but it is in fact a zipfile; the zip file contains
+  an HTML file.  Please download and unzip it locally to view the report.
+  Ideally this report would be a PDF, but there is an issue
+  `(linked here)
+  <https://github.com/conda-forge/texlive-core-feedstock/issues/19>`_.
+  that needs to be resolved first.
+
+**Authors**
+
+``Larry C. Cheng``
+  (`ORCiD 0000-0002-6922-6433 <https://orcid.org/0000-0002-6922-6433>`_) wrote the original script.
+
+``Arthur C. Eschenlauer``
+  (`ORCiD 0000-0002-2882-0508 <https://orcid.org/0000-0002-2882-0508>`_) adapted the script to run in Galaxy.
+
+===================================
+PERL-compatible regular expressions
+===================================
+
+Note that the PERL-compatible regular expressions accepted by this tool are documented at https://rdrr.io/r/base/regex.html
+
+    ]]></help>
+    <citations>
+        <!-- Cheng_2018 "Phosphopeptide Enrichment ..." PMID: 30124664 -->
+        <citation type="doi">10.3791/57996</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_anova_script.Rmd	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,657 @@
+---
+title: "Quant Data Processing Script"
+author: "Larry Cheng; Art Eschenlauer"
+date: "May 28, 2018; Nov 16, 2021"
+output:
+  html_document: default
+  pdf_document: default
+params:
+  inputFile: "Upstream_Map_pST_outputfile_STEP4.txt"
+  alphaFile: "alpha_levels.txt"
+  firstDataColumn: "Intensity"
+  imputationMethod: !r c("group-median","median","mean","random")[4]
+  meanPercentile: 1
+  sdPercentile: 0.2
+  regexSampleNames: "\\.(\\d+)[A-Z]$"
+  regexSampleGrouping: "(\\d+)"
+  imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt"
+---
+```{r setup, include=FALSE}
+# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
+knitr::opts_chunk$set(echo = FALSE, fig.dim=c(9,10))
+```
+
+## Purpose:
+Perform imputation of missing values, quantile normalization, and ANOVA.
+
+<!--
+## Variables to change for each input file
+-->
+```{r include = FALSE}
+#Input Filename
+inputFile <- params$inputFile
+
+#First data column - ideally, this could be detected via regexSampleNames, but for now leave it as is.
+firstDataColumn <- params$firstDataColumn
+FDC_is_integer <- TRUE
+firstDataColumn <- withCallingHandlers(
+    as.integer(firstDataColumn)
+  , warning = function(w) FDC_is_integer <<- FALSE
+  )
+if (FALSE == FDC_is_integer) {
+  firstDataColumn <- params$firstDataColumn
+}
+
+#False discovery rate adjustment for ANOVA (Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05)
+valFDR <- read.table(file = params$alphaFile, sep = "\t", header=F, quote="")[,1]
+
+#Imputed Data filename
+imputedDataFilename <- params$imputedDataFilename
+
+#ANOVA data filename
+```
+
+```{r include = FALSE}
+#Imputation method, should be one of c("random","group-median","median","mean")
+imputationMethod <- params$imputationMethod
+
+#Selection of percentile of logvalue data to set the mean for random number generation when using random imputation
+meanPercentile <- params$meanPercentile / 100.0
+
+#deviation adjustment-factor for random values; real number.
+sdPercentile <- params$sdPercentile
+
+#Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
+regexSampleNames <- params$regexSampleNames
+
+#Regular expression to extract Sample Grouping from Sample Name (if error occurs, compare sampleNumbers and tempMatches to see if groupings/pairs line up)
+# e.g., "(\\d+)"
+regexSampleGrouping <- params$regexSampleGrouping
+
+```
+
+
+```{r include = FALSE}
+### FUNCTIONS
+
+#ANOVA filter function
+anovaFunc <- function(x, groupingFactor) {
+  x.aov = aov(as.numeric(x) ~ groupingFactor)
+  pvalue = summary(x.aov)[[1]][["Pr(>F)"]][1]
+  pvalue
+}
+```
+
+
+
+### Checking that log-transformed sample distributions are similar:
+```{r echo=FALSE}
+
+library(data.table)
+
+# read.table reads a file in table format and creates a data frame from it.
+#   - note that `quote=""` means that quotation marks are treated literally.
+fullData <- read.table(file = inputFile, sep = "\t", header=T, quote="", check.names=FALSE)
+print(colnames(fullData))
+#head(fullData)
+
+if (FALSE == FDC_is_integer) {
+  dataColumnIndices <- grep(firstDataColumn, names(fullData), perl=TRUE)
+  str(dataColumnIndices)
+  if (length(dataColumnIndices) > 0) {
+    firstDataColumn <- dataColumnIndices[1]
+  } else {
+    stop(paste("failed to convert firstDataColumn:", firstDataColumn))
+  }
+}
+
+quantData0 <- fullData[firstDataColumn:length(fullData)]
+quantData <- fullData[firstDataColumn:length(fullData)]
+quantData[quantData==0] <- NA  #replace 0 with NA
+quantDataLog <- log10(quantData)
+
+rownames(quantDataLog) <- fullData$Phosphopeptide
+
+summary(quantDataLog)
+
+#data visualization
+old_par <- par(
+  mai=par("mai") + c(0.5,0,0,0)
+)
+boxplot(
+  quantDataLog
+, las=2
+)
+par(old_par)
+
+quantDataLog_stack <- stack(quantDataLog)
+```
+
+```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)}
+library(ggplot2)
+ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind))
+```
+
+### Globally, are phosphopeptide intensities are approximately unimodal?
+```{r echo = FALSE,fig.align="left", fig.dim=c(9,5)}
+
+# ref for bquote particularly and plotting math expressions generally:
+#   https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/
+
+#identify the location of missing values
+fin <- is.finite(as.numeric(as.matrix(quantDataLog)))
+
+logvalues <- as.numeric(as.matrix(quantDataLog))[fin]
+plot(
+  density(logvalues)
+, main = bquote("Smoothed estimated probability density vs." ~ log[10](intensity))
+, xlab = bquote(log[10](intensity))
+)
+hist(
+  x = as.numeric(as.matrix(quantDataLog))
+, breaks = 100
+, main = bquote("Frequency vs." ~ log[10](intensity))
+, xlab = bquote(log[10](intensity))
+)
+```
+
+<!--
+## Impute missing values
+-->
+
+### Distribution of standard deviations of phosphopeptides, ignoring missing values:
+
+```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)}
+#determine quantile
+q1 <- quantile(logvalues, probs = meanPercentile)[1]
+
+#determine standard deviation of quantile to impute
+sd_finite <- function(x) {
+  ok <- is.finite(x)
+  sd(x[ok]) * sdPercentile
+}
+sds <- apply(quantDataLog, 1, sd_finite) # 1 = row of matrix (ie, phosphopeptide)
+plot(
+  density(sds, na.rm=T)
+, main="Smoothed estimated probability density vs. std. deviation"
+, sub="(probability estimation made with Gaussian smoothing)"
+)
+
+m1 <- median(sds, na.rm=T) #sd to be used is the median sd
+
+```
+
+
+
+<!--
+The number of missing values are:
+-->
+```{r echo=FALSE}
+#Determine number of cells to impute
+temp <- quantData[is.na(quantData)]
+
+#Determine number of values to impute
+NoToImpute <- length(temp)
+```
+
+<!--
+% of values that are missing:
+-->
+```{r echo=FALSE}
+pct_missing_values <- length(temp)/(length(logvalues)+length(temp)) * 100
+```
+
+<!--
+First few rows of data before imputation:
+-->
+## Impute missing values
+```{r echo = FALSE}
+
+#ACE start segment: trt-median based imputation
+# prep for trt-median based imputation
+
+# Assuming that regexSampleNames <- "\\.(\\d+)[A-Z]$"
+#   get factors -> group runs (samples) by ignoring terminal [A-Z] in sample names
+# regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
+m <- regexpr(regexSampleNames, names(quantData), perl=TRUE)
+tempMatches <- regmatches(names(quantData), m)
+print("Extracted sample names")
+print(tempMatches)
+m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE)
+sampleNumbers <- as.factor(regmatches(tempMatches, m2))
+print("Factor levels")
+print(sampleNumbers)
+
+```
+```{r echo = FALSE}
+
+#ACE hack begin
+#Determine number of cells to impute
+cat(
+  sprintf("Before imputation, there are:\n %d peptides\n %d missing values (%2.0f%s)"
+  , sum(rep.int(TRUE, nrow(quantData)))
+  , sum(is.na(quantData))
+  , pct_missing_values
+  , "%"
+  )
+)
+#ACE hack end
+
+```
+```{r echo = FALSE}
+
+#Impute data
+quantDataImputed <- quantData
+
+# Identify which values are missing and need to be imputed
+ind <- which(is.na(quantDataImputed), arr.ind=TRUE)
+
+```
+```{r echo = FALSE}
+
+# Apply imputation
+switch(
+  imputationMethod
+, "group-median"={
+    cat("Imputation method: substitute missing value with median peptide-intensity for sample-group\n")
+    #goodRows <- rep.int(TRUE, nrow(quantDataImputed))
+    sampleLevelIntegers <- as.integer(sampleNumbers)
+    for (i in 1:length(levels(sampleNumbers))) {
+      levelCols <- i == sampleLevelIntegers
+      ind <- which(is.na(quantDataImputed[,levelCols]), arr.ind=TRUE)
+      quantDataImputed[ind,levelCols] <- apply(quantDataImputed[,levelCols], 1, median, na.rm=T)[ind[,1]]
+    }
+    goodRows <- !is.na(rowMeans(quantDataImputed))
+  }
+, "median"={
+    cat("Imputation method: substitute missing value with median peptide-intensity across all sample classes\n")
+    quantDataImputed[ind] <- apply(quantDataImputed, 1, median, na.rm=T)[ind[,1]]
+    goodRows <- !is.na(rowMeans(quantDataImputed))
+  }
+, "mean"={
+    cat("Imputation method: substitute missing value with mean peptide-intensity across all sample classes\n")
+    quantDataImputed[ind] <- apply(quantDataImputed, 1, mean, na.rm=T)[ind[,1]]
+    goodRows <- !is.na(rowMeans(quantDataImputed))
+  }
+, "random"={
+    cat(
+      sprintf(
+        "Imputation method: substitute missing value with random intensity N ~ (%0.2f, %0.2f)\n"
+      , q1, m1
+      )
+    )
+    quantDataImputed[is.na(quantDataImputed)] <- 10^rnorm(NoToImpute, mean= q1, sd = m1)
+    goodRows <- !is.na(rowMeans(quantDataImputed))
+  }
+)
+
+```
+```{r echo = FALSE}
+
+#Determine number of cells to impute
+temp <- quantDataImputed[is.na(quantDataImputed)]
+cat(
+  sprintf(
+    "After imputation, there are:\n  %d missing values\n  %d usable peptides\n  %d peptides with too many missing values for further analysis"
+  , sum(is.na(quantDataImputed[goodRows,]))
+  , sum(goodRows)
+  , sum(!goodRows)
+  )
+)
+```
+```{r echo = FALSE}
+
+
+# Zap rows where imputation was ineffective
+fullData         <- fullData        [goodRows, ]
+quantData        <- quantData       [goodRows, ]
+quantDataImputed <- quantDataImputed[goodRows, ]
+
+```
+```{r echo = FALSE}
+
+d_combined <- (density(as.numeric(as.matrix(log10(quantDataImputed)))))
+d_original <- density(as.numeric(as.matrix(log10(quantDataImputed[!is.na(quantData)]))))
+
+```
+```{r echo = FALSE}
+
+if (sum(is.na(quantData)) > 0) {
+  # There ARE missing values
+  d_imputed <- (density(as.numeric(as.matrix(log10(quantDataImputed[is.na(quantData)])))))
+} else {
+  # There are NO missing values
+  d_imputed <- d_combined
+}
+
+```
+
+<!-- ```{r echo = FALSE, fig.cap = "Blue =  Data before imputation; Red = Imputed data"} -->
+```{r echo = FALSE, fig.dim=c(9,5)}
+ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y))
+plot(
+  d_combined
+, ylim = ylim
+, sub = "Blue = data before imputation; Red = imputed data"
+, main = "Density vs. log10(intensity) before and after imputation"
+)
+lines(d_original, col="blue")
+lines(d_imputed, col="red")
+```
+
+## Perform Quantile Normalization
+```{r echo=FALSE}
+library(preprocessCore)
+# Apply quantile normalization using preprocessCore::normalize.quantiles
+# ---
+# tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html
+#   except this: https://support.bioconductor.org/p/122925/#9135989
+#   says to install it like this:
+#     ```
+#     BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE,lib=.libPaths()[1])
+#     ```
+# conda installation (necessary because of a bug in recent openblas):
+#   conda install bioconductor-preprocesscore openblas=0.3.3
+# ...
+# ---
+# normalize.quantiles {preprocessCore}	--  Quantile Normalization
+#
+# Description:
+#   Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities.
+#
+# Usage:
+#   normalize.quantiles(x,copy=TRUE, keep.names=FALSE)
+#
+# Arguments:
+#
+#   - x: A matrix of intensities where each column corresponds to a chip and each row is a probe.
+#
+#   - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy,
+#       but in certain situations not making a copy of the matrix, but instead normalizing
+#       it in place will be more memory friendly.
+#
+#   - keep.names: Boolean option to preserve matrix row and column names in output.
+#
+# Details:
+#   This method is based upon the concept of a quantile-quantile plot extended to n dimensions.
+#     No special allowances are made for outliers. If you make use of quantile normalization
+#     please cite Bolstad et al, Bioinformatics (2003).
+#
+#   This functions will handle missing data (ie NA values), based on
+#     the assumption that the data is missing at random.
+#
+#   Note that the current implementation optimizes for better memory usage
+#     at the cost of some additional run-time.
+#
+# Value: A normalized matrix.
+#
+# Author: Ben Bolstad, bmbolstad.com
+#
+# References
+#
+#   - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide
+#       Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf
+#
+#   - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of
+#       Normalization Methods for High Density Oligonucleotide Array Data Based on Bias
+#       and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
+#       http://bmbolstad.com/misc/normalize/normalize.html
+# ...
+
+if (TRUE) {
+  quantDataImputed.qn <- normalize.quantiles(as.matrix(quantDataImputed))
+} else {
+  quantDataImputed.qn <- as.matrix(quantDataImputed)
+}
+
+quantDataImputed.qn = as.data.frame(quantDataImputed.qn)
+names(quantDataImputed.qn) = names(quantDataImputed)
+quantDataImputed_QN_log <- log10(quantDataImputed.qn)
+
+rownames(quantDataImputed_QN_log) <- fullData[,1]
+
+quantDataImputed.qn.LS = t(scale(t(log10(quantDataImputed.qn))))
+anyNaN <- function (x) {
+  !any(x == "NaN")
+}
+sel = apply(quantDataImputed.qn.LS, 1, anyNaN)
+quantDataImputed.qn.LS2 <- quantDataImputed.qn.LS[which(sel),]
+quantDataImputed.qn.LS2 = as.data.frame(quantDataImputed.qn.LS2)
+
+#output quantile normalized data
+dataTableImputed_QN_LT <- cbind(fullData[1:9], quantDataImputed_QN_log)
+write.table(dataTableImputed_QN_LT, file = paste(paste(strsplit(imputedDataFilename, ".txt"),"QN_LT",sep="_"),".txt",sep=""), sep = "\t", col.names=TRUE, row.names=FALSE)
+
+```
+
+<!-- ACE insertion begin -->
+### Checking that normalized, imputed, log-transformed sample distributions are similar:
+
+```{r echo=FALSE}
+#library(data.table)
+
+#Save unimputed quantDataLog for plotting below
+unimputedQuantDataLog <- quantDataLog
+
+#Log10 transform (after preparing for zero values, which should never happen...)
+quantDataImputed.qn[quantDataImputed.qn == 0] <- .000000001
+quantDataLog <- log10(quantDataImputed.qn)
+
+summary(quantDataLog)
+
+#Output quantile-normalized log-transformed dataset with imputed, normalized data
+
+dataTableImputed <- cbind(fullData[1:9], quantDataLog)
+write.table(
+    dataTableImputed
+  , file=imputedDataFilename
+  , sep="\t"
+  , col.names=TRUE
+  , row.names=FALSE
+  , quote=FALSE
+  )
+
+
+
+#data visualization
+old_par <- par(
+  mai=par("mai") + c(0.5,0,0,0)
+, oma=par("oma") + c(0.5,0,0,0)
+)
+boxplot(
+  quantDataLog
+, las=2
+)
+par(old_par)
+```
+
+```{r echo=FALSE, fig.dim=c(9,5)}
+quantDataLog_stack <- stack(quantDataLog)
+ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind))
+```
+
+## Perform ANOVA filters
+
+```{r,echo=FALSE}
+#Make new data frame containing only Phosphopeptides to connect preANOVA to ANOVA (connect_df)
+connect_df <- data.frame(
+    dataTableImputed_QN_LT$Phosphopeptide
+  , dataTableImputed_QN_LT[,firstDataColumn]
+  )
+colnames(connect_df) <- c("Phosphopeptide","Intensity")
+```
+
+```{r echo=FALSE, fig.dim=c(9,10)}
+# Get factors -> group replicates (as indicated by terminal letter) by the preceding digits
+#   For example, group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc..
+m <- regexpr(regexSampleNames, names(quantDataImputed_QN_log), perl=TRUE)
+#ACE str(m)
+tempMatches <- regmatches(names(quantDataImputed_QN_log), m)
+#ACE str(tempMatches)
+numSamples <- length(tempMatches)
+#ACE str(numSamples)
+m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE)
+#ACE str(m2)
+#ACE str(regmatches(tempMatches, m2))
+sampleNumbers <- as.factor(regmatches(tempMatches, m2))
+#ACE str(sampleNumbers)
+
+if (length(levels(sampleNumbers))<2) {
+  cat("ERROR!!!! Cannot perform ANOVA analysis because it requires two or more factor levels\n")
+  cat("Unparsed sample names are:\n")
+  print(names(quantDataImputed_QN_log))
+  cat(sprintf("Parsing rule for SampleNames is '%s'\n", regexSampleNames))
+  cat("Parsed names are:\n")
+  print(tempMatches)
+  cat(sprintf("Parsing rule for SampleGrouping is '%s'\n", regexSampleGrouping))
+  cat("Sample group assignments are:\n")
+  print(regmatches(tempMatches, m2))
+} else {
+  pValueData.anovaPs <- apply(quantDataImputed_QN_log, 1, anovaFunc, groupingFactor=sampleNumbers)
+
+  pValueData.anovaPs.FDR <- p.adjust(pValueData.anovaPs, method="fdr")
+  pValueData <- data.frame(
+    phosphopeptide = fullData[,1]
+  , rawANOVAp = pValueData.anovaPs
+  , FDRadjustedANOVAp = pValueData.anovaPs.FDR
+  )
+  #ACE rownames(pValueData) <- fullData[,1]
+  # output ANOVA file to constructed filename,
+  #   e.g.    "Outputfile_pST_ANOVA_STEP5.txt"
+  #   becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"
+
+  #Re-output quantile-normalized log-transformed dataset with imputed, normalized data to include p-values
+
+  dataTableImputed <- cbind(fullData[1:9], pValueData[,2:3], quantDataLog)
+  write.table(
+      dataTableImputed
+    , file=imputedDataFilename
+    , sep="\t"
+    , col.names=TRUE
+    , row.names=FALSE
+    , quote=FALSE
+    )
+
+
+  pValueData <- pValueData[order(pValueData$FDRadjustedANOVAp),]
+
+  cutoff <- valFDR[1]
+  for (cutoff in valFDR){ #loop through FDR cutoffs
+
+    filtered_p <- pValueData[which(pValueData$FDRadjustedANOVAp < cutoff),, drop = FALSE]
+    filteredData.filtered <- quantDataImputed_QN_log[rownames(filtered_p),, drop = FALSE]
+    filteredData.filtered <- filteredData.filtered[order(filtered_p$FDRadjustedANOVAp),, drop = FALSE]
+
+    # <!-- ACE insertion start -->
+    old_oma <- par("oma")
+    old_par <- par(
+      mai=(par("mai") + c(0.7,0,0,0)) * c(1,1,0.3,1)
+    , oma=old_oma * c(1,1,0.3,1)
+    , cex.main=0.9
+    , cex.axis=0.7
+    )
+
+    if (nrow(filteredData.filtered) > 0) {
+      boxplot(
+        filteredData.filtered
+      , main = sprintf("Imputed, normalized intensities where adjusted p-value < %0.2f", cutoff)
+      # no line plot , main = ""
+      , las = 2
+      # , ylim = c(5.5,10)
+      , ylab = expression(log[10](intensity))
+      )
+    } else {
+      cat(sprintf("No peptides were found to have cutoff adjusted p-value < %0.2f\n", cutoff))
+    }
+    par(old_par)
+
+    #Add Phosphopeptide column to ANOVA filtered table
+    ANOVA.filtered_merge <- merge(
+        x = connect_df
+      , y = filteredData.filtered
+      , by.x="Intensity"
+      , by.y=1
+      )
+    ANOVA.filtered_merge.order <- rownames(filtered_p)
+
+    ANOVA.filtered_merge.format <- sapply(
+      X = filtered_p$FDRadjustedANOVAp
+    , FUN = function(x) {
+        if (x > 0.0001)
+          paste0("(%0.",1+ceiling(-log10(x)),"f) %s")
+        else
+          paste0("(%0.4e) %s")
+        }
+    )
+
+    #ANOVA.filtered_merge.format <- paste0("(%0.",1+ceiling(-log10(filtered_p$FDRadjustedANOVAp)),"f) %s")
+
+    ANOVA.filtered <- data.table(
+        ANOVA.filtered_merge$Phosphopeptide
+      , ANOVA.filtered_merge$Intensity
+      , ANOVA.filtered_merge[, 2:numSamples+1]
+      )
+    colnames(ANOVA.filtered) <- c("Phosphopeptide", colnames(filteredData.filtered))
+
+    # merge qualitative columns into the ANOVA data
+    output_table <- data.frame(ANOVA.filtered$Phosphopeptide)
+    output_table <- merge(
+        x = output_table
+      , y = dataTableImputed_QN_LT
+      , by.x = "ANOVA.filtered.Phosphopeptide"
+      , by.y="Phosphopeptide"
+      )
+
+    #Produce heatmap to visualize significance and the effect of imputation
+    m <- as.matrix(unimputedQuantDataLog[ANOVA.filtered_merge.order,])
+    if (nrow(m) > 0) {
+      rownames_m <- rownames(m)
+      rownames(m) <- sapply(
+          X = 1:nrow(m)
+        , FUN = function(i) {
+            sprintf(
+              ANOVA.filtered_merge.format[i]
+            , filtered_p$FDRadjustedANOVAp[i]
+            , rownames_m[i]
+            )
+          }
+        )
+      margins <- c(
+        max(nchar(colnames(m))) * 10 / 16 # col
+      , max(nchar(rownames(m))) * 5 / 16 # row
+      )
+      how_many_peptides <- min(50, nrow(m))
+
+      op <- par("cex.main")
+      try(
+        if (nrow(m) > 1) {
+          par(cex.main=0.6)
+          heatmap(
+            m[how_many_peptides:1,]
+          , Rowv = NA
+          , Colv = NA
+          , cexRow = 0.7
+          , cexCol = 0.8
+          , scale="row"
+          , margins = margins
+          , main = "Heatmap of unimputed, unnormalized intensities"
+          , xlab = ""
+          # , main = bquote(
+          #     .( how_many_peptides )
+          #       ~ " peptides with adjusted p-value <"
+          #       ~ .(sprintf("%0.2f", cutoff))
+          #     )
+          )
+        }
+      )
+      #ACE fig_dim knitr::opts_chunk$set(fig.dim = fig_dim)
+      par(op)
+    }
+
+  }
+}
+```
+
+## Peptide IDs, etc.
+
+See output files.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_mrgfltr.py	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,1337 @@
+#!/usr/bin/env python
+
+# Import the packages needed
+import argparse
+import os.path
+import sys
+
+import pandas
+import re
+import time
+import sqlite3 as sql
+from codecs import getreader as cx_getreader
+import sys
+import numpy as np
+
+#   for sorting list of lists using operator.itemgetter
+import operator
+
+#   for formatting stack-trace
+import traceback
+
+#   for Aho-Corasick search for fixed set of substrings
+import ahocorasick
+import operator
+import hashlib
+
+# for shutil.copyfile(src, dest)
+import shutil
+
+# global constants
+N_A = 'N/A'
+
+# ref: https://stackoverflow.com/a/8915613/15509512
+#   answers: "How to handle exceptions in a list comprehensions"
+#   usage:
+#       from math import log
+#       eggs = [1,3,0,3,2]
+#       print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
+#   producing:
+#       for <built-in function log>
+#         with args (0,)
+#         exception: math domain error
+#       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
+def catch(func, *args, handle=lambda e : e, **kwargs):
+    try:
+        return func(*args, **kwargs)
+    except Exception as e:
+        print("For %s" % str(func))
+        print("  with args %s" % str(args))
+        print("  caught exception: %s" % str(e))
+        (ty, va, tb) = sys.exc_info()
+        print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
+        exit(-1)
+        return None # was handle(e)
+
+def ppep_join(x):
+    x = [i for i in x if N_A != i]
+    result = "%s" % ' | '.join(x)
+    if result != "":
+        return result
+    else:
+        return N_A
+
+def melt_join(x):
+    tmp = {key.lower(): key for key in x}
+    result = "%s" % ' | '.join([tmp[key] for key in tmp])
+    return result
+
+def __main__():
+    # Parse Command Line
+    parser = argparse.ArgumentParser(
+        description='Phopsphoproteomic Enrichment Pipeline Merge and Filter.'
+        )
+
+    # inputs:
+    #   Phosphopeptide data for experimental results, including the intensities
+    #   and the mapping to kinase domains, in tabular format.
+    parser.add_argument(
+        '--phosphopeptides', '-p',
+        nargs=1,
+        required=True,
+        dest='phosphopeptides',
+        help='Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format'
+        )
+    #   UniProtKB/SwissProt DB input, SQLite
+    parser.add_argument(
+        '--ppep_mapping_db', '-d',
+        nargs=1,
+        required=True,
+        dest='ppep_mapping_db',
+        help='UniProtKB/SwissProt SQLite Database'
+        )
+    #ACE #   PhosPhositesPlus DB input, csv
+    #ACE parser.add_argument(
+    #ACE     '--psp_regulatory_sites', '-s',
+    #ACE     nargs=1,
+    #ACE     required=True,
+    #ACE     dest='psp_regulatory_sites_csv',
+    #ACE     help='PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header'
+    #ACE     )
+    #   species to limit records chosed from PhosPhositesPlus
+    parser.add_argument(
+        '--species', '-x',
+        nargs=1,
+        required=False,
+        default=[],
+        dest='species',
+        help='limit PhosphoSitePlus records to indicated species (field may be empty)'
+        )
+
+    # outputs:
+    #   tabular output
+    parser.add_argument(
+        '--mrgfltr_tab', '-o',
+        nargs=1,
+        required=True,
+        dest='mrgfltr_tab',
+        help='Tabular output file for results'
+        )
+    #   CSV output
+    parser.add_argument(
+        '--mrgfltr_csv', '-c',
+        nargs=1,
+        required=True,
+        dest='mrgfltr_csv',
+        help='CSV output file for results'
+        )
+    #   SQLite output
+    parser.add_argument(
+        '--mrgfltr_sqlite', '-S',
+        nargs=1,
+        required=True,
+        dest='mrgfltr_sqlite',
+        help='SQLite output file for results'
+        )
+
+    # "Make it so!" (parse the arguments)
+    options = parser.parse_args()
+    print("options: " + str(options))
+
+    # determine phosphopeptide ("upstream map") input tabular file access
+    if options.phosphopeptides is None:
+        exit('Argument "phosphopeptides" is required but not supplied')
+    try:
+        upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0])
+        input_file = open(upstream_map_filename_tab, 'r')
+        input_file.close()
+    except Exception as e:
+        exit('Error parsing phosphopeptides argument: %s' % str(e))
+
+    # determine input SQLite access
+    if options.ppep_mapping_db is None:
+        exit('Argument "ppep_mapping_db" is required but not supplied')
+    try:
+        uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0])
+        input_file = open(uniprot_sqlite, 'rb')
+        input_file.close()
+    except Exception as e:
+        exit('Error parsing ppep_mapping_db argument: %s' % str(e))
+
+    # copy input SQLite dataset to output SQLite dataset
+    if options.mrgfltr_sqlite is None:
+        exit('Argument "mrgfltr_sqlite" is required but not supplied')
+    try:
+        output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0])
+        shutil.copyfile(uniprot_sqlite, output_sqlite)
+    except Exception as e:
+        exit('Error copying ppep_mapping_db to mrgfltr_sqlite: %s' % str(e))
+
+    #ACE # determine psp_regulatory_sites CSV access
+    #ACE if options.psp_regulatory_sites_csv is None:
+    #ACE     exit('Argument "psp_regulatory_sites_csv" is required but not supplied')
+    #ACE #ACE print('options.psp_regulatory_sites_csv: ' + options.psp_regulatory_sites_csv)
+    #ACE try:
+    #ACE     phosphosite_filename_csv = os.path.abspath(options.psp_regulatory_sites_csv[0])
+    #ACE     input_file = open(phosphosite_filename_csv, 'r')
+    #ACE     input_file.close()
+    #ACE except Exception as e:
+    #ACE     exit('Error parsing psp_regulatory_sites_csv argument: %s' % str(e))
+    #ACE print('phosphosite_filename_csv: ' + phosphosite_filename_csv)
+
+    # determine species to limit records from PSP_Regulatory_Sites
+    if options.species is None:
+        exit('Argument "species" is required (and may be empty) but not supplied')
+    try:
+        if len(options.species) > 0:
+            species = options.species[0]
+        else:
+            species = ''
+    except Exception as e:
+        exit('Error parsing species argument: %s' % str(e))
+
+    # determine tabular output destination
+    if options.mrgfltr_tab is None:
+        exit('Argument "mrgfltr_tab" is required but not supplied')
+    try:
+        output_filename_tab = os.path.abspath(options.mrgfltr_tab[0])
+        output_file = open(output_filename_tab, 'w')
+        output_file.close()
+    except Exception as e:
+        exit('Error parsing mrgfltr_tab argument: %s' % str(e))
+
+    # determine CSV output destination
+    if options.mrgfltr_csv is None:
+        exit('Argument "mrgfltr_csv" is required but not supplied')
+    try:
+        output_filename_csv = os.path.abspath(options.mrgfltr_csv[0])
+        output_file = open(output_filename_csv, 'w')
+        output_file.close()
+    except Exception as e:
+        exit('Error parsing mrgfltr_csv argument: %s' % str(e))
+
+
+    def mqpep_getswissprot():
+
+        ###############################################
+        # copied from Excel Output Script.ipynb BEGIN #
+        ###############################################
+
+        ###########  String Constants  #################
+        DEPHOSPHOPEP = 'DephosphoPep'
+        DESCRIPTION = 'Description'
+        FUNCTION_PHOSPHORESIDUE = 'Function Phosphoresidue(PSP=PhosphoSitePlus.org)'
+        GENE_NAME = 'Gene_Name'                     # Gene Name from UniProtKB
+        ON_FUNCTION = 'ON_FUNCTION'                 # ON_FUNCTION column from PSP_Regulatory_Sites
+        ON_NOTES = 'NOTES'                          # NOTES column from PSP_Regulatory_Sites
+        ON_OTHER_INTERACT = 'ON_OTHER_INTERACT'     # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
+        ON_PROCESS = 'ON_PROCESS'                   # ON_PROCESS column from PSP_Regulatory_Sites
+        ON_PROT_INTERACT = 'ON_PROT_INTERACT'       # ON_PROT_INTERACT column from PSP_Regulatory_Sites
+        PHOSPHOPEPTIDE = 'Phosphopeptide'
+        PHOSPHOPEPTIDE_MATCH = 'Phosphopeptide_match'
+        PHOSPHORESIDUE = 'Phosphoresidue'
+        PUTATIVE_UPSTREAM_DOMAINS = 'Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains'
+        SEQUENCE = 'Sequence'
+        SEQUENCE10 = 'Sequence10'
+        SEQUENCE7 = 'Sequence7'
+        SITE_PLUSMINUS_7AA = 'SITE_+/-7_AA'
+        SITE_PLUSMINUS_7AA_SQL = 'SITE_PLUSMINUS_7AA'
+        UNIPROT_ID = 'UniProt_ID'
+        UNIPROT_SEQ_AND_META_SQL = '''
+            select    Uniprot_ID, Description, Gene_Name, Sequence,
+                      Organism_Name, Organism_ID, PE, SV
+                 from UniProtKB
+             order by Sequence, UniProt_ID
+        '''
+        UNIPROT_UNIQUE_SEQ_SQL = '''
+            select distinct Sequence
+                       from UniProtKB
+                   group by Sequence
+        '''
+        PPEP_PEP_UNIPROTSEQ_SQL = '''
+            select distinct phosphopeptide, peptide, sequence
+                       from uniprotkb_pep_ppep_view
+                   order by sequence
+        '''
+        PPEP_MELT_SQL = '''
+            SELECT DISTINCT
+                phospho_peptide AS 'p_peptide',
+                kinase_map AS 'characterization',
+                'X' AS 'X'
+            FROM ppep_gene_site_view
+        '''
+        # CREATE TABLE PSP_Regulatory_site (
+        #   site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
+        #   domain             TEXT,
+        #   ON_FUNCTION        TEXT,
+        #   ON_PROCESS         TEXT,
+        #   ON_PROT_INTERACT   TEXT,
+        #   ON_OTHER_INTERACT  TEXT,
+        #   notes              TEXT,
+        #   organism           TEXT
+        # );
+        PSP_REGSITE_SQL = '''
+            SELECT DISTINCT
+              SITE_PLUSMINUS_7AA ,
+              DOMAIN             ,
+              ON_FUNCTION        ,
+              ON_PROCESS         ,
+              ON_PROT_INTERACT   ,
+              ON_OTHER_INTERACT  ,
+              NOTES              ,
+              ORGANISM
+            FROM PSP_Regulatory_site
+        '''
+        PPEP_ID_SQL ='''
+            SELECT
+                id AS 'ppep_id',
+                seq AS 'ppep_seq'
+            FROM ppep
+        '''
+        MRGFLTR_DDL ='''
+        DROP VIEW  IF EXISTS mrgfltr_metadata_view;
+        DROP TABLE IF EXISTS mrgfltr_metadata;
+        CREATE TABLE mrgfltr_metadata
+          ( ppep_id                 INTEGER REFERENCES ppep(id)
+          , Sequence10              TEXT
+          , Sequence7               TEXT
+          , GeneName                TEXT
+          , Phosphoresidue          TEXT
+          , UniProtID               TEXT
+          , Description             TEXT
+          , FunctionPhosphoresidue  TEXT
+          , PutativeUpstreamDomains TEXT
+          , PRIMARY KEY (ppep_id)            ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW mrgfltr_metadata_view AS
+          SELECT DISTINCT
+              ppep.seq             AS phospho_peptide
+            , Sequence10
+            , Sequence7
+            , GeneName
+            , Phosphoresidue
+            , UniProtID
+            , Description
+            , FunctionPhosphoresidue
+            , PutativeUpstreamDomains
+          FROM
+            ppep, mrgfltr_metadata
+          WHERE
+              mrgfltr_metadata.ppep_id = ppep.id
+          ORDER BY
+            ppep.seq
+            ;
+        '''
+
+        CITATION_INSERT_STMT = '''
+          INSERT INTO Citation (
+            ObjectName,
+            CitationData
+          ) VALUES (?,?)
+          '''
+        CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
+        CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'
+
+        MRGFLTR_METADATA_COLUMNS = [
+            'ppep_id',
+            'Sequence10',
+            'Sequence7',
+            'GeneName',
+            'Phosphoresidue',
+            'UniProtID',
+            'Description',
+            'FunctionPhosphoresidue',
+            'PutativeUpstreamDomains'
+            ]
+
+        ###########  String Constants (end) ############
+
+        class Error(Exception):
+            """Base class for exceptions in this module."""
+            pass
+
+        class PreconditionError(Error):
+            """Exception raised for errors in the input.
+
+            Attributes:
+                expression -- input expression in which the error occurred
+                message -- explanation of the error
+            """
+
+            def __init__(self, expression, message):
+                self.expression = expression
+                self.message = message
+
+        #start_time = time.clock() #timer
+        start_time = time.process_time() #timer
+
+        #get keys from upstream tabular file using readline()
+        # ref: https://stackoverflow.com/a/16713581/15509512
+        #      answer to "Use codecs to read file with correct encoding"
+        file1_encoded = open(upstream_map_filename_tab, 'rb')
+        file1 = cx_getreader("latin-1")(file1_encoded)
+
+        count = 0
+        upstream_map_p_peptide_list = []
+        re_tab = re.compile('^[^\t]*')
+        while True:
+            count += 1
+            # Get next line from file
+            line = file1.readline()
+            # if line is empty
+            # end of file is reached
+            if not line:
+                break
+            if count > 1:
+                m = re_tab.match(line)
+                upstream_map_p_peptide_list.append(m[0])
+        file1.close()
+        file1_encoded.close()
+
+        # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
+        re_phos = re.compile('p')
+        dephospho_peptide_list = [ re_phos.sub('',foo) for foo in upstream_map_p_peptide_list ]
+
+        end_time = time.process_time() #timer
+        print("%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr)
+
+        ## ----------- Get SwissProt data from SQLite database (start) -----------
+        # build UniProt sequence LUT and list of unique SwissProt sequences
+
+        # Open SwissProt SQLite database
+        conn = sql.connect(uniprot_sqlite)
+        cur  = conn.cursor()
+
+        # Set up structures to hold SwissProt data
+
+        uniprot_Sequence_List = []
+        UniProtSeqLUT = {}
+
+        # Execute query for unique seqs without fetching the results yet
+        uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL)
+
+        while batch := uniprot_unique_seq_cur.fetchmany(size=50):
+            if None == batch:
+                # handle case where no records are returned
+                break
+            for row in batch:
+                Sequence = row[0]
+                UniProtSeqLUT[(Sequence,DESCRIPTION)] = []
+                UniProtSeqLUT[(Sequence,GENE_NAME)  ] = []
+                UniProtSeqLUT[(Sequence,UNIPROT_ID) ] = []
+                UniProtSeqLUT[ Sequence               ] = []
+
+        # Execute query for seqs and metadata without fetching the results yet
+        uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL)
+
+        while batch := uniprot_seq_and_meta.fetchmany(size=50):
+            if None == batch:
+                  # handle case where no records are returned
+                break
+            for UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV in batch:
+                uniprot_Sequence_List.append(Sequence)
+                UniProtSeqLUT[Sequence] = Sequence
+                UniProtSeqLUT[(Sequence,UNIPROT_ID) ].append(UniProt_ID)
+                UniProtSeqLUT[(Sequence,GENE_NAME)  ].append(Gene_Name)
+                if OS != N_A:
+                    Description += ' OS=' + OS
+                if OX != N_A:
+                    Description += ' OX=' + str(int(OX))
+                if Gene_Name != N_A:
+                    Description += ' GN=' + Gene_Name
+                if PE != N_A:
+                    Description += ' PE=' + PE
+                if SV != N_A:
+                    Description += ' SV=' + SV
+                UniProtSeqLUT[(Sequence,DESCRIPTION)].append(Description)
+
+        # Close SwissProt SQLite database; clean up local variables
+        conn.close()
+        Sequence = ''
+        UniProt_ID = ''
+        Description = ''
+        Gene_Name = ''
+
+        ## ----------- Get SwissProt data from SQLite database (finish) -----------
+
+        end_time = time.process_time() #timer
+        print("%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr)
+
+        ## ----------- Get SwissProt data from SQLite database (start) -----------
+        # build PhosphoPep_UniProtSeq_LUT and PhosphoPep_UniProtSeq_LUT
+        #ACE_temp pepSeqList = list( zip(pepList, dephosphPepList, [seq]*len(pepList)) )
+
+        # Open SwissProt SQLite database
+        conn = sql.connect(uniprot_sqlite)
+        cur  = conn.cursor()
+
+        # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
+        DephosphoPep_UniProtSeq_LUT = {}
+
+        # Set up dictionary to accumulate results
+        PhosphoPep_UniProtSeq_LUT = {}
+
+        # Execute query for tuples without fetching the results yet
+        ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL)
+
+        while batch := ppep_pep_uniprotseq_cur.fetchmany(size=50):
+            if None == batch:
+                # handle case where no records are returned
+                break
+            for (phospho_pep, dephospho_pep, sequence) in batch:
+                #do interesting stuff here...
+                PhosphoPep_UniProtSeq_LUT[phospho_pep]                  = phospho_pep
+                PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
+                if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
+                    DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set()
+                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)]  = []
+                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)]    = []
+                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]   = []
+                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]     = []
+                DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep)
+
+                #ACE print("ppep:'%s' dephospho_pep:'%s' sequence:'%s'" % (phospho_pep, dephospho_pep, sequence))
+                if sequence not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]:
+                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)].append(sequence)
+                for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]:
+                    if phospho_pep != phospho_pep:
+                        print("phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep))
+                    if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
+                        PhosphoPep_UniProtSeq_LUT[phospho_pep]                  = phospho_pep
+                        PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
+                    r = list(zip(
+                       [s for s in UniProtSeqLUT[(sequence,UNIPROT_ID)]],
+                       [s for s in UniProtSeqLUT[(sequence,GENE_NAME)]],
+                       [s for s in UniProtSeqLUT[(sequence,DESCRIPTION)]]
+                       ))
+                    # Sort by `UniProt_ID`
+                    #   ref: https://stackoverflow.com/a/4174955/15509512
+                    r = sorted(r, key=operator.itemgetter(0))
+                    # Get one tuple for each `phospho_pep`
+                    #   in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
+                    for (upid, gn, desc) in r:
+                        # Append pseudo-tuple per UniProt_ID but only when it is not present
+                        if upid not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]:
+                            DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)].append(upid)
+                            DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)].append(desc)
+                            DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)].append(gn)
+
+        # Close SwissProt SQLite database; clean up local variables
+        conn.close()
+        # wipe local variables
+        phospho_pep = dephospho_pep = sequence = 0
+        upid = gn = desc = r = ''
+
+        ## ----------- Get SwissProt data from SQLite database (finish) -----------
+
+        end_time = time.process_time() #timer
+        print("%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time,upstream_map_filename_tab), file=sys.stderr)
+
+        print('{:>10} unique upstream phosphopeptides tested'.format(str(len(upstream_map_p_peptide_list))))
+
+        #Read in Upstream tabular file
+        # We are discarding the intensity data; so read it as text
+        upstream_data = pandas.read_table(
+            upstream_map_filename_tab,
+            dtype='str',
+            index_col = 0
+            )
+
+        end_time = time.process_time() #timer
+        print("%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        upstream_data.index = upstream_map_p_peptide_list
+
+
+        end_time = time.process_time() #timer
+        print("%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr) #timer
+
+
+        #trim upstream_data to include only the upstream map columns
+        old_cols = upstream_data.columns.tolist()
+        i = 0
+        first_intensity = -1
+        last_intensity  = -1
+        intensity_re = re.compile('Intensity.*')
+        for col_name in old_cols:
+            m = intensity_re.match(col_name)
+            if m:
+                last_intensity = i
+                if first_intensity == -1:
+                    first_intensity = i
+            i += 1
+        #print('last intensity = %d' % last_intensity)
+        col_PKCalpha = last_intensity + 2
+        col_firstIntensity = first_intensity
+
+        data_in_cols = [old_cols[0]] + old_cols[first_intensity:last_intensity+1]
+
+        if upstream_data.empty:
+            print("upstream_data is empty")
+            exit(0)
+
+        data_in = upstream_data.copy(deep=True)[data_in_cols]
+
+        # Convert floating-point integers to int64 integers
+        #   ref: https://stackoverflow.com/a/68497603/15509512
+        data_in[list(data_in.columns[1:])] = data_in[
+            list(data_in.columns[1:])].astype('float64').apply(np.int64)
+
+        #create another phosphopeptide column that will be used to join later;
+        #  MAY need to change depending on Phosphopeptide column position
+        #data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]]
+        data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index
+
+
+
+
+        end_time = time.process_time() #timer
+        print("%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        # Produce a dictionary of metadata for a single phosphopeptide.
+        #   This is a replacement of `UniProtInfo_subdict` in the original code.
+        def pseq_to_subdict(phospho_pep):
+            #ACE print("calling pseq_to_subdict, %s" % phospho_pep);
+            # Strip "p" from phosphopeptide sequence
+            dephospho_pep = re_phos.sub('',phospho_pep)
+
+            # Determine number of phosphoresidues in phosphopeptide
+            numps = len(phospho_pep) - len(dephospho_pep)
+
+            # Determine location(s) of phosphoresidue(s) in phosphopeptide
+            #   (used later for Phosphoresidue, Sequence7, and Sequence10)
+            ploc = [] #list of p locations
+            i = 0
+            p = phospho_pep
+            while i < numps:
+                ploc.append(p.find("p"))
+                p = p[:p.find("p")] + p[p.find("p")+1:]
+                i +=1
+
+
+            # Establish nested dictionary
+            result = {}
+            result[SEQUENCE] = []
+            result[UNIPROT_ID] = []
+            result[DESCRIPTION] = []
+            result[GENE_NAME] = []
+            result[PHOSPHORESIDUE] = []
+            result[SEQUENCE7] = []
+            result[SEQUENCE10] = []
+
+            # Add stripped sequence to dictionary
+            result[SEQUENCE].append(dephospho_pep)
+
+            # Locate dephospho_pep in DephosphoPep_UniProtSeq_LUT
+            dephos = DephosphoPep_UniProtSeq_LUT[dephospho_pep]
+
+            # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT
+            ### Caller may elect to:
+            ## try:
+            ##     ...
+            ## except PreconditionError as pe:
+            ##     print("'{expression}': {message}".format(
+            ##             expression = pe.expression,
+            ##             message = pe.message))
+            ##             )
+            ##         )
+            if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
+                raise PreconditionError( dephospho_pep,
+                    'dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT'
+                    )
+            if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
+                raise PreconditionError( dephospho_pep,
+                    'no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT'
+                    )
+            if dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]:
+                raise PreconditionError( dephospho_pep,
+                    "dephosphorylated phosphopeptide does not match " +
+                    "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " +
+                    PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]
+                    )
+            result[SEQUENCE] = [dephospho_pep]
+            result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]
+            result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)]
+            result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)]
+            if (dephospho_pep,SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
+               raise PreconditionError( dephospho_pep,
+                    'no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT'
+                    )
+            UniProtSeqList = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]
+            if len (UniProtSeqList) < 1:
+               print("Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep)
+               # raise PreconditionError(
+               #     "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
+               #      'value has zero length'
+               #      )
+            for UniProtSeq in UniProtSeqList:
+                i = 0
+                phosphoresidues = []
+                seq7s_set = set()
+                seq7s = []
+                seq10s_set = set()
+                seq10s = []
+                while i < len(ploc):
+                    start = UniProtSeq.find(dephospho_pep)
+                    psite = start+ploc[i] #location of phosphoresidue on protein sequence
+
+                    #add Phosphoresidue
+                    phosphosite = "p"+str(UniProtSeq)[psite]+str(psite+1)
+                    phosphoresidues.append(phosphosite)
+
+                    #Add Sequence7
+                    if psite < 7: #phospho_pep at N terminus
+                        seq7 = str(UniProtSeq)[:psite+8]
+                        if seq7[psite] == "S": #if phosphosresidue is serine
+                            pres = "s"
+                        elif seq7[psite] == "T": #if phosphosresidue is threonine
+                            pres = "t"
+                        elif seq7[psite] == "Y": #if phosphoresidue is tyrosine
+                            pres = "y"
+                        else: # if not pSTY
+                            pres = "?"
+                        seq7 = seq7[:psite] + pres + seq7[psite+1:psite+8]
+                        while len(seq7) < 15: #add appropriate number of "_" to the front
+                            seq7 = "_" + seq7
+                    elif len(UniProtSeq) - psite < 8: #phospho_pep at C terminus
+                        seq7 = str(UniProtSeq)[psite-7:]
+                        if seq7[7] == "S":
+                            pres = "s"
+                        elif seq7[7] == "T":
+                            pres = "t"
+                        elif seq7[7] == "Y":
+                            pres = "y"
+                        else:
+                            pres = "?"
+                        seq7 = seq7[:7] + pres + seq7[8:]
+                        while len(seq7) < 15: #add appropriate number of "_" to the back
+                            seq7 = seq7 + "_"
+                    else:
+                        seq7 = str(UniProtSeq)[psite-7:psite+8]
+                        pres = "" #phosphoresidue
+                        if seq7[7] == "S": #if phosphosresidue is serine
+                            pres = "s"
+                        elif seq7[7] == "T": #if phosphosresidue is threonine
+                            pres = "t"
+                        elif seq7[7] == "Y": #if phosphoresidue is tyrosine
+                            pres = "y"
+                        else: # if not pSTY
+                            pres = "?"
+                        seq7 = seq7[:7] + pres + seq7[8:]
+                    if seq7 not in seq7s_set:
+                        seq7s.append(seq7)
+                        seq7s_set.add(seq7)
+
+                    #add Sequence10
+                    if psite < 10: #phospho_pep at N terminus
+                        seq10 = str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite:psite+11]
+                    elif len(UniProtSeq) - psite < 11: #phospho_pep at C terminus
+                        seq10 = str(UniProtSeq)[psite-10:psite] + "p" + str(UniProtSeq)[psite:]
+                    else:
+                        seq10 = str(UniProtSeq)[psite-10:psite+11]
+                        seq10 = seq10[:10] + "p" + seq10[10:]
+                    if seq10 not in seq10s_set:
+                        seq10s.append(seq10)
+                        seq10s_set.add(seq10)
+
+                    i+=1
+
+                result[PHOSPHORESIDUE].append(phosphoresidues)
+                result[SEQUENCE7].append(seq7s)
+                # result[SEQUENCE10] is a list of lists of strings
+                result[SEQUENCE10].append(seq10s)
+
+
+
+
+            r = list(zip(
+               result[UNIPROT_ID],
+               result[GENE_NAME],
+               result[DESCRIPTION],
+               result[PHOSPHORESIDUE]
+               ))
+            # Sort by `UniProt_ID`
+            #   ref: https://stackoverflow.com//4174955/15509512
+            s = sorted(r, key=operator.itemgetter(0))
+
+            result[UNIPROT_ID] = []
+            result[GENE_NAME] = []
+            result[DESCRIPTION] = []
+            result[PHOSPHORESIDUE] = []
+
+            for r in s:
+                result[UNIPROT_ID].append(r[0])
+                result[GENE_NAME].append(r[1])
+                result[DESCRIPTION].append(r[2])
+                result[PHOSPHORESIDUE].append(r[3])
+
+
+            #convert lists to strings in the dictionary
+            for key,value in result.items():
+                if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]:
+                    result[key] = '; '.join(map(str, value))
+                elif key in [SEQUENCE10]:
+                    # result[SEQUENCE10] is a list of lists of strings
+                    joined_value = ''
+                    joined_set = set()
+                    sep = ''
+                    for valL in value:
+                        # valL is a list of strings
+                        for val in valL:
+                            # val is a string
+                            if val not in joined_set:
+                                joined_set.add(val)
+                                #joined_value += sep + '; '.join(map(str, val))
+                                joined_value += sep + val
+                                sep = '; '
+                    # joined_value is a string
+                    result[key] = joined_value
+
+
+            newstring = '; '.join(
+                [', '.join(l) for l in result[PHOSPHORESIDUE]]
+                )
+            ### #separate the isoforms in PHOSPHORESIDUE column with ";"
+            ### oldstring = result[PHOSPHORESIDUE]
+            ### oldlist = list(oldstring)
+            ### newstring = ""
+            ### i = 0
+            ### for e in oldlist:
+            ###     if e == ";":
+            ###         if numps > 1:
+            ###             if i%numps:
+            ###                 newstring = newstring + ";"
+            ###             else:
+            ###                 newstring = newstring + ","
+            ###         else:
+            ###             newstring = newstring + ";"
+            ###         i +=1
+            ###     else:
+            ###         newstring = newstring + e
+            result[PHOSPHORESIDUE] = newstring
+
+
+            #separate sequence7's by |
+            oldstring = result[SEQUENCE7]
+            oldlist = oldstring
+            newstring = ""
+            for l in oldlist:
+              for e in l:
+                if e == ";":
+                    newstring = newstring + " |"
+                elif len(newstring) > 0 and 1 > newstring.count(e):
+                    newstring = newstring + " | " + e
+                elif 1 > newstring.count(e):
+                    newstring = newstring + e
+            result[SEQUENCE7] = newstring
+
+
+            return [phospho_pep, result]
+
+        # Construct list of [string, dictionary] lists
+        #   where the dictionary provides the SwissProt metadata for a phosphopeptide
+        result_list = [
+            catch(pseq_to_subdict,psequence)
+            for psequence
+            in data_in[PHOSPHOPEPTIDE_MATCH]
+            ]
+
+
+        end_time = time.process_time() #timer
+        print("%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        # Construct dictionary from list of lists
+        #   ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
+        UniProt_Info = {
+            result[0]:result[1]
+            for result
+            in result_list
+            if result is not None
+            }
+
+
+        end_time = time.process_time() #timer
+        print("%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        #cosmetic: add N_A to phosphopeptide rows with no hits
+        p_peptide_list = []
+        for key in UniProt_Info:
+            p_peptide_list.append(key)
+            for nestedKey in UniProt_Info[key]:
+                if UniProt_Info[key][nestedKey] == "":
+                    UniProt_Info[key][nestedKey] = N_A
+
+        end_time = time.process_time() #timer
+        print("%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        #convert UniProt_Info dictionary to dataframe
+        uniprot_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(UniProt_Info))
+
+        #reorder columns to match expected output file
+        uniprot_df[PHOSPHOPEPTIDE] = uniprot_df.index #make index a column too
+
+
+        cols = uniprot_df.columns.tolist()
+        #cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]]
+        #uniprot_df = uniprot_df[cols]
+        uniprot_df = uniprot_df[[
+            PHOSPHOPEPTIDE,
+            SEQUENCE10,
+            SEQUENCE7,
+            GENE_NAME,
+            PHOSPHORESIDUE,
+            UNIPROT_ID,
+            DESCRIPTION
+            ]]
+
+
+        end_time = time.process_time() #timer
+        print("%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        #concat to split then groupby to collapse
+        seq7_df = pandas.concat([pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(' | '))
+                            for _, row in uniprot_df.iterrows()]).reset_index()
+        seq7_df.columns = [SEQUENCE7,PHOSPHOPEPTIDE]
+
+        # --- -------------- begin read PSP_Regulatory_sites ---------------------------------
+        #read in PhosphoSitePlus Regulatory Sites dataset
+        #ACE if (True):
+        ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
+        conn = sql.connect(uniprot_sqlite)
+        regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
+        # Close SwissProt SQLite database
+        conn.close()
+        #ACE # Array indexes are zero-based
+        #ACE #   ref: https://en.wikipedia.org/wiki/Python_(programming_language)
+        #ACE RENAME_COLS = [ 'SITE_PLUSMINUS_7AA', 'DOMAIN', 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT'
+        #ACE               , 'ON_OTHER_INTERACT' , 'NOTES' , 'ORGANISM']
+        #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
+        #ACE     print(regsites_df)
+        ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (finish) -----------
+        #ACE else:
+        #ACE     regsites_df = pandas.read_csv(phosphosite_filename_csv, header=3,skiprows=1-3)
+        #ACE     SITE_PLUSMINUS_7AA_SQL = SITE_PLUSMINUS_7AA
+        #ACE     #ACE # Array indexes are zero-based
+        #ACE     #ACE #   ref: https://en.wikipedia.org/wiki/Python_(programming_language)
+        #ACE     #ACE RENAME_COLS = [ 'GENE'          , 'PROTEIN'    , 'PROT_TYPE' , 'ACC_ID'          , 'GENE_ID'
+        #ACE     #ACE               , 'HU_CHR_LOC'    , 'ORGANISM'   , 'MOD_RSD'   , 'SITE_GRP_ID'     , 'SITE_+/-7_AA'
+        #ACE     #ACE               , 'DOMAIN'        , 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT'
+        #ACE     #ACE               , 'PMIDs'         , 'LT_LIT'     , 'MS_LIT'    , 'MS_CST'          , 'NOTES'
+        #ACE     #ACE               ]
+        #ACE     #ACE REGSITE_COL_SITE7AA = 9
+        #ACE     #ACE REGSITE_COL_PROTEIN = 1
+        #ACE     #ACE REGSITE_COL_DOMAIN = 10
+        #ACE     #ACE REGSITE_COL_PMIDs = 15
+
+        # ... -------------- end read PSP_Regulatory_sites ------------------------------------
+
+
+        #keep only the human entries in dataframe
+        if len(species) > 0:
+            print('Limit PhosphoSitesPlus records to species "' + species + '"')
+            regsites_df = regsites_df[regsites_df.ORGANISM == species]
+
+        #merge the seq7 df with the regsites df based off of the sequence7
+        merge_df = seq7_df.merge(regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how='left')
+        #ACE print(merge_df.columns.tolist()) #ACE
+
+        #after merging df, select only the columns of interest - note that PROTEIN is absent here
+        merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,ON_FUNCTION,ON_PROCESS, ON_PROT_INTERACT,ON_OTHER_INTERACT,ON_NOTES]]
+        #ACE print(merge_df.columns.tolist()) #ACE
+        #combine column values of interest into one FUNCTION_PHOSPHORESIDUE column"
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(merge_df[ON_PROCESS], sep="; ", na_rep="")
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="")
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="")
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="")
+
+        #remove the columns that were combined
+        merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,FUNCTION_PHOSPHORESIDUE]]
+
+        #ACE print(merge_df) #ACE
+        #ACE print(merge_df.columns.tolist()) #ACE
+
+        end_time = time.process_time() #timer
+        print("%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        #cosmetic changes to Function Phosphoresidue column
+        fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE])
+
+        end_time = time.process_time() #timer
+        print("%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        i = 0
+        while i < len(fp_series):
+            #remove the extra ";" so that it looks more professional
+            if fp_series[i] == "; ; ; ; ": #remove ; from empty hits
+                fp_series[i] = ""
+            while fp_series[i].endswith("; "): #remove ; from the ends
+                fp_series[i] = fp_series[i][:-2]
+            while fp_series[i].startswith("; "): #remove ; from the beginning
+                fp_series[i] = fp_series[i][2:]
+            fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ")
+            fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
+            fp_series[i] = fp_series[i].replace("; ; ", "; ")
+
+            #turn blanks into N_A to signify the info was searched for but cannot be found
+            if fp_series[i] == "":
+                fp_series[i] = N_A
+
+            i += 1
+        merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series
+
+        end_time = time.process_time() #timer
+        print("%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        #merge uniprot df with merge df
+        uniprot_regsites_merged_df = uniprot_df.merge(merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE,how="left")
+
+        #collapse the merged df
+        uniprot_regsites_collapsed_df = pandas.DataFrame(
+            uniprot_regsites_merged_df
+            .groupby(PHOSPHOPEPTIDE)[FUNCTION_PHOSPHORESIDUE]
+            .apply(lambda x: ppep_join(x)))
+            #.apply(lambda x: "%s" % ' | '.join(x)))
+
+
+        end_time = time.process_time() #timer
+        print("%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        uniprot_regsites_collapsed_df[PHOSPHOPEPTIDE] = uniprot_regsites_collapsed_df.index #add df index as its own column
+
+
+        #rename columns
+        uniprot_regsites_collapsed_df.columns = [FUNCTION_PHOSPHORESIDUE, 'ppp']
+
+
+
+        #select columns to be merged to uniprot_df
+        #ACE cols = regsites_df.columns.tolist()
+        #ACE print(cols) #ACE
+        #ACE if len(cols) > 8:
+        #ACE     cols = [cols[9]]+[cols[1]]+cols[10:15]
+        #ACE     #ACE cols = [cols[9]]+[cols[1]]+cols[10:15]
+        #ACE     print(cols) #ACE
+        #ACE regsite_merge_df = regsites_df[cols]
+
+        end_time = time.process_time() #timer
+        print("%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        #add columns based on Sequence7 matching site_+/-7_AA
+        uniprot_regsite_df = pandas.merge(
+            left=uniprot_df,
+            right=uniprot_regsites_collapsed_df,
+            how='left',
+            left_on=PHOSPHOPEPTIDE,
+            right_on='ppp')
+
+        end_time = time.process_time() #timer
+        print("%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        data_in.rename(
+            {'Protein description': PHOSPHOPEPTIDE},
+            axis='columns',
+            inplace=True
+            )
+
+
+
+        sort_start_time = time.process_time() #timer
+
+        #data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
+        res2 = sorted(data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key = lambda s: s.casefold())
+        data_in = data_in.loc[res2]
+
+        end_time = time.process_time() #timer
+        print("%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr) #timer
+
+
+
+        cols = [old_cols[0]] + old_cols[col_PKCalpha-1:]
+        upstream_data = upstream_data[cols]
+
+        end_time = time.process_time() #timer
+        print("%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr) #timer
+
+
+        #### #rename upstream columns in new list
+        #### new_cols = []
+        #### for name in cols:
+        ####     if "_NetworKIN" in name:
+        ####         name = name.split("_")[0]
+        ####     if " motif" in name:
+        ####         name = name.split(" motif")[0]
+        ####     if " sequence " in name:
+        ####         name = name.split(" sequence")[0]
+        ####     if "_Phosida" in name:
+        ####         name = name.split("_")[0]
+        ####     if "_PhosphoSite" in name:
+        ####         name = name.split("_")[0]
+        ####     new_cols.append(name)
+
+        #rename upstream columns in new list
+        def col_rename(name):
+            if "_NetworKIN" in name:
+                name = name.split("_")[0]
+            if " motif" in name:
+                name = name.split(" motif")[0]
+            if " sequence " in name:
+                name = name.split(" sequence")[0]
+            if "_Phosida" in name:
+                name = name.split("_")[0]
+            if "_PhosphoSite" in name:
+                name = name.split("_")[0]
+            return name
+
+        new_cols = [col_rename(col) for col in cols]
+        upstream_data.columns = new_cols
+
+
+
+        end_time = time.process_time() #timer
+        print("%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr) #timer
+
+
+        # Create upstream_data_cast as a copy of upstream_data
+        #   but with first column substituted by the phosphopeptide sequence
+        upstream_data_cast = upstream_data.copy()
+        new_cols_cast = new_cols
+        new_cols_cast[0] = 'p_peptide'
+        upstream_data_cast.columns = new_cols_cast
+        upstream_data_cast['p_peptide'] = upstream_data.index
+        new_cols_cast0 = new_cols_cast[0]
+
+        # --- -------------- begin read upstream_data_melt ------------------------------------
+        ## ----------- Get melted kinase mapping data from SQLite database (start) -----------
+        conn = sql.connect(uniprot_sqlite)
+        upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
+        # Close SwissProt SQLite database
+        conn.close()
+        upstream_data_melt = upstream_data_melt_df.copy()
+        upstream_data_melt.columns = ['p_peptide', 'characterization', 'X']
+        upstream_data_melt['characterization'] = [
+                col_rename(s)
+                for s in upstream_data_melt['characterization']
+                ]
+
+        print('%0.6f upstream_data_melt_df initially has %d rows' %
+              (end_time - start_time, len(upstream_data_melt.axes[0]))
+              , file=sys.stderr)
+        # ref: https://stackoverflow.com/a/27360130/15509512
+        #      e.g. df.drop(df[df.score < 50].index, inplace=True)
+        upstream_data_melt.drop(
+            upstream_data_melt[upstream_data_melt.X != 'X'].index,
+            inplace = True
+            )
+        print('%0.6f upstream_data_melt_df pre-dedup has %d rows' %
+              (end_time - start_time, len(upstream_data_melt.axes[0]))
+              , file=sys.stderr)
+        #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
+        #ACE     print(upstream_data_melt)
+        ## ----------- Get melted kinase mapping data from SQLite database (finish) -----------
+        # ... -------------- end read upstream_data_melt --------------------------------------
+
+        end_time = time.process_time() #timer
+        print("%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr) #timer
+        # ... end read upstream_data_melt
+
+        upstream_data_melt_index = upstream_data_melt.index
+        upstream_data_melt_p_peptide = upstream_data_melt['p_peptide']
+
+        end_time = time.process_time() #timer
+        print("%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        upstream_delta_melt_LoL = upstream_data_melt.values.tolist()
+
+        melt_dict = {}
+        for key in upstream_map_p_peptide_list:
+            melt_dict[key] = []
+
+        for el in upstream_delta_melt_LoL:
+            (p_peptide, characterization, X) = tuple(el)
+            if p_peptide in melt_dict:
+                melt_dict[p_peptide].append(characterization)
+            else:
+                exit('Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide))
+
+
+        end_time = time.process_time() #timer
+        print("%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr) #timer
+
+
+        # for key in upstream_map_p_peptide_list:
+        #     melt_dict[key] = ' | '.join(melt_dict[key])
+
+        for key in upstream_map_p_peptide_list:
+            melt_dict[key] = melt_join(melt_dict[key])
+
+        end_time = time.process_time() #timer
+        print("%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        # map_dict is a dictionary of dictionaries
+        map_dict = {}
+        for key in upstream_map_p_peptide_list:
+            map_dict[key] = {}
+            map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key]
+
+
+        end_time = time.process_time() #timer
+        print("%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        #convert map_dict to dataframe
+        map_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(map_dict))
+        map_df["p-peptide"] = map_df.index #make index a column too
+        cols_map_df = map_df.columns.tolist()
+        cols_map_df = [cols_map_df[1]] + [cols_map_df[0]]
+        map_df = map_df[cols_map_df]
+
+        #join map_df to uniprot_regsite_df
+        output_df = uniprot_regsite_df.merge(
+            map_df,
+            how="left",
+            left_on=PHOSPHOPEPTIDE,
+            right_on="p-peptide")
+
+        output_df = output_df[
+            [  PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE,
+                UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE,
+                PUTATIVE_UPSTREAM_DOMAINS
+                ]
+            ]
+
+
+        # cols_output_prelim = output_df.columns.tolist()
+        #
+        # print("cols_output_prelim")
+        # print(cols_output_prelim)
+        #
+        # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
+        #
+        # print("cols_output with p-peptide")
+        # print(cols_output)
+        #
+        # cols_output = [col for col in cols_output if not col == "p-peptide"]
+        #
+        # print("cols_output")
+        # print(cols_output)
+        #
+        # output_df = output_df[cols_output]
+
+        #join output_df back to quantitative columns in data_in df
+        quant_cols = data_in.columns.tolist()
+        quant_cols = quant_cols[1:]
+        quant_data = data_in[quant_cols]
+
+        ## ----------- Write merge/filter metadata to SQLite database (start) -----------
+        # Open SwissProt SQLite database
+        conn = sql.connect(output_sqlite)
+        cur  = conn.cursor()
+
+        cur.executescript(MRGFLTR_DDL)
+
+        cur.execute(
+            CITATION_INSERT_STMT,
+            ('mrgfltr_metadata_view', CITATION_INSERT_PSP)
+            )
+        cur.execute(
+            CITATION_INSERT_STMT,
+            ('mrgfltr_metadata', CITATION_INSERT_PSP)
+            )
+        cur.execute(
+            CITATION_INSERT_STMT,
+            ('mrgfltr_metadata_view', CITATION_INSERT_PSP_REF)
+            )
+        cur.execute(
+            CITATION_INSERT_STMT,
+            ('mrgfltr_metadata', CITATION_INSERT_PSP_REF)
+            )
+
+        # Read ppep-to-sequence LUT
+        ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)
+        #ACE ppep_lut_df.info(verbose=True)
+        # write only metadata for merged/filtered records to SQLite
+        mrgfltr_metadata_df = output_df.copy()
+        # replace phosphopeptide seq with ppep.id
+        mrgfltr_metadata_df = ppep_lut_df.merge(
+            mrgfltr_metadata_df,
+            left_on='ppep_seq',
+            right_on=PHOSPHOPEPTIDE,
+            how='inner'
+            )
+        mrgfltr_metadata_df.drop(
+            columns=[PHOSPHOPEPTIDE, 'ppep_seq'],
+            inplace=True
+            )
+        #rename columns
+        mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS
+        #ACE mrgfltr_metadata_df.info(verbose=True)
+        mrgfltr_metadata_df.to_sql(
+            'mrgfltr_metadata',
+            con=conn,
+            if_exists='append',
+            index=False,
+            method='multi'
+            )
+
+        # Close SwissProt SQLite database
+        conn.close()
+        ## ----------- Write merge/filter metadata to SQLite database (finish) -----------
+
+        output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH)
+        output_cols = output_df.columns.tolist()
+        output_cols = output_cols[:-1]
+        output_df = output_df[output_cols]
+
+        #cosmetic changes to Upstream column
+        output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
+        us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
+        i = 0
+        while i < len(us_series):
+            #turn blanks into N_A to signify the info was searched for but cannot be found
+            if us_series[i] == "":
+                us_series[i] = N_A
+            i += 1
+        output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series
+
+        end_time = time.process_time() #timer
+        print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        (output_rows, output_cols) = output_df.shape
+
+        #output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True)
+        output_df = output_df.convert_dtypes(convert_integer=True)
+
+
+        #Output onto Final CSV file
+        output_df.to_csv(output_filename_csv, index=False)
+        output_df.to_csv(output_filename_tab, quoting=None, sep='\t', index=False)
+
+        end_time = time.process_time() #timer
+        print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer
+
+        print('{:>10} phosphopeptides written to output'.format(str(output_rows)))
+
+        end_time = time.process_time() #timer
+        print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer
+
+
+        #Rev. 7/1/2016
+        #Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
+        #Rev. 7/3/2016:  renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
+        #Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
+        #                read from SwissProt SQLite database
+        #Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
+
+        #############################################
+        # copied from Excel Output Script.ipynb END #
+        #############################################
+
+    try:
+        catch(mqpep_getswissprot,)
+        exit(0)
+    except Exception as e:
+        exit('Internal error running mqpep_getswissprot(): %s' % (e))
+
+if __name__ == "__main__":
+    __main__()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/search_ppep.py	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,512 @@
+#!/usr/bin/env python
+# Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB
+
+import argparse
+import os.path
+import sqlite3
+import re
+from codecs import getreader as cx_getreader
+import time
+
+# For Aho-Corasick search for fixed set of substrings
+# - add_word
+# - make_automaton
+# - iter
+import ahocorasick
+# Support map over auto.iter(...)
+# - itemgetter
+import operator
+#import hashlib
+
+# ref: https://stackoverflow.com/a/8915613/15509512
+#   answers: "How to handle exceptions in a list comprehensions"
+#   usage:
+#       from math import log
+#       eggs = [1,3,0,3,2]
+#       print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
+#   producing:
+#       for <built-in function log>
+#         with args (0,)
+#         exception: math domain error
+#       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
+def catch(func, *args, handle=lambda e : e, **kwargs):
+    try:
+        return func(*args, **kwargs)
+    except Exception as e:
+        print("For %s" % str(func))
+        print("  with args %s" % str(args))
+        print("  caught exception: %s" % str(e))
+        (ty, va, tb) = sys.exc_info()
+        print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
+        #exit(-1)
+        return None # was handle(e)
+
+def __main__():
+    ITEM_GETTER = operator.itemgetter(1)
+
+    DROP_TABLES_SQL = '''
+        DROP VIEW  IF EXISTS ppep_gene_site_view;
+        DROP VIEW  IF EXISTS uniprot_view;
+        DROP VIEW  IF EXISTS uniprotkb_pep_ppep_view;
+        DROP VIEW  IF EXISTS ppep_intensity_view;
+        DROP VIEW  IF EXISTS ppep_metadata_view;
+
+        DROP TABLE IF EXISTS sample;
+        DROP TABLE IF EXISTS ppep;
+        DROP TABLE IF EXISTS site_type;
+        DROP TABLE IF EXISTS deppep_UniProtKB;
+        DROP TABLE IF EXISTS deppep;
+        DROP TABLE IF EXISTS ppep_gene_site;
+        DROP TABLE IF EXISTS ppep_metadata;
+        DROP TABLE IF EXISTS ppep_intensity;
+    '''
+
+    CREATE_TABLES_SQL = '''
+        CREATE TABLE deppep
+          ( id INTEGER PRIMARY KEY
+          , seq TEXT UNIQUE                            ON CONFLICT IGNORE
+          )
+          ;
+        CREATE TABLE deppep_UniProtKB
+          ( deppep_id    INTEGER REFERENCES deppep(id) ON DELETE CASCADE
+          , UniProtKB_id TEXT REFERENCES UniProtKB(id) ON DELETE CASCADE
+          , pos_start    INTEGER
+          , pos_end      INTEGER
+          , PRIMARY KEY (deppep_id, UniProtKB_id, pos_start, pos_end)
+                                                       ON CONFLICT IGNORE
+          )
+          ;
+        CREATE TABLE ppep
+          ( id        INTEGER PRIMARY KEY
+          , deppep_id INTEGER REFERENCES deppep(id)    ON DELETE CASCADE
+          , seq       TEXT UNIQUE                      ON CONFLICT IGNORE
+          , scrubbed  TEXT
+          );
+        CREATE TABLE site_type
+          ( id        INTEGER PRIMARY KEY
+          , type_name TEXT UNIQUE                      ON CONFLICT IGNORE
+          );
+        CREATE INDEX idx_ppep_scrubbed on ppep(scrubbed)
+          ;
+        CREATE TABLE sample
+          ( id        INTEGER PRIMARY KEY
+          , name      TEXT UNIQUE                      ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW uniprot_view AS
+          SELECT DISTINCT
+              Uniprot_ID
+            , Description
+            , Organism_Name
+            , Organism_ID
+            , Gene_Name
+            , PE
+            , SV
+            , Sequence
+            , Description || ' OS=' ||
+                Organism_Name || ' OX=' || Organism_ID ||
+                CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN='|| Gene_Name END ||
+                CASE WHEN PE = 'N/A' THEN '' ELSE ' PE='|| PE END ||
+                CASE WHEN SV = 'N/A' THEN '' ELSE ' SV='|| SV END
+                                                       AS long_description
+            , Database
+          FROM UniProtKB
+          ;
+        CREATE VIEW uniprotkb_pep_ppep_view AS
+          SELECT   deppep_UniProtKB.UniprotKB_ID       AS accession
+                 , deppep_UniProtKB.pos_start          AS pos_start
+                 , deppep_UniProtKB.pos_end            AS pos_end
+                 , deppep.seq                          AS peptide
+                 , ppep.seq                            AS phosphopeptide
+                 , ppep.scrubbed                       AS scrubbed
+                 , uniprot_view.Sequence               AS sequence
+                 , uniprot_view.Description            AS description
+                 , uniprot_view.long_description       AS long_description
+                 , ppep.id                             AS ppep_id
+          FROM     ppep, deppep, deppep_UniProtKB, uniprot_view
+          WHERE    deppep.id = ppep.deppep_id
+          AND      deppep.id = deppep_UniProtKB.deppep_id
+          AND      deppep_UniProtKB.UniprotKB_ID = uniprot_view.Uniprot_ID
+          ORDER BY UniprotKB_ID, deppep.seq, ppep.seq
+          ;
+        CREATE TABLE ppep_gene_site
+          ( ppep_id         INTEGER REFERENCES ppep(id)
+          , gene_names      TEXT
+          , site_type_id    INTEGER REFERENCES site_type(id)
+          , kinase_map      TEXT
+          , PRIMARY KEY (ppep_id, kinase_map)          ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW ppep_gene_site_view AS
+          SELECT DISTINCT
+            ppep.seq   AS phospho_peptide
+          , ppep_id
+          , gene_names
+          , type_name
+          , kinase_map
+          FROM
+            ppep, ppep_gene_site, site_type
+          WHERE
+              ppep_gene_site.ppep_id = ppep.id
+            AND
+              ppep_gene_site.site_type_id = site_type.id
+          ORDER BY
+            ppep.seq
+            ;
+        CREATE TABLE ppep_metadata
+          ( ppep_id             INTEGER REFERENCES ppep(id)
+          , protein_description TEXT
+          , gene_name           TEXT
+          , FASTA_name          TEXT
+          , phospho_sites       TEXT
+          , motifs_unique       TEXT
+          , accessions          TEXT
+          , motifs_all_members  TEXT
+          , domain              TEXT
+          , ON_FUNCTION         TEXT
+          , ON_PROCESS          TEXT
+          , ON_PROT_INTERACT    TEXT
+          , ON_OTHER_INTERACT   TEXT
+          , notes               TEXT
+          , PRIMARY KEY (ppep_id)                      ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW ppep_metadata_view AS
+          SELECT DISTINCT
+              ppep.seq             AS phospho_peptide
+            , protein_description
+            , gene_name
+            , FASTA_name
+            , phospho_sites
+            , motifs_unique
+            , accessions
+            , motifs_all_members
+            , domain
+            , ON_FUNCTION
+            , ON_PROCESS
+            , ON_PROT_INTERACT
+            , ON_OTHER_INTERACT
+            , notes
+          FROM
+            ppep, ppep_metadata
+          WHERE
+              ppep_metadata.ppep_id = ppep.id
+          ORDER BY
+            ppep.seq
+            ;
+        CREATE TABLE ppep_intensity
+          ( ppep_id    INTEGER REFERENCES ppep(id)
+          , sample_id  INTEGER
+          , intensity  INTEGER
+          , PRIMARY KEY (ppep_id, sample_id)           ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW ppep_intensity_view AS
+          SELECT DISTINCT
+              ppep.seq             AS phospho_peptide
+            , sample.name          AS sample
+            , intensity
+          FROM
+            ppep, sample, ppep_intensity
+          WHERE
+              ppep_intensity.sample_id = sample.id
+            AND
+              ppep_intensity.ppep_id = ppep.id
+          ;
+    '''
+
+    UNIPROT_SEQ_AND_ID_SQL = '''
+        select    Sequence, Uniprot_ID
+             from UniProtKB
+    '''
+
+    # Parse Command Line
+    parser = argparse.ArgumentParser(
+        description='Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB).'
+        )
+
+    # inputs:
+    #   Phosphopeptide data for experimental results, including the intensities
+    #   and the mapping to kinase domains, in tabular format.
+    parser.add_argument(
+        '--phosphopeptides', '-p',
+        nargs=1,
+        required=True,
+        dest='phosphopeptides',
+        help='Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool'
+        )
+    parser.add_argument(
+        '--uniprotkb', '-u',
+        nargs=1,
+        required=True,
+        dest='uniprotkb',
+        help='UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool'
+        )
+    parser.add_argument(
+        '--schema',
+        action='store_true',
+        dest='db_schema',
+        help='show updated database schema'
+        )
+    parser.add_argument(
+        '--warn-duplicates',
+        action='store_true',
+        dest='warn_duplicates',
+        help='show warnings for duplicated sequences'
+        )
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        dest='verbose',
+        help='show somewhat verbose program tracing'
+        )
+    # "Make it so!" (parse the arguments)
+    options = parser.parse_args()
+    if options.verbose:
+        print("options: " + str(options) + "\n")
+
+    # path to phosphopeptide (e.g., "outputfile_STEP2.txt") input tabular file
+    if options.phosphopeptides is None:
+        exit('Argument "phosphopeptides" is required but not supplied')
+    try:
+        f_name  = os.path.abspath(options.phosphopeptides[0])
+    except Exception as e:
+        exit('Error parsing phosphopeptides argument: %s' % (e))
+
+    # path to SQLite input/output tabular file
+    if options.uniprotkb is None:
+        exit('Argument "uniprotkb" is required but not supplied')
+    try:
+        db_name = os.path.abspath(options.uniprotkb[0])
+    except Exception as e:
+        exit('Error parsing uniprotkb argument: %s' % (e))
+
+    # print("options.schema is %d" % options.db_schema)
+
+    # db_name = "demo/test.sqlite"
+    # f_name  = "demo/test_input.txt"
+
+    con = sqlite3.connect(db_name)
+    cur = con.cursor()
+    ker = con.cursor()
+
+    cur.executescript(DROP_TABLES_SQL)
+
+    # if options.db_schema:
+    #     print("\nAfter dropping tables/views that are to be created, schema is:")
+    #     cur.execute("SELECT * FROM sqlite_schema")
+    #     for row in cur.fetchall():
+    #         if row[4] is not None:
+    #             print("%s;" % row[4])
+
+    cur.executescript(CREATE_TABLES_SQL)
+
+    if options.db_schema:
+        print("\nAfter creating tables/views that are to be created, schema is:")
+        cur.execute("SELECT * FROM sqlite_schema")
+        for row in cur.fetchall():
+            if row[4] is not None:
+                print("%s;" % row[4])
+
+    def generate_ppep(f):
+        #get keys from upstream tabular file using readline()
+        # ref: https://stackoverflow.com/a/16713581/15509512
+        #      answer to "Use codecs to read file with correct encoding"
+        file1_encoded = open(f, 'rb')
+        file1 = cx_getreader("latin-1")(file1_encoded)
+
+        count = 0
+        re_tab = re.compile('^[^\t]*')
+        re_quote = re.compile('"')
+        while True:
+            count += 1
+            # Get next line from file
+            line = file1.readline()
+            # if line is empty
+            # end of file is reached
+            if not line:
+                break
+            if count > 1:
+                m = re_tab.match(line)
+                m = re_quote.sub('',m[0])
+                yield m
+        file1.close()
+        file1_encoded.close()
+
+    # Build an Aho-Corasick automaton from a trie
+    # - ref:
+    #   - https://pypi.org/project/pyahocorasick/
+    #   - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
+    #   - https://en.wikipedia.org/wiki/Trie
+    auto = ahocorasick.Automaton()
+    re_phos = re.compile('p')
+    # scrub out unsearchable characters per section
+    #   "Match the p_peptides to the @sequences array:"
+    # of the original
+    #   PhosphoPeptide Upstream Kinase Mapping.pl
+    # which originally read
+    #   $tmp_p_peptide =~ s/#//g;
+    #   $tmp_p_peptide =~ s/\d//g;
+    #   $tmp_p_peptide =~ s/\_//g;
+    #   $tmp_p_peptide =~ s/\.//g;
+    #
+    re_scrub = re.compile('0-9_.#')
+    ppep_count = 0
+    for ppep in generate_ppep(f_name):
+        ppep_count += 1
+        add_to_trie = False
+        #print(ppep)
+        scrubbed = re_scrub.sub('',ppep)
+        deppep = re_phos.sub('',scrubbed)
+        if options.verbose:
+            print("deppep: %s; scrubbed: %s" % (deppep,scrubbed))
+        #print(deppep)
+        cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
+        if cur.fetchone() is None:
+            add_to_trie = True
+        cur.execute("INSERT INTO deppep(seq) VALUES (?)", (deppep,))
+        cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
+        deppep_id = cur.fetchone()[0]
+        if add_to_trie:
+            #print((deppep_id, deppep))
+            # Build the trie
+            auto.add_word(deppep, (deppep_id, deppep))
+        cur.execute(
+            "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)",
+            (ppep, scrubbed, deppep_id)
+            )
+    # def generate_deppep():
+    #     cur.execute("SELECT seq FROM deppep")
+    #     for row in cur.fetchall():
+    #         yield row[0]
+    cur.execute("SELECT count(*) FROM (SELECT seq FROM deppep GROUP BY seq)")
+    for row in cur.fetchall():
+        deppep_count = row[0]
+
+    cur.execute("SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)")
+    for row in cur.fetchall():
+        sequence_count = row[0]
+
+    print(
+      "%d phosphopeptides were read from input" % ppep_count
+      )
+    print(
+      "%d corresponding dephosphopeptides are represented in input" % deppep_count
+      )
+    # Look for cases where both Gene_Name and Sequence are identical
+    cur.execute('''
+      SELECT Uniprot_ID, Gene_Name, Sequence
+      FROM   UniProtKB
+      WHERE  Sequence IN (
+        SELECT   Sequence
+        FROM     UniProtKB
+        GROUP BY Sequence, Gene_Name
+        HAVING   count(*) > 1
+        )
+      ORDER BY Sequence
+      ''')
+    duplicate_count = 0
+    old_seq = ''
+    for row in cur.fetchall():
+        if duplicate_count == 0:
+            print("\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column).")
+        if row[2] != old_seq:
+            old_seq = row[2]
+            duplicate_count += 1
+            if options.warn_duplicates:
+                print("\n%s\t%s\t%s" % row)
+        else:
+            if options.warn_duplicates:
+                print("%s\t%s" % (row[0], row[1]))
+    if duplicate_count > 0:
+        print("\n%d sequences have duplicated accession IDs\n" % duplicate_count)
+
+    print(
+      "%s accession sequences will be searched\n" % sequence_count
+      )
+
+    #print(auto.dump())
+
+    # Convert the trie to an automaton (a finite-state machine)
+    auto.make_automaton()
+
+    # Execute query for seqs and metadata without fetching the results yet
+    uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL)
+    while batch := uniprot_seq_and_id.fetchmany(size=50):
+      if None == batch:
+          break
+      for Sequence, UniProtKB_id in batch:
+          if Sequence is not None:
+              for end_index, (insert_order, original_value) in auto.iter(Sequence):
+                  ker.execute('''
+                      INSERT INTO deppep_UniProtKB
+                        (deppep_id,UniProtKB_id,pos_start,pos_end)
+                      VALUES (?,?,?,?)
+                      ''', (
+                          insert_order,
+                          UniProtKB_id,
+                          1 + end_index - len(original_value),
+                          end_index
+                          )
+                      )
+          else:
+              raise ValueError("UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" % (UniProtKB_id,))
+    ker.execute("""
+        SELECT   count(*) || ' accession-peptide-phosphopeptide combinations were found'
+        FROM     uniprotkb_pep_ppep_view
+        """
+        )
+    for row in ker.fetchall():
+        print(row[0])
+
+    ker.execute("""
+      SELECT   count(*) || ' accession matches were found', count(*) AS accession_count
+      FROM     (
+        SELECT   accession
+        FROM     uniprotkb_pep_ppep_view
+        GROUP BY accession
+        )
+      """
+      )
+    for row in ker.fetchall():
+      print(row[0])
+      accession_count = row[1]
+
+    ker.execute("""
+      SELECT   count(*) || ' peptide matches were found'
+      FROM     (
+        SELECT   peptide
+        FROM     uniprotkb_pep_ppep_view
+        GROUP BY peptide
+        )
+      """
+      )
+    for row in ker.fetchall():
+      print(row[0])
+
+    ker.execute("""
+      SELECT   count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count
+      FROM     (
+        SELECT   phosphopeptide
+        FROM     uniprotkb_pep_ppep_view
+        GROUP BY phosphopeptide
+        )
+      """
+      )
+    for row in ker.fetchall():
+      print(row[0])
+      phosphopeptide_count = row[1]
+
+    con.commit()
+    ker.execute('vacuum')
+    con.close()
+
+if __name__ == "__main__":
+    wrap_start_time = time.perf_counter()
+    __main__()
+    wrap_stop_time = time.perf_counter()
+    # print(wrap_start_time)
+    # print(wrap_stop_time)
+    print("\nThe matching process took %d milliseconds to run.\n" % ((wrap_stop_time - wrap_start_time)*1000),)
+
+ # vim: sw=4 ts=4 et ai :
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/alpha_levels.tabular	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,3 @@
+0.05
+0.1
+0.2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/pSTY_motifs.tabular	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,196 @@
+1	((E|D|A)(D|E)(E|D)(E|D)pS(E|D|A)(D|E|A)(E|D)(E|D))|(pS.(E|pS|pT))|(pS..(E|pS|pT))|((pS|pT)..(E|D))|(pS(D|E).(D|E).(D|E))|((D|E)pS(D|E).(D|E))|(pS(D|E)(D|E)(D|E))|((pS|pT)..(D|E))|((pS|pT)..(E|D|pS|pY))|((S|E|P|G)(D|S|N|E|P)(E|D|G|Q|W)(Y|E|D|S|W|T)(W|E|D)pS(D|E)(D|E|W|N)(E|D)(E|D|N|Q))	Casein Kinase II substrate motif (HPRD)
+2	((L|F|I)...R(Q|S|T)L(pS|pT)(M|L|I|V))|(..B.R..pS..)|(pS...(pS|pT))	MAPKAPK2 kinase substrate motif (HPRD)
+3	((M|V|L|I|F)(R|K|H)...(pS|pT)...(M|V|L|I|F))|((M|V|L|I)..(R|K|H).(pS|pT)...(M|V|L|I))|((M|V|L|I|F)(R|K|H)..(pS|pT)...(M|V|L|I|F))	AMP-activated protein kinase substrate motif (HPRD)
+4	((P|L|I|M).(L|I|D|E)pSQ)|(LpSQE)|(pSQ)	ATM kinase substrate motif (HPRD)
+5	((R|K).R..(pS|pT)(M|L|V|I))|(VFLGFpTYVAP)	p70 Ribosomal S6 kinase substrate motif (HPRD)
+6	((R|K).R..pS)|(RRR.pS)	MAPKAPK1 kinase substrate motif (HPRD)
+7	((R|K)pSP(R|P)(R|K|H))|((pS|pT)P.(R|K))|(HHH(R|K)pSPR(R|K)R)	Cdc2 kinase substrate motif (HPRD)
+8	((R|N)(F|L|M)(R|K)(R|K)pS(R|I|V|M)(R|I|M|V)(M|I|F|V)(I|F|M))|(FR.(pS|pT))|(RF(R|K)(R|K)pS(R|I)(R|I)MI)	NIMA kinase substrate motif (HPRD)
+9	((pS|pT)P.(K|R))|((K|R)(pS|pT)P)|((pS|pT)P(K|R))	Growth associated histone HI kinase substrate motif (HPRD)
+10	(..(pS|pT)E)|(.(pS|pT)...(A|P|S|T))	G protein-coupled receptor kinase 1 substrate motif (HPRD)
+11	(.R..(pS|pT).R.)|((pS|pT).(R|K))|((R|K)..(pS|pT))|((R|K)..(pS|pT).(R|K))|((K|R).(pS|pT))|((R|K).(pS|pT).(R|K))	PKC kinase substrate motif (HPRD)
+12	(.pSQ)|(P(pS|pT).)	DNA dependent Protein kinase substrate motif (HPRD)
+13	(AKRRRLSpSLRA)|(VRKRpTLRRL)	PAK1 kinase substrate motif (HPRD)
+14	(ARKGpSLRQ)|(R(R|F)RR(R|K)GpSF(R|K)(R|K))	PKC alpha kinase substrate motif (HPRD)
+15	(HpSTSDD)|(YRpSVDE)	Branched chain alpha-ketoacid dehydrogenase kinase substrate motif (HPRD)
+16	(KCSpTWP)|(R..pS)|(R.R..pS.P)|(YpTV)|(RS.(pS|pT).P)|(R.(Y|F).pS.P)|(RPVSSAApSVY)	14-3-3 domain binding motif (HPRD)
+17	(KK.RRpT(L|V).)|(KKR.RpT(L|V).)|((R|K).RR.(pS|pT)(L|V).)	DMPK1 kinase substrate motif (HPRD)
+18	(KKKKKK(pS|pT)...)|((R|K|Q|N)(M|C|W)(R|T|S|N)(E|D|S|N)(R|K|E|D|N)pS(S|D|E)(S|GC|D)(SM|R|N)(N|H|S|R|C))	TGF beta receptor kinase substrate motif (HPRD)
+19	(KRKQIpSVR)|((F|M|K)(R|K)(M|R|Q|F)(M|F|L|I)pS(F|I|M|L)(F|R|K)(L|I)(F|L|I))|((K|R)..pS(V|I))	Phosphorylase kinase substrate motif (HPRD)
+20	(KRQGpSVRR)|(R(K|E|R).pS)	PKC epsilon kinase substrate motif (HPRD)
+21	(P.(pS|pT)P)|(pSP)	ERK1, ERK2 Kinase substrate motif (HPRD)
+22	(P.(pS|pT)PP)|(..P.(pS|pT)PPP.)	ERK1,2 kinase substrate motif (HPRD)
+23	(PL(pS|pT)PIP(K|R|H))|(PL(pS|pT)P.(K|R|H))	CDK4 kinase substrate motif (HPRD)
+24	(PLpTLP)|(PLLpTP)|(PLpTP)|(PpTLP)|(PLpTLP)|(PpTLP)|(LpTP)	RAF1 kinase substrate motif (HPRD)
+25	(R..(pS|pT))|((K|F)(R|K)(Q|M)(Q|M|K|L|F)pS(F|I|M|L|V)(D|E|I)(L|M|K|I)(F|K))|((M|V|L|I|F).(R|K)..(pS|pT)..)|(R..pS)	Calmodulin-dependent protein kinase II substrate motif (HPRD)
+26	(R..pSPV)|(K(pS|pT)P.K)|(KpSP...K)|(KpSP..K)|(KpSP....K)|(KpTPAKEE)|(P.pSP)|(.(pS|pT)P)|(..pSP)	GSK-3, ERK1, ERK2, CDK5 substrate motif (HPRD)
+27	(R.R..(pS|pT)(F|L))|(R.R..(pS|pT))|(GRART(S|T)pSFAE)|((R|Q|K)(R|K|N|Q|P|H)(R|K)(R|S|T)(N|K|Q|H|D|P)pS(F|W|I|M|N|S)(S|T|H)(R|S|K)(S|T|P|Q))|((R|K).(R|K)(S|T).pS)	Akt kinase substrate motif (HPRD)
+28	(RR..pS)|(KR.RpS)|(KRR.pT)	ZIP kinase substrate motif (HPRD)
+29	(RR.pS(M|I|L|V|F|Y))|(R.pS)|(KR..pS)|(R..pS)|((R|K).(pS|pT))|(K..(pS|pT))|((R|K)(R|K).(pS|pT))|(K...(pS|pT))|((pS|pT).(R|K))|(RRRRpSIIFI)|(RR.pS)|(R(R|K).(pS|pT)(I|L|V|F|Y)(D|C|.).D)|(RR.pS)|(RRR(R|N)pSII(F|D))|((R|C|P|K)(R|A|P)(R|K)(R|K|S)(N|L|S|M|P)Ps(I|L|V|C)(S|P|H|Q)(S|W|Q)(S|L|G))	PKA kinase substrate motif (HPRD)
+30	(RRFGpSBRRF)|(RRFGpS(M|L|V|I|F)RR(M|L|V|I|F))	MEKK kinase substrate motif (HPRD)
+31	(VPGKARKKpSSCQLL)|(PLARTLpSVAGLP)|((M|I|L|V|F|Y).R..(pS|pT))	Calmodulin-dependent protein kinase IV substrate motif (HPRD)
+32	(pSD.E)|(pS..(E|D))	Casein kinase II substrate motif (HPRD)
+33	(pSP..(pS|pT))|((D|E)..(pS|pT))|((pS|pT)..(S|T))|((pS|pT)...(S|T)(M|L|V|I|F))	Casein Kinase I substrate motif (HPRD)
+34	(pTP.K)|((K|H|G)H(H|P)(K|G|H)pSP(R|K)(H|R|K)(R|H|K))|((pS|pT)PG(pS|pT)PGTP)	CDK5 kinase substrate motif (HPRD)
+35	(R|K).R..pS...(R|K)	AMP-activated protein kinase 2 substrate motif (HPRD)
+36	(R|K|N)R.(pS|pT)(M|L|V|I)	Aurora-A kinase substrate motif (HPRD)
+37	(D|E)(pS|pT)...	b-Adrenergic Receptor kinase substrate motif (HPRD)
+38	(M|V|L|I|F).R..(pS|pT)...(M|V|L|I|F)	Calmodulin-dependent protein kinase I substrate motif (HPRD)
+39	(M|I|L|V|F|Y).R..(pS|pT)(M|I|L|V|F|Y)	Calmodulin-dependent protein kinase II alpha substrate motif (HPRD)
+40	E(F|E)D(T|A|G)GpSI(I|F|Y|G)(I|G|F)(F|G)(F|P|L)	Casein Kinase I delta substrate motif (HPRD)
+41	Y(Y|E)(D|Y)(A|D)(A|G)pSI(I|Y|F|G)(I|G|F)(F|G)(F|P|L)	Casein Kinase I gamma substrate motif (HPRD)
+42	P.(pS|pT)PKK.KK	Cdc2 like protein kinase substrate motif (HPRD)
+43	(pS|pT)P.(R|K)	CDK1,2, 4, 6 kinase substrate motif (HPRD)
+44	pSP.(R|K).	CDK kinase substrate motif (HPRD)
+45	(M|I|L|V).(R|K)..(pS|pT)	Chk1 kinase substrate motif (HPRD)
+46	R..(pS|pT)..R	CLK1 kinase substrate motif (HPRD)
+47	(R|K).(R|K).(R|K).pS..R	CLK1,2 kinase substrate motif (HPRD)
+48	R(R|H)(R|H)(R|E)RE(R|H)pSR(R|D)L	CLK2 kinase substrate motif (HPRD)
+49	R..(pS|pT)(L|V)R	DMPK1,2 kinase substrate motif (HPRD)
+50	R(R|K)R(E|R)R(E|A)(H|R)pSRR(R|D)(L|E)	DOA/CDC-like kinase 2 substrate motif (HPRD)
+51	(I|L|V|F|M)RR..(pS|pT)(I|L|M|V|F)	Doublecortin kinase-1 kinase substrate motif (HPRD)
+52	E.pS.R..R	elF2 alpha kinase substrate motif (HPRD)
+53	(T|P|S)(G|P|E|Y)(P|L|I)(L|M|P)pSP(G|P|F)(P|F|G|Y)(F|Y|I)	ERK1 kinase substrate motif (HPRD)
+54	pTEpY	ERK1 Kinase substrate motif (HPRD)
+55	KpSPP	ERK1, ERK2, SAPK, CDK5 and GSK3 kinase substrate motif (HPRD)
+56	(D|Y|W|E)(C)(P|S|C|E)(P|C|S|L|T|V)(L|M|T)pS(P|A)(T|S|G|R|C|F)(W|P|S)(W|F)	ERK2 kinase substrate motif (HPRD)
+57	pS...pS	GSK3 kinase substrate motif (HPRD)
+58	P.pTP	GSK3, Erk1, Erk2 and CDK5 kinase motif (HPRD)
+59	(M|L|V|I|F)(R|K|H)..pS...(M|L|V|I|F)	HMGCoA Reductase kinase substrate motif (HPRD)
+60	GP(Q|M)pSPI	JNK1 Kinase substrate motif (HPRD)
+61	LRpT	LKB1 Kinase substrate motif (HPRD)
+62	pT(G|P|E)pY	MAPK 11,13,14 Kinase substrate motif (HPRD)
+63	KKR..pS.(R|K)(R|K)	MLCK kinase substrate motif (HPRD)
+64	FpTY	mTOR kinase substrate motif (HPRD)
+65	IRRLpSTRRR	Nek 2 kinase substrate motif (HPRD)
+66	(R|K)(R|.).(pS|pT)	PAK2 kinase substrate motif (HPRD)
+67	F..F(pS|pT)(F|Y)	PDK1 kinase substrate motif (HPRD)
+68	(R|K)(R|K)(R|K).(pS|pT).	Pim1 kinase substrate sequence (HPRD)
+69	(R|K)(R|K|A|Q|P)(R|K)(R|Q|H|N|Y)(P|H|K)pS(G|S|T)(P|S|G|Q|H|S|T)(S|P|Q|G|D)(T|S|P|G)	Pim2 kinase substrate sequence (HPRD)
+70	R(R|K).(pS|pT)B	PKA, PKG kinase substrate motif (HPRD)
+71	(L|R|F)(R|K)R(K|Q)GpS(F|M)KK.A	PKC beta kinase substrate motif (HPRD)
+72	R.RKGpSF	PKC delta kinase substrate motif (HPRD)
+73	AR..R(R|K)RpSFRR	PKC eta kinase substrate motif (HPRD)
+74	F..F(pS|pT)(F|Y)	PKC family kinase substrate motif (HPRD)
+75	RRRK(G|K)SF(R|K)(R|K)KA	PKC gamma kinase substrate motif (HPRD)
+76	(L|V)(V|L|A)R(Q|K|E)MpS	PKC mu kinase substrate motif (HPRD)
+77	(R|F|W|M)(W|A|K|S)(R|S|K|H)(R|H|S|Q)(R|K|N|P|G|Q)pS(I|F|R|V|K|S|L|M)(K|M|R|S|T)(R|S|K|W)(R|K|G)	PKC theta kinase substrate motif (HPRD)
+78	F.R..pS(F|M)(F|M)	PKC zeta kinase substrate motif (HPRD)
+79	(L|V|I)(R|K|Q)(R|K)(R|K|T|Q|M)(N|K|R|L|M|H)pS(F|W|I|M|L|V)(S|N)(R|S|P|Y|W)(S|R|N|L)	PKD kinase substrate motif (HPRD)
+80	R(R|K).(pS|pT)B	PKG kinase substrate motif (HPRD)
+81	R..(pS|pT).R..R	PKR kinase substrate motif (HPRD)
+82	(D|E).(pS|pT)(I|L|V|M).(D|E)	Plk1 kinase substrate motif (HPRD)
+83	.pS..D..	Pyruvate dehydrogenase kinase substrate motif (HPRD)
+84	pTEY	Dual specificity protein phosphatase 1 substrate motif (HPRD)
+85	pT.pY	Dual specificity protein phosphatase 6 substrate motif (HPRD)
+86	RRA(pS|pT)VA	PP2A, PP2C substrate motif (HPRD)
+87	.R..pSVA	PP2B substrate motif (HPRD)
+88	.pT.pY.	PP2C delta substrate motif (HPRD)
+89	pS(D|E)(D|E)E	BARD1 BRCT domain binding motif (HPRD)
+90	DpSG..pS	Beta-TrCP1 domain binding motif (HPRD)
+91	pS(F|Y|H)(V|F|Y)(F|Y)	BRCA1 BRCT domain binding motif (HPRD)
+92	(I|L)(I|L|P)pTP(R|K)	CDC4 WD40 domain binding motif (HPRD)
+93	HFDpTYLI	Chk2 FHA domain binding motif (HPRD)
+94	(R|D|H)(L|Y)(L|M)(K|A)pT(Q|L|M|E|V)(K|L|I|R)	FHA domain binding motif (HPRD)
+95	S(pS|pT).	MDC1 BRCT domain binding motif (HPRD)
+96	S(pS|pT).	Plk1 PBD domain binding motif (HPRD)
+97	pSYII	RAD9 BRCT domain binding motif (HPRD)
+98	(pS|pT)P	WW domain binding motif (HPRD)
+99	((pS|pT)P.(K|R))|((pS|pT)P(K|R))	CDK1_Phosida
+100	(P.(pS|pT)P)|(V.(pS|pT)P)|(PE(pS|pT)P)	ERK/MAPK_Phosida
+101	(R(R|S|T).(pS|pT).(S|T))|(R.R..(pS|pT))	PKB/AKT_Phosida
+102	(R.(pS|pT))|(R(R|K).(pS|pT))|(KR..(pS|pT))	PKA_Phosida
+103	(R..(pS|pT))|(R..(pS|pT)V)	CAMK2_Phosida
+104	(S..(pS|pT))|((S|T)...pS)	CK1_Phosida
+105	(pS|pT)..E	CK2_Phosida
+106	pS...S	GSK3_Phosida
+107	(pS|pT)P.(K|R)	CDK2_Phosida
+108	R..(pS|pT).R	PKC_Phosida
+109	(L|V|I).(R|K)..(pS|pT)	PKD_Phosida
+110	(I|E|V)pY(E|G)(E|D|P|N)(I|V|L)	LCK_Phosida
+111	(I|V|L)pY..(P|F)	ABL_Phosida
+112	(E|D)..pY..(D|E|A|G|S|T)	SRC_Phosida
+113	pY..(I|L|V|M)	ALK_Phosida
+114	(D|P|S|A|E|N).pY(V|L|D|E|I|N|P)	EGFR_Phosida
+115	(R|K).(pS|pT)(I|L|V)	AURORA_Phosida
+116	(R|K|N)R.(pS|pT)(M|L|V|I)	AURORA-A_Phosida
+117	(D|E).(pS|pT)(V|I|L|M).(D|E)	PLK_Phosida
+118	(E|D).(pS|pT)(F|L|I|Y|W|V|M)	PLK1_Phosida
+119	L..(pS|pT)	NEK6_Phosida
+120	L.R..(pS|pT)	CHK1/2_Phosida
+121	(M|I|L|V).(R|K)..(pS|pT)	CHK1_Phosida
+122	F..F(pS|pT)(F|Y)	PDK1_Phosida
+123	(F|L|M)(R|K)(R|K)(pS|pT)	NIMA_Phosida
+124	((D|E)(D|E)...pYVA)|((E|D|Y)pY)	TC-PTP phosphatase substrate motif (HPRD)
+125	((D|E).(L|I|V).pY..(L|I|V))|((D|E).(L|I|V)..pY..(L|I|V))|((D|E)(D|E)(D|E|L).pY..(F|M|L|V|I)(D|E))|((D|E).pY)|((E|P)(F|I|L)pYA.(F|I|L|V))	SHP1 phosphatase substrate motif (HPRD)
+126	((D|E).......(D|E)..pY..L.......Y..(L|I))|((I|V|L|S).pY..(L|I))	Src family kinase substrate motif (HPRD)
+127	((D|E)pYpY(R|K))|(EFpY(G|A)TY(G|A))|(E(Y|F|D)pYM)|((E|P)(M|L|I|V|F)pY(G|A).(M|L|I|V|F|Y)A)|(RD.Y.TDYpYR)|(E(F|D|Y)pY)	PTP1B phosphatase substrate motif (HPRD)
+128	((H|F).V.(T|S|A)pY)|((I|V|L).pY(F|M).P)|(pY(I|V).(I|V))|((I|L|V|M).pY(T|V|A).(I|V|L|F))|((I|V).pY(L|M|T)Y(A|P|T)SG)|(W(M|T|V)pY(Y|R)(I|L).)	SHP2 N-terminal SH2 domain binding motif (HPRD)
+129	((V|I|L).pYA.(L|V))|(..pYYM(K|R))	SHP1 C-terminal SH2 domain binding motif (HPRD)
+130	(.E.IpYGVLF)|(E.(I|V|L|F)pY(G|A)V(L|V|F|I)(F|L|V|I))	Lck kinase substrate motif (HPRD)
+131	(DEEIpY(E|G)EL.)|((D|E).......(D|E)..pY..L.......Y..(L|I))	Lyn kinase substrate motif (HPRD)
+132	(EE(D|E)IpYFFFF)|(...IpY(M|I|F)FFF)	CSK kinase substrate motif (HPRD)
+133	(EEEEpYFELV)|((E|D|R|A)(D|E)(D|E)(E|D|I)pY(F|V|I|E)(E|F|D)(L|I|F|V)V)|(.(D|E)pY.)|(pYIPP)|(.(D|E)pY(I|L|V))	EGFR kinase substrate motif (HPRD)
+134	(EEEEpYVFI.)|((L|N)(R|I)TpY)|((D|E)(D|E)(D|E)(D|E)pY(V|E|I)F(I|V|F))	PDGFR kinase substrate motif (HPRD)
+135	(EEEIpYEEIE)|((E|A|D)(E|A)(E|A)(I|E|V)pY(D|E)(D|E)(I|V|E)(E|I|V))	Fes kinase substrate motif (HPRD)
+136	(EEEpYFFLF)|(A(E|A)EEpY(F|V)F(L|F|M|I|V)F)	FGFR kinase substrate motif (HPRD)
+137	(L(Y|H)pY(M|F).(F|M))|(L.pYA.L)	SHP1 N-terminal SH2 domain binding motif (HPRD)
+138	(pY(M|L|E)EP)|(pYESP)	Vav SH2 domain binding motif (HPRD)
+139	(pY(Y|I|V)N(F|L|I|V))|(pY(Q|Y|V)N(Y|Q|F))|(pY.N)	Grb2 SH2 domain binding motif (HPRD)
+140	(pY..P)|(pYDHP)	Crk SH2 domain binding motif (HPRD)
+141	(pY..Q)|(pY(M|L|V|I|F)(P|R|K|H)Q)	STAT3 SH2 domain binding motif (HPRD)
+142	(pY..YY)|(pY(D|E).(I|L|V|M))|((D|E)..pY)|(pY....(F|Y))	ALK kinase substrate motif (HPRD)
+143	(pYIDL)|(pYASI)|(EFpYA.(V|I)G(R|K|H)S)	SHP2 phosphatase substrate motif (HPRD)
+144	(pYM.M)|(EDAIpY)|(.VIpYAAPF)|(EAIpYAAPF)|(EEIpYEEpY)|(E.IpY..P.)|(EEIpYYYVH)|(ERIpYARTK)|(AEV(I|V|L|F)pYAA(P|F)F)	Abl kinase substrate motif (HPRD)
+145	(pYM.M)|(EE(E|N|D)pY(M|F)(M|F)(M|F|I|E)(M|F))|(.EEEpYMMMM)|(KKSRGDpYMTMQIG)|(KKKLPATGDpYMNMSPVGD)	Insulin receptor kinase substrate motif (HPRD)
+146	(pYM.M)|(YIpYGSFK)|(EEEIpY(G|E)EFD)|(D(D|E)(E|D|G)(I|V|L)pY(G|E)E(F|I)F)|((D|E).......(D|E)..pY..L.......Y..(L|I))|((D|E)(D|E)(E|D|G)(I|V|L)pY(G|E|D)E(F|I|L|V)(D|E))|(pY(A|G|S|T|D|E))	Src kinase substrate motif (HPRD)
+147	(pYM.M)|(pY..M)|(pYMPMS)	PI3 Kinase p85 SH2 domain binding motif (HPRD)
+148	ME(E|N)(I|V)pY(G|E)IFF	Fgr kinase substrate motif (HPRD)
+149	KKKSPGEpYVNIEFG	IGF1 receptor kinase substrate motif (HPRD)
+150	pY..(L|I|V)	JAK2 kinase substrate motif (HPRD)
+151	pTPpY	JNK kinase substrate motif (HPRD)
+152	(E|D|pT|pY).pYEE	Syk kinase substrate motif (HPRD)
+153	DpYpYR	PTP1B, TC-PTP phosphatase substrate motif (HPRD)
+154	(D|E)FpY(G|A)(F|Y)(A|G)	PTPRH phosphatase substrate motif (HPRD)
+155	F(M|L|V|I)pY	PTPRJ phosphatase substrate motif (HPRD)
+156	pY(E|M|V)(N|V|I)	3BP2 SH2 domain binding motif (HPRD)
+157	pYENP	Abl SH2 domain binding motif (HPRD)
+158	pY(T|A|S)(K|R|Q|N)(M|I|V|R)	Csk SH2 domain binding motif (HPRD)
+159	pYE.(V|I)	Fes SH2 domain binding motif (HPRD)
+160	pYEE(I|V)	Fgr SH2 domain binding motif (HPRD)
+161	pYEDP	Fyn SH2 domain binding motif (HPRD)
+162	pY(M|I|L|V).(M|I|L|V)	GRB2, 3BP2, Csk, Fes, Syk C-terminal SH2 domain binding motif (HPRD)
+163	(F|Y)pY(E|T|Y|S)N(I|L|V|P|T|Y|S)	GRB7, GRB10 SH2 domain binding motif (HPRD)
+164	pYF.(F|P|L|Y)	HCP SH2 domain binding motif (HPRD)
+165	pY(A|E|V)(Y|F|E|S|N|V)(P|F|I|H)	Itk SH2 domain binding motif (HPRD)
+166	pYDYV	Lck and Src SH2 domain binding motif (HPRD)
+167	pYDEP	Nck SH2 domain binding motif (HPRD)
+168	pY(L|I|V)E(L|I|V)	PLCgamma C and N-terminal SH2 domain binding motif (HPRD)
+169	pY..P	RasGAP C-terminal SH2 domain binding motif (HPRD)
+170	pYILV.(M|L|I|V|P)	RasGAP N-terminal SH2 domain binding motif (HPRD)
+171	TIpY..(V|I)	SAP and EAT2 SH2 domain binding motif (HPRD)
+172	pY(L|V)N(V|P)	Sem5 SH2 domain binding motif (HPRD)
+173	pY(T|V|I).L	Shb SH2 domain binding motif (HPRD)
+174	pY(I|E|Y|L).(I|L|M)	SHC SH2 domain binding motif (HPRD)
+175	(I|V|L|S).pY..(L|I)	SHIP2 SH2 domain binding motif (HPRD)
+176	(I|V).pY..(L|V)	SHP1 SH2 domain binding motif (HPRD)
+177	(V|I|L).pY(M|L|F).P	SHP1, SHP2 SH2 domain binding motif (HPRD)
+178	(T|V|I|Y).pY(A|S|T|V).(I|V|L)	SHP2 CSH2 domain binding motif (HPRD)
+179	(I|L|V)(I|L|V)(I|L|V|F|T|Y)pY(T|I|L|V)(I|L)(I|L|V|P)	SHP2 C-terminal SH2 domain binding motif (HPRD)
+180	pYIPP	SHP2, PLCgamma SH2 domain binding motif (HPRD)
+181	pYM.M	Src and Abl SH2 domain binding motif (HPRD)
+182	pY(R|K|H|Q|E|D)(R|K|H|Q|E|D)(I|P)	Src, Fyn, Lck, Fgr, Abl, Crk, Nck SH2 domain binding motif (HPRD)
+183	PP.pY	Src, Fyn,Csk, Nck and SHC SH2 domain binding motif (HPRD)
+184	pYEEI	Src,Lck and Fyn SH2 domains binding motif (HPRD)
+185	pY(D|E)(P|R)(R|P|Q)	STAT1 SH2 domain binding motif (HPRD)
+186	pY(Q|T|E)(E|Q)(L|I)	Syk C-terminal SH2 domain binding motif (HPRD)
+187	pYTT(I|L|M)	Syk N-terminal SH2 domain binding motif (HPRD)
+188	(D|E).......(D|E)..pY..L.......Y..(L|I)	Syk, ZAP-70, Shc, Lyn SH2 domain binding motif (HPRD)
+189	pYEN(F|I|V)	Tensin SH2 domain binding motif (HPRD)
+190	D(N|D).pY	Cbl PTB domain binding motif (HPRD)
+191	N.LpY	Dok1 PTB domain binding motif (HPRD)
+192	N..pY	FRIP PTB domain binding motif (HPRD)
+193	NP.pY	Shc PTB domain binding motif (HPRD)
+194	DD.pY	Shb PTB domain binding motif (HPRD)
+195	NP.pYF.R	ShcA PTB domain binding motif (HPRD)
+196	HN(M|L|V|I)(M|L|V|I|N)NP(S|T)pY	ShcC PTB domain binding motif (HPRD)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input_for_anova.tabular	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,23 @@
+Phosphopeptide	Sequence10	Sequence7	Gene_Name	Phosphoresidue	UniProt_ID	Description	Function Phosphoresidue(PSP=PhosphoSitePlus.org)	Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains	Intensity.shL.1A	Intensity.shL.1B	Intensity.shL.1C	Intensity.shR.2A	Intensity.shR.2B	Intensity.shR.2C
+AAAAPDSRVpSEEENLK	MAAAAPDSRVpSEEENLKKTPK	AAPDSRVsEEENLKK	RRP15	pS11	Q9Y3B9	RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2	N/A	CK2alpha | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | BARD1 BRCT domain binding | PKA | CK1 | CK2	38150000	39445000	56305000	55338000	7010600	70203000
+AAAITDMADLEELSRLpSPLPPGpSPGSAAR	MADLEELSRLpSPLPPGSPGSA; LSRLSPLPPGpSPGSAARGRAE	LEELSRLsPLPPGSP | LSPLPPGsPGSAARG	AEBP2; AEBP2	pS18, pS24; pS18, pS24	Q6ZN18; Q6ZN18-2	AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2	N/A	N/A	5416400	7101800	385280000	208060000	41426000	352400000
+ADALQAGASQFETpSAAK	LQAGASQFETpSAAKLKRKYWW	GASQFETsAAKLKRK	VAMP2; VAMP3	pS80; pS63	P63027; Q15836	VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3	N/A	PKD3 | PKCiota	44627000	41445000	69094000	42521000	5738000	61819000
+DQKLpSELDDR	DKVLERDQKLpSELDDRADALQ	LERDQKLsELDDRAD	VAMP1; VAMP1; VAMP1; VAMP2; VAMP3	pS63; pS63; pS63; pS61; pS44	P23763; P23763-2; P23763-3; P63027; Q15836	VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1; VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3	N/A	CK2alpha | PKAbeta | PKAgamma | PKCiota | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | Pyruvate dehydrogenase kinase substrate	75542000	44814000	32924000	35016000	11023000	4669900
+EFVpSSDESSSGENK	SESFKSKEFVpSSDESSSGENK	FKSKEFVsSDESSSG	SSRP1	pS667	Q08945	SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1	N/A	CK2alpha | CK2a2 | CDK7 | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | Casein Kinase I substrate | CK2 | GSK3	12562000	16302000	23000000	7857800	0	18830000
+EGMNPSYDEYADpSDEDQHDAYLER	MNPSYDEYADpSDEDQHDAYLE	SYDEYADsDEDQHDA	SSRP1	pS444	Q08945	SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1	N/A	CK2alpha | CK2a2 | CDK7 | CK1alpha | Casein kinase II substrate | b-Adrenergic Receptor kinase substrate | Pyruvate dehydrogenase kinase substrate	0	0	0	0	0	0
+IGNEEpSDLEEACILPHpSPINVDK	DDEEKIGNEEpSDLEEACILPH; DLEEACILPHpSPINVDKRPIA	EKIGNEEsDLEEACI | EACILPHsPINVDKR	HERC2	pS1577, pS1588	O95714	HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2	N/A	CK2alpha | Casein kinase II substrate | ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | b-Adrenergic Receptor kinase substrate | WW domain binding | ERK/MAPK | CK2 | NEK6	167764000	121218000	155736000	140640000	83642000	128468000
+IRAEEEDLAAVPFLApSDNEEEEDEK	EDLAAVPFLApSDNEEEEDEKG	AAVPFLAsDNEEEED	HERC2	pS2928	O95714	HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2	N/A	CK2alpha | Casein kinase II substrate | CK2	22562000	18225000	9119700	11689000	0	0
+KGLLApTpSGNDGTIR	VWCNKKGLLApTSGNDGTIRVW; WCNKKGLLATpSGNDGTIRVWN	NKKGLLAtSGNDGTI | KKGLLATsGNDGTIR	HERC1	pT3445, pS3446	Q15751	HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2	N/A	N/A	7843600	0	241700000	0	0	10042600
+KpSSLVTSK	PTPQDLPQRKpSSLVTSKLAGG; PTPQDLPQRKpSSLVTSKLAG	QDLPQRKsSLVTSKL	ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA	pS108; pS108; pS124; pS131; pS104; pS104; pS120; pS124	O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9	ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA	N/A	G protein-coupled receptor kinase 1 substrate	0	0	18629000	0	0	0
+KSpSLVTSK	TPQDLPQRKSpSLVTSKLAGGQ; TPQDLPQRKSpSLVTSKLAG	DLPQRKSsLVTSKLA	ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA	pS109; pS109; pS125; pS132; pS105; pS105; pS121; pS125	O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9	ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA	molecular association, regulation; protein conformation; SNCA(DISRUPTS)	G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | Casein Kinase I substrate | MDC1 BRCT domain binding | GSK3 | AURORA	7090300	8341200	9691500	10030000	1675200	9952100
+LpSPNPWQEK	MLAVDIEDRLpSPNPWQEKREI	VDIEDRLsPNPWQEK	HERC2	pS3462	O95714	HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2	N/A	ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | WW domain binding	0	11706000	12495000	0	7273000	8877800
+NLLEDDpSDEEEDFFLR	SERRNLLEDDpSDEEEDFFLRG	RNLLEDDsDEEEDFF	VAMP4	pS30	O75379	VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2	N/A	CK2alpha | Casein kinase II substrate | Casein Kinase I substrate | b-Adrenergic Receptor kinase substrate | BARD1 BRCT domain binding | CK2 | Csnk2a1	1592100000	973800000	1011600000	1450300000	631970000	878760000
+pSQKQEEENPAEETGEEK	MpSQKQEEENPAE	______MsQKQEEEN	ENSA; ENSA; ENSA; ENSA; ENSA; ENSA	pS2; pS2; pS2; pS2; pS2; pS2	O43768; O43768-2; O43768-3; O43768-4; O43768-8; O43768-9	ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA	N/A	ATM kinase substrate | PKC kinase substrate | PKA kinase substrate	0	0	8765300	0	2355900	14706000
+QLSEpSFK	SKSSSRQLSEpSFKSKEFVSSD	SSRQLSEsFKSKEFV	SSRP1	pS659	Q08945	SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1	N/A	CK2a2 | CDK7 | PKCalpha | PKCbeta | DNAPK | PKC kinase substrate | PKA kinase substrate | NEK6	68201000	87774000	138300000	95357000	19966000	149110000
+RGpSLEMSSDGEPLSR	SSATSGGRRGpSLEMSSDGEPL	TSGGRRGsLEMSSDG	AEBP2; AEBP2	pS206; pS206	Q6ZN18; Q6ZN18-2	AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2	N/A	Casein Kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | PKA | GSK3 | AURORA	19262000	11103000	19454000	0	1816900	22028000
+SDGpSLEDGDDVHR	IEDGGARSDGpSLEDGDDVHRA	GGARSDGsLEDGDDV	SERINC1	pS364	Q9NRX5	SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1	N/A	Casein kinase II substrate | Plk1 kinase substrate | Pyruvate dehydrogenase kinase substrate | CK1 | PLK | PLK1	31407000	17665000	20892000	23194000	5132400	54893000
+SEpSLTAESR	EGGGLMTRSEpSLTAESRLVHT	GLMTRSEsLTAESRL	HERC1	pS1491	Q15751	HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2	N/A	b-Adrenergic Receptor kinase substrate	11766000	13176000	20540000	16963000	4364700	21308000
+STGPTAATGpSNRR	MSTGPTAATGpSNRRLQQTQNQ	GPTAATGsNRRLQQT	VAMP3	pS11	Q15836	VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3	N/A	PKCalpha | PKCbeta | PKCzeta | PKC kinase substrate | PKA kinase substrate	3057100	4718800	12052000	5047700	1070900	8333500
+TEDLEATpSEHFK	RNKTEDLEATpSEHFKTTSQKV	TEDLEATsEHFKTTS	VAMP8	pS55	Q9BV40	VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1	activity, inhibited; abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion	G protein-coupled receptor kinase 1 substrate | Casein Kinase I substrate	20400000	9738500	7862300	0	0	76518000
+TFWpSPELK	SSMNSIKTFWpSPELKKERVLR	NSIKTFWsPELKKER	ERC2	pS187	O15083	ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3	N/A	IKKalpha | IKKbeta | HIPK2 | Casein Kinase II substrate | ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | WW domain binding	29764000	20957000	24855000	30752000	8304800	23771000
+YFDpSGDYNMAK	CADEMQKYFDpSGDYNMAKAKM; RLQKGQKYFDpSGDYNMAKAKM; MKSVEQKYFDpSGDYNMAKAKM	EMQKYFDsGDYNMAK | KGQKYFDsGDYNMAK | VEQKYFDsGDYNMAK	ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA	pS67; pS67; pS83; pS90; pS63; pS63; pS79; pS83	O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9	ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA	molecular association, regulation; cell cycle regulation; PPP2CA(INDUCES)	b-Adrenergic Receptor kinase substrate	323250000	127970000	0	67123000	12790000	71378000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_input_for_preproc.tabular	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,38 @@
+Proteins	Positions within proteins	Leading proteins	Protein	Fasta headers	Localization prob	Score diff	PEP	Score	Delta score	Score for localization	Localization prob shL.1A	Score diff shL.1A	PEP shL.1A	Score shL.1A	Localization prob shL.1B	Score diff shL.1B	PEP shL.1B	Score shL.1B	Localization prob shL.1C	Score diff shL.1C	PEP shL.1C	Score shL.1C	Localization prob shR.2A	Score diff shR.2A	PEP shR.2A	Score shR.2A	Localization prob shR.2B	Score diff shR.2B	PEP shR.2B	Score shR.2B	Localization prob shR.2C	Score diff shR.2C	PEP shR.2C	Score shR.2C	Diagnostic peak	Number of Phospho (STY)	Amino acid	Sequence window	Modification window	Peptide window coverage	Phospho (STY) Probabilities	Phospho (STY) Score diffs	Position in peptide	Charge	Mass error [ppm]	Identification type shL.1A	Identification type shL.1B	Identification type shL.1C	Identification type shR.2A	Identification type shR.2B	Identification type shR.2C	Intensity	Intensity___1	Intensity___2	Intensity___3	Ratio mod/base	Intensity shL.1A	Intensity shL.1B	Intensity shL.1C	Intensity shR.2A	Intensity shR.2B	Intensity shR.2C	Ratio mod/base shL.1A	Ratio mod/base shL.1B	Ratio mod/base shL.1C	Ratio mod/base shR.2A	Ratio mod/base shR.2B	Ratio mod/base shR.2C	Intensity shL.1A___1	Intensity shL.1A___2	Intensity shL.1A___3	Intensity shL.1B___1	Intensity shL.1B___2	Intensity shL.1B___3	Intensity shL.1C___1	Intensity shL.1C___2	Intensity shL.1C___3	Intensity shR.2A___1	Intensity shR.2A___2	Intensity shR.2A___3	Intensity shR.2B___1	Intensity shR.2B___2	Intensity shR.2B___3	Intensity shR.2C___1	Intensity shR.2C___2	Intensity shR.2C___3	Occupancy shL.1A	Occupancy ratioshL.1A	Occupancy error scale shL.1A	Occupancy shL.1B	Occupancy ratioshL.1B	Occupancy error scale shL.1B	Occupancy shL.1C	Occupancy ratioshL.1C	Occupancy error scale shL.1C	Occupancy shR.2A	Occupancy ratioshR.2A	Occupancy error scale shR.2A	Occupancy shR.2B	Occupancy ratioshR.2B	Occupancy error scale shR.2B	Occupancy shR.2C	Occupancy ratioshR.2C	Occupancy error scale shR.2C	Reverse	Potential contaminant	id	Protein group IDs	Positions	Position	Peptide IDs	Mod. peptide IDs	Evidence IDs	MS/MS IDs	Best localization evidence ID	Best localization MS/MS ID	Best localization raw file	Best localization scan number	Best score evidence ID	Best score MS/MS ID	Best score raw file	Best score scan number	Best PEP evidence ID	Best PEP MS/MS ID	Best PEP raw file	Best PEP scan number
+sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN	108;108;124;124;131;104;104;120	sp|O43768-2|ENSA_HUMAN	sp|O43768-2|ENSA_HUMAN		0.877317	8.54376	0.001041	110.11	55.028	110.11																										1	S	TGDHIPTPQDLPQRKSSLVTSKLAG______	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXXPPPPPPPPXXXXXXXXX	KS(0.877)S(0.123)LVTSK	KS(8.54)S(-8.54)LVT(-58.58)S(-72.01)K	2	2	0.022801			By MS/MS				18629000	18629000	0	0		0	0	18629000	0	0	0							0	0	0	0	0	0	18629000	0	0	0	0	0	0	0	0	0	0	0																					700	529	108	108	12310;20039	13742;22688	99166	91729	99166	91729	QE05099	5593	99166	91729	QE05099	5593	99166	91729	QE05099	5593
+sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN	109;109;125;125;132;105;105;121	sp|O43768-2|ENSA_HUMAN	sp|O43768-2|ENSA_HUMAN		0.877764	9.23011	0.00135208	98.182	25.939	55.754																										1	S	GDHIPTPQDLPQRKSSLVTSKLAG_______	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXPPPPPPPPXXXXXXXXXX	KS(0.105)S(0.878)LVT(0.015)S(0.002)K	KS(-9.23)S(9.23)LVT(-17.65)S(-25.69)K	3	2	-0.061619	By MS/MS	By MS/MS	By matching	By matching	By matching	By MS/MS	81973000	81973000	0	0		7090300	8341200	9691500	10030000	1675200	9952100							7090300	0	0	8341200	0	0	9691500	0	0	10030000	0	0	1675200	0	0	9952100	0	0																					701	529	109	109	12310;20039	13742;22688	99164;99165;99168;99169;160369;160370;160371;160372;160373;160374	91727;91728;91731;142479	99164	91727	QE05097	5219	99167	91730	QE05100	5516	99167	91730	QE05100	5516
+CON__P02662	46	CON__P02662	CON__P02662		0.99978	36.4544	1.10E-08	122.19	116.48	122.19																										2	S	VFGKEKVNELSKDIGSESTEDQAMEDIKQME	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;Phospho (STY);X;X;X;X;X;Oxidation (M);X;X;X;X;X;X;X	XXXXXXXXXXXXPPPPPPPPPPPPPPPPXXX	DIGS(1)ES(0.972)T(0.029)EDQAMEDIK	DIGS(36.45)ES(15.33)T(-15.33)EDQAMEDIK	4	2	0.56139	By MS/MS		By MS/MS			By MS/MS	49187000	0	49187000	0	NaN	16494000	0	20139000	0	0	12553000	NaN	NaN	NaN	NaN	NaN	NaN	0	16494000	0	0	0	0	0	20139000	0	0	0	0	0	0	0	0	12553000	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN		+	2	14	46	46	3452	3862;3863	27864;27865;27866;27867	25820;25821;25822;25823	27865	25821	QE05099	36641	27865	25821	QE05099	36641	27865	25821	QE05099	36641
+CON__P02662	48	CON__P02662	CON__P02662		0.971522	15.3284	1.10E-08	122.19	116.48	122.19																										2	S	GKEKVNELSKDIGSESTEDQAMEDIKQMEAE	X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;Phospho (STY);X;X;X;X;X;Oxidation (M);X;X;X;X;X;X;X;X;X	XXXXXXXXXXPPPPPPPPPPPPPPPPXXXXX	DIGS(1)ES(0.972)T(0.029)EDQAMEDIK	DIGS(36.45)ES(15.33)T(-15.33)EDQAMEDIK	6	2	0.56139	By MS/MS		By MS/MS			By MS/MS	49187000	0	49187000	0	NaN	16494000	0	20139000	0	0	12553000	NaN	NaN	NaN	NaN	NaN	NaN	0	16494000	0	0	0	0	0	20139000	0	0	0	0	0	0	0	0	12553000	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN		+	3	14	48	48	3452	3862;3863	27864;27865;27866;27867	25820;25821;25822;25823	27865	25821	QE05099	36641	27865	25821	QE05099	36641	27865	25821	QE05099	36641
+CON__P02662	115	CON__P02662	CON__P02662		1	50.1781	4.91E-07	124.08	88.205	50.178																										1	S	RLKKYKVPQLEIVPNSAEERLHSMKEGIHAQ	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXPPPPPPPPPPPPPPXXXXXXXXXXX	VPQLEIVPNS(1)AEER	VPQLEIVPNS(50.18)AEER	10	3	-0.26085	By MS/MS	By matching	By MS/MS	By matching	By matching	By MS/MS	228160000	228160000	0	0	NaN	36938000	3667100	7945800	0	2359500	8418700	NaN	NaN	NaN	NaN	NaN	NaN	36938000	0	0	3667100	0	0	7945800	0	0	0	0	0	2359500	0	0	8418700	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN		+	4	14	115	115	23142	26196	185609;185610;185611;185612;185613;185614;185615	165233;165234;165235;165236	185612	165236	QE05102	41518	185610	165234	QE05097	41110	185610	165234	QE05097	41110
+sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-8|ENSA_HUMAN	2;2;2;2;2;2	sp|O43768-2|ENSA_HUMAN	sp|O43768-2|ENSA_HUMAN		1.0	73.249	3.69e-06	83.395	74.925	83.395																										1	S	______________MSQKQEEENPAEETGEE	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXXXPPPPPPPPPPPPPPPP	S(1)QKQEEENPAEETGEEK	S(73.25)QKQEEENPAEET(-73.25)GEEK	1	2	-0.84902			By matching		By matching	By MS/MS	25828000	25828000	0	0		0	0	8765300	0	2355900	14706000							0	0	0	0	0	0	8765300	0	0	0	0	0	2355900	0	0	14706000	0	0																					702	529	2	2	19781	22398	158249;158250;158251	140920	158249	140920	QE05102	12907	158249	140920	QE05102	12907	158249	140920	QE05102	12907
+sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN;sp|P56211-2|ARP19_HUMAN;sp|P56211|ARP19_HUMAN	67;67;83;83;90;63;63;79;46;62	sp|O43768-2|ENSA_HUMAN;sp|P56211-2|ARP19_HUMAN	sp|O43768-2|ENSA_HUMAN		0.999907	42.1841	4.04e-05	77.894	72.756	77.894																										1	S	DFLMKRLQKGQKYFDSGDYNMAKAKMKNKQL;DFLRKRLQKGQKYFDSGDYNMAKAKMKNKQL	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXPPPPPPPPPPPXXXXXXXX	YFDS(1)GDYNMAK	Y(-44.9)FDS(42.18)GDY(-42.18)NMAK	4	2	0.090313	By MS/MS	By MS/MS		By matching	By MS/MS	By MS/MS	602510000	602510000	0	0		323250000	127970000	0	67123000	12790000	71378000							323250000	0	0	127970000	0	0	0	0	0	67123000	0	0	12790000	0	0	71378000	0	0																					703	529;2007	67;46	67	23817	26932	190543;190544;190545;190546;190547	169398;169399;169400;169401	190543	169398	QE05097	28697	190543	169398	QE05097	28697	190543	169398	QE05097	28697
+sp|O95714|HERC2_HUMAN;sp|Q9BVR0|HRC23_HUMAN	1577;304	sp|O95714|HERC2_HUMAN	sp|O95714|HERC2_HUMAN		1.0	100.152	1.12e-15	100.15	94.415	100.15																										2	S	KPESTDDEEKIGNEESDLEEACILPHSPINV	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X	XXXXXXXXXXPPPPPPPPPPPPPPPPPPPPP	IGNEES(1)DLEEACILPHS(1)PINVDK	IGNEES(100.15)DLEEACILPHS(100.15)PINVDK	6	3	-0.31776	By matching	By matching	By matching	By matching	By MS/MS	By MS/MS	398730000	0	398730000	0		83882000	60609000	77868000	70320000	41821000	64234000							0	83882000	0	0	60609000	0	0	77868000	0	0	70320000	0	0	41821000	0	0	64234000	0																					1295	867	1577	1577	11517	12858	93270;93271;93272;93273;93274;93275	86700;86701	93271	86701	QE05102	51298	93271	86701	QE05102	51298	93271	86701	QE05102	51298
+sp|O95714|HERC2_HUMAN;sp|Q9BVR0|HRC23_HUMAN	1588;315	sp|O95714|HERC2_HUMAN	sp|O95714|HERC2_HUMAN		1.0	100.152	1.12e-15	100.15	94.415	100.15																										2	S	GNEESDLEEACILPHSPINVDKRPIAIKSPK	X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	PPPPPPPPPPPPPPPPPPPPPPXXXXXXXXX	IGNEES(1)DLEEACILPHS(1)PINVDK	IGNEES(100.15)DLEEACILPHS(100.15)PINVDK	17	3	-0.31776	By matching	By matching	By matching	By matching	By MS/MS	By MS/MS	398730000	0	398730000	0		83882000	60609000	77868000	70320000	41821000	64234000							0	83882000	0	0	60609000	0	0	77868000	0	0	70320000	0	0	41821000	0	0	64234000	0																					1296	867	1588	1588	11517	12858	93270;93271;93272;93273;93274;93275	86700;86701	93271	86701	QE05102	51298	93271	86701	QE05102	51298	93271	86701	QE05102	51298
+sp|O95714|HERC2_HUMAN	2928	sp|O95714|HERC2_HUMAN	sp|O95714|HERC2_HUMAN		1.0	44.9549	6.81e-12	84.285	78.578	44.955																										1	S	IRAEEEDLAAVPFLASDNEEEEDEKGNSGSL	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	PPPPPPPPPPPPPPPPPPPPPPPPPXXXXXX	IRAEEEDLAAVPFLAS(1)DNEEEEDEK	IRAEEEDLAAVPFLAS(44.95)DNEEEEDEK	16	3	-0.24823	By MS/MS	By MS/MS	By matching	By matching			61597000	61597000	0	0		22562000	18225000	9119700	11689000	0	0							22562000	0	0	18225000	0	0	9119700	0	0	11689000	0	0	0	0	0	0	0	0																					1297	867	2928	2928	11904	13281	96043;96044;96045;96046	89048;89049	96044	89049	QE05098	52942	96043	89048	QE05097	52381	96043	89048	QE05097	52381
+sp|O95714|HERC2_HUMAN	1938	sp|O95714|HERC2_HUMAN	sp|O95714|HERC2_HUMAN		0.427104	0.0	4.17e-06	44.164	42.292	44.164																											S	KYDLKLAELPAAAQPSAEDSDTEDDSEAEQT	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXPPPPPPPPPPPPPPPPPPPPPPPPPP	LAELPAAAQPS(0.427)AEDS(0.427)DT(0.142)EDDS(0.003)EAEQTER	LAELPAAAQPS(0)AEDS(0)DT(-4.78)EDDS(-20.87)EAEQT(-37.92)ER	11	3	-1.2171							0	0	0	0		0	0	0	0	0	0							0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0																					1298	867	1938	1938	12395	13829			99721	92163	QE05099	31358	99721	92163	QE05099	31358	99721	92163	QE05099	31358
+sp|O95714|HERC2_HUMAN	1942	sp|O95714|HERC2_HUMAN	sp|O95714|HERC2_HUMAN		0.427104	0.0	4.17e-06	44.164	42.292	44.164																											S	KLAELPAAAQPSAEDSDTEDDSEAEQTERNI	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX	LAELPAAAQPS(0.427)AEDS(0.427)DT(0.142)EDDS(0.003)EAEQTER	LAELPAAAQPS(0)AEDS(0)DT(-4.78)EDDS(-20.87)EAEQT(-37.92)ER	15	3	-1.2171							0	0	0	0		0	0	0	0	0	0							0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0																					1299	867	1942	1942	12395	13829			99721	92163	QE05099	31358	99721	92163	QE05099	31358	99721	92163	QE05099	31358
+sp|O95714|HERC2_HUMAN	3462	sp|O95714|HERC2_HUMAN	sp|O95714|HERC2_HUMAN		1.0	41.1171	0.0267288	41.117	33.02	41.117																										1	S	NGEECMLAVDIEDRLSPNPWQEKREIVSSED	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXXPPPPPPPPPXXXXXXXX	LS(1)PNPWQEK	LS(41.12)PNPWQEK	2	2	0.64603		By matching	By MS/MS		By matching	By matching	40352000	40352000	0	0		0	11706000	12495000	0	7273000	8877800							0	0	0	11706000	0	0	12495000	0	0	0	0	0	7273000	0	0	8877800	0	0																					1300	867	3462	3462	14140	15756	112737;112738;112739;112740	102778	112737	102778	QE05099	28079	112737	102778	QE05099	28079	112737	102778	QE05099	28079
+sp|Q08945|SSRP1_HUMAN	667	sp|Q08945|SSRP1_HUMAN	sp|Q08945|SSRP1_HUMAN		0.824557	6.72928	2.29e-05	88.385	80.253	88.385																										1	S	SSRQLSESFKSKEFVSSDESSSGENKSKKKR	X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXPPPPPPPPPPPPPPXXXXX	EFVS(0.825)S(0.175)DESSSGENK	EFVS(6.73)S(-6.73)DES(-34.1)S(-47.3)S(-52.91)GENK	4	2	-0.31453	By MS/MS	By MS/MS	By MS/MS	By MS/MS		By MS/MS	78553000	78553000	0	0		12562000	16302000	23000000	7857800	0	18830000							12562000	0	0	16302000	0	0	23000000	0	0	7857800	0	0	0	0	0	18830000	0	0																					3469	2387	667	667	6499	7276	53820;53821;53822;53823;53824	51145;51146;51147;51148;51149	53820	51145	QE05097	12983	53820	51145	QE05097	12983	53820	51145	QE05097	12983
+sp|Q08945|SSRP1_HUMAN	444	sp|Q08945|SSRP1_HUMAN	sp|Q08945|SSRP1_HUMAN		0.999939	44.165	7.94e-20	97.469	93.771	97.469																										1	S	GLKEGMNPSYDEYADSDEDQHDAYLERMKEE	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXPPPPPPPPPPPPPPPPPPPPPPPPXXXX	EGMNPSYDEYADS(1)DEDQHDAYLER	EGMNPS(-49.21)Y(-49.82)DEY(-44.17)ADS(44.17)DEDQHDAY(-90.19)LER	13	3	0.19918			By MS/MS			By MS/MS	0	0	0	0		0	0	0	0	0	0							0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0																					3470	2387	444	444	6658	7448	55048;55049	52320;52321	55048	52320	QE05099	31926	55048	52320	QE05099	31926	55048	52320	QE05099	31926
+sp|Q08945|SSRP1_HUMAN	659	sp|Q08945|SSRP1_HUMAN	sp|Q08945|SSRP1_HUMAN		0.999878	39.1416	0.00235198	117.7	65.216	117.7																										1	S	SRGSSSKSSSRQLSESFKSKEFVSSDESSSG	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X	XXXXXXXXXXXPPPPPPPXXXXXXXXXXXXX	QLSES(1)FK	QLS(-39.14)ES(39.14)FK	5	2	0.14738	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By matching	By MS/MS	558700000	558700000	0	0		68201000	87774000	138300000	95357000	19966000	149110000							68201000	0	0	87774000	0	0	138300000	0	0	95357000	0	0	19966000	0	0	149110000	0	0																					3471	2387	659	659	16873	19002	134380;134381;134382;134383;134384;134385	120469;120470;120471;120472;120473	134381	120470	QE05098	17736	134381	120470	QE05098	17736	134381	120470	QE05098	17736
+sp|Q15751|HERC1_HUMAN	3446	sp|Q15751|HERC1_HUMAN	sp|Q15751|HERC1_HUMAN		0.999981	47.2167	0.0187791	47.548	7.8172	47.548																										2	S	VMTCVWCNKKGLLATSGNDGTIRVWNVTKKQ	X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXPPPPPPPPPPPPPPXXXXXXXX	KGLLAT(1)S(1)GNDGTIR	KGLLAT(47.2)S(47.22)GNDGT(-47.2)IR	7	2	-0.95722	By matching		By MS/MS			By matching	129800000	0	129800000	0		3921800	0	120850000	0	0	5021300							0	3921800	0	0	0	0	0	120850000	0	0	0	0	0	0	0	0	5021300	0																					4421	2824	3446	3446	12194	13609	98227;98228;98229	90789	98227	90789	QE05099	12004	98227	90789	QE05099	12004	98227	90789	QE05099	12004
+sp|Q15751|HERC1_HUMAN	1491	sp|Q15751|HERC1_HUMAN	sp|Q15751|HERC1_HUMAN		0.9956	24.4686	0.000725254	80.245	41.065	80.245																										1	S	STSASEGGGLMTRSESLTAESRLVHTSPNYR	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXPPPPPPPPPXXXXXXXXX	S(0.004)ES(0.996)LT(0.001)AESR	S(-24.47)ES(24.47)LT(-30.8)AES(-48.77)R	3	2	-0.02332	By matching	By MS/MS	By MS/MS	By MS/MS	By matching	By MS/MS	88117000	88117000	0	0		11766000	13176000	20540000	16963000	4364700	21308000							11766000	0	0	13176000	0	0	20540000	0	0	16963000	0	0	4364700	0	0	21308000	0	0																					4422	2824	1491	1491	18146	20455	144586;144587;144588;144589;144590;144591	129449;129450;129451;129452	144587	129450	QE05099	10286	144587	129450	QE05099	10286	144587	129450	QE05099	10286
+sp|Q15751|HERC1_HUMAN	1510	sp|Q15751|HERC1_HUMAN	sp|Q15751|HERC1_HUMAN		0.330689	0.0	7.97e-05	45.193	39.23	45.193																											S	ESRLVHTSPNYRLIKSRSESDLSQPESDEEG	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXXXPPPPPPPPPPPPPPPP	S(0.331)RS(0.331)ES(0.331)DLS(0.008)QPESDEEGYALSGR	S(0)RS(0)ES(0)DLS(-16.27)QPES(-35.13)DEEGY(-44.24)ALS(-45.11)GR	1	3	0.88872							0	0	0	0		0	0	0	0	0	0							0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0																					4423	2824	1510	1510	19884	22510			159108	141525	QE05102	26609	159108	141525	QE05102	26609	159108	141525	QE05102	26609
+sp|Q15751|HERC1_HUMAN	1512	sp|Q15751|HERC1_HUMAN	sp|Q15751|HERC1_HUMAN		0.473289	2.22394	8.37e-06	56.783	53.982	56.783																											S	RLVHTSPNYRLIKSRSESDLSQPESDEEGYA	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXPPPPPPPPPPPPPPPPPP	S(0.284)RS(0.473)ES(0.219)DLS(0.024)QPESDEEGYALSGR	S(-2.22)RS(2.22)ES(-3.34)DLS(-13.02)QPES(-39.32)DEEGY(-52.92)ALS(-56.34)GR	3	3	-0.16378							0	0	0	0		0	0	0	0	0	0							0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0																					4424	2824	1512	1512	19884	22510			159107	141524	QE05101	26243	159107	141524	QE05101	26243	159107	141524	QE05101	26243
+sp|Q15751|HERC1_HUMAN	1514	sp|Q15751|HERC1_HUMAN	sp|Q15751|HERC1_HUMAN		0.330689	0.0	7.97e-05	45.193	39.23	45.193																											S	VHTSPNYRLIKSRSESDLSQPESDEEGYALS	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX	S(0.331)RS(0.331)ES(0.331)DLS(0.008)QPESDEEGYALSGR	S(0)RS(0)ES(0)DLS(-16.27)QPES(-35.13)DEEGY(-44.24)ALS(-45.11)GR	5	3	0.88872							0	0	0	0		0	0	0	0	0	0							0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0																					4425	2824	1514	1514	19884	22510			159108	141525	QE05102	26609	159108	141525	QE05102	26609	159108	141525	QE05102	26609
+sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN	18;18	sp|Q6ZN18-2|AEBP2_HUMAN	sp|Q6ZN18-2|AEBP2_HUMAN		0.998316	27.7896	1.21e-62	181.56	176.76	181.56																										2	S	AAITDMADLEELSRLSPLPPGSPGSAARGRA	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X	PPPPPPPPPPPPPPPPPPPPPPPPPPPPXXX	AAAITDMADLEELS(0.002)RLS(0.998)PLPPGS(0.809)PGS(0.191)AAR	AAAIT(-99.88)DMADLEELS(-27.79)RLS(27.79)PLPPGS(6.28)PGS(-6.28)AAR	17	3	0.97551	By matching	By matching	By matching	By MS/MS	By MS/MS	By MS/MS	499850000	0	499850000	0		2708200	3550900	192640000	104030000	20713000	176200000							0	2708200	0	0	3550900	0	0	192640000	0	0	104030000	0	0	20713000	0	0	176200000	0																					5468	3335	18	18	28	35	264;265;266;267;268;269	236;237;238;239	264	236	QE05100	65231	264	236	QE05100	65231	264	236	QE05100	65231
+sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN	24;24	sp|Q6ZN18-2|AEBP2_HUMAN	sp|Q6ZN18-2|AEBP2_HUMAN		0.809237	6.27624	1.21e-62	181.56	176.76	181.56																										2	S	ADLEELSRLSPLPPGSPGSAARGRAEPPEEE	X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	PPPPPPPPPPPPPPPPPPPPPPXXXXXXXXX	AAAITDMADLEELS(0.002)RLS(0.998)PLPPGS(0.809)PGS(0.191)AAR	AAAIT(-99.88)DMADLEELS(-27.79)RLS(27.79)PLPPGS(6.28)PGS(-6.28)AAR	23	3	0.97551	By matching	By matching	By matching	By MS/MS	By MS/MS	By MS/MS	499850000	0	499850000	0		2708200	3550900	192640000	104030000	20713000	176200000							0	2708200	0	0	3550900	0	0	192640000	0	0	104030000	0	0	20713000	0	0	176200000	0																					5469	3335	24	24	28	35	264;265;266;267;268;269	236;237;238;239	264	236	QE05100	65231	264	236	QE05100	65231	264	236	QE05100	65231
+sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN	206;206	sp|Q6ZN18-2|AEBP2_HUMAN	sp|Q6ZN18-2|AEBP2_HUMAN		0.999982	48.3708	1.18e-09	128.05	118.25	128.05																										1	S	TGGGGSSATSGGRRGSLEMSSDGEPLSRMDS	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXPPPPPPPPPPPPPPPXXX	RGS(1)LEMSSDGEPLSR	RGS(48.37)LEMS(-48.37)S(-54.13)DGEPLS(-99.69)R	3	2	-0.10602	By MS/MS	By MS/MS	By MS/MS		By matching	By MS/MS	73663000	73663000	0	0		19262000	11103000	19454000	0	1816900	22028000							19262000	0	0	11103000	0	0	19454000	0	0	0	0	0	1816900	0	0	22028000	0	0																					5470	3335	206	206	17255	19413	137099;137100;137101;137102;137103	122913;122914;122915;122916	137099	122913	QE05097	23240	137099	122913	QE05097	23240	137099	122913	QE05097	23240
+		REV__sp|P35908|K22E_HUMAN	REV__sp|P35908|K22E_HUMAN		1	71.692	0.00457965	71.692	14.102	71.692																										1	S		X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXPPPPPPPPPXXXXXXXXXXXX	IIKELS(1)DGR	IIKELS(71.69)DGR	6	2	2.0005	By matching	By MS/MS	By matching	By matching		By matching	431850000	431850000	0	0	NaN	103010000	67359000	64124000	74201000	0	55805000	NaN	NaN	NaN	NaN	NaN	NaN	103010000	0	0	67359000	0	0	64124000	0	0	74201000	0	0	0	0	0	55805000	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	+	+	61	57	252	252	11589	12932	93729;93730;93731;93732;93733;93734	87100	93729	87100	QE05098	47490	93729	87100	QE05098	47490	93729	87100	QE05098	47490
+		REV__sp|Q9NSB4|KRT82_HUMAN	REV__sp|Q9NSB4|KRT82_HUMAN		1	45.368	0.0161156	45.368	28.697	45.368																										1	S		X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXPPPPPPPPPPXXXXXXXXX	VDGS(1)VCDLRR	VDGS(45.37)VCDLRR	4	2	0.77096	By matching	By matching	By matching	By matching	By matching	By MS/MS	1670400000	1670400000	0	0	NaN	218420000	241200000	328130000	240860000	52984000	294390000	NaN	NaN	NaN	NaN	NaN	NaN	218420000	0	0	241200000	0	0	328130000	0	0	240860000	0	0	52984000	0	0	294390000	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	+	+	62	58	330	330	22307	25289	178961;178962;178963;178964;178965;178966;178967	159240	178961	159240	QE05102	16922	178961	159240	QE05102	16922	178961	159240	QE05102	16922
+		REV__sp|Q6S5H4-2|POTEB_HUMAN	REV__sp|Q6S5H4-2|POTEB_HUMAN		1	51.2862	0.045235	51.286	32.662	51.286																											S		X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXXPPPPPPPPPXXXXXXXXX	EVS(1)EIEELK	EVS(51.29)EIEELK	3	2	0.81181		By matching	By matching	By matching	By matching	By matching	50767000	50767000	0	0	0.044169	0	8469100	14247000	11062000	1262600	15726000	0	0.056281	0.030122	0.051456	0.037786	0.081346	0	0	0	8469100	0	0	14247000	0	0	11062000	0	0	1262600	0	0	15726000	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	+		63	59	22	22	8166	9110	66515;66516;66517;66518;66519	61714;61715	66516	61715	QE05100	38402	66516	61715	QE05100	38402	66516	61715	QE05100	38402
+sp|Q8IUD2-4|RB6I2_HUMAN;sp|Q8IUD2-2|RB6I2_HUMAN;sp|Q8IUD2-3|RB6I2_HUMAN;sp|Q8IUD2|RB6I2_HUMAN;sp|Q8IUD2-5|RB6I2_HUMAN;sp|O15083|ERC2_HUMAN	191;191;191;191;191;187	sp|Q8IUD2-4|RB6I2_HUMAN	sp|Q8IUD2-4|RB6I2_HUMAN		0.999998	58.0663	0.00181554	89.827	67.799	89.827																										1	S	ESKLSSSMNSIKTFWSPELKKERALRKDEAS	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXPPPPPPPPXXXXXXXXXXX	TFWS(1)PELK	T(-58.07)FWS(58.07)PELK	4	2	0.075831	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By MS/MS	138400000	138400000	0	0		29764000	20957000	24855000	30752000	8304800	23771000							29764000	0	0	20957000	0	0	24855000	0	0	30752000	0	0	8304800	0	0	23771000	0	0																					6037	3584	191	191	21148	23984	169817;169818;169819;169820;169821;169822	151176;151177;151178;151179;151180;151181	169822	151181	QE05102	49176	169822	151181	QE05102	49176	169822	151181	QE05102	49176
+sp|Q9NRX5|SERC1_HUMAN	364	sp|Q9NRX5|SERC1_HUMAN	sp|Q9NRX5|SERC1_HUMAN		0.999996	54.0798	2.24e-16	159.22	148.1	159.22																										1	S	DESTLIEDGGARSDGSLEDGDDVHRAVDNER	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXXPPPPPPPPPPPPPXXXXXX	SDGS(1)LEDGDDVHR	S(-54.08)DGS(54.08)LEDGDDVHR	4	2	0.64808	By MS/MS	By MS/MS	By matching	By MS/MS	By MS/MS	By MS/MS	222110000	222110000	0	0		31407000	17665000	20892000	23194000	5132400	54893000							31407000	0	0	17665000	0	0	20892000	0	0	23194000	0	0	5132400	0	0	54893000	0	0																					8729	5187	364	364	17793	20026	141355;141356;141357;141358;141359;141360;141361;141362;141363;141364;141365	126543;126544;126545;126546;126547;126548;126549	141361	126549	QE05102	10564	141361	126549	QE05102	10564	141361	126549	QE05102	10564
+sp|Q9Y3B9|RRP15_HUMAN	11	sp|Q9Y3B9|RRP15_HUMAN	sp|Q9Y3B9|RRP15_HUMAN		0.997432	25.8922	9.39e-31	175.33	139.7	175.33																										1	S	_____MAAAAPDSRVSEEENLKKTPKKKMKM	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXPPPPPPPPPPPPPPPPXXXXXXXXX	AAAAPDS(0.003)RVS(0.997)EEENLK	AAAAPDS(-25.89)RVS(25.89)EEENLK	10	2	-0.029697	By matching	By matching	By MS/MS	By MS/MS	By MS/MS	By MS/MS	266450000	266450000	0	0		38150000	39445000	56305000	55338000	7010600	70203000							38150000	0	0	39445000	0	0	56305000	0	0	55338000	0	0	7010600	0	0	70203000	0	0																					9895	5791	11	11	12	17	158;159;160;161;162;163	166;167;168;169	159	167	QE05100	23225	159	167	QE05100	23225	159	167	QE05100	23225
+sp|Q15751|HERC1_HUMAN	3445	sp|Q15751|HERC1_HUMAN	sp|Q15751|HERC1_HUMAN		0.999981	47.2024	0.0187791	47.548	7.8172	47.548																										2	T	RVMTCVWCNKKGLLATSGNDGTIRVWNVTKK	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXPPPPPPPPPPPPPPXXXXXXX	KGLLAT(1)S(1)GNDGTIR	KGLLAT(47.2)S(47.22)GNDGT(-47.2)IR	6	2	-0.95722	By matching		By MS/MS			By matching	129800000	0	129800000	0		3921800	0	120850000	0	0	5021300							0	3921800	0	0	0	0	0	120850000	0	0	0	0	0	0	0	0	5021300	0																					10983	2824	3445	3445	12194	13609	98227;98228;98229	90789	98227	90789	QE05099	12004	98227	90789	QE05099	12004	98227	90789	QE05099	12004
+sp|O75379|VAMP4_HUMAN	30	sp|O75379|VAMP4_HUMAN	sp|O75379|VAMP4_HUMAN		1	67.6437	1.44E-52	203.56	187.24	67.644																										1	S	TGSVKSERRNLLEDDSDEEEDFFLRGPSGPR	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXPPPPPPPPPPPPPPPPPPPPPP	NLLEDDS(1)DEEEDFFLR	NLLEDDS(67.64)DEEEDFFLR	7	3	-0.051914	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By MS/MS	7929000000	7929000000	0	0	NaN	1592100000	973800000	1011600000	1450300000	631970000	878760000	NaN	NaN	NaN	NaN	NaN	NaN	1592100000	0	0	973800000	0	0	1011600000	0	0	1450300000	0	0	631970000	0	0	878760000	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN			963	669	30	30	15558;15559	17538;17539	124829;124830;124831;124832;124833;124834;124835;124836;124837;124838;124839;124840;124841;124842;124843;124844;124845;124846	112951;112952;112953;112954;112955;112956;112957;112958;112959;112960;112961;112962;112963;112964;112965;112966;112967;112968;112969;112970;112971;112972	124840	112969	QE05102	57877	124833	112957	QE05099	57820	124833	112957	QE05099	57820
+sp|O95183|VAMP5_HUMAN	48	sp|O95183|VAMP5_HUMAN	sp|O95183|VAMP5_HUMAN		0.72657	5.36697	5.72E-05	79.514	55.133	79.514																										1	S	KLAELQQRSDQLLDMSSTFNKTTQNLAQKKC	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXPPPPPPPPPPPPPXXXXXXXXXX	SDQLLDMS(0.727)S(0.211)T(0.062)FNK	S(-64.13)DQLLDMS(5.37)S(-5.37)T(-10.67)FNK	8	2	-0.18713	By matching	By matching	By MS/MS	By matching	By matching	By matching	86590000	86590000	0	0	0.032027	17447000	15753000	20219000	14001000	6284700	12885000	0.028348	0.025719	0.032895	0.033925	0.083789	0.034516	17447000	0	0	15753000	0	0	20219000	0	0	14001000	0	0	6284700	0	0	12885000	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN			1189	809	48	48	17891	20149	142427;142428;142429;142430;142431;142432	127454	142427	127454	QE05099	48504	142427	127454	QE05099	48504	142427	127454	QE05099	48504
+sp|Q15836|VAMP3_HUMAN;sp|P63027|VAMP2_HUMAN	63;80	sp|Q15836|VAMP3_HUMAN	sp|Q15836|VAMP3_HUMAN		0.920811	10.6555	1.81E-09	124.1	98.278	107.25																										1	S	DRADALQAGASQFETSAAKLKRKYWWKNCKM	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXPPPPPPPPPPPPPPPPPXXXXXXXXXXXX	ADALQAGASQFET(0.079)S(0.921)AAK	ADALQAGAS(-49.99)QFET(-10.66)S(10.66)AAK	14	2	0.23449	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By matching	By MS/MS	265240000	265240000	0	0	0.036151	44627000	41445000	69094000	42521000	5738000	61819000	0.03226	0.028442	0.039791	0.036967	0.030963	0.043392	44627000	0	0	41445000	0	0	69094000	0	0	42521000	0	0	5738000	0	0	61819000	0	0	0.47624	0.90925	12.188	0.51677	1.0694	7.2217	NaN	NaN	NaN	0.81588	4.4311	19.209	NaN	NaN	NaN	0.4388	0.78189	5.9861			4442	2836	63	63	279	319	2297;2298;2299;2300;2301;2302	1992;1993;1994;1995;1996	2300	1995	QE05100	30086	2301	1996	QE05102	30007	2301	1996	QE05102	30007
+sp|Q15836|VAMP3_HUMAN;sp|P63027|VAMP2_HUMAN;sp|P23763-2|VAMP1_HUMAN;sp|P23763-3|VAMP1_HUMAN;sp|P23763|VAMP1_HUMAN	44;61;63;63;63	sp|Q15836|VAMP3_HUMAN	sp|Q15836|VAMP3_HUMAN		1	65.4951	2.36E-06	126.19	98.602	65.495																										1	S	MRVNVDKVLERDQKLSELDDRADALQAGASQ	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXXXXPPPPPPPPPPXXXXXXXXXX	DQKLS(1)ELDDR	DQKLS(65.5)ELDDR	5	3	-0.72518	By MS/MS	By MS/MS	By MS/MS	By MS/MS	By matching	By MS/MS	412950000	412950000	0	0	NaN	75542000	44814000	32924000	35016000	11023000	4669900	NaN	NaN	NaN	NaN	NaN	NaN	75542000	0	0	44814000	0	0	32924000	0	0	35016000	0	0	11023000	0	0	4669900	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN			4443	2836	44	44	4530	5083	37093;37094;37095;37096;37097;37098;37099;37100;37101;37102;37103;37104	34712;34713;34714;34715;34716;34717;34718;34719	37100	34719	QE05102	18436	37093	34712	QE05097	18245	37093	34712	QE05097	18245
+sp|Q15836|VAMP3_HUMAN	11	sp|Q15836|VAMP3_HUMAN	sp|Q15836|VAMP3_HUMAN		0.97018	15.1316	0.000117365	79.652	72.041	79.652																										1	S	_____MSTGPTAATGSNRRLQQTQNQVDEVV	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXPPPPPPPPPPPPPXXXXXXXXXXXX	STGPTAAT(0.03)GS(0.97)NRR	S(-66.94)T(-63.48)GPT(-42.47)AAT(-15.13)GS(15.13)NRR	10	2	-0.15791	By matching	By matching	By MS/MS	By matching	By matching	By MS/MS	34280000	34280000	0	0	NaN	3057100	4718800	12052000	5047700	1070900	8333500	NaN	NaN	NaN	NaN	NaN	NaN	3057100	0	0	4718800	0	0	12052000	0	0	5047700	0	0	1070900	0	0	8333500	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN			4444	2836	11	11	20280	22978	162490;162491;162492;162493;162494;162495	144222;144223	162490	144222	QE05099	7582	162490	144222	QE05099	7582	162490	144222	QE05099	7582
+sp|Q9BV40|VAMP8_HUMAN	55	sp|Q9BV40|VAMP8_HUMAN	sp|Q9BV40|VAMP8_HUMAN		0.959784	13.7778	3.78E-05	91.969	27.98	91.969																										1	S	NLEHLRNKTEDLEATSEHFKTTSQKVARKFW	X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X	XXXXXXXXPPPPPPPPPPPPXXXXXXXXXXX	TEDLEAT(0.04)S(0.96)EHFK	T(-83.18)EDLEAT(-13.78)S(13.78)EHFK	8	2	0.40785	By matching	By matching	By matching			By MS/MS	114520000	114520000	0	0	NaN	20400000	9738500	7862300	0	0	76518000	NaN	NaN	NaN	NaN	NaN	NaN	20400000	0	0	9738500	0	0	7862300	0	0	0	0	0	0	0	0	76518000	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN			7902	4687	55	55	21013	23827	168874;168875;168876;168877	150433	168874	150433	QE05102	19524	168874	150433	QE05102	19524	168874	150433	QE05102	19524
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_kinase_substrate.tabular	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,2 @@
+GENE	KINASE	KIN_ACC_ID	KIN_ORGANISM	SUBSTRATE	SUB_GENE_ID	SUB_ACC_ID	SUB_GENE	SUB_ORGANISM	SUB_MOD_RSD	SITE_GRP_ID	SITE_+/-7_AA	DOMAIN	IN_VIVO_RXN	IN_VITRO_RXN	CST_CAT#
+Csnk2a1	CK2A1	Q60737	human	VAMP4	53330	O70480	Vamp4	human	S30	454285	RNLLEDDsDEEEDFF		 	X
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_networkin.tabular	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,33 @@
+#substrate	position	id	networkin_score	tree	netphorest_group	netphorest_score	string_identifier	string_score	substrate_name	sequence	string_path
+VAMP4 (ENSP00000236192)	30	CK2alpha	35.6396	KIN	CK2_group	0.5228	ENSP00000236192	0.85	VAMP4	LLEDDsDEEED	"ENSP00000217244, 0.68 ENSP00000236192"
+SSRP1 (ENSP00000278412)	444	CK2alpha	28.6345	KIN	CK2_group	0.3768	ENSP00000278412	0.874	SSRP1	DEYADsDEDQH	"ENSP00000217244, 0.6992 ENSP00000278412"
+SSRP1 (ENSP00000278412)	667	CK2alpha	22.2088	KIN	CK2_group	0.3168	ENSP00000278412	0.874	SSRP1	SKEFVsSDESS	"ENSP00000217244, 0.6992 ENSP00000278412"
+HERC2 (ENSP00000261609)	1577	CK2alpha	10.7686	KIN	CK2_group	0.5253	ENSP00000261609	0.4514	HERC2	IGNEEsDLEEA	"ENSP00000217244, 0.764 ENSP00000346659, 0.76 ENSP00000261609"
+HERC2 (ENSP00000261609)	2928	CK2alpha	10.7686	KIN	CK2_group	0.4698	ENSP00000261609	0.4514	HERC2	VPFLAsDNEEE	"ENSP00000217244, 0.764 ENSP00000346659, 0.76 ENSP00000261609"
+RRP15 (ENSP00000355899)	11	CK2alpha	8.5484	KIN	CK2_group	0.3566	ENSP00000355899	0.461	RRP15	PDSRVsEEENL	"ENSP00000217244, 0.3688 ENSP00000355899"
+SSRP1 (ENSP00000278412)	444	CK2a2	7.8435	KIN	CK2_group	0.3768	ENSP00000278412	0.615	SSRP1	DEYADsDEDQH	"ENSP00000262506, 0.492 ENSP00000278412"
+SSRP1 (ENSP00000278412)	667	CK2a2	7.7757	KIN	CK2_group	0.3168	ENSP00000278412	0.615	SSRP1	SKEFVsSDESS	"ENSP00000262506, 0.492 ENSP00000278412"
+VAMP2 (ENSP00000314214)	80	PKD3	6.9217	KIN	PKD_group	0.0744	ENSP00000314214	0.949	VAMP2	SQFETsAAKLK	"ENSP00000234179, 0.7592 ENSP00000314214"
+VAMP2 (ENSP00000314214)	61	CK2alpha	6.3122	KIN	CK2_group	0.3338	ENSP00000314214	0.4391	VAMP2	RDQKLsELDDR	"ENSP00000217244, 0.7992 ENSP00000222812, 0.7544 ENSP00000314214"
+VAMP1 (ENSP00000380148)	63	CK2alpha	6.1363	KIN	CK2_group	0.3338	ENSP00000380148	0.4364	VAMP1	RDQKLsELDDR	"ENSP00000217244, 0.7944 ENSP00000222812, 0.7544 ENSP00000380148"
+ERC1 (ENSP00000354158)	191	IKKalpha	5.3194	KIN	IKKalpha_IKKbeta_group	0.031	ENSP00000354158	0.96	ERC1	IKTFWsPELKK	"ENSP00000359424, 0.768 ENSP00000354158"
+ERC1 (ENSP00000354158)	191	IKKalpha	5.3194	KIN	IKKalpha_IKKbeta_group	0.031	ENSP00000354158	0.96	ERC1	IKTFWsPELKK	"ENSP00000359424, 0.768 ENSP00000354158"
+VAMP2 (ENSP00000314214)	61	PKAbeta	4.9293	KIN	PKA_group	0.1153	ENSP00000314214	0.8	VAMP2	RDQKLsELDDR	"ENSP00000359719, 0.64 ENSP00000314214"
+VAMP2 (ENSP00000314214)	61	PKAgamma	4.9293	KIN	PKA_group	0.1153	ENSP00000314214	0.8	VAMP2	RDQKLsELDDR	"ENSP00000366488, 0.64 ENSP00000314214"
+VAMP3 (ENSP00000054666)	44	CK2alpha	4.2842	KIN	CK2_group	0.3338	ENSP00000054666	0.4201	VAMP3	RDQKLsELDDR	"ENSP00000217244, 0.7992 ENSP00000317714, 0.6792 ENSP00000054666"
+VAMP2 (ENSP00000314214)	80	PKCiota	3.8971	KIN	PKC_group	0.0928	ENSP00000314214	0.899	VAMP2	SQFETsAAKLK	"ENSP00000295797, 0.7192 ENSP00000314214"
+SSRP1 (ENSP00000278412)	444	CDK7	3.6159	KIN	CDK7	0.0186	ENSP00000278412	0.903	SSRP1	DEYADsDEDQH	"ENSP00000256443, 0.7224 ENSP00000278412"
+SSRP1 (ENSP00000278412)	444	CK1alpha	3.3573	KIN	CK1_group	0.1264	ENSP00000278412	0.404	SSRP1	DEYADsDEDQH	"ENSP00000261798, 0.3232 ENSP00000278412"
+VAMP3 (ENSP00000054666)	11	PKCalpha	3.0633	KIN	PKC_group	0.4633	ENSP00000054666	0.3277	VAMP3	TAATGsNRRLQ	"ENSP00000284384, 0.6232 ENSP00000359025, 0.6352 ENSP00000054666"
+SSRP1 (ENSP00000278412)	659	PKCalpha	3.0524	KIN	PKC_group	0.4345	ENSP00000278412	0.237	SSRP1	RQLSEsFKSKE	"ENSP00000284384, 0.4552 ENSP00000351885, 0.76 ENSP00000278412"
+VAMP2 (ENSP00000314214)	61	PKCiota	2.7785	KIN	PKC_group	0.0463	ENSP00000314214	0.899	VAMP2	RDQKLsELDDR	"ENSP00000295797, 0.7192 ENSP00000314214"
+SSRP1 (ENSP00000278412)	659	CDK7	2.5961	KIN	CDK7	0.0104	ENSP00000278412	0.903	SSRP1	RQLSEsFKSKE	"ENSP00000256443, 0.7224 ENSP00000278412"
+SSRP1 (ENSP00000278412)	667	CDK7	2.5961	KIN	CDK7	0.0124	ENSP00000278412	0.903	SSRP1	SKEFVsSDESS	"ENSP00000256443, 0.7224 ENSP00000278412"
+ERC1 (ENSP00000354158)	191	IKKbeta	2.571	KIN	IKKalpha_IKKbeta_group	0.031	ENSP00000354158	0.946	ERC1	IKTFWsPELKK	"ENSP00000339151, 0.7568 ENSP00000354158"
+ERC1 (ENSP00000354158)	191	IKKbeta	2.571	KIN	IKKalpha_IKKbeta_group	0.031	ENSP00000354158	0.946	ERC1	IKTFWsPELKK	"ENSP00000339151, 0.7568 ENSP00000354158"
+SSRP1 (ENSP00000278412)	659	PKCbeta	2.4948	KIN	PKC_group	0.4345	ENSP00000278412	0.1743	SSRP1	RQLSEsFKSKE	"ENSP00000305355, 0.7976 ENSP00000366013, 0.7192 ENSP00000284811, 0.7448 ENSP00000278412"
+VAMP3 (ENSP00000054666)	11	PKCbeta	2.4948	KIN	PKC_group	0.4633	ENSP00000054666	0.2393	VAMP3	TAATGsNRRLQ	"ENSP00000305355, 0.512 ENSP00000348986, 0.7616 ENSP00000054666"
+SSRP1 (ENSP00000278412)	659	CK2a2	2.4345	KIN	CK2_group	0.0356	ENSP00000278412	0.615	SSRP1	RQLSEsFKSKE	"ENSP00000262506, 0.492 ENSP00000278412"
+ERC1 (ENSP00000354158)	191	HIPK2	2.2748	KIN	HIPK1_HIPK2_group	0.0463	ENSP00000354158	0.4159	ERC1	IKTFWsPELKK	"ENSP00000263551, 0.7696 ENSP00000286332, 0.7192 ENSP00000354158"
+VAMP3 (ENSP00000054666)	11	PKCzeta	2.0773	KIN	PKC_group	0.4633	ENSP00000054666	0.4263	VAMP3	TAATGsNRRLQ	"ENSP00000367830, 0.7688 ENSP00000320935, 0.796 ENSP00000054666"
+SSRP1 (ENSP00000278412)	659	DNAPK	2.0042	KIN	DNAPK	0.0584	ENSP00000278412	0.56	SSRP1	RQLSEsFKSKE	"ENSP00000313420, 0.448 ENSP00000278412"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_regulatory_sites.tabular	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,8 @@
+32017
+"PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words ""PhosphoSitePlus(R), www.phosphosite.org"" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: ""Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."""
+
+GENE	PROTEIN	PROT_TYPE	ACC_ID	GENE_ID	HU_CHR_LOC	ORGANISM	MOD_RSD	SITE_GRP_ID	SITE_+/-7_AA	DOMAIN	ON_FUNCTION	ON_PROCESS	ON_PROT_INTERACT	ON_OTHER_INTERACT	PMIDs	LT_LIT	MS_LIT	MS_CST	NOTES
+ENSA	ENSA	"Inhibitor; Protein phosphatase, regulatory subunit"	O43768	2029	1q21.3	human	S109-p	477819	DLPQRKSsLVTSKLA	Endosulfine	"molecular association, regulation; protein conformation"		SNCA(DISRUPTS)		18973346	1	34	50
+VAMP8	VAMP8	"Membrane protein, integral; Vesicle"	Q9BV40	8673	2p11.2	human	S55-p	12738929	TEDLEATsEHFKTTS	Synaptobrevin	"activity, inhibited"				27402227	1	8	0	"abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion"
+ENSA	ENSA	"Inhibitor; Protein phosphatase, regulatory subunit"	O43768	2029	1q21.3	human	S67-p	455934	KGQKYFDsGDYNMAK	Endosulfine	"molecular association, regulation"	cell cycle regulation	PPP2CA(INDUCES)		27889260	3	56	47
+Vamp4	VAMP4	"Membrane protein, integral; Vesicle"	O70480	53330	1 H2.1|1 70.29 cM	mouse	S30-p	454285	RNLLEDDsDEEEDFF		"molecular association, regulation; intracellular localization"		PACS-1(INDUCES)		14608369	1	64	10
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_swissprot.fasta	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,68 @@
+>sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2
+MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDDDAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEKEKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKHQKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKSEEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT
+>sp|Q08945|SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1
+MAETLEFNDVYQEVKGSMNDGRLRLSRQGIIFKNSKTGKVDNIQAGELTEGIWRRVALGHGLKLLTKNGHVYKYDGFRESEFEKLSDFFKTHYRLELMEKDLCVKGWNWGTVKFGGQLLSFDIGDQPVFEIPLSNVSQCTTGKNEVTLEFHQNDDAEVSLMEVRFYVPPTQEDGVDPVEAFAQNVLSKADVIQATGDAICIFRELQCLTPRGRYDIRIYPTFLHLHGKTFDYKIPYTTVLRLFLLPHKDQRQMFFVISLDPPIKQGQTRYHFLILLFSKDEDISLTLNMNEEEVEKRFEGRLTKNMSGSLYEMVSRVMKALVNRKITVPGNFQGHSGAQCITCSYKASSGLLYPLERGFIYVHKPPVHIRFDEISFVNFARGTTTTRSFDFEIETKQGTQYTFSSIEREEYGKLFDFVNAKKLNIKNRGLKEGMNPSYDEYADSDEDQHDAYLERMKEEGKIREENANDSSDDSGEETDESFNPGEEEEDVAEEFDSNASASSSSNEGDSDRDEKKRKQLKKAKMAKDRKSRKKPVEVKKGKDPNAPKRPMSAYMLWLNASREKIKSDHPGISITDLSKKAGEIWKGMSKEKKEEWDRKAEDARRDYEKAMKEYEGGRGESSKRDKSKKKKKVKVKMEKKSTPSRGSSSKSSSRQLSESFKSKEFVSSDESSSGENKSKKKRRRSEDSEEEELASTPPSSEDSASGSDE
+>sp|Q96SA4|SERC2_HUMAN Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 PE=2 SV=3
+MGACLGACSLLSCASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS
+>sp|Q96SA4-2|SERC2_HUMAN Isoform 2 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2
+MGAEGAPDFLSCPRVRRASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS
+>sp|Q96SA4-3|SERC2_HUMAN Isoform 3 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2
+MRSMRLREEESPGPSHTASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS
+>sp|Q96SA4-4|SERC2_HUMAN Isoform 4 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2
+MDGRMMRSMRLREEESPGPSHTASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS
+>sp|Q9NRX5|SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1
+MGSVLGLCSMASWIPCLCGSAPCLLCRCCPSGNNSTVTRLIYALFLLVGVCVACVMLIPGMEEQLNKIPGFCENEKGVVPCNILVGYKAVYRLCFGLAMFYLLLSLLMIKVKSSSDPRAAVHNGFWFFKFAAAIAIIIGAFFIPEGTFTTVWFYVGMAGAFCFILIQLVLLIDFAHSWNESWVEKMEEGNSRCWYAALLSATALNYLLSLVAIVLFFVYYTHPASCSENKAFISVNMLLCVGASVMSILPKIQESQPRSGLLQSSVITVYTMYLTWSAMTNEPETNCNPSLLSIIGYNTTSTVPKEGQSVQWWHAQGIIGLILFLLCVFYSSIRTSNNSQVNKLTLTSDESTLIEDGGARSDGSLEDGDDVHRAVDNERDGVTYSYSFFHFMLFLASLYIMMTLTNWYRYEPSREMKSQWTAVWVKISSSWIGIVLYVWTLVAPLVLTNRDFD
+>sp|O43768|ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1
+MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE
+>sp|O43768-2|ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG
+>sp|O43768-3|ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE
+>sp|O43768-4|ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGVWGIASYPLSLGLKEVLRMKSVEQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG
+>sp|O43768-5|ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE
+>sp|O43768-6|ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG
+>sp|O43768-7|ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE
+>sp|O43768-8|ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGVWGIVSYPLSLELKEVLRMKSVEVLLDPFLEVLLLNRSRGEFEI
+>sp|O43768-9|ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA
+MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG
+>sp|Q15751|HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2
+MATMIPPVKLKWLEHLNSSWITEDSESIATREGVAVLYSKLVSNKEVVPLPQQVLCLKGPQLPDFERESLSSDEQDHYLDALLSSQLALAKMVCSDSPFAGALRKRLLVLQRVFYALSNKYHDKGKVKQQQHSPESSSGSADVHSVSERPRSSTDALIEMGVRTGLSLLFALLRQSWMMPVSGPGLSLCNDVIHTAIEVVSSLPPLSLANESKIPPMGLDCLSQVTTFLKGVTIPNSGADTLGRRLASELLLGLAAQRGSLRYLLEWIEMALGASAVVHTMEKGKLLSSQEGMISFDCFMTILMQMRRSLGSSADRSQWREPTRTSDGLCSLYEAALCLFEEVCRMASDYSRTCASPDSIQTGDAPIVSETCEVYVWGSNSSHQLVEGTQEKILQPKLAPSFSDAQTIEAGQYCTFVISTDGSVRACGKGSYGRLGLGDSNNQSTLKKLTFEPHRSIKKVSSSKGSDGHTLAFTTEGEVFSWGDGDYGKLGHGNSSTQKYPKLIQGPLQGKVVVCVSAGYRHSAAVTEDGELYTWGEGDFGRLGHGDSNSRNIPTLVKDISNVGEVSCGSSHTIALSKDGRTVWSFGGGDNGKLGHGDTNRVYKPKVIEALQGMFIRKVCAGSQSSLALTSTGQVYAWGCGACLGCGSSEATALRPKLIEELAATRIVDVSIGDSHCLALSHDNEVYAWGNNSMGQCGQGNSTGPITKPKKVSGLDGIAIQQISAGTSHSLAWTALPRDRQVVAWHRPYCVDLEESTFSHLRSFLERYCDKINSEIPPLPFPSSREHHSFLKLCLKLLSNHLALALAGGVATSILGRQAGPLRNLLFRLMDSTVPDEIQEVVIETLSVGATMLLPPLRERMELLHSLLPQGPDRWESLSKGQRMQLDIILTSLQDHTHVASLLGYSSPSDAADLSSVCTGYGNLSDQPYGTQSCHPDTHLAEILMKTLLRNLGFYTDQAFGELEKNSDKFLLGTSSSENSQPAHLHELLCSLQKQLLAFCHINNISENSSSVALLHKHLQLLLPHATDIYSRSANLLKESPWNGSVGEKLRDVIYVSAAGSMLCQIVNSLLLLPVSVARPLLSYLLDLLPPLDCLNRLLPAADLLEDQELQWPLHGGPELIDPAGLPLPQPAQSWVWLVDLERTIALLIGRCLGGMLQGSPVSPEEQDTAYWMKTPLFSDGVEMDTPQLDKCMSCLLEVALSGNEEQKPFDYKLRPEIAVYVDLALGCSKEPARSLWISMQDYAVSKDWDSATLSNESLLDTVSRFVLAALLKHTNLLSQACGESRYQPGKHLSEVYRCVYKVRSRLLACKNLELIQTRSSSRDRWISENQDSADVDPQEHSFTRTIDEEAEMEEQAERDREEGHPEPEDEEEEREHEVMTAGKIFQCFLSAREVARSRDRDRMNSGAGSGARADDPPPQSQQERRVSTDLPEGQDVYTAACNSVIHRCALLILGVSPVIDELQKRREEGQLQQPSTSASEGGGLMTRSESLTAESRLVHTSPNYRLIKSRSESDLSQPESDEEGYALSGRRNVDLDLAASHRKRGPMHSQLESLSDSWARLKHSRDWLCNSSYSFESDFDLTKSLGVHTLIENVVSFVSGDVGNAPGFKEPEESMSTSPQASIIAMEQQQLRAELRLEALHQILVLLSGMEEKGSISLAGSRLSSGFQSSTLLTSVRLQFLAGCFGLGTVGHTGGKGESGRLHHYQDGIRAAKRNIQIEIQVAVHKIYQQLSATLERALQANKHHIEAQQRLLLVTVFALSVHYQPVDVSLAISTGLLNVLSQLCGTDTMLGQPLQLLPKTGVSQLSTALKVASTRLLQILAITTGTYADKLSPKVVQSLLDLLCSQLKNLLSQTGVLHMASFGEGEQEDGEEEEKKVDSSGETEKKDFRAALRKQHAAELHLGDFLVFLRRVVSSKAIQSKMASPKWTEVLLNIASQKCSSGIPLVGNLRTRLLALHVLEAVLPACESGVEDDQMAQIVERLFSLLSDCMWETPIAQAKHAIQIKEKEQEIKLQKQGELEEEDENLPIQEVSFDPEKAQCCLVENGQILTHGSGGKGYGLASTGVTSGCYQWKFYIVKENRGNEGTCVGVSRWPVHDFNHRTTSDMWLYRAYSGNLYHNGEQTLTLSSFTQGDFITCVLDMEARTISFGKNGEEPKLAFEDVDAAELYPCVMFYSSNPGEKVKICDMQMRGTPRDLLPGDPICSPVAAVLAEATIQLIRILHRTDRWTYCINKKMMERLHKIKICIKESGQKLKKSRSVQSREENEMREEKESKEEEKGKHTRHGLADLSELQLRTLCIEVWPVLAVIGGVDAGLRVGGRCVHKQTGRHATLLGVVKEGSTSAKVQWDEAEITISFPTFWSPSDTPLYNLEPCEPLPFDVARFRGLTASVLLDLTYLTGVHEDMGKQSTKRHEKKHRHESEEKGDVEQKPESESALDMRTGLTSDDVKSQSTTSSKSENEIASFSLDPTLPSVESQHQITEGKRKNHEHMSKNHDVAQSEIRAVQLSYLYLGAMKSLSALLGCSKYAELLLIPKVLAENGHNSDCASSPVVHEDVEMRAALQFLMRHMVKRAVMRSPIKRALGLADLERAQAMIYKLVVHGLLEDQFGGKIKQEIDQQAEESDPAQQAQTPVTTSPSASSTTSFMSSSLEDTTTATTPVTDTETVPASESPGVMPLSLLRQMFSSYPTTTVLPTRRAQTPPISSLPTSPSDEVGRRQSLTSPDSQSARPANRTALSDPSSRLSTSPPPPAIAVPLLEMGFSLRQIAKAMEATGARGEADAQNITVLAMWMIEHPGHEDEEEPQSGSTADSRPGAAVLGSGGKSNDPCYLQSPGDIPSADAAEMEEGFSESPDNLDHTENAASGSGPSARGRSAVTRRHKFDLAARTLLARAAGLYRSVQAHRNQSRREGISLQQDPGALYDFNLDEELEIDLDDEAMEAMFGQDLTSDNDILGMWIPEVLDWPTWHVCESEDREEVVVCELCECSVVSFNQHMKRNHPGCGRSANRQGYRSNGSYVDGWFGGECGSGNPYYLLCGTCREKYLAMKTKSKSTSSERYKGQAPDLIGKQDSVYEEDWDMLDVDEDEKLTGEEEFELLAGPLGLNDRRIVPEPVQFPDSDPLGASVAMVTATNSMEETLMQIGCHGSVEKSSSGRITLGEQAAALANPHDRVVALRRVTAAAQVLLARTMVMRALSLLSVSGSSCSLAAGLESLGLTDIRTLVRLMCLAAAGRAGLSTSPSAMASTSERSRGGHSKANKPISCLAYLSTAVGCLASNAPSAAKLLVQLCTQNLISAATGVNLTTVDDSIQRKFLPSFLRGIAEENKLVTSPNFVVTQALVALLADKGAKLRPNYDKSEVEKKGPLELANALAACCLSSRLSSQHRQWAAQQLVRTLAAHDRDNQTTLQTLADMGGDLRKCSFIKLEAHQNRVMTCVWCNKKGLLATSGNDGTIRVWNVTKKQYSLQQTCVFNRLEGDAEESLGSPSDPSFSPVSWSISGKYLAGALEKMVNIWQVNGGKGLVDIQPHWVSALAWPEEGPATAWSGESPELLLVGRMDGSLGLIEVVDVSTMHRRELEHCYRKDVSVTCIAWFSEDRPFAVGYFDGKLLLGTKEPLEKGGIVLIDAHKDTLISMKWDPTGHILMTCAKEDSVKLWGSISGCWCCLHSLCHPSIVNGIAWCRLPGKGSKLQLLMATGCQSGLVCVWRIPQDTTQTNVTSAEGWWEQESNCQDGYRKSSGAKCVYQLRGHITPVRTVAFSSDGLALVSGGLGGLMNIWSLRDGSVLQTVVIGSGAIQTTVWIPEVGVAACSNRSKDVLVVNCTAEWAAANHVLATCRTALKQQGVLGLNMAPCMRAFLERLPMMLQEQYAYEKPHVVCGDQLVHSPYMQCLASLAVGLHLDQLLCNPPVPPHHQNCLPDPASWNPNEWAWLECFSTTIKAAEALTNGAQFPESFTVPDLEPVPEDELVFLMDNSKWINGMDEQIMSWATSRPEDWHLGGKCDVYLWGAGRHGQLAEAGRNVMVPAAAPSFSQAQQVICGQNCTFVIQANGTVLACGEGSYGRLGQGNSDDLHVLTVISALQGFVVTQLVTSCGSDGHSMALTESGEVFSWGDGDYGKLGHGNSDRQRRPRQIEALQGEEVVQMSCGFKHSAVVTSDGKLFTFGNGDYGRLGLGNTSNKKLPERVTALEGYQIGQVACGLNHTLAVSADGSMVWAFGDGDYGKLGLGNSTAKSSPQKIDVLCGIGIKKVACGTQFSVALTKDGHVYTFGQDRLIGLPEGRARNHNRPQQIPVLAGVIIEDVAVGAEHTLALASNGDVYAWGSNSEGQLGLGHTNHVREPTLVTGLQGKNVRQISAGRCHSAAWTAPPVPPRAPGVSVPLQLGLPDTVPPQYGALREVSIHTVRARLRLLYHFSDLMYSSWRLLNLSPNNQNSTSHYNAGTWGIVQGQLRPLLAPRVYTLPMVRSIGKTMVQGKNYGPQITVKRISTRGRKCKPIFVQIARQVVKLNASDLRLPSRAWKVKLVGEGADDAGGVFDDTITEMCQELETGIVDLLIPSPNATAEVGYNRDRFLFNPSACLDEHLMQFKFLGILMGVAIRTKKPLDLHLAPLVWKQLCCVPLTLEDLEEVDLLYVQTLNSILHIEDSGITEESFHEMIPLDSFVGQSADGKMVPIIPGGNSIPLTFSNRKEYVERAIEYRLHEMDRQVAAVREGMSWIVPVPLLSLLTAKQLEQMVCGMPEISVEVLKKVVRYREVDEQHQLVQWFWHTLEEFSNEERVLFMRFVSGRSRLPANTADISQRFQIMKVDRPYDSLPTSQTCFFQLRLPPYSSQLVMAERLRYAINNCRSIDMDNYMLSRNVDNAEGSDTDY
+>sp|O95714|HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2
+MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIVYTGTESTQNGELPPRKDDSVEPSGTKKEDLNDKEKKDEEETPAPIYRAKSILDSWVWGKQPDVNELKECLSVLVKEQQALAVQSATTTLSALRLKQRLVILERYFIALNRTVFQENVKVKWKSSGISLPPVDKKSSRPAGKGVEGLARVGSRAALSFAFAFLRRAWRSGEDADLCSELLQESLDALRALPEASLFDESTVSSVWLEVVERATRFLRSVVTGDVHGTPATKGPGSIPLQDQHLALAILLELAVQRGTLSQMLSAILLLLQLWDSGAQETDNERSAQGTSAPLLPLLQRFQSIICRKDAPHSEGDMHLLSGPLSPNESFLRYLTLPQDNELAIDLRQTAVVVMAHLDRLATPCMPPLCSSPTSHKGSLQEVIGWGLIGWKYYANVIGPIQCEGLANLGVTQIACAEKRFLILSRNGRVYTQAYNSDTLAPQLVQGLASRNIVKIAAHSDGHHYLALAATGEVYSWGCGDGGRLGHGDTVPLEEPKVISAFSGKQAGKHVVHIACGSTYSAAITAEGELYTWGRGNYGRLGHGSSEDEAIPMLVAGLKGLKVIDVACGSGDAQTLAVTENGQVWSWGDGDYGKLGRGGSDGCKTPKLIEKLQDLDVVKVRCGSQFSIALTKDGQVYSWGKGDNQRLGHGTEEHVRYPKLLEGLQGKKVIDVAAGSTHCLALTEDSEVHSWGSNDQCQHFDTLRVTKPEPAALPGLDTKHIVGIACGPAQSFAWSSCSEWSIGLRVPFVVDICSMTFEQLDLLLRQVSEGMDGSADWPPPQEKECVAVATLNLLRLQLHAAISHQVDPEFLGLGLGSILLNSLKQTVVTLASSAGVLSTVQSAAQAVLQSGWSVLLPTAEERARALSALLPCAVSGNEVNISPGRRFMIDLLVGSLMADGGLESALHAAITAEIQDIEAKKEAQKEKEIDEQEANASTFHRSRTPLDKDLINTGICESSGKQCLPLVQLIQQLLRNIASQTVARLKDVARRISSCLDFEQHSRERSASLDLLLRFQRLLISKLYPGESIGQTSDISSPELMGVGSLLKKYTALLCTHIGDILPVAASIASTSWRHFAEVAYIVEGDFTGVLLPELVVSIVLLLSKNAGLMQEAGAVPLLGGLLEHLDRFNHLAPGKERDDHEELAWPGIMESFFTGQNCRNNEEVTLIRKADLENHNKDGGFWTVIDGKVYDIKDFQTQSLTGNSILAQFAGEDPVVALEAALQFEDTRESMHAFCVGQYLEPDQEIVTIPDLGSLSSPLIDTERNLGLLLGLHASYLAMSTPLSPVEIECAKWLQSSIFSGGLQTSQIHYSYNEEKDEDHCSSPGGTPASKSRLCSHRRALGDHSQAFLQAIADNNIQDHNVKDFLCQIERYCRQCHLTTPIMFPPEHPVEEVGRLLLCCLLKHEDLGHVALSLVHAGALGIEQVKHRTLPKSVVDVCRVVYQAKCSLIKTHQEQGRSYKEVCAPVIERLRFLFNELRPAVCNDLSIMSKFKLLSSLPRWRRIAQKIIRERRKKRVPKKPESTDDEEKIGNEESDLEEACILPHSPINVDKRPIAIKSPKDKWQPLLSTVTGVHKYKWLKQNVQGLYPQSPLLSTIAEFALKEEPVDVEKMRKCLLKQLERAEVRLEGIDTILKLASKNFLLPSVQYAMFCGWQRLIPEGIDIGEPLTDCLKDVDLIPPFNRMLLEVTFGKLYAWAVQNIRNVLMDASAKFKELGIQPVPLQTITNENPSGPSLGTIPQARFLLVMLSMLTLQHGANNLDLLLNSGMLALTQTALRLIGPSCDNVEEDMNASAQGASATVLEETRKETAPVQLPVSGPELAAMMKIGTRVMRGVDWKWGDQDGPPPGLGRVIGELGEDGWIRVQWDTGSTNSYRMGKEGKYDLKLAELPAAAQPSAEDSDTEDDSEAEQTERNIHPTAMMFTSTINLLQTLCLSAGVHAEIMQSEATKTLCGLLRMLVESGTTDKTSSPNRLVYREQHRSWCTLGFVRSIALTPQVCGALSSPQWITLLMKVVEGHAPFTATSLQRQILAVHLLQAVLPSWDKTERARDMKCLVEKLFDFLGSLLTTCSSDVPLLRESTLRRRRVRPQASLTATHSSTLAEEVVALLRTLHSLTQWNGLINKYINSQLRSITHSFVGRPSEGAQLEDYFPDSENPEVGGLMAVLAVIGGIDGRLRLGGQVMHDEFGEGTVTRITPKGKITVQFSDMRTCRVCPLNQLKPLPAVAFNVNNLPFTEPMLSVWAQLVNLAGSKLEKHKIKKSTKQAFAGQVDLDLLRCQQLKLYILKAGRALLSHQDKLRQILSQPAVQETGTVHTDDGAVVSPDLGDMSPEGPQPPMILLQQLLASATQPSPVKAIFDKQELEAAALAVCQCLAVESTHPSSPGFEDCSSSEATTPVAVQHIRPARVKRRKQSPVPALPIVVQLMEMGFSRRNIEFALKSLTGASGNASSLPGVEALVGWLLDHSDIQVTELSDADTVSDEYSDEEVVEDVDDAAYSMSTGAVVTESQTYKKRADFLSNDDYAVYVRENIQVGMMVRCCRAYEEVCEGDVGKVIKLDRDGLHDLNVQCDWQQKGGTYWVRYIHVELIGYPPPSSSSHIKIGDKVRVKASVTTPKYKWGSVTHQSVGVVKAFSANGKDIIVDFPQQSHWTGLLSEMELVPSIHPGVTCDGCQMFPINGSRFKCRNCDDFDFCETCFKTKKHNTRHTFGRINEPGQSAVFCGRSGKQLKRCHSSQPGMLLDSWSRMVKSLNVSSSVNQASRLIDGSEPCWQSSGSQGKHWIRLEIFPDVLVHRLKMIVDPADSSYMPSLVVVSGGNSLNNLIELKTININPSDTTVPLLNDCTEYHRYIEIAIKQCRSSGIDCKIHGLILLGRIRAEEEDLAAVPFLASDNEEEEDEKGNSGSLIRKKAAGLESAATIRTKVFVWGLNDKDQLGGLKGSKIKVPSFSETLSALNVVQVAGGSKSLFAVTVEGKVYACGEATNGRLGLGISSGTVPIPRQITALSSYVVKKVAVHSGGRHATALTVDGKVFSWGEGDDGKLGHFSRMNCDKPRLIEALKTKRIRDIACGSSHSAALTSSGELYTWGLGEYGRLGHGDNTTQLKPKMVKVLLGHRVIQVACGSRDAQTLALTDEGLVFSWGDGDFGKLGRGGSEGCNIPQNIERLNGQGVCQIECGAQFSLALTKSGVVWTWGKGDYFRLGHGSDVHVRKPQVVEGLRGKKIVHVAVGALHCLAVTDSGQVYAWGDNDHGQQGNGTTTVNRKPTLVQGLEGQKITRVACGSSHSVAWTTVDVATPSVHEPVLFQTARDPLGASYLGVPSDADSSAASNKISGASNSKPNRPSLAKILLSLDGNLAKQQALSHILTALQIMYARDAVVGALMPAAMIAPVECPSFSSAAPSDASAMASPMNGEECMLAVDIEDRLSPNPWQEKREIVSSEDAVTPSAVTPSAPSASARPFIPVTDDLGAASIIAETMTKTKEDVESQNKAAGPEPQALDEFTSLLIADDTRVVVDLLKLSVCSRAGDRGRDVLSAVLSGMGTAYPQVADMLLELCVTELEDVATDSQSGRLSSQPVVVESSHPYTDDTSTSGTVKIPGAEGLRVEFDRQCSTERRHDPLTVMDGVNRIVSVRSGREWSDWSSELRIPGDELKWKFISDGSVNGWGWRFTVYPIMPAAGPKELLSDRCVLSCPSMDLVTCLLDFRLNLASNRSIVPRLAASLAACAQLSALAASHRMWALQRLRKLLTTEFGQSININRLLGENDGETRALSFTGSALAALVKGLPEALQRQFEYEDPIVRGGKQLLHSPFFKVLVALACDLELDTLPCCAETHKWAWFRRYCMASRVAVALDKRTPLPRLFLDEVAKKIRELMADSENMDVLHESHDIFKREQDEQLVQWMNRRPDDWTLSAGGSGTIYGWGHNHRGQLGGIEGAKVKVPTPCEALATLRPVQLIGGEQTLFAVTADGKLYATGYGAGGRLGIGGTESVSTPTLLESIQHVFIKKVAVNSGGKHCLALSSEGEVYSWGEAEDGKLGHGNRSPCDRPRVIESLRGIEVVDVAAGGAHSACVTAAGDLYTWGKGRYGRLGHSDSEDQLKPKLVEALQGHRVVDIACGSGDAQTLCLTDDDTVWSWGDGDYGKLGRGGSDGCKVPMKIDSLTGLGVVKVECGSQFSVALTKSGAVYTWGKGDYHRLGHGSDDHVRRPRQVQGLQGKKVIAIATGSLHCVCCTEDGEVYTWGDNDEGQLGDGTTNAIQRPRLVAALQGKKVNRVACGSAHTLAWSTSKPASAGKLPAQVPMEYNHLQEIPIIALRNRLLLLHHLSELFCPCIPMFDLEGSLDETGLGPSVGFDTLRGILISQGKEAAFRKVVQATMVRDRQHGPVVELNRIQVKRSRSKGGLAGPDGTKSVFGQMCAKMSSFGPDSLLLPHRVWKVKFVGESVDDCGGGYSESIAEICEELQNGLTPLLIVTPNGRDESGANRDCYLLSPAARAPVHSSMFRFLGVLLGIAIRTGSPLSLNLAEPVWKQLAGMSLTIADLSEVDKDFIPGLMYIRDNEATSEEFEAMSLPFTVPSASGQDIQLSSKHTHITLDNRAEYVRLAINYRLHEFDEQVAAVREGMARVVPVPLLSLFTGYELETMVCGSPDIPLHLLKSVATYKGIEPSASLIQWFWEVMESFSNTERSLFLRFVWGRTRLPRTIADFRGRDFVIQVLDKYNPPDHFLPESYTCFFLLKLPRYSCKQVLEEKLKYAIHFCKSIDTDDYARIALTGEPAADDSSDDSDNEDVDSFASDSTQDYLTGH
+>sp|Q6ZN18|AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2
+MAAAITDMADLEELSRLSPLPPGSPGSAARGRAEPPEEEEEEEEEEEEAEAEAVAALLLNGGSGGGGGGGGGGVGGGEAETMSEPSPESASQAGEDEDEEEDDEEEEDESSSSGGGEEESSAESLVGSSGGSSSDETRSLSPGAASSSSGDGDGKEGLEEPKGPRGSQGGGGGGSSSSSVVSSGGDEGYGTGGGGSSATSGGRRGSLEMSSDGEPLSRMDSEDSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKRTLIRKVFNLYLSKQ
+>sp|Q6ZN18-2|AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2
+MAAAITDMADLEELSRLSPLPPGSPGSAARGRAEPPEEEEEEEEEEEEAEAEAVAALLLNGGSGGGGGGGGGGVGGGEAETMSEPSPESASQAGEDEDEEEDDEEEEDESSSSGGGEEESSAESLVGSSGGSSSDETRSLSPGAASSSSGDGDGKEGLEEPKGPRGSQGGGGGGSSSSSVVSSGGDEGYGTGGGGSSATSGGRRGSLEMSSDGEPLSRMDSEDSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKR
+>sp|Q6ZN18-3|AEBP2_HUMAN Isoform 3 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2
+MYTRRYSSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKRTLIRKVFNLYLSKQ
+>sp|O15083|ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3
+MYGSARTITNLEGSPSRSPRLPRSPRLGHRRTSSGGGGGTGKTLSMENIQSLNAAYATSGPMYLSDHEGVASTTYPKGTMTLGRATNRAVYGGRVTAMGSSPNIASAGLSHTDVLSYTDQHGGLTGSSHHHHHQVPSMLRQVRDSTMLDLQAQLKELQRENDLLRKELDIKDSKLGSSMNSIKTFWSPELKKERVLRKEEAARMSVLKEQMRVSHEENQHLQLTIQALQDELRTQRDLNHLLQQESGNRGAEHFTIELTEENFRRLQAEHDRQAKELFLLRKTLEEMELRIETQKQTLNARDESIKKLLEMLQSKGLPSKSLEDDNERTRRMAEAESQVSHLEVILDQKEKENIHLREELHRRSQLQPEPAKTKALQTVIEMKDTKIASLERNIRDLEDEIQMLKANGVLNTEDREEEIKQIEVYKSHSKFMKTKIDQLKQELSKKESELLALQTKLETLSNQNSDCKQHIEVLKESLTAKEQRAAILQTEVDALRLRLEEKESFLNKKTKQLQDLTEEKGTLAGEIRDMKDMLEVKERKINVLQKKIENLQEQLRDKDKQLTNLKDRVKSLQTDSSNTDTALATLEEALSEKERIIERLKEQRERDDRERLEEIESFRKENKDLKEKVNALQAELTEKESSLIDLKEHASSLASAGLKRDSKLKSLEIAIEQKKEECSKLEAQLKKAHNIEDDSRMNPEFADQIKQLDKEASYYRDECGKAQAEVDRLLEILKEVENEKNDKDKKIAELESLTLRHMKDQNKKVANLKHNQQLEKKKNAQLLEEVRRREDSMADNSQHLQIEELMNALEKTRQELDATKARLASTQQSLAEKEAHLANLRIERRKQLEEILEMKQEALLAAISEKDANIALLELSASKKKKTQEEVMALKREKDRLVHQLKQQTQNRMKLMADNYDDDHHHYHHHHHHHHHRSPGRSQHSNHRPSPDQDDEEGIWA
+>sp|P23763|VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1
+MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVIYFFT
+>sp|P23763-3|VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1
+MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVSKYR
+>sp|P23763-2|VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1
+MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVRRD
+>sp|Q15836|VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3
+MSTGPTAATGSNRRLQQTQNQVDEVVDIMRVNVDKVLERDQKLSELDDRADALQAGASQFETSAAKLKRKYWWKNCKMWAIGITVLVIFIIIIIVWVVSS
+>sp|P63027|VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3
+MSATAATAPPAAPAGEGGPPAPPPNLTSNRRLQQTQAQVDEVVDIMRVNVDKVLERDQKLSELDDRADALQAGASQFETSAAKLKRKYWWKNLKMMIILGVICAIILIIIIVYFST
+>sp|O75379|VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2
+MPPKFKRHLNDDDVTGSVKSERRNLLEDDSDEEEDFFLRGPSGPRFGPRNDKIKHVQNQVDEVIDVMQENITKVIERGERLDELQDKSESLSDNATAFSNRSKQLRRQMWWRGCKIKAIMALVAAILLLVIIILIVMKYRT
+>sp|O75379-2|VAMP4_HUMAN_Isoform 2 of Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4
+MPPKFKRHLNDDDVTGSVKSERRNLLEDDSDEEEDFFLGPSGPRFGPRNDKIKHVQNQVDEVIDVMQENITKVIERGERLDELQDKSESLSDNATAFSNRSKQLRRQMWWRGCKIKAIMALVAAILLLVIIILIVMKYRT
+>sp|O95183|VAMP5_HUMAN_Vesicle-associated membrane protein 5 OS=Homo sapiens OX=9606 GN=VAMP5 PE=1 SV=1
+MAGIELERCQQQANEVTEIMRNNFGKVLERGVKLAELQQRSDQLLDMSSTFNKTTQNLAQKKCWENIRYRICVGLVVVGVLLIILIVLLVVFLPQSSDSSSAPRTQDAGIASGPGN
+>sp|P51809|VAMP7_HUMAN_Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7 PE=1 SV=3
+MAILFAVVARGTTILAKHAWCGGNFLEVTEQILAKIPSENNKLTYSHGNYLFHYICQDRIVYLCITDDDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIDLVAQRGERLELLIDKTENLVDSSVTFKTTSRNLARAMCMKNLKLTIIIIIVSIVFIYIIVSPLCGGFTWPSCVKK
+>sp|P51809-2|VAMP7_HUMAN_Isoform 2 of Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7
+MAILFAVVARGTTILAKHAWCGGNFLEVTEQILAKIPSENNKLTYSHGNYLFHYICQDRIVYLCITDDDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIVCHLQNYQQKSCSSHVYEEPQAHYYHHHRINCVHLYHCFTSLWWIYMAKLCEEIGKKKLPLTKDMREQGVKSNPCDSSLSHTDRWYLPVSSTLFSLFKILFHASRFIFVLSTSLFL
+>sp|P51809-3|VAMP7_HUMAN_Isoform 3 of Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7
+MAILFAVVARGTTILAKHAWCGGNFLEDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIDLVAQRGERLELLIDKTENLVDSSVTFKTTSRNLARAMCMKNLKLTIIIIIVSIVFIYIIVSPLCGGFTWPSCVKK
+>sp|Q9BV40|VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1
+MEEASEGGGNDRVRNLQSEVEGVKNIMTQNVERILARGENLEHLRNKTEDLEATSEHFKTTSQKVARKFWWKNVKMIVLICVIVFIIILFIVLFATGAFS
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/workflow/ppenrich_suite_wf.ga	Mon Mar 07 19:05:01 2022 +0000
@@ -0,0 +1,653 @@
+{
+    "a_galaxy_workflow": "true",
+    "annotation": "phoshpoproteomic enrichment data pre-processing and ANOVA",
+    "creator": [
+        {
+            "class": "Person",
+            "identifier": "0000-0002-2882-0508",
+            "name": "Art Eschenlauer"
+        }
+    ],
+    "format-version": "0.1",
+    "license": "MIT",
+    "name": "ppenrich_suite_wf",
+    "steps": {
+        "0": {
+            "annotation": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).",
+            "content_id": null,
+            "errors": null,
+            "id": 0,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).",
+                    "name": "Phospho (STY)Sites.txt"
+                }
+            ],
+            "label": "Phospho (STY)Sites.txt",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 257.06666564941406,
+                "height": 81.39999389648438,
+                "left": 339.95001220703125,
+                "right": 539.9500122070312,
+                "top": 175.6666717529297,
+                "width": 200,
+                "x": 339.95001220703125,
+                "y": 175.6666717529297
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": null}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "002d55e6-29a5-426d-9248-70ec33424b15",
+            "workflow_outputs": []
+        },
+        "1": {
+            "annotation": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)",
+            "content_id": null,
+            "errors": null,
+            "id": 1,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)",
+                    "name": "SwissProt_Human_Canonical_Isoform.fasta"
+                }
+            ],
+            "label": "SwissProt_Human_Canonical_Isoform.fasta",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 411.4666748046875,
+                "height": 101.79998779296875,
+                "left": 379.95001220703125,
+                "right": 579.9500122070312,
+                "top": 309.66668701171875,
+                "width": 200,
+                "x": 379.95001220703125,
+                "y": 309.66668701171875
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"fasta\"], \"tag\": null}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "8f079dcc-1843-47cd-b4dc-1830e4466430",
+            "workflow_outputs": []
+        },
+        "2": {
+            "annotation": "Derived from https://networkin.info/download/networkin_human_predictions_3.1.tsv.xz (which is free for non-commercial use - for required citation, see https://networkin.info/)",
+            "content_id": null,
+            "errors": null,
+            "id": 2,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "Derived from https://networkin.info/download/networkin_human_predictions_3.1.tsv.xz (which is free for non-commercial use - for required citation, see https://networkin.info/)",
+                    "name": "NetworKIN_cutoffscore2.0.tabular"
+                }
+            ],
+            "label": "NetworKIN_cutoffscore2.0.tabular",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 573.4666748046875,
+                "height": 101.79998779296875,
+                "left": 418.95001220703125,
+                "right": 618.9500122070312,
+                "top": 471.66668701171875,
+                "width": 200,
+                "x": 418.95001220703125,
+                "y": 471.66668701171875
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": null}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "dc894a94-97a3-40ff-811e-01b30d498478",
+            "workflow_outputs": []
+        },
+        "3": {
+            "annotation": "Derived from http://hprd.org/serine_motifs, http://hprd.org/tyrosine_motifs, and http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx",
+            "content_id": null,
+            "errors": null,
+            "id": 3,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "Derived from http://hprd.org/serine_motifs, http://hprd.org/tyrosine_motifs, and http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx",
+                    "name": "pSTY_Motifs.tabular"
+                }
+            ],
+            "label": "pSTY_Motifs.tabular",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 726.0666809082031,
+                "height": 81.39999389648438,
+                "left": 459.95001220703125,
+                "right": 659.9500122070312,
+                "top": 644.6666870117188,
+                "width": 200,
+                "x": 459.95001220703125,
+                "y": 644.6666870117188
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": null}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "6fc936ad-0b52-484f-a051-73c1776fdeb0",
+            "workflow_outputs": []
+        },
+        "4": {
+            "annotation": "Derived from Kinase_Substrate_Dataset.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use  - see that link for citation.)",
+            "content_id": null,
+            "errors": null,
+            "id": 4,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "Derived from Kinase_Substrate_Dataset.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use  - see that link for citation.)",
+                    "name": "PSP_Kinase_Substrate_Dataset.tabular"
+                }
+            ],
+            "label": "PSP_Kinase_Substrate_Dataset.tabular",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 894.4666748046875,
+                "height": 101.79998779296875,
+                "left": 503.95001220703125,
+                "right": 703.9500122070312,
+                "top": 792.6666870117188,
+                "width": 200,
+                "x": 503.95001220703125,
+                "y": 792.6666870117188
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": null}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "22b77482-2339-4b45-8fc6-d39f7175131b",
+            "workflow_outputs": []
+        },
+        "5": {
+            "annotation": "Derived from Regulatory_sites.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use  - see that link for citation.)",
+            "content_id": null,
+            "errors": null,
+            "id": 5,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "Derived from Regulatory_sites.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use  - see that link for citation.)",
+                    "name": "PSP_Regulatory_sites.tabular"
+                }
+            ],
+            "label": "PSP_Regulatory_sites.tabular",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 1041.0666809082031,
+                "height": 81.39999389648438,
+                "left": 535.9500122070312,
+                "right": 735.9500122070312,
+                "top": 959.6666870117188,
+                "width": 200,
+                "x": 535.9500122070312,
+                "y": 959.6666870117188
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": null}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "3d97a902-1408-403c-b82e-ddb6ca6a7d47",
+            "workflow_outputs": []
+        },
+        "6": {
+            "annotation": "List of alpha cutoff values for significance testing; text file having no header and a single line for each cutoff value.",
+            "content_id": null,
+            "errors": null,
+            "id": 6,
+            "input_connections": {},
+            "inputs": [
+                {
+                    "description": "List of alpha cutoff values for significance testing; text file having no header and a single line for each cutoff value.",
+                    "name": "alpha_levels.tabular"
+                }
+            ],
+            "label": "alpha_levels.tabular",
+            "name": "Input dataset",
+            "outputs": [],
+            "position": {
+                "bottom": 1210.5666198730469,
+                "height": 81.39999389648438,
+                "left": 562.9500122070312,
+                "right": 762.9500122070312,
+                "top": 1129.1666259765625,
+                "width": 200,
+                "x": 562.9500122070312,
+                "y": 1129.1666259765625
+            },
+            "tool_id": null,
+            "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": null}",
+            "tool_version": null,
+            "type": "data_input",
+            "uuid": "7b5eab97-7dad-4b0e-81eb-22aac39dd5b6",
+            "workflow_outputs": []
+        },
+        "7": {
+            "annotation": "",
+            "content_id": "mqppep_preproc",
+            "errors": null,
+            "id": 7,
+            "input_connections": {
+                "networkin": {
+                    "id": 2,
+                    "output_name": "output"
+                },
+                "p_sty_motifs": {
+                    "id": 3,
+                    "output_name": "output"
+                },
+                "phosphoSites": {
+                    "id": 0,
+                    "output_name": "output"
+                },
+                "protein_fasta": {
+                    "id": 1,
+                    "output_name": "output"
+                },
+                "psp_kinase_substrate": {
+                    "id": 4,
+                    "output_name": "output"
+                },
+                "psp_regulatory_sites": {
+                    "id": 5,
+                    "output_name": "output"
+                }
+            },
+            "inputs": [],
+            "label": null,
+            "name": "MaxQuant Phosphopeptide Preprocessing",
+            "outputs": [
+                {
+                    "name": "phosphoPepIntensities",
+                    "type": "tabular"
+                },
+                {
+                    "name": "enrichGraph",
+                    "type": "pdf"
+                },
+                {
+                    "name": "locProbCutoffGraph",
+                    "type": "pdf"
+                },
+                {
+                    "name": "enrichGraph_svg",
+                    "type": "svg"
+                },
+                {
+                    "name": "locProbCutoffGraph_svg",
+                    "type": "svg"
+                },
+                {
+                    "name": "filteredData_tabular",
+                    "type": "tabular"
+                },
+                {
+                    "name": "quantData_tabular",
+                    "type": "tabular"
+                },
+                {
+                    "name": "mapped_phophopeptides",
+                    "type": "tabular"
+                },
+                {
+                    "name": "melted_phophopeptide_map",
+                    "type": "tabular"
+                },
+                {
+                    "name": "mqppep_output_sqlite",
+                    "type": "sqlite"
+                },
+                {
+                    "name": "preproc_tab",
+                    "type": "tabular"
+                },
+                {
+                    "name": "preproc_csv",
+                    "type": "csv"
+                },
+                {
+                    "name": "preproc_sqlite",
+                    "type": "sqlite"
+                }
+            ],
+            "position": {
+                "bottom": 1186.6000366210938,
+                "height": 812.933349609375,
+                "left": 945.4500122070312,
+                "right": 1145.4500122070312,
+                "top": 373.66668701171875,
+                "width": 200,
+                "x": 945.4500122070312,
+                "y": 373.66668701171875
+            },
+            "post_job_actions": {
+                "RenameDatasetActionenrichGraph": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.enrichGraph_pdf"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "enrichGraph"
+                },
+                "RenameDatasetActionenrichGraph_svg": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.enrichGraph_svg"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "enrichGraph_svg"
+                },
+                "RenameDatasetActionfilteredData_tabular": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.filteredData"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "filteredData_tabular"
+                },
+                "RenameDatasetActionlocProbCutoffGraph": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.locProbCutoffGraph_pdf"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "locProbCutoffGraph"
+                },
+                "RenameDatasetActionlocProbCutoffGraph_svg": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.locProbCutoffGraph_svg"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "locProbCutoffGraph_svg"
+                },
+                "RenameDatasetActionmapped_phophopeptides": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.ppep_map"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "mapped_phophopeptides"
+                },
+                "RenameDatasetActionmelted_phophopeptide_map": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.melted"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "melted_phophopeptide_map"
+                },
+                "RenameDatasetActionmqppep_output_sqlite": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.ppep_mapping_sqlite"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "mqppep_output_sqlite"
+                },
+                "RenameDatasetActionphosphoPepIntensities": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.ppep_intensities"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "phosphoPepIntensities"
+                },
+                "RenameDatasetActionpreproc_csv": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.preproc_csv"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "preproc_csv"
+                },
+                "RenameDatasetActionpreproc_sqlite": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.preproc_sqlite"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "preproc_sqlite"
+                },
+                "RenameDatasetActionpreproc_tab": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.preproc_tab"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "preproc_tab"
+                },
+                "RenameDatasetActionquantData_tabular": {
+                    "action_arguments": {
+                        "newname": "#{phosphoSites}.quantData"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "quantData_tabular"
+                }
+            },
+            "tool_id": "mqppep_preproc",
+            "tool_state": "{\"collapseFunc\": \"sum\", \"enriched\": \"ST\", \"intervalCol\": \"1\", \"localProbCutoff\": \"0.75\", \"merge_function\": \"sum\", \"networkin\": {\"__class__\": \"ConnectedValue\"}, \"p_sty_motifs\": {\"__class__\": \"ConnectedValue\"}, \"phosphoCol\": \"^Number of Phospho [(]STY[)]$\", \"phosphoSites\": {\"__class__\": \"ConnectedValue\"}, \"phospho_type\": \"sty\", \"protein_fasta\": {\"__class__\": \"ConnectedValue\"}, \"psp_kinase_substrate\": {\"__class__\": \"ConnectedValue\"}, \"psp_regulatory_sites\": {\"__class__\": \"ConnectedValue\"}, \"species\": \"human\", \"startCol\": \"^Intensity[^_]\", \"__page__\": null, \"__rerun_remap_job_id__\": null}",
+            "tool_version": "0.1.0+galaxy0",
+            "type": "tool",
+            "uuid": "235b1a2e-ccc0-4c91-bb91-bbf4d272c870",
+            "workflow_outputs": [
+                {
+                    "label": "ppep_intensities",
+                    "output_name": "phosphoPepIntensities",
+                    "uuid": "92fd4e27-5d4b-4e9f-b3ad-6bdad53bb93d"
+                },
+                {
+                    "label": "enrichGraph_pdf",
+                    "output_name": "enrichGraph",
+                    "uuid": "4c1d5590-f8ba-421c-858c-4c026691b52e"
+                },
+                {
+                    "label": "locProbCutoffGraph_pdf",
+                    "output_name": "locProbCutoffGraph",
+                    "uuid": "66a79534-6372-4937-bcf2-8644be985eea"
+                },
+                {
+                    "label": "enrichGraph_svg",
+                    "output_name": "enrichGraph_svg",
+                    "uuid": "5e713d9c-1868-423b-be9a-25c0486e1472"
+                },
+                {
+                    "label": "locProbCutoffGraph_svg",
+                    "output_name": "locProbCutoffGraph_svg",
+                    "uuid": "4621ea21-ae90-4547-a68f-30dfc7857368"
+                },
+                {
+                    "label": "filteredData",
+                    "output_name": "filteredData_tabular",
+                    "uuid": "bb26d0fb-6f19-43c7-80ef-1cf81aa09ee8"
+                },
+                {
+                    "label": "quantData",
+                    "output_name": "quantData_tabular",
+                    "uuid": "20efe04f-2700-4af0-92c6-0830a42d8e75"
+                },
+                {
+                    "label": "ppep_map",
+                    "output_name": "mapped_phophopeptides",
+                    "uuid": "037e2b97-8fc8-436d-bcc3-af5ee685b752"
+                },
+                {
+                    "label": "melted_phosphopeptide_map",
+                    "output_name": "melted_phophopeptide_map",
+                    "uuid": "c3e5de84-2659-45eb-81a6-edef6037d8aa"
+                },
+                {
+                    "label": "ppep_mapping_sqlite",
+                    "output_name": "mqppep_output_sqlite",
+                    "uuid": "a1a4f827-1f1f-4175-ae51-c238f9e1f248"
+                },
+                {
+                    "label": "preproc_tab",
+                    "output_name": "preproc_tab",
+                    "uuid": "b22b4b56-9395-4f6d-945e-0089e8897069"
+                },
+                {
+                    "label": "preproc_csv",
+                    "output_name": "preproc_csv",
+                    "uuid": "54be90f9-1158-4686-af42-43d021088300"
+                },
+                {
+                    "label": "preproc_sqlite",
+                    "output_name": "preproc_sqlite",
+                    "uuid": "33663f9c-b718-4bdd-acc9-087c76bea678"
+                }
+            ]
+        },
+        "8": {
+            "annotation": "Perform ANOVA. For imputing missing values, use median of non-missing values from the same treatment group.",
+            "content_id": "mqppep_anova",
+            "errors": null,
+            "id": 8,
+            "input_connections": {
+                "alpha_file": {
+                    "id": 6,
+                    "output_name": "output"
+                },
+                "input_file": {
+                    "id": 7,
+                    "output_name": "preproc_tab"
+                }
+            },
+            "inputs": [],
+            "label": "MaxQuant Phosphopeptide ANOVA group-median imputed",
+            "name": "MaxQuant Phosphopeptide ANOVA",
+            "outputs": [
+                {
+                    "name": "imputed_data_file",
+                    "type": "tabular"
+                },
+                {
+                    "name": "report_file",
+                    "type": "html"
+                }
+            ],
+            "position": {
+                "bottom": 1488.0999603271484,
+                "height": 254.93333435058594,
+                "left": 1202.949951171875,
+                "right": 1402.949951171875,
+                "top": 1233.1666259765625,
+                "width": 200,
+                "x": 1202.949951171875,
+                "y": 1233.1666259765625
+            },
+            "post_job_actions": {
+                "RenameDatasetActionimputed_data_file": {
+                    "action_arguments": {
+                        "newname": "#{input_file}.intensities_group-mean-imputed_QN_LT"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "imputed_data_file"
+                },
+                "RenameDatasetActionreport_file": {
+                    "action_arguments": {
+                        "newname": "#{input_file}.intensities_group-mean-imputed_report (download/unzip to view)"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "report_file"
+                }
+            },
+            "tool_id": "mqppep_anova",
+            "tool_state": "{\"alpha_file\": {\"__class__\": \"ConnectedValue\"}, \"first_data_column\": \"Intensity\", \"imputation\": {\"imputation_method\": \"group-median\", \"__current_case__\": 0}, \"input_file\": {\"__class__\": \"ConnectedValue\"}, \"sample_grouping_regex\": \"(\\\\d+)\", \"sample_names_regex\": \"\\\\.(\\\\d+)[A-Z]$\", \"__page__\": null, \"__rerun_remap_job_id__\": null}",
+            "tool_version": "0.1.0+galaxy0",
+            "type": "tool",
+            "uuid": "2257286b-6f9a-45c1-90a3-bf5b972959d5",
+            "workflow_outputs": [
+                {
+                    "label": "intensities_group-mean-imputed_QN_LT",
+                    "output_name": "imputed_data_file",
+                    "uuid": "8e7317c6-95e9-4454-b4d7-31b4de6167a8"
+                },
+                {
+                    "label": "intensities_group-mean-imputed_report",
+                    "output_name": "report_file",
+                    "uuid": "dfe9b34e-1f3e-4971-8382-41178104e253"
+                }
+            ]
+        },
+        "9": {
+            "annotation": "Perform ANOVA. For imputing missing values, create random values.",
+            "content_id": "mqppep_anova",
+            "errors": null,
+            "id": 9,
+            "input_connections": {
+                "alpha_file": {
+                    "id": 6,
+                    "output_name": "output"
+                },
+                "input_file": {
+                    "id": 7,
+                    "output_name": "preproc_tab"
+                }
+            },
+            "inputs": [],
+            "label": "MaxQuant Phosphopeptide ANOVA randomly imputed",
+            "name": "MaxQuant Phosphopeptide ANOVA",
+            "outputs": [
+                {
+                    "name": "imputed_data_file",
+                    "type": "tabular"
+                },
+                {
+                    "name": "report_file",
+                    "type": "html"
+                }
+            ],
+            "position": {
+                "bottom": 1325.0999603271484,
+                "height": 254.93333435058594,
+                "left": 1452.949951171875,
+                "right": 1652.949951171875,
+                "top": 1070.1666259765625,
+                "width": 200,
+                "x": 1452.949951171875,
+                "y": 1070.1666259765625
+            },
+            "post_job_actions": {
+                "RenameDatasetActionimputed_data_file": {
+                    "action_arguments": {
+                        "newname": "#{input_file}.intensities_randomly-imputed_QN_LT"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "imputed_data_file"
+                },
+                "RenameDatasetActionreport_file": {
+                    "action_arguments": {
+                        "newname": "#{input_file}.intensities_randomly-imputed_report (download/unzip to view)"
+                    },
+                    "action_type": "RenameDatasetAction",
+                    "output_name": "report_file"
+                }
+            },
+            "tool_id": "mqppep_anova",
+            "tool_state": "{\"alpha_file\": {\"__class__\": \"ConnectedValue\"}, \"first_data_column\": \"Intensity\", \"imputation\": {\"imputation_method\": \"random\", \"__current_case__\": 3, \"meanPercentile\": \"1\", \"sdPercentile\": \"0.2\"}, \"input_file\": {\"__class__\": \"ConnectedValue\"}, \"sample_grouping_regex\": \"(\\\\d+)\", \"sample_names_regex\": \"\\\\.(\\\\d+)[A-Z]$\", \"__page__\": null, \"__rerun_remap_job_id__\": null}",
+            "tool_version": "0.1.0+galaxy0",
+            "type": "tool",
+            "uuid": "9516971c-8532-4797-8bf9-4655ff104dbd",
+            "workflow_outputs": [
+                {
+                    "label": "intensities_randomly-imputed_QN_LT",
+                    "output_name": "imputed_data_file",
+                    "uuid": "8ceda029-d5fd-4d75-a2b3-ac582bb137c3"
+                },
+                {
+                    "label": "intensities_randomly-imputed_report",
+                    "output_name": "report_file",
+                    "uuid": "84bedf25-c15b-4cc7-97e0-92f746e89f9c"
+                }
+            ]
+        }
+    },
+    "tags": [
+        "ppenrich"
+    ],
+    "uuid": "ac7bf2d1-89fe-4bf6-920a-d5508842d3f9",
+    "version": 7
+}
\ No newline at end of file