Mercurial > repos > eschen42 > mqppep_anova
changeset 12:4deacfee76ef draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit e87d28ea433cc26db7fe44768685d08c06f7a0d0"
author | eschen42 |
---|---|
date | Tue, 15 Mar 2022 18:17:55 +0000 |
parents | 254ab97c6a2c |
children | b41a077af3aa |
files | MaxQuantProcessingScript.R PhosphoPeptide_Upstream_Kinase_Mapping.pl macros.xml mqppep_anova.xml mqppep_anova_script.Rmd mqppep_mrgfltr.py repository_dependencies.xml search_ppep.py test-data/pSTY_motifs.tabular test-data/test_input_for_preproc.tabular test-data/test_kinase_substrate.tabular test-data/test_networkin.tabular test-data/test_regulatory_sites.tabular test-data/test_swissprot.fasta workflow/ppenrich_suite_wf.ga |
diffstat | 15 files changed, 108 insertions(+), 5956 deletions(-) [+] |
line wrap: on
line diff
--- a/MaxQuantProcessingScript.R Tue Mar 15 12:44:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,705 +0,0 @@ -#!/usr/bin/env Rscript - -# This is the implementation for the -# "MaxQuant Phosphopeptide Localization Probability Cutoff" -# Galaxy tool (mqppep_lclztn_filter) -# It is adapted from the MaxQuant Processing Script written by Larry Cheng. - -# libraries -library(optparse) -library(data.table) -library(stringr) -library(ggplot2) - -# title: "MaxQuant Processing Script" -# author: "Larry Cheng" -# date: "February 19, 2018" -# -# # MaxQuant Processing Script -# Takes MaxQuant Phospho (STY)sites.txt file as input -# and performs the following (in order): -# 1) Runs the Proteomics Quality Control software -# 2) Remove contaminant and reverse sequence rows -# 3) Filters rows based on localization probability -# 4) Extract the quantitative data -# 5) Sequences phosphopeptides -# 6) Merges multiply phosphorylated peptides -# 7) Filters out phosphopeptides based on enrichment -# The output file contains the phosphopeptide (first column) -# and the quantitative values for each sample. -# -# ## Revision History -# Rev. 2022-02-10 :wrap for inclusion in Galaxy -# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" -# and "Phosphopeptide Processing Script" -# Rev. 2017-12-12 :added PTXQC -# added additional plots and table outputs for quality control -# allowed for more than 2 samples to be grouped together -# (up to 26 (eg, 1A, 1B, 1C, etc)) -# converted from .r to .rmd file to knit report -# for quality control -# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data -# impute multiple times -# Rev. 2016-09-09 :added filter to eliminate contaminant & reverse sequence rows -# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to -# preANOVA file output -# Rev. 2016-08-22 :use regexSampleNames <- "\\.(\\d + )[AB]$" -# so that it looks at the end of string -# Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....) -# Rev. 2016-07-03 :Removed row names from the write.table() output for -# ANOVA and PreANOVA -# Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75 -# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting -# the row numbers afterwards -# Rev. 2016-06-21 :test18 + standardized the regexpression in protocol - - -### FUNCTION DECLARATIONS begin ---------------------------------------------- - -# Read first line of file at filePath -# adapted from: https://stackoverflow.com/a/35761217/15509512 -read_first_line <- function(filepath) { - con <- file(filepath, "r") - line <- readLines(con, n = 1) - close(con) - return(line) -} - -# Move columns to the end of dataframe -# - data: the dataframe -# - move: a vector of column names, each of which is an element of names(data) -movetolast <- function(data, move) { - data[c(setdiff(names(data), move), move)] -} - -# Generate phosphopeptide and build list when applied -phosphopeptide_func <- function(df) { - # generate peptide sequence and list of phosphopositions - phosphoprobsequence <- - strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]] - output <- vector() - phosphopeptide <- "" - counter <- 0 # keep track of position in peptide - phosphopositions <- - vector() # keep track of phosphorylation positions in peptide - score_diff <- "" - for (chara in phosphoprobsequence) { - # build peptide sequence - if (!( - chara == " " | - chara == "(" | - chara == ")" | - chara == "." | - chara == "-" | - chara == "0" | - chara == "1" | - chara == "2" | - chara == "3" | - chara == "4" | - chara == "5" | - chara == "6" | - chara == "7" | - chara == "8" | - chara == "9") - ) { - phosphopeptide <- paste(phosphopeptide, chara, sep = "") - counter <- counter + 1 - } - # generate score_diff - if (chara == "-" | - chara == "." | - chara == "0" | - chara == "1" | - chara == "2" | - chara == "3" | - chara == "4" | - chara == "5" | - chara == "6" | - chara == "7" | - chara == "8" | - chara == "9" - ) { - score_diff <- paste(score_diff, chara, sep = "") - } - # evaluate score_diff - if (chara == ")") { - score_diff <- as.numeric(score_diff) - # only consider a phosphoresidue if score_diff > 0 - if (score_diff > 0) { - phosphopositions <- append(phosphopositions, counter) - } - score_diff <- "" - } - } - - # generate phosphopeptide sequence (ie, peptide sequence with "p"'s) - counter <- 1 - phosphoposition_correction1 <- - -1 # used to correct phosphosposition as "p"'s - # are inserted into the phosphopeptide string - phosphoposition_correction2 <- - 0 # used to correct phosphosposition as "p"'s - # are inserted into the phosphopeptide string - while (counter <= length(phosphopositions)) { - phosphopeptide <- - paste( - substr( - phosphopeptide, - 0, - phosphopositions[counter] + phosphoposition_correction1 - ), - "p", - substr( - phosphopeptide, - phosphopositions[counter] + phosphoposition_correction2, - nchar(phosphopeptide) - ), - sep = "" - ) - counter <- counter + 1 - phosphoposition_correction1 <- phosphoposition_correction1 + 1 - phosphoposition_correction2 <- phosphoposition_correction2 + 1 - } - # building phosphopeptide list - output <- append(output, phosphopeptide) - return(output) -} - -### FUNCTION DECLARATIONS end ------------------------------------------------ - - -### EXTRACT ARGUMENTS begin -------------------------------------------------- - -# parse options -option_list <- list( - make_option( - c("-i", "--input"), - action = "store", - type = "character", - help = "A MaxQuant Phospho (STY)Sites.txt" - ) - , - make_option( - c("-o", "--output"), - action = "store", - type = "character", - help = "path to output file" - ) - , - make_option( - c("-E", "--enrichGraph"), - action = "store", - type = "character", - help = "path to enrichment graph PDF" - ) - , - make_option( - c("-F", "--enrichGraph_svg"), - action = "store", - type = "character", - help = "path to enrichment graph SVG" - ) - , - make_option( - c("-L", "--locProbCutoffGraph"), - action = "store", - type = "character", - help = "path to location-proability cutoff graph PDF" - ) - , - make_option( - c("-M", "--locProbCutoffGraph_svg"), - action = "store", - type = "character", - help = "path to location-proability cutoff graph SVG" - ) - , - make_option( - c("-e", "--enriched"), - action = "store", - type = "character", - help = "pY or pST enriched samples (ie, 'Y' or 'ST')" - ) - # default = "^Number of Phospho [(]STY[)]$", - , - make_option( - c("-p", "--phosphoCol"), - action = "store", - type = "character", - help = paste0("PERL-compatible regular expression matching", - " header of column having number of 'Phospho (STY)'") - ) - # default = "^Intensity[^_]", - , - make_option( - c("-s", "--startCol"), - action = "store", - type = "character", - help = paste0("PERL-compatible regular expression matching", - " header of column having first sample intensity") - ) - # default = 1, - , - make_option( - c("-I", "--intervalCol"), - action = "store", - type = "integer", - help = paste0("Column interval between the Intensities of samples", - " (eg, 1 if subsequent column; 2 if every other column") - ) - # default = 0.75, - , - make_option( - c("-l", "--localProbCutoff"), - action = "store", - type = "double", - help = "Localization Probability Cutoff" - ) - # default = "sum", - , - make_option( - c("-f", "--collapse_func"), - action = "store", - type = "character", - help = paste0("merge identical phosphopeptides", - " by ('sum' or 'average') the intensities") - ) - # default = "filtered_data.txt", - , - make_option( - c("-r", "--filtered_data"), - action = "store", - type = "character", - help = "filtered_data.txt" - ) - # default = "quantData.txt", - , - make_option( - c("-q", "--quant_data"), - action = "store", - type = "character", - help = "quantData.txt" - ) -) -args <- parse_args(OptionParser(option_list = option_list)) -# Check parameter values - -### EXTRACT ARGUMENTS end ---------------------------------------------------- - - -### EXTRACT PARAMETERS from arguments begin ---------------------------------- - -if (!file.exists(args$input)) { - stop((paste("File", args$input, "does not exist"))) -} - -phospho_col_pattern <- "^Number of Phospho [(][STY][STY]*[)]$" -start_col_pattern <- "^Intensity[^_]" -phospho_col_pattern <- read_first_line(args$phosphoCol) -start_col_pattern <- read_first_line(args$startCol) - -sink(getConnection(2)) - -input_file_name <- args$input -filtered_filename <- args$filtered_data -quant_file_name <- args$quant_data -interval_col <- as.integer(args$intervalCol) - -first_line <- read_first_line(input_file_name) -col_headers <- - unlist(strsplit( - x = first_line, - split = c("\t"), - fixed = TRUE - )) -sink(getConnection(2)) -sink() - - -intensity_header_cols <- - grep(pattern = start_col_pattern, x = col_headers, perl = TRUE) -if (length(intensity_header_cols) == 0) { - err_msg <- - paste("Found no intensity columns matching pattern:", - start_col_pattern) - # Divert output to stderr - sink(getConnection(2)) - print(err_msg) - sink() - stop(err_msg) -} - - -phospho_col <- - grep(pattern = phospho_col_pattern, x = col_headers, perl = TRUE)[1] -if (is.na(phospho_col)) { - err_msg <- - paste("Found no 'number of phospho sites' columns matching pattern:", - phospho_col_pattern) - # Divert output to stderr - sink(getConnection(2)) - print(err_msg) - sink() - stop(err_msg) -} - - -i_count <- 0 -this_column <- 1 -last_value <- intensity_header_cols[1] -intensity_cols <- c(last_value) - -while (length(intensity_header_cols) >= interval_col * i_count) { - i_count <- 1 + i_count - this_column <- interval_col + this_column - if (last_value + interval_col != intensity_header_cols[this_column]) - break - last_value <- intensity_header_cols[this_column] - if (length(intensity_header_cols) < interval_col * i_count) - break - intensity_cols <- - c(intensity_cols, intensity_header_cols[this_column]) -} - -start_col <- intensity_cols[1] -num_samples <- i_count - -output_filename <- args$output -enrich_graph_filename <- args$enrichGraph -loc_prob_cutoff_graph_filename <- args$locProbCutoffGraph -enrich_graph_filename_svg <- args$enrichGraph_svg -loc_prob_cutoff_graph_fn_svg <- args$locProbCutoffGraph_svg - -local_prob_cutoff <- args$localProbCutoff -enriched <- args$enriched -collapse_fn <- args$collapse_func - -### EXTRACT PARAMETERS from arguments end ------------------------------------ - - -# Proteomics Quality Control for MaxQuant Results -# (Bielow C et al. J Proteome Res. 2016 PMID: 26653327) -# is run by the Galaxy MaxQuant wrapper and need not be invoked here. - - -# Read & filter out contaminants, reverse sequences, & localization probability -# --- -full_data <- - read.table( - file = input_file_name, - sep = "\t", - header = T, - quote = "" - ) - -# Filter out contaminant rows and reverse rows -filtered_data <- subset(full_data, !grepl("CON__", Proteins)) -filtered_data <- - subset(filtered_data, !grepl("_MYCOPLASMA", Proteins)) -filtered_data <- - subset(filtered_data, !grepl("CONTAMINANT_", Proteins)) -filtered_data <- - subset(filtered_data, !grepl("REV__", Protein) - ) # since REV__ rows are blank in the first column (Proteins) -write.table( - filtered_data, - file = filtered_filename, - sep = "\t", - quote = FALSE, - col.names = TRUE, - row.names = FALSE -) -# ... - - -# Filter out data with localization probability below localProbCutoff -# --- -# Data filtered by localization probability -loc_prob_filtered_data <- - filtered_data[ - filtered_data$Localization.prob >= local_prob_cutoff, - ] -# ... - - -# Localization probability -- visualize locprob cutoff -# --- -loc_prob_graph_data <- - data.frame( - group = c(paste(">", toString(local_prob_cutoff), sep = ""), - paste("<", toString(local_prob_cutoff), sep = "")), - value = c( - nrow(loc_prob_filtered_data) / nrow(filtered_data) * 100, - (nrow(filtered_data) - nrow(loc_prob_filtered_data)) - / nrow(filtered_data) * 100 - ) - ) -gigi <- - ggplot(loc_prob_graph_data, aes(x = "", y = value, fill = group)) + - geom_bar(width = 0.5, - stat = "identity", - color = "black") + - labs(x = NULL, - y = "percent", - title = "Phosphopeptides partitioned by localization-probability cutoff" - ) + - scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") + - theme_minimal() + - theme( - legend.position = "right", - legend.title = element_text(), - plot.title = element_text(hjust = 0.5), - plot.subtitle = element_text(hjust = 0.5), - plot.title.position = "plot" - ) -pdf(loc_prob_cutoff_graph_filename) -print(gigi) -dev.off() -svg(loc_prob_cutoff_graph_fn_svg) -print(gigi) -dev.off() -# ... - - -# Extract quantitative values from filtered data -# --- -quant_data <- - loc_prob_filtered_data[, seq(from = start_col, - by = interval_col, - length.out = num_samples)] -# ... - - -# Generate Phosphopeptide Sequence -# for latest version of MaxQuant (Version 1.5.3.30) -# --- -metadata_df <- - data.frame( - loc_prob_filtered_data[, 1:8], - loc_prob_filtered_data[, phospho_col], - loc_prob_filtered_data[, phospho_col + 1], - loc_prob_filtered_data[, phospho_col + 2], - loc_prob_filtered_data[, phospho_col + 3], - loc_prob_filtered_data[, phospho_col + 4], - loc_prob_filtered_data[, phospho_col + 5], - loc_prob_filtered_data[, phospho_col + 6], - loc_prob_filtered_data[, phospho_col + 7], - quant_data - ) -colnames(metadata_df) <- - c( - "Proteins", - "Positions within proteins", - "Leading proteins", - "Protein", - "Protein names", - "Gene names", - "Fasta headers", - "Localization prob", - "Number of Phospho (STY)", - "Amino Acid", - "Sequence window", - "Modification window", - "Peptide window coverage", - "Phospho (STY) Probabilities", - "Phospho (STY) Score diffs", - "Position in peptide", - colnames(quant_data) - ) -# 'phosphopeptide_func' generates a phosphopeptide sequence -# for each row of data. -# for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1, 2) = both -metadata_df$phosphopeptide <- - apply(X = metadata_df, MARGIN = 1, FUN = phosphopeptide_func) -colnames(metadata_df)[1] <- "Phosphopeptide" -# Move the quant data columns to the right end of the data.frame -metadata_df <- movetolast(metadata_df, c(colnames(quant_data))) -# ... - - -# Write quantitative values for debugging purposes -# --- -quant_write <- cbind(metadata_df[, "Sequence window"], quant_data) -colnames(quant_write)[1] <- "Sequence.Window" -write.table( - quant_write, - file = quant_file_name, - sep = "\t", - quote = FALSE, - col.names = TRUE, - row.names = FALSE -) -# ... - - -# Make new data frame containing only Phosphopeptides -# that are to be mapped to quant data (merge_df) -# --- -metadata_df <- - setDT(metadata_df, keep.rownames = TRUE) # row name will be used to map -merge_df <- - data.frame( - as.integer(metadata_df$rn), - metadata_df$phosphopeptide # row index to merge data frames - ) -colnames(merge_df) <- c("rn", "Phosphopeptide") -# ... - - -# Add Phosphopeptide column to quant columns for quality control checking -# --- -quant_data_qc <- as.data.frame(quant_data) -setDT(quant_data_qc, keep.rownames = TRUE) # will use to match rowname to data -quant_data_qc$rn <- as.integer(quant_data_qc$rn) -quant_data_qc <- merge(merge_df, quant_data_qc, by = "rn") -quant_data_qc$rn <- NULL # remove rn column -# ... - - -# Collapse multiphosphorylated peptides -# --- -quant_data_qc_collapsed <- - data.table(quant_data_qc, key = "Phosphopeptide") -quant_data_qc_collapsed <- - aggregate(. ~ Phosphopeptide, quant_data_qc, FUN = collapse_fn) -# ... -print("quant_data_qc_collapsed") -head(quant_data_qc_collapsed) - -# Compute (as string) % of phosphopeptides that are multiphosphorylated -# (for use in next step) -# --- -pct_multiphos <- - ( - nrow(quant_data_qc) - nrow(quant_data_qc_collapsed) - ) / (2 * nrow(quant_data_qc)) -pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%") -# ... - - -# Compute and visualize breakdown of pY, pS, and pT before enrichment filter -# --- -py_data <- - quant_data_qc_collapsed[ - str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"), - ] -ps_data <- - quant_data_qc_collapsed[ - str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS"), - ] -pt_data <- - quant_data_qc_collapsed[ - str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"), - ] - -py_num <- nrow(py_data) -ps_num <- nrow(ps_data) -pt_num <- nrow(pt_data) - -# Visualize enrichment -enrich_graph_data <- data.frame(group = c("pY", "pS", "pT"), - value = c(py_num, ps_num, pt_num)) - -enrich_graph_data <- - enrich_graph_data[ - enrich_graph_data$value > 0, - ] - -# Plot pie chart with legend -# start: https://stackoverflow.com/a/62522478/15509512 -# refine: https://www.statology.org/ggplot-pie-chart/ -# colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8 -slices <- enrich_graph_data$value -phosphoresidue <- enrich_graph_data$group -pct <- round(100 * slices / sum(slices)) -lbls <- - paste(enrich_graph_data$group, "\n", pct, "%\n(", slices, ")", sep = "") -slc_ctr <- c() -run_tot <- 0 -for (p in pct) { - slc_ctr <- c(slc_ctr, run_tot + p / 2.0) - run_tot <- run_tot + p -} -lbl_y <- 100 - slc_ctr -df <- - data.frame(slices, - pct, - lbls, - phosphoresidue = factor(phosphoresidue, levels = phosphoresidue)) -gigi <- ggplot(df - , aes(x = 1, y = pct, fill = phosphoresidue)) + - geom_col(position = "stack", orientation = "x") + - geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") + - coord_polar(theta = "y", direction = -1) + - labs( - x = NULL - , - y = NULL - , - title = "Percentages (and counts) of phosphosites, by type of residue" - , - caption = sprintf( - "Roughly %s of peptides have multiple phosphosites.", - pct_multiphos - ) - ) + - labs(x = NULL, y = NULL, fill = NULL) + - theme_classic() + - theme( - legend.position = "right" - , - axis.line = element_blank() - , - axis.text = element_blank() - , - axis.ticks = element_blank() - , - plot.title = element_text(hjust = 0.5) - , - plot.subtitle = element_text(hjust = 0.5) - , - plot.caption = element_text(hjust = 0.5) - , - plot.title.position = "plot" - ) + - scale_fill_manual(breaks = phosphoresidue, - values = c("#c7eae5", "#f6e8c3", "#dfc27d")) - -pdf(enrich_graph_filename) -print(gigi) -dev.off() -svg(enrich_graph_filename_svg) -print(gigi) -dev.off() -# ... - - -# Filter phosphopeptides by enrichment -# -- -if (enriched == "Y") { - quant_data_qc_enrichment <- quant_data_qc_collapsed[ - str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"), - ] -} else if (enriched == "ST") { - quant_data_qc_enrichment <- quant_data_qc_collapsed[ - str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS") | - str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"), - ] -} else { - print("Error in enriched variable. Set to either 'Y' or 'ST'") -} -# ... - -print("quant_data_qc_enrichment") -head(quant_data_qc_enrichment) - -# Write phosphopeptides filtered by enrichment -# -- -write.table( - quant_data_qc_enrichment, - file = output_filename, - sep = "\t", - quote = FALSE, - row.names = FALSE -) -# ...
--- a/PhosphoPeptide_Upstream_Kinase_Mapping.pl Tue Mar 15 12:44:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2095 +0,0 @@ -#!/usr/local/bin/perl -############################################################################################################################### -# perl Kinase_enrichment_analysis_complete_v0.pl -# -# Nick Graham, USC -# 2016-02-27 -# -# Built from scripts written by NG at UCLA in Tom Graeber's lab: -# CombinePhosphoSites.pl -# Retrieve_p_motifs.pl -# NetworKIN_Motif_Finder_v7.pl -# -# Given a list of phospho-peptides, find protein information and upstream kinases. -# Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl -# -# Updated 2022-01-13, Art Eschenlauer, UMN on behalf of Justin Drake's lab: -# Added warnings and used strict; -# fixed some code paths resulting in more NetworKIN matches; -# applied Aho-Corasick algorithm (via external Python script because Perl implementation was still too slow) -# to speed up "Match the non_p_peptides to the @sequences array"; -# added support for SQLite-formatted UniProtKB/Swiss-Prot data as an alternative to FASTA-formatted data; -# added support for SQLite output in addition to tabular files. -# -# -############################################################################################################################### - -use strict; -use warnings 'FATAL' => 'all'; - -use Getopt::Std; -use DBD::SQLite::Constants qw/:file_open/; -use DBI qw(:sql_types); -use File::Copy; -use File::Basename; -use POSIX qw(strftime); -use Time::HiRes qw(gettimeofday); -#use Data::Dump qw(dump); - -my $USE_SEARCH_PPEP_PY = 1; -#my $FAILED_MATCH_SEQ = "Failed match"; -my $FAILED_MATCH_SEQ = 'No Sequence'; -my $FAILED_MATCH_GENE_NAME = 'No_Gene_Name'; - -my $dirname = dirname(__FILE__); -my %opts; -my ($file_in, $average_or_sum, $db_out, $file_out, $file_melt, $phospho_type); -my $dbtype; -my ($fasta_in, $networkin_in, $motifs_in, $PSP_Kinase_Substrate_in, $PSP_Regulatory_Sites_in); -my (@samples, %sample_id_lut, %ppep_id_lut, %data, @tmp_data, %n); -my $line = 0; -my @failed_match = ($FAILED_MATCH_SEQ); -my @failed_matches; -my (%all_data); -my (@p_peptides, @non_p_peptides); -my @parsed_fasta; -my (@accessions, @names, @sequences, @databases, $database); -my ($dbfile, $dbh, $stmth); -my @col_names; -my (%matched_sequences, %accessions, %names, %sites, ); -my (@tmp_matches, @tmp_accessions, @tmp_names, @tmp_sites); -my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues); -my (@kinases_observed, $kinases); -my (@kinases_observed_lbl, @phosphosites_observed_lbl); -my ($p_sequence_kinase, $p_sequence, $kinase); -my (@motif_sequence, %motif_type, %motif_count); -my (@kinases_PhosphoSite, $kinases_PhosphoSite); -my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite); -my (%regulatory_sites_PhosphoSite_hash); -my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes, %organism); -my (%unique_motifs); -my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches); -my %psp_regsite_protein_2; -my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2, %organism_2); -my @timeData; -my $PhosphoSitePlusCitation; -my %site_description; - -my %kinase_substrate_NetworKIN_matches; -my %kinase_motif_matches; -my $regulatory_sites_PhosphoSite; -my ($seq_plus5aa, $seq_plus7aa, %seq_plus7aa_2); -my %kinase_substrate_PhosphoSite_matches; -my @formatted_sequence; -my $pSTY_sequence; -my $i; -my @a; -my $use_sqlite; -my $verbose; - -########## -## opts ## -########## - ## input files - # i : path to input file, e.g., 'outputfile_STEP2.txt' - # f : path to UniProtKB/SwissProt FASTA - # s : optional species argument - # n : path to NetworKIN_201612_cutoffscore2.0.txt - # m : path to pSTY_Motifs.txt - # p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt - # r : path to 2017-03_PSP_Regulatory_sites.txt - ## options - # P : phospho_type - # F : function - # v : verbose output - ## output files - # o : path to output file - # O : path to "melted" output file - # D : path to output SQLite file - -sub usage() - { - print STDERR <<"EOH"; - This program given a list of phospho-peptides, finds protein information and upstream kinases. - usage: $0 [-hvd] -f FASTA_file - -h : this (help) message - -v : slightly verbose - -a : use SQLite less - ## input files - -i : path to input file, e.g., 'outputfile_STEP2.txt' - -f : path to UniProtDB/SwissProt FASTA - -s : optional species filter argument for PSP records; defaults to 'human' - -n : path to NetworKIN_201612_cutoffscore2.0.txt - -m : path to pSTY_Motifs.txt - -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt - -r : path to 2017-03_PSP_Regulatory_sites.txt - ## options - -P : phospho_type - -F : function - ## output files - -o : path to output file - -O : path to "melted" output file - -D : path to output SQLite file - example: $0 -EOH - exit; - } - -sub format_localtime_iso8601 { - # ref: https://perldoc.perl.org/Time::HiRes - my ($seconds, $microseconds) = gettimeofday; - # ref: https://pubs.opengroup.org/onlinepubs/9699919799/functions/strftime.html - return strftime("%Y-%m-%dT%H:%M:%S",localtime(time)) . sprintf(".%03d", $microseconds/1000); -} - -sub replace_pSpTpY { - my ($formatted_sequence, $phospho_type) = @_; - if ($phospho_type eq 'y') { - $formatted_sequence =~ s/pS/S/g; - $formatted_sequence =~ s/pT/T/g; - $formatted_sequence =~ s/pY/y/g; - } - elsif ($phospho_type eq "sty") { - $formatted_sequence =~ s/pS/s/g; - $formatted_sequence =~ s/pT/t/g; - $formatted_sequence =~ s/pY/y/g; - } - $formatted_sequence; -} - -sub pseudo_sed() -{ - # Comments give the sed equivalent - my $s; - # / GN=/!{ s:\(OX=[^ \t]*\):\1 GN=N/A:; }; - unless (m / GN=/s) - { - $s = s :(OX=[^ \t]*):${1} GN=N/A:s; - } - # / PE=/!{ s:\(GN=[^ \t]*\):\1 PE=N/A:; }; - unless (m / PE=/s) - { - $s = s :(GN=[^ \t]*):${1} PE=N/A:s; - } - # / SV=/!{ s:\(PE=[^ \t]*\):\1 SV=N/A:; }; - unless (m / SV=/s) - { - $s = s :(PE=[^ \t]*):${1} SV=N/A:s; - } - # s/^sp.//; - $s = s /^sp.//s; - # s/[|]/\t/g; - $s = s /[|]/\t/sg; - # s/ OS=/\t/; - $s = s / OS=/\t/s; - # s/ OX=/\t/; - $s = s / OX=/\t/s; - # s/ GN=/\t/; - $s = s / GN=/\t/s; - # s/ PE=/\t/; - $s = s / PE=/\t/s; - # s/ SV=/\t/; - $s = s / SV=/\t/s; -} # sub pseudo_sed - -getopts('i:f:s:n:m:p:r:P:F:o:O:D:hva', \%opts) ; - - -if (exists($opts{'h'})) { - usage(); -} -if (exists($opts{'a'})) { - $USE_SEARCH_PPEP_PY = 0; -} -if (exists($opts{'v'})) { - $verbose = 1; -} else { - $verbose = 0; -} -if (!exists($opts{'i'}) || !-e $opts{'i'}) { - die('Input File not found'); -} else { - $file_in = $opts{'i'}; -} -if (!exists($opts{'f'}) || !-e $opts{'f'}) { - die('FASTA not found'); -} else { - $fasta_in = $opts{'f'}; - $use_sqlite = 0; -} -my $species; -if ((!exists($opts{'s'})) || ($opts{'s'} eq '')) { - $species = 'human'; -} else { - $species = $opts{'s'}; - print "'-s' option is '$species'\n"; -} -print "species filter is '$species'\n"; - -if (!exists($opts{'n'}) || !-e $opts{'n'}) { - die('Input NetworKIN File not found'); -} else { - $networkin_in = $opts{'n'}; -} -if (!exists($opts{'m'}) || !-e $opts{'m'}) { - die('Input pSTY_Motifs File not found'); -} else { - $motifs_in = $opts{'m'}; -} -if (!exists($opts{'p'}) || !-e $opts{'p'}) { - die('Input PSP_Kinase_Substrate_Dataset File not found'); -} else { - $PSP_Kinase_Substrate_in = $opts{'p'}; -} -if (!exists($opts{'r'}) || !-e $opts{'r'}) { - die('Input PSP_Regulatory_sites File not found'); -} else { - $PSP_Regulatory_Sites_in = $opts{'r'}; -} -if (exists($opts{'P'})) { - $phospho_type = $opts{'P'}; -} -else { - $phospho_type = "sty"; -} -if (exists($opts{'F'})) { - $average_or_sum = $opts{'F'}; -} -else { - $average_or_sum = "sum"; -} -if (exists($opts{'D'})) { - $db_out = $opts{'D'}; -} -else { - $db_out = "db_out.sqlite"; -} -if (exists($opts{'O'})) { - $file_melt = $opts{'O'}; -} -else { - $file_melt = "output_melt.tsv"; -} -if (exists($opts{'o'})) { - $file_out = $opts{'o'}; -} -else { - $file_out = "output.tsv"; -} - - -############################################################################################################################### -# Print the relevant file names to the screen -############################################################################################################################### -# print "\nData file: $data_in\nFASTA file: $fasta_in\nSpecies: $species\nOutput file: $motifs_out\n\n"; -print "\n--- parameters:\n"; -print "Data file: $file_in\nAverage or sum identical p-sites? $average_or_sum\nOutput file: $file_out\nMelted map: $file_melt\n"; -if ($use_sqlite == 0) { - print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data: $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt FASTA file: $fasta_in\nOutput SQLite file: $db_out\n"; -} else { - print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data: $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt SQLIte file: $dbfile\nOutput SQLite file: $db_out\n"; -} -print "...\n\n"; - -print "Phospho-residues(s) = $phospho_type\n\n"; -if ($phospho_type ne 'y') { - if ($phospho_type ne 'sty') { - die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n"; - } -} - -############################################################################################################################### -# read the input data file -# average or sum identical phospho-sites, depending on the value of $average_or_sum -############################################################################################################################### - -open (IN, "$file_in") or die "I couldn't find the input file: $file_in\n"; - -die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ; - - -$line = 0; - -while (<IN>) { - chomp; - my @x = split(/\t/); - for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n] =~ s/\n//g; $x[$n] =~ s/\"//g;} - - # Read in the samples - if ($line == 0) { - for my $n (1 .. $#x) { - push (@samples, $x[$n]); - $sample_id_lut{$x[$n]} = $n; - } - $line++; - } else { - # check whether we have already seen a phospho-peptide - if (exists($data{$x[0]})) { - if ($average_or_sum eq "sum") { # add the data - # unload the data - @tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); } - # add the new data and repack - for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; } - $all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); } - - } elsif ($average_or_sum eq "average") { # average the data - # unload the data - @tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); } - # average with the new data and repack - for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); } - $n{$x[0]}++; - $data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); } - } - } - # if the phospho-sequence has not been seen, save the data - else { - for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); } - $n{$x[0]} = 1; - } - } -} -close(IN); - - -############################################################################################################################### -# Search the FASTA database for phospho-sites and motifs -# -# based on Retrieve_p_peptide_motifs_v2.pl -############################################################################################################################### - - -############################################################################################################################### -# -# Read in the Data file: -# 1) make @p_peptides array as in the original file -# 2) make @non_p_peptides array w/o residue modifications (p, #, other) -# -############################################################################################################################### - -foreach my $peptide (keys %data) { - $peptide =~ s/s/pS/g; $peptide =~ s/t/pT/g; $peptide =~ s/y/pY/g; - push (@p_peptides, $peptide); - $peptide =~ s/p//g; - push(@non_p_peptides, $peptide); -} - -if ($use_sqlite == 0) { - ############################################################################################################################### - # - # Read in the UniProtKB/Swiss-Prot data from FASTA; save to @sequences array and SQLite output database - # - ############################################################################################################################### - - # e.g. - # >sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 - # MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD - # DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK - # EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH - # QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS - # EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT - # accession: Q9Y3B9 - # name: RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 - # sequence: MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT - - open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n"; - print "Reading FASTA file $fasta_in\n"; - # ref: https://perldoc.perl.org/perlsyn#Compound-Statements - # "If the condition expression of a while statement is based on any of - # a group of iterative expression types then it gets some magic treatment. - # The affected iterative expression types are readline, the <FILEHANDLE> - # input operator, readdir, glob, the <PATTERN> globbing operator, and - # `each`. If the condition expression is one of these expression types, - # then the value yielded by the iterative operator will be implicitly - # assigned to `$_`." - while (<IN1>) { - chomp; - # ref: https://perldoc.perl.org/functions/split#split-/PATTERN/,EXPR - # "If only PATTERN is given, EXPR defaults to $_." - my (@x) = split(/\|/); - for my $i (0 .. $#x) { - $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; } - if ($x[0] =~ /^>/) { - $x[0] =~ s/\>//g; - push (@databases, $x[0]); - push (@accessions, $x[1]); - push (@names, $x[2]); - pseudo_sed(); - s/$/\t/; - push (@parsed_fasta, $_); - } elsif ($x[0] =~ /^\w/) { - if (defined $sequences[$#accessions]) { - $sequences[$#accessions] = $sequences[$#accessions].$x[0]; - } else { - $sequences[$#accessions] = $x[0]; - } - $parsed_fasta[$#accessions] = $parsed_fasta[$#accessions].$x[0]; - } - } - close IN1; - print "Done Reading FASTA file $fasta_in\n"; - $dbfile = $db_out; - print "Begin writing $dbfile at " . format_localtime_iso8601() . "\n"; - $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef); - my $auto_commit = $dbh->{AutoCommit}; - print "auto_commit was $auto_commit and is now 0\n" if ($verbose); - $dbh->{AutoCommit} = 0; - - # begin DDL-to-SQLite - # --- - $stmth = $dbh->prepare(" - DROP TABLE IF EXISTS UniProtKB; - "); - $stmth->execute(); - - $stmth = $dbh->prepare(" - CREATE TABLE UniProtKB ( - Uniprot_ID TEXT PRIMARY KEY ON CONFLICT IGNORE, - Description TEXT, - Organism_Name TEXT, - Organism_ID INTEGER, - Gene_Name TEXT, - PE TEXT, - SV TEXT, - Sequence TEXT, - Database TEXT - ) - "); - $stmth->execute(); - $stmth = $dbh->prepare(" - CREATE UNIQUE INDEX idx_uniq_UniProtKB_0 on UniProtKB(Uniprot_ID); - "); - $stmth->execute(); - $stmth = $dbh->prepare(" - CREATE INDEX idx_UniProtKB_0 on UniProtKB(Gene_Name); - "); - $stmth->execute(); - # ... - # end DDL-to-SQLite - - # insert all rows - # begin store-to-SQLite "UniProtKB" table - # --- - $stmth = $dbh->prepare(" - INSERT INTO UniProtKB ( - Uniprot_ID, - Description, - Organism_Name, - Organism_ID, - Gene_Name, - PE, - SV, - Sequence, - Database - ) VALUES (?,?,?,?,?,?,?,?,?) - "); - my $row_count = 1; - my $row_string; - my (@row, @rows); - my $wrd; - while ( scalar @parsed_fasta > 0 ) { - $database = $databases[$#parsed_fasta]; - #### print "parsed_fasta[-1]: " . $parsed_fasta[$#parsed_fasta] . "\n"; - $row_string = pop(@parsed_fasta); - #### print "row_string: $row_string\n"; - @row = (split /\t/, $row_string); - for $i (1..3,5..8) { - $stmth->bind_param($i, $row[$i]); - } - $stmth->bind_param(9, $database); - $stmth->bind_param(4, $row[4], { TYPE => SQL_INTEGER }); - if (not $stmth->execute()) { - print "Error in row $row_count: $stmth->errstr\n"; - } - $row_count += 1; - } - # ... - # end store-to-SQLite "UniProtKB" table - - print "begin commit at " . format_localtime_iso8601() . "\n"; - $dbh->{AutoCommit} = $auto_commit; - print "auto_commit is now $auto_commit\n" if ($verbose); - $dbh->disconnect if ( defined $dbh ); - print "Finished writing $dbfile at " . format_localtime_iso8601() . "\n\n"; - $dbtype = "FASTA"; -} - -if ($use_sqlite == 1) { - ############################################################################################################################### - # - # Read in the UniProtKB/Swiss-Prot data from SQLite; save to @sequences array - # - ############################################################################################################################### - - copy($dbfile, $db_out) or die "Copy $dbfile to $db_out failed: $!"; - - # https://metacpan.org/pod/DBD::SQLite#Read-Only-Database - $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef, { - sqlite_open_flags => SQLITE_OPEN_READONLY, - }); - print "DB connection $dbh is to $dbfile\n"; - - # Uniprot_ID, Description, Organism_Name, Organism_ID, Gene_Name, PE, SV, Sequence - $stmth = $dbh->prepare(" - SELECT Uniprot_ID - , Description || ' OS=' || Organism_Name || ' OX=' || Organism_ID - || CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN='|| Gene_Name END - || CASE WHEN PE = 'N/A' THEN '' ELSE ' PE='|| PE END - || CASE WHEN SV = 'N/A' THEN '' ELSE ' SV='|| SV END - AS Description - , Sequence - , Database - FROM - UniProtKB - "); - $stmth->execute(); - @col_names = @{$stmth->{NAME}}; - print "\nColumn names selected from UniProtKB SQLite table: " . join(", ", @col_names) . "\n\n" if ($verbose); - while (my @row = $stmth->fetchrow_array) { - push (@names, $row[1]); # redacted Description - push (@accessions, $row[0]); # Uniprot_ID - $sequences[$#accessions] = $row[2]; # Sequence - push (@databases, $row[3]); # Database (should be 'sp') - } - - $dbh->disconnect if ( defined $dbh ); - - print "Done Reading UniProtKB/Swiss-Prot file $dbfile\n\n"; - $dbtype = "SQLite"; -} - -print "$#accessions accessions were read from the UniProtKB/Swiss-Prot $dbtype file\n"; - -###################### - $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef); - $stmth = $dbh->prepare(" - INSERT INTO UniProtKB ( - Uniprot_ID, - Description, - Organism_Name, - Organism_ID, - Gene_Name, - PE, - SV, - Sequence, - Database - ) VALUES ( - 'No Uniprot_ID', - 'NO_GENE_SYMBOL No Description', - 'No Organism_Name', - 0, - '$FAILED_MATCH_GENE_NAME', - '0', - '0', - '$FAILED_MATCH_SEQ', - 'No Database' - ) - "); - if (not $stmth->execute()) { - print "Error inserting dummy row into UniProtKB: $stmth->errstr\n"; - } - $dbh->disconnect if ( defined $dbh ); -###################### - -@timeData = localtime(time); -print "\n--- Start search at " . format_localtime_iso8601() ."\n"; - -print " --> Calling 'search_ppep' script\n\n"; -if ($verbose) { - $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in --verbose"); -} else { - $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in"); -} -if ($i) { - print "python $dirname/search_ppep.py -u $db_out -p $file_in\n exited with exit code $i\n"; - die "Search failed for phosphopeptides in SwissProt/SQLite file."; -} -print " <-- Returned from 'search_ppep' script\n"; - -@timeData = localtime(time); -print "... Finished search at " . format_localtime_iso8601() ."\n\n"; - - -############################################################################################################################### -# -# Match the non_p_peptides to the @sequences array: -# 1) Format the motifs +/- 10 residues around the phospho-site -# 2) Print the original data plus the phospho-motif to the output file -# -############################################################################################################################### - - -print "--- Match the non_p_peptides to the \@sequences array:\n"; - -if ($USE_SEARCH_PPEP_PY) { - print "Find the matching protein sequence(s) for the peptide using SQLite\n"; -} else { - print "Find the matching protein sequence(s) for the peptide using slow search\n"; -} - -# https://metacpan.org/pod/DBD::SQLite#Read-Only-Database -$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef, { - sqlite_open_flags => SQLITE_OPEN_READONLY, -}); -print "DB connection $dbh is to $db_out\n"; - -# CREATE VIEW uniprotid_pep_ppep AS -# SELECT deppep_UniProtKB.UniprotKB_ID AS accession -# , deppep.seq AS peptide -# , ppep.seq AS phosphopeptide -# , UniProtKB.Sequence AS sequence -# , UniProtKB.Description AS description -# FROM ppep, deppep, deppep_UniProtKB, UniProtKB -# WHERE deppep.id = ppep.deppep_id -# AND deppep.id = deppep_UniProtKB.deppep_id -# AND deppep_UniProtKB.UniprotKB_ID = UniProtKB.Uniprot_ID -# ORDER BY UniprotKB_ID, deppep.seq, ppep.seq; - -my %ppep_to_count_lut; -print "start select peptide counts " . format_localtime_iso8601() . "\n"; -my $uniprotkb_pep_ppep_view_stmth = $dbh->prepare(" - SELECT DISTINCT - phosphopeptide - , count(*) as i - FROM - uniprotkb_pep_ppep_view - GROUP BY - phosphopeptide - ORDER BY - phosphopeptide -"); -if (not $uniprotkb_pep_ppep_view_stmth->execute()) { - die "Error fetching peptide counts: $uniprotkb_pep_ppep_view_stmth->errstr\n"; -} -while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) { - $ppep_to_count_lut{$row[0]} = $row[1]; - #print "\$ppep_to_count_lut{$row[0]} = $ppep_to_count_lut{$row[0]}\n"; -} - -# accession, peptide, sequence, description, phosphopeptide, long_description, pos_start, pos_end, scrubbed, ppep_id -# 0 1 2 3 4 5 6 7 8 9 -my $COL_ACCESSION = 0; -my $COL_PEPTIDE = 1; -my $COL_SEQUENCE = 2; -my $COL_DESCRIPTION = 3; -my $COL_PHOSPHOPEPTIDE = 4; -my $COL_LONG_DESCRIPTION = 5; -my $COL_POS_START = 6; -my $COL_POS_END = 7; -my $COL_SCRUBBED = 8; -my $COL_PPEP_ID = 9; - -my %ppep_to_row_lut; -print "start select all records without qualification " . format_localtime_iso8601() . "\n"; -$uniprotkb_pep_ppep_view_stmth = $dbh->prepare(" - SELECT DISTINCT - accession - , peptide - , sequence - , description - , phosphopeptide - , long_description - , pos_start - , pos_end - , scrubbed - , ppep_id - FROM - uniprotkb_pep_ppep_view - ORDER BY - phosphopeptide -"); -if (not $uniprotkb_pep_ppep_view_stmth->execute()) { - die "Error fetching all records without qualification: $uniprotkb_pep_ppep_view_stmth->errstr\n"; -} -my $current_ppep; -my $counter = 0; -my $former_ppep = ""; -@tmp_matches = (); -@tmp_accessions = (); -@tmp_names = (); -@tmp_sites = (); -while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) { - # Identify phosphopeptide for current row; - # it is an error for it to change when the counter is not zero. - $current_ppep = $row[$COL_PHOSPHOPEPTIDE]; - - # when counter is zero, prepare for a new phosphopeptide - if (not $current_ppep eq $former_ppep) { - die "counter is $counter instead of zero" if ($counter != 0); - $ppep_id_lut{$current_ppep} = $row[$COL_PPEP_ID]; - print "next phosphpepetide: $current_ppep; id: $ppep_id_lut{$current_ppep}\n" if ($verbose); - $counter = $ppep_to_count_lut{$current_ppep}; - @tmp_matches = (); - @tmp_accessions = (); - @tmp_names = (); - @tmp_sites = (); - } - - if ($USE_SEARCH_PPEP_PY) { - push(@tmp_matches, $row[ $COL_SEQUENCE ]); - push(@tmp_accessions, $row[ $COL_ACCESSION ]); - push(@tmp_names, $row[ $COL_LONG_DESCRIPTION ]); - push(@tmp_sites, $row[ $COL_POS_START ]); - } - - # Prepare counter and phosphopeptide tracker for next row - $former_ppep = $current_ppep; - $counter -= 1; - - # Set trackers for later use after last instance of current phosphopeptide - if ($counter == 0) { - if ($USE_SEARCH_PPEP_PY) { - $matched_sequences{$current_ppep} = [ @tmp_matches ]; - $accessions{ $current_ppep} = [ @tmp_accessions ]; - $names{ $current_ppep} = [ @tmp_names ]; - $sites{ $current_ppep} = [ @tmp_sites ]; - } - } -} - - -print "end select all records without qualification " . format_localtime_iso8601() . "\n"; - -for my $j (0 .. $#p_peptides) { - - #Find the matching protein sequence(s) for the peptide using SQLite - my ($site, $sequence); - my (@row, @rows); - my $match = 0; - my $p_peptide = $p_peptides[$j]; - @tmp_matches = (); - @tmp_accessions = (); - @tmp_names = (); - @tmp_sites = (); - - #Find the matching protein sequence(s) for the peptide using slow search - $site = -1; - unless ($USE_SEARCH_PPEP_PY) { - for my $k (0 .. $#sequences) { - $site = index($sequences[$k], $non_p_peptides[$j]); - if ($site != -1) { - push(@tmp_matches, $sequences[$k]); - push(@tmp_accessions, $accessions[$k]); - push(@tmp_names, $names[$k]); - push(@tmp_sites, $site); - } - # print "Non-phosphpeptide $non_p_peptides[$j] matched accession $accessions[$k] ($names[$k]) at site $site\n"; - $site = -1; $match++; - # print "tmp_accessions @tmp_accessions \n"; - } - if ($match == 0) { # Check to see if no match was found. Skip to next if no match found. - print "Warning: Failed match for $p_peptides[$j]\n"; - $matched_sequences{$p_peptides[$j]} = \@failed_match; - push(@failed_matches,$p_peptides[$j]); - next; - } else { - $matched_sequences{$p_peptides[$j]} = [ @tmp_matches ]; - $accessions{$p_peptides[$j]} = [ @tmp_accessions ]; - $names{$p_peptides[$j]} = [ @tmp_names ]; - $sites{$p_peptides[$j]} = [ @tmp_sites ]; - } - } - -} # end for my $j (0 .. $#p_peptides) - -print "... Finished match the non_p_peptides at " . format_localtime_iso8601() ."\n\n"; - -print "--- Match the p_peptides to the \@sequences array:\n"; - -for my $peptide_to_match ( keys %matched_sequences ) { - if (grep($peptide_to_match, @failed_matches)) { - print "Failed to match peptide $peptide_to_match\n"; - } - next if (grep($peptide_to_match, @failed_matches)); - my @matches = @{$matched_sequences{$peptide_to_match}}; - @tmp_motifs_array = (); - for my $i (0 .. $#matches) { - - # Find the location of the phospo-site in the sequence(s) - $tmp_site = 0; my $offset = 0; - my $tmp_p_peptide = $peptide_to_match; - $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g; - - # Find all phosphorylated residues in the p_peptide - @p_sites = (); - while ($tmp_site != -1) { - $tmp_site = index($tmp_p_peptide, 'p', $offset); - if ($tmp_site != -1) {push (@p_sites, $tmp_site);} - $offset = $tmp_site + 1; - $tmp_p_peptide =~ s/p//; - } - @tmp_p_residues = (); - for my $l (0 .. $#p_sites) { - next if not defined $sites{$peptide_to_match}[$i]; - - push (@tmp_p_residues, $p_sites[$l] + $sites{$peptide_to_match}[$i]); - - # Match the sequences around the phospho residues to find the motifs - my ($desired_residues_L, $desired_residues_R); - if ($tmp_p_residues[0] - 10 < 0) { #check to see if there are fewer than 10 residues left of the first p-site - # eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3 - $desired_residues_L = $tmp_p_residues[0]; - } - else { - $desired_residues_L = 10; - } - my $seq_length = length($matched_sequences{$peptide_to_match}[$i]); - if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) { #check to see if there are fewer than 10 residues right of the last p-site - $desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1); - # eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3 - # print "Line 170: seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n"; - } - else { - $desired_residues_R = 10; - } - - my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1; - my $arg2 = $tmp_p_residues[0] - $desired_residues_L; - my $arg1 = $matched_sequences{$peptide_to_match}[$i]; - - if (($total_length > 0) && (length($arg1) > $arg2 + $total_length - 1)) { - $tmp_motif = substr($arg1, $arg2, $total_length); - - # Put the "p" back in front of the appropriate phospho-residue(s). - my (@tmp_residues, $tmp_position); - for my $m (0 .. $#p_sites) { - # print "Line 183: $p_sites[$m]\n"; - if ($m == 0) { - $tmp_position = $desired_residues_L; - } else { - $tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0]; - } - if ($tmp_position < length($tmp_motif) + 1) { - push (@tmp_residues, substr($tmp_motif, $tmp_position, 1)); - if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");} - if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");} - if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");} - } - } - - $tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g; - - # Comment out on 8.10.13 to remove the numbers from motifs - my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1; - my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1; - $tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue; - push(@tmp_motifs_array, $tmp_motif); - $residues{$peptide_to_match}{$i} = [ @tmp_residues ]; - $p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ]; - } - } - $p_motifs{$peptide_to_match} = [ @tmp_motifs_array ]; - } # end for my $i (0 .. $#matches) ### this bracket could be in the wrong place -} - -print "... Finished match the p_peptides to the \@sequences array at " . format_localtime_iso8601() ."\n\n"; - -############################################################################################################################### -# -# Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs -# -############################################################################################################################### - - -print "--- Reading various site data:\n"; - -############################################################################################################################### -# -# Read the NetworKIN_predictions file: -# 1) make a "kinases_observed" array -# 2) annotate the phospho-substrates with the appropriate kinase -# -############################################################################################################################### -my $SITE_KINASE_SUBSTRATE = 1; -$site_description{$SITE_KINASE_SUBSTRATE} = "NetworKIN"; - -open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n"; -print "Reading the NetworKIN data: $networkin_in\n"; -while (<IN1>) { - chomp; - my (@x) = split(/\t/); - for my $i (0 .. $#x) { - $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; - } - next if ($x[0] eq "#substrate"); - if (exists ($kinases -> {$x[2]})) { - #do nothing - } - else { - $kinases -> {$x[2]} = $x[2]; - push (@kinases_observed, $x[2]); - } - my $tmp = $x[10]."_".$x[2]; #eg, REEILsEMKKV_PKCalpha - if (exists($p_sequence_kinase -> {$tmp})) { - #do nothing - } - else { - $p_sequence_kinase -> {$tmp} = $tmp; - } -} -close IN1; - -############################################################################################################################### -# -# Read the Kinase motifs file: -# 1) make a "motif_sequence" array -# -############################################################################################################################### - -# file format (tab separated): -# x[0] = primary key (character), e.g., '17' or '23a' -# x[1] = pattern (egrep pattern), e.g., '(M|I|L|V|F|Y).R..(pS|pT)' -# x[2] = description, e.g., 'PKA_Phosida' or '14-3-3 domain binding motif (HPRD)' or 'Akt kinase substrate motif (HPRD & Phosida)' - -my $SITE_MOTIF = 2; -$site_description{$SITE_MOTIF} = "motif"; - -open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n"; -print "Reading the Motifs file: $motifs_in\n"; - -while (<IN2>) { - chomp; - my (@x) = split(/\t/); - for my $i (0 .. 2) { - $x[$i] =~ s/\r//g; - $x[$i] =~ s/\n//g; - $x[$i] =~ s/\"//g; - } - if (exists ($motif_type{$x[1]})) { - $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2]; - } else { - $motif_type{$x[1]} = $x[2]; - $motif_count{$x[1]} = 0; - push (@motif_sequence, $x[1]); - } -} -close (IN2); - - -############################################################################################################################### -# 6.28.2011 -# Read PSP_Kinase_Substrate data: -# 1) make a "kinases_PhosphoSite" array -# 2) annotate the phospho-substrates with the appropriate kinase -# -# Columns: -# (0) GENE -# (1) KINASE -# (2) KIN_ACC_ID -# (3) KIN_ORGANISM -# (4) SUBSTRATE -# (5) SUB_GENE_ID -# (6) SUB_ACC_ID -# (7) SUB_GENE -# (8) SUB_ORGANISM -# (9) SUB_MOD_RSD -# (10) SITE_GRP_ID -# (11) SITE_+/-7_AA -# (12) DOMAIN -# (13) IN_VIVO_RXN -# (14) IN_VITRO_RXN -# (15) CST_CAT# -############################################################################################################################### - -my $SITE_PHOSPHOSITE = 3; -$site_description{$SITE_PHOSPHOSITE} = "PhosphoSite"; - - -$line = 0; - -open (IN3, "$PSP_Kinase_Substrate_in") or die "I couldn't find $PSP_Kinase_Substrate_in\n"; -print "Reading the PhosphoSite Kinase-Substrate data: $PSP_Kinase_Substrate_in\n"; - -while (<IN3>) { - chomp; - my (@x) = split(/\t/); - for my $i (0 .. $#x) { - $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; - } - if ($line != 0) { - if (($species eq $x[3]) && ($species eq $x[8])) { - if (exists ($kinases_PhosphoSite -> {$x[0]})) { - #do nothing - } - else { - $kinases_PhosphoSite -> {$x[0]} = $x[0]; - push (@kinases_PhosphoSite, $x[0]); - } - my $offset = 0; - # Replace the superfluous lower case s, t and y - my @lowercase = ('s','t','y'); - my @uppercase = ('S','T','Y'); - for my $k (0 .. 2) { - my $site = 0; - while ($site != -1) { - $site = index($x[11],$lowercase[$k], $offset); - if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);} - $offset = $site + 1; - } - } - my $tmp = $x[11]."_".$x[0]; #eg, RTPGRPLsSYGMDSR_PAK2 - if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { - #do nothing - } - else { - $p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp; - } - } - else { - # do nothing - #print "PSP_kinase_substrate line rejected because KIN_ORGANISM is '$x[3]' and SUB_ORGANISM is '$x[8]': $line\n"; - } - } - $line++; -} -close IN3; - - -############################################################################################################################### -# Read PhosphoSite regulatory site data: -# 1) make a "regulatory_sites_PhosphoSite" hash -# -# Columns: -# (0) GENE -# (1) PROTEIN --> #ACE %psp_regsite_protein -# (2) PROT_TYPE -# (3) ACC_ID -# (4) GENE_ID -# (5) HU_CHR_LOC -# (6) ORGANISM --> %organism -# (7) MOD_RSD -# (8) SITE_GRP_ID -# (9) SITE_+/-7_AA --> %regulatory_sites_PhosphoSite_hash -# (10) DOMAIN --> %domain -# (11) ON_FUNCTION --> %ON_FUNCTION -# (12) ON_PROCESS --> %ON_PROCESS -# (13) ON_PROT_INTERACT --> %ON_PROT_INTERACT -# (14) ON_OTHER_INTERACT --> %ON_OTHER_INTERACT -# (15) PMIDs -# (16) LT_LIT -# (17) MS_LIT -# (18) MS_CST -# (19) NOTES --> %notes -############################################################################################################################### - - -$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef); -my $auto_commit = $dbh->{AutoCommit}; -$dbh->{AutoCommit} = 0; -print "DB connection $dbh is to $db_out, opened for modification\n"; - -# add partial PSP_Regulatory_site table (if not exists) regardless of whether SwissProt input was FASTA or SQLite -$stmth = $dbh->prepare(" -CREATE TABLE IF NOT EXISTS PSP_Regulatory_site ( - SITE_PLUSMINUS_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, - DOMAIN TEXT, - ON_FUNCTION TEXT, - ON_PROCESS TEXT, - ON_PROT_INTERACT TEXT, - ON_OTHER_INTERACT TEXT, - NOTES TEXT, - ORGANISM TEXT, - PROTEIN TEXT -) -"); -$stmth->execute(); - -# add partial PSP_Regulatory_site LUT (if not exists) regardless of whether SwissProt input was FASTA or SQLite -$stmth = $dbh->prepare(" -CREATE TABLE IF NOT EXISTS ppep_regsite_LUT -( ppep_id INTEGER REFERENCES ppep(id) -, site_plusminus_7AA TEXT REFERENCES PSP_Regulatory_site(site_plusminus_7AA) -, PRIMARY KEY (ppep_id, site_plusminus_7AA) ON CONFLICT IGNORE -); -"); -$stmth->execute(); - -# $stmth = $dbh->prepare(" -# CREATE UNIQUE INDEX idx_PSP_Regulatory_site_0 -# ON PSP_Regulatory_site(site_plusminus_7AA); -# "); -# $stmth->execute(); - - -# add Citation table (if not exists) regardless of whether SwissProt input was FASTA or SQLite -my $citation_sql; -$citation_sql = " -CREATE TABLE IF NOT EXISTS Citation ( - ObjectName TEXT REFERENCES sqlite_schema(name) ON DELETE CASCADE, - CitationData TEXT, - PRIMARY KEY (ObjectName, CitationData) ON CONFLICT IGNORE -) -"; -$stmth = $dbh->prepare($citation_sql); -$stmth->execute(); - - -open (IN4, "$PSP_Regulatory_Sites_in") or die "I couldn't find $PSP_Regulatory_Sites_in\n"; -print "Reading the PhosphoSite regulatory site data: $PSP_Regulatory_Sites_in\n"; - - -$line = -1; -while (<IN4>) { - $line++; - chomp; - if ($_ =~ m/PhosphoSitePlus/) { - #$PhosphoSitePlusCitation = ($_ =~ s/PhosphoSitePlus/FooBar/g); - $PhosphoSitePlusCitation = $_; - $PhosphoSitePlusCitation =~ s/\t//g; - $PhosphoSitePlusCitation =~ s/\r//g; - $PhosphoSitePlusCitation =~ s/\n//g; - $PhosphoSitePlusCitation =~ s/""/"/g; - $PhosphoSitePlusCitation =~ s/^"//g; - $PhosphoSitePlusCitation =~ s/"$//g; - print "$PhosphoSitePlusCitation\n"; - next; - } - my (@x) = split(/\t/); - for my $i (0 .. $#x) { - $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; - } - my $found_GENE=0; - if ( (not exists($x[0])) ) { - next; - } - elsif ( ($x[0] eq "GENE") ) { - $found_GENE=1; - next; - } - if ( (not exists($x[9])) || ($x[9] eq "") ) { - if (exists($x[8]) && (not $x[8] eq "")) { - die "$PSP_Regulatory_Sites_in line $line has no SITE_+/-7_AA: $_\n"; - } else { - if ( (not exists($x[1])) || (not $x[1] eq "") ) { - print "$PSP_Regulatory_Sites_in line $line (".length($_)." characters) has no SITE_+/-7_AA: $_\n" - if $found_GENE==1; - } - next; - } - } - elsif ($line != 0) { - if ($species ne $x[6]) { - # Do nothing - this record was filtered out by the species filter - } - elsif (!exists($regulatory_sites_PhosphoSite_hash{$x[9]})) { - if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") { - $regulatory_sites_PhosphoSite_hash{$x[9]} = $x[9]; - $domain{$x[9]} = $x[10]; - $ON_FUNCTION{$x[9]} = $x[11]; - $ON_PROCESS{$x[9]} = $x[12]; - $ON_PROT_INTERACT{$x[9]} = $x[13]; - $ON_OTHER_INTERACT{$x[9]} = $x[14]; - $notes{$x[9]} = $x[19]; - $organism{$x[9]} = $x[6]; - } - } - else { - # $domain - if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") { - if ($x[10] ne "") { - $domain{$x[9]} = $domain{$x[10]}; - } - else { - # do nothing - } - } - else { - if ($domain{$x[9]} =~ /$x[10]/) { - # do nothing - } - else { - $domain{$x[9]} = $domain{$x[9]}." / ".$x[10]; - #print "INFO line $line - compound domain for 7aa: GENE $x[0] PROTEIN $x[1] PROT_TYPE $x[2] ACC_ID $x[3] GENE_ID $x[4] HU_CHR_LOC $x[5] ORGANISM $x[6] MOD_RSD $x[7] SITE_GRP_ID $x[8] SITE_+/-7_AA $x[9] DOMAIN $domain{$x[9]}\n"; - } - } - - # $ON_FUNCTION - if (!defined $ON_FUNCTION{$x[9]} || $ON_FUNCTION{$x[9]} eq "") { - $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]}; - } elsif ($x[10] eq "") { - # do nothing - } - else { - $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10]; - } - - # $ON_PROCESS - if (!defined $ON_PROCESS{$x[9]} || $ON_PROCESS{$x[9]} eq "") { - $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]}; - } elsif ($x[10] eq "") { - # do nothing - } - else { - $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10]; - } - - # $ON_PROT_INTERACT - if (!defined $ON_PROT_INTERACT{$x[9]} || $ON_PROT_INTERACT{$x[9]} eq "") { - $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]}; - } elsif ($x[10] eq "") { - # do nothing - } - else { - $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10]; - } - - # $ON_OTHER_INTERACT - if (!defined $ON_OTHER_INTERACT{$x[9]} || $ON_OTHER_INTERACT{$x[9]} eq "") { - $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]}; - } elsif ($x[10] eq "") { - # do nothing - } - else { - $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10]; - } - - # $notes - if (!defined $notes{$x[9]} || $notes{$x[9]} eq "") { - $notes{$x[9]} = $notes{$x[10]}; - } elsif ($x[10] eq "") { - # do nothing - } - else { - $notes{$x[9]} = $notes{$x[9]}." / ".$x[10]; - } - - # $organism - if (!defined $organism{$x[9]} || $organism{$x[9]} eq "") { - $organism{$x[9]} = $organism{$x[10]}; - } elsif ($x[10] eq "") { - # do nothing - } - else { - $organism{$x[9]} = $organism{$x[9]}." / ".$x[10]; - } - } - } -} -close IN4; - -print "... Finished reading various site data at " . format_localtime_iso8601() ."\n\n"; - -$stmth = $dbh->prepare(" -INSERT INTO Citation ( - ObjectName, - CitationData -) VALUES (?,?) -"); - -sub add_citation { - my ($cit_table, $cit_text, $cit_label) = @_; - $stmth->bind_param(1, $cit_table); - $stmth->bind_param(2, $cit_text); - if (not $stmth->execute()) { - print "Error writing $cit_label cit for table $cit_table: $stmth->errstr\n"; - } -} -my ($citation_text, $citation_table); - -# PSP regulatory or kinase/substrate site -$citation_text = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'; -$citation_table = "PSP_Regulatory_site"; -add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); -$citation_table = "psp_gene_site"; -add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); -$citation_table = "psp_gene_site_view"; -add_citation($citation_table, $citation_text, "PSP_Regulatory_site"); -$citation_text = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'; -$citation_table = "PSP_Regulatory_site"; -add_citation($citation_table, $citation_text, "PSP_Regulatory_site"); -$citation_table = "psp_gene_site"; -add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); -$citation_table = "psp_gene_site_view"; -add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); - -# NetworKIN site -$citation_text = 'Linding, 2007, "Systematic discovery of in vivo phosphorylation networks.", https://pubmed.ncbi.nlm.nih.gov/17570479, https://doi.org/10.1016/j.cell.2007.05.052'; -$citation_table = "psp_gene_site"; -add_citation($citation_table, $citation_text, "NetworkKIN"); -$citation_table = "psp_gene_site_view"; -add_citation($citation_table, $citation_text, "NetworkKIN"); -$citation_text = 'Horn, 2014, "KinomeXplorer: an integrated platform for kinome biology studies.", https://pubmed.ncbi.nlm.nih.gov/24874572, https://doi.org/10.1038/nmeth.296'; -$citation_table = "psp_gene_site"; -add_citation($citation_table, $citation_text, "NetworkKIN"); -$citation_table = "psp_gene_site_view"; -add_citation($citation_table, $citation_text, "NetworkKIN"); -$citation_text = 'Aken, 2016, "The Ensembl gene annotation system.", https://pubmed.ncbi.nlm.nih.gov/33137190, https://doi.org/10.1093/database/baw093'; -$citation_table = "psp_gene_site"; -add_citation($citation_table, $citation_text, "NetworkKIN"); -$citation_table = "psp_gene_site_view"; -add_citation($citation_table, $citation_text, "NetworkKIN"); - -# pSTY motifs -$citation_text = 'Amanchy, 2007, "A curated compendium of phosphorylation motifs.", https://pubmed.ncbi.nlm.nih.gov/17344875, https://doi.org/10.1038/nbt0307-285'; -$citation_table = "psp_gene_site"; -add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs"); -$citation_table = "psp_gene_site_view"; -add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs"); -$citation_text = 'Gnad, 2011, "PHOSIDA 2011: the posttranslational modification database.", https://pubmed.ncbi.nlm.nih.gov/21081558, https://doi.org/10.1093/nar/gkq1159'; -$citation_table = "psp_gene_site"; -add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs"); -$citation_table = "psp_gene_site_view"; -add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs"); - - -############################################################################################################################### -# -# Read the data file: -# 1) find sequences that match the NetworKIN predictions -# 2) find motifs that match the observed sequences -# -############################################################################################################################### - -print "--- Find sequences that match the NetworKIN predictions and find motifs that match observed sequences\n"; - -my $ppep_regsite_LUT_stmth; -$ppep_regsite_LUT_stmth = $dbh->prepare(" - INSERT INTO ppep_regsite_LUT ( - ppep_id, - site_plusminus_7AA - ) VALUES (?,?) -"); - -my ($start_seconds, $start_microseconds) = gettimeofday; - -foreach my $peptide (keys %data) { - # find the unique phospho-motifs for this $peptide - my @all_motifs = (); - my $have_all_motifs = 0; - for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { - my $tmp_motif = $p_motifs{$peptide}[$i]; - push(@all_motifs, $tmp_motif); - $have_all_motifs = 1; - } - if ($have_all_motifs == 1) { - for my $j (0 .. $#all_motifs) { - if (defined $all_motifs[$j]) { - $all_motifs[$j] =~ s/\d+-\[\s//; - $all_motifs[$j] =~ s/\s\]\-\d+//; - } - } - } - my %seen = (); - if ($have_all_motifs == 1) { - foreach my $a (@all_motifs) { - if (defined $a) { - if (exists($seen{$a})) { - next; - } else { - push(@{$unique_motifs{$peptide}}, $a); - $seen{$a} = 1; - } - } - print "push(\@{\$unique_motifs{$peptide}}, $a);\n" if ($verbose); - } - } - - # count the number of phospo-sites in the motif - my $number_pY = 0; - my $number_pSTY = 0; - if ($phospho_type eq 'y') { - if (defined(${$unique_motifs{$peptide}}[0])) { - while (${$unique_motifs{$peptide}}[0] =~ /pY/g) { - $number_pY++; - } - } - } - if ($phospho_type eq 'sty') { - print "looking for unique_motifs for $peptide\n" if ($verbose); - if (defined(${$unique_motifs{$peptide}}[0])) { - while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) { - $number_pSTY++; - print "We have found $number_pSTY unique_motifs for $peptide\n" if ($verbose); - } - } - } - - - # search each of the unique motifs for matches - print "searching $#{$unique_motifs{$peptide}} motifs for peptide $peptide\n" if ($verbose); - for my $i (0 .. $#{$unique_motifs{$peptide}}) { - print "\$i = $i; peptide = $peptide; unique_motif = ${$unique_motifs{$peptide}}[$i]\n" if ($verbose); - my $tmp_motif = ${$unique_motifs{$peptide}}[$i]; - print " --- matching unique motif $tmp_motif for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); - my $formatted_sequence; - if (($number_pY == 1) || ($number_pSTY == 1)) { - my $seq_plus5aa = ""; - my $seq_plus7aa = ""; - $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type); - print " a #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequence for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); - if ($phospho_type eq 'y') { - $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1]; - $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1]; - } - elsif ($phospho_type eq "sty") { - $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1]; - $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1]; - } - - if (defined $seq_plus7aa) { - # commit the 7aa LUT records - $ppep_regsite_LUT_stmth->bind_param( 1, $ppep_id_lut{$peptide} ); - $ppep_regsite_LUT_stmth->bind_param( 2, $seq_plus7aa ); - if (not $ppep_regsite_LUT_stmth->execute()) { - print "Error writing tuple ($ppep_id_lut{$peptide},$seq_plus7aa) for peptide $peptide to ppep_regsite_LUT: $ppep_regsite_LUT_stmth->errstr\n"; - } - } - for my $i (0 .. $#kinases_observed) { - if (defined $seq_plus5aa) { - my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should be PGRPLsSYGMD_PKCalpha - if (exists($p_sequence_kinase -> {$tmp})) { - $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; #ACE - } - } - } - for my $i (0 .. $#motif_sequence) { - if ($peptide =~ /$motif_sequence[$i]/) { - $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; - } - } - for my $i (0 .. $#kinases_PhosphoSite) { - if (defined $seq_plus7aa) { - my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 - if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { - $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; - } - } - } - if (exists($regulatory_sites_PhosphoSite_hash{$seq_plus7aa})) { - $seq_plus7aa_2{$peptide} = $seq_plus7aa; - $domain_2{$peptide} = $domain{$seq_plus7aa}; - $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; - $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; - $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; - $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; - $notes_2{$peptide} = $notes{$seq_plus7aa}; - $organism_2{$peptide} = $organism{$seq_plus7aa}; - } else { - } - } - elsif (($number_pY > 1) || ($number_pSTY > 1)) { #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2 - $formatted_sequence = $tmp_motif; - $seq_plus5aa = ""; - $seq_plus7aa = ""; - #Create the sequences with only one phosphorylation site - #eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes 1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329 and 1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329 - - my (@sites, $offset, $next_p_site); - $sites[0] = index($tmp_motif, "p"); - $offset = $sites[0] + 1; - $next_p_site = 0; - while ($next_p_site != -1) { - $next_p_site = index($tmp_motif, "p", $offset); - if ($next_p_site != -1) { - push (@sites, $next_p_site); - } - $offset = $next_p_site+1; - } - - my @pSTY_sequences; - for my $n (0 .. $#sites) { - $pSTY_sequences[$n] = $tmp_motif; - for (my $m = $#sites; $m >= 0; $m--) { - if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";} - } - } - - my @formatted_sequences; - for my $k (0 .. $#sites) { - $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type); - } - - for my $k (0 .. $#formatted_sequences) { - print " b #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequences[$k] for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); - if ($phospho_type eq 'y') { - $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequences[$k]))[1]; - $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequences[$k]))[1]; - } - elsif ($phospho_type eq "sty") { - $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequences[$k]))[1]; - $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequences[$k]))[1]; - } - for my $i (0 .. $#kinases_observed) { - my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should look like REEILsEMKKV_PKCalpha - if (exists($p_sequence_kinase -> {$tmp})) { - $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; - } - } - $pSTY_sequence = $formatted_sequences[$k]; - for my $i (0 .. $#motif_sequence) { - if ($pSTY_sequence =~ /$motif_sequence[$i]/) { - $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; - } - } - for my $i (0 .. $#kinases_PhosphoSite) { - my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 - #print "seq_plus7aa._.kinases_PhosphoSite[i] is $tmp"; - if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { - $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; - } - } - if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) { - $seq_plus7aa_2{$peptide} = $seq_plus7aa; - - # $domain - if ($domain_2{$peptide} eq "") { - $domain_2{$peptide} = $domain{$seq_plus7aa}; - } - elsif ($domain{$seq_plus7aa} eq "") { - # do nothing - } - else { - $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa}; - } - - - # $ON_FUNCTION_2 - if ($ON_FUNCTION_2{$peptide} eq "") { - $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; - } - elsif ($ON_FUNCTION{$seq_plus7aa} eq "") { - # do nothing - } - else { - $ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa}; - } - - # $ON_PROCESS_2 - if ($ON_PROCESS_2{$peptide} eq "") { - $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; - } - elsif ($ON_PROCESS{$seq_plus7aa} eq "") { - # do nothing - } - else { - $ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa}; - } - - # $ON_PROT_INTERACT_2 - if ($ON_PROT_INTERACT_2{$peptide} eq "") { - $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; - } - elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") { - # do nothing - } - else { - $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa}; - } - - # $ON_OTHER_INTERACT_2 - if ($ON_OTHER_INTERACT_2{$peptide} eq "") { - $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; - } - elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") { - # do nothing - } - else { - $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa}; - } - - # $notes_2 - if ($notes_2{$peptide} eq "") { - $notes_2{$peptide} = $notes{$seq_plus7aa}; - } - elsif ($notes{$seq_plus7aa} eq "") { - # do nothing - } - else { - $notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa}; - } - $notes_2{$peptide} = $notes{$seq_plus7aa}; - - # $organism_2 - if ($organism_2{$peptide} eq "") { - $organism_2{$peptide} = $organism{$seq_plus7aa}; - } - elsif ($organism{$seq_plus7aa} eq "") { - # do nothing - } - else { - $organism_2{$peptide} = $organism_2{$peptide}." / ".$organism{$seq_plus7aa}; - } - $organism_2{$peptide} = $organism{$seq_plus7aa}; - } else { - } # if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) - } # for my $k (0 .. $#formatted_sequences) - } # if/else number of phosphosites - } # for each motif i # for my $i (0 .. $#{$unique_motifs{$peptide}}) -} # for each $peptide - -my ($end_seconds, $end_microseconds) = gettimeofday; - -my $delta_seconds = $end_seconds - $start_seconds; -my $delta_microseconds = $end_microseconds - $start_microseconds; -$delta_microseconds += 1000000 * $delta_seconds; -my $key_count = keys(%data); -print sprintf("Average search time is %d microseconds per phopshopeptide\n", ($delta_microseconds / $key_count)); - -($start_seconds, $start_microseconds) = gettimeofday; - -print "Writing PSP_Regulatory_site records\n"; - -my $psp_regulatory_site_stmth = $dbh->prepare(" - INSERT INTO PSP_Regulatory_site ( - DOMAIN, - ON_FUNCTION, - ON_PROCESS, - ON_PROT_INTERACT, - ON_OTHER_INTERACT, - NOTES, - SITE_PLUSMINUS_7AA, - ORGANISM - ) VALUES (?,?,?,?,?,?,?,?) - "); - -foreach my $peptide (keys %data) { - if (exists($domain_2{$peptide}) and (defined $domain_2{$peptide}) and (not $domain_2{$peptide} eq "") ) { - $psp_regulatory_site_stmth->bind_param(1, $domain_2{$peptide}); - $psp_regulatory_site_stmth->bind_param(2, $ON_FUNCTION_2{$peptide}); - $psp_regulatory_site_stmth->bind_param(3, $ON_PROCESS_2{$peptide}); - $psp_regulatory_site_stmth->bind_param(4, $ON_PROT_INTERACT_2{$peptide}); - $psp_regulatory_site_stmth->bind_param(5, $ON_OTHER_INTERACT_2{$peptide}); - $psp_regulatory_site_stmth->bind_param(6, $notes_2{$peptide}); - $psp_regulatory_site_stmth->bind_param(7, $seq_plus7aa_2{$peptide}); - $psp_regulatory_site_stmth->bind_param(8, $organism_2{$peptide}); - if (not $psp_regulatory_site_stmth->execute()) { - print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $psp_regulatory_site_stmth->errstr\n"; - } else { - } - } elsif (exists($domain_2{$peptide}) and (not defined $domain_2{$peptide})) { - print "\$domain_2{$peptide} is undefined\n"; #ACE - } -} - -$dbh->{AutoCommit} = $auto_commit; -# auto_commit implicitly finishes psp_regulatory_site_stmth, apparently # $psp_regulatory_site_stmth->finish; -$dbh->disconnect if ( defined $dbh ); - - -($end_seconds, $end_microseconds) = gettimeofday; - -$delta_seconds = $end_seconds - $start_seconds; -$delta_microseconds = $end_microseconds - $start_microseconds; -$delta_microseconds += 1000000 * $delta_seconds; -$key_count = keys(%data); -print sprintf("Write time is %d microseconds\n", ($delta_microseconds)); - -print "... Finished find sequences that match the NetworKIN predictions and find motifs that match observed sequences at " . format_localtime_iso8601() ."\n\n"; - -############################################################################################################################### -# -# Print to the output file -# -############################################################################################################################### -open (OUT, ">$file_out") || die "could not open the fileout: $file_out"; -open (MELT, ">$file_melt") || die "could not open the fileout: $file_melt"; - -# print the header info -print MELT "phospho_peptide\tgene_names\tsite_type\tkinase_map\n"; -print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t"; - -# print the PhosphoSite regulatory data -print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t"; - -# print the sample names -for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; } - -# print the kinases and groups -for my $i (0 .. $#kinases_observed) { - my $temp = $kinases_observed[$i]."_NetworKIN"; - print OUT "$temp\t"; - push(@kinases_observed_lbl, $temp); -} -for my $i (0 .. $#motif_sequence) { - print OUT "$motif_type{$motif_sequence[$i]} ($motif_sequence[$i])\t"; -} -for my $i (0 .. $#kinases_PhosphoSite) { - my $temp = $kinases_PhosphoSite[$i]."_PhosphoSite"; - if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; } - if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; } - push(@phosphosites_observed_lbl, $temp); -} - -# begin DDL-to-SQLite -# --- -$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef); -$auto_commit = $dbh->{AutoCommit}; -$dbh->{AutoCommit} = 0; -print "DB connection $dbh is to $db_out, opened for modification\n"; - -my $sample_stmth; -$sample_stmth = $dbh->prepare(" - INSERT INTO sample ( - id, - name - ) VALUES (?,?) -"); - -my $ppep_intensity_stmth; -$ppep_intensity_stmth = $dbh->prepare(" - INSERT INTO ppep_intensity ( - ppep_id, - sample_id, - intensity - ) VALUES (?,?,?) -"); - -my $site_type_stmth; -$site_type_stmth = $dbh->prepare(" - insert into site_type ( - id, - type_name - ) values (?,?) -"); - -my $ppep_gene_site_stmth; -$ppep_gene_site_stmth = $dbh->prepare(" - insert into ppep_gene_site ( - ppep_id, - gene_names, - kinase_map, - site_type_id - ) values (?,?,?,?) -"); - -my $ppep_metadata_stmth; -$ppep_metadata_stmth = $dbh->prepare(" - INSERT INTO ppep_metadata - ( ppep_id - , protein_description - , gene_name - , FASTA_name - , phospho_sites - , motifs_unique - , accessions - , motifs_all_members - , domain - , ON_FUNCTION - , ON_PROCESS - , ON_PROT_INTERACT - , ON_OTHER_INTERACT - , notes - ) VALUES ( - ?,?,?,?,?,?,? - , ?,?,?,?,?,?,? - ) -"); -# end DDL-to-SQLite -# ... - -# begin store-to-SQLite "sample" table -# --- -# %sample_id_lut maps name -> ID -for my $sample_name (keys %sample_id_lut) { - $sample_stmth->bind_param( 2, $sample_name ); - $sample_stmth->bind_param( 1, $sample_id_lut{$sample_name} ); - if (not $sample_stmth->execute()) { - print "Error writing tuple ($sample_name,$sample_id_lut{$sample_name}): $sample_stmth->errstr\n"; - } -} -# end store-to-SQLite "sample" table -# ... - -# begin store-to-SQLite "site_type" table -# --- -sub add_site_type { - my ($site_type_id, $site_type_type_name) = @_; - $site_type_stmth->bind_param( 2, $site_type_type_name ); - $site_type_stmth->bind_param( 1, $site_type_id ); - if (not $site_type_stmth->execute()) { - die "Error writing tuple ($site_type_id,$site_type_type_name): $site_type_stmth->errstr\n"; - } -} -add_site_type($SITE_KINASE_SUBSTRATE, $site_description{$SITE_KINASE_SUBSTRATE}); -add_site_type($SITE_MOTIF, $site_description{$SITE_MOTIF}); -add_site_type($SITE_PHOSPHOSITE, $site_description{$SITE_PHOSPHOSITE}); -# end store-to-SQLite "site_type" table -# ... - -foreach my $peptide (sort(keys %data)) { - next if (grep($peptide, @failed_matches)); - my $ppep_id = $ppep_id_lut{$peptide}; - my @ppep_metadata = (); - my @ppep_intensity = (); - my @gene = (); - my $gene_names; - my $j; - # Print the peptide itself - # column 1: p-peptide - print OUT "$peptide\t"; - push (@ppep_metadata, $ppep_id); - push (@ppep_intensity, $peptide); - - my $verbose_cond = 0; # $peptide eq 'AAAAAAAGDpSDpSWDADAFSVEDPVR' || $peptide eq 'KKGGpSpSDEGPEPEAEEpSDLDSGSVHSASGRPDGPVR'; - # skip over failed matches - print "\nfirst match for '$peptide' is '$matched_sequences{$peptide}[0]' and FAILED_MATCH_SEQ is '$FAILED_MATCH_SEQ'\n" if $verbose_cond; - if ($matched_sequences{$peptide}[0] eq $FAILED_MATCH_SEQ) { - # column 2: Protein description - # column 3: Gene name(s) - # column 4: FASTA name - # column 5: phospho-residues - # Column 6: UNIQUE phospho-motifs - # Column 7: accessions - # Column 8: ALL motifs with residue numbers - # 2 3 4 5 6 7 8 - print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t"; - print "No match found for '$peptide' in sequence database\n"; - $gene_names = '$FAILED_MATCH_GENE_NAME'; - } else { - my @description = (); - my %seen = (); - # Print just the protein description - for $i (0 .. $#{$names{$peptide}}) { - my $long_name = $names{$peptide}[$i]; - my @naming_parts = split(/\sOS/, $long_name); - my @front_half = split(/\s/, $naming_parts[0]); - push(@description, join(" ", @front_half[1..($#front_half)])); - } - # column 2: Protein description - print OUT join(" /// ", @description), "\t"; - push (@ppep_metadata, join(" /// ", @description)); - - # Print just the gene name - for $i (0 .. $#{$names{$peptide}}) { - my $tmp_gene = $names{$peptide}[$i]; - $tmp_gene =~ s/^.*GN=//; - $tmp_gene =~ s/\s.*//; - if (!exists($seen{$tmp_gene})) { - push(@gene, $tmp_gene); - $seen{$tmp_gene} = $tmp_gene; - } - } - # column 3: Gene name(s) - $gene_names = join(" /// ", @gene); - print OUT $gene_names, "\t"; - push (@ppep_metadata, join(" /// ", @gene)); - - # column 4: FASTA name - print OUT join(" /// ", @{$names{$peptide}}), "\t"; - push (@ppep_metadata, join(" /// ", @{$names{$peptide}})); - - # column 5: phospho-residues - my $tmp_for_insert = ""; - my $foobar; - for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { - print "match $i for '$peptide' is '$matched_sequences{$peptide}[$i]'\n" if $verbose_cond; - if ($i < $#{ $matched_sequences{$peptide} }) { - if (defined $p_residues{$peptide}{$i}) { - @tmp_p_residues = @{$p_residues{$peptide}{$i}}; - for $j (0 .. $#tmp_p_residues) { - if ($j < $#tmp_p_residues) { - my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data - print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; - $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; - } - elsif ($j == $#tmp_p_residues) { - my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data - print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// "; - $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// "; - } - } - } - } - elsif ($i == $#{ $matched_sequences{$peptide} }) { - my $there_were_sites = 0; - if (defined $p_residues{$peptide}{$i}) { - @tmp_p_residues = @{$p_residues{$peptide}{$i}}; - if ($#tmp_p_residues > 0) { - for my $j (0 .. $#tmp_p_residues) { - if ($j < $#tmp_p_residues) { - if (defined $p_residues{$peptide}{$i}[$j]) { - my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data - $foobar = $residues{$peptide}{$i}[$j]; - if (defined $foobar) { - print OUT "$foobar"; - print OUT "$tmp_site_for_printing, "; - $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; - $there_were_sites = 1; - } - } - } - elsif ($j == $#tmp_p_residues) { - if (defined $p_residues{$peptide}{$i}[$j]) { - $foobar = $residues{$peptide}{$i}[$j]; - if (defined $foobar) { - my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data - print OUT "$foobar"; - print OUT "$tmp_site_for_printing\t"; - $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing"; - $there_were_sites = 1; - } - } - } - } - } - } - if (0 == $there_were_sites) { - print OUT "\t"; - } - } - } - print "tmp_for_insert '$tmp_for_insert' for '$peptide'\n" if $verbose_cond; - push (@ppep_metadata, $tmp_for_insert); - - # Column 6: UNIQUE phospho-motifs - print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t"; - push (@ppep_metadata, join(" /// ", @{$unique_motifs{$peptide}})); - - # Column 7: accessions - if (defined $accessions{$peptide}) { - print OUT join(" /// ", @{$accessions{$peptide}}), "\t"; - push (@ppep_metadata, join(" /// ", @{$accessions{$peptide}})); - } else { - print OUT "\t"; - push (@ppep_metadata, ""); - } - - # Column 8: ALL motifs with residue numbers - if (defined $p_motifs{$peptide}) { - print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t"; - push (@ppep_metadata, join(" /// ", @{$p_motifs{$peptide}})); - } else { - print OUT "\t"; - push (@ppep_metadata, ""); - } - - } - - # Print the PhosphoSite regulatory data - - if (defined $domain_2{$peptide}) { print OUT "$domain_2{$peptide}\t"; } else { print OUT "\t"; } - if (defined $ON_FUNCTION_2{$peptide}) { print OUT "$ON_FUNCTION_2{$peptide}\t"; } else { print OUT "\t"; } - if (defined $ON_PROCESS_2{$peptide}) { print OUT "$ON_PROCESS_2{$peptide}\t"; } else { print OUT "\t"; } - if (defined $ON_PROT_INTERACT_2{$peptide}) { print OUT "$ON_PROT_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } - if (defined $ON_OTHER_INTERACT_2{$peptide}) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } - if (defined $notes_2{$peptide}) { print OUT "$notes_2{$peptide}\t"; } else { print OUT "\t"; } - - if (defined $domain_2{$peptide}) { push (@ppep_metadata, $domain_2{$peptide}); } else { push(@ppep_metadata, ""); } - if (defined $ON_FUNCTION_2{$peptide}) { push (@ppep_metadata, $ON_FUNCTION_2{$peptide}); } else { push(@ppep_metadata, ""); } - if (defined $ON_PROCESS_2{$peptide}) { push (@ppep_metadata, $ON_PROCESS_2{$peptide}); } else { push(@ppep_metadata, ""); } - if (defined $ON_PROT_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_PROT_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); } - if (defined $ON_OTHER_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_OTHER_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); } - if (defined $notes_2{$peptide}) { push (@ppep_metadata, $notes_2{$peptide}); } else { push(@ppep_metadata, ""); } - - # begin store-to-SQLite "ppep_metadata" table - # --- - for $i (1..14) { - $ppep_metadata_stmth->bind_param($i, $ppep_metadata[$i-1]); - } - if (not $ppep_metadata_stmth->execute()) { - print "Error writing ppep_metadata row for phosphopeptide $ppep_metadata[$i]: $ppep_metadata_stmth->errstr\n"; - } - # ... - # end store-to-SQLite "ppep_metadata" table - - # Print the data - @tmp_data = (); - foreach (@{$data{$peptide}}) { - push(@tmp_data, $_); - } - print OUT join("\t", @tmp_data), "\t"; - - # begin store-to-SQLite "ppep_intensity" table - # --- - # commit the sample intensities - $i = 0; - foreach (@{$data{$peptide}}) { - my $intense = $_; - $ppep_intensity_stmth->bind_param( 1, $ppep_id ); - $ppep_intensity_stmth->bind_param( 2, $sample_id_lut{$samples[$i]} ); - $ppep_intensity_stmth->bind_param( 3, $intense ); - if (not $ppep_intensity_stmth->execute()) { - print "Error writing tuple ($peptide,$samples[$i],$intense): $ppep_intensity_stmth->errstr\n"; - } - $i += 1; - } - # ... - # end store-to-SQLite "ppep_intensity" table - - # print the kinase-substrate data - for my $i (0 .. $#kinases_observed) { - if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) { - print OUT "X\t"; - my $NetworKIN_label = $kinases_observed[$i]."_NetworKIN"; - print MELT "$peptide\t$gene_names\t$site_description{$SITE_KINASE_SUBSTRATE}\t$NetworKIN_label\n"; - # begin store-to-SQLite "ppep_gene_site" table - # --- - $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id - $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names - $ppep_gene_site_stmth->bind_param(3, $NetworKIN_label); # ppep_gene_site.kinase_map - $ppep_gene_site_stmth->bind_param(4, $SITE_KINASE_SUBSTRATE); # ppep_gene_site.site_type_id - if (not $ppep_gene_site_stmth->execute()) { - print "Error writing tuple ($peptide,$gene_names,$kinases_observed[$i]): $ppep_gene_site_stmth->errstr\n"; - } - # ... - # end store-to-SQLite "ppep_gene_site" table - } - else { print OUT "\t";} - } - my %wrote_motif; - my $motif_parts_0; - for my $i (0 .. $#motif_sequence) { - if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) { - print OUT "X\t"; - $motif_parts_0 = $motif_type{$motif_sequence[$i]}." ".$motif_sequence[$i]; - my $key = "$peptide\t$gene_names\t$motif_parts_0"; - if (!exists($wrote_motif{$key})) { - $wrote_motif{$key} = $key; - print MELT "$peptide\t$gene_names\t$site_description{$SITE_MOTIF}\t$motif_parts_0\n"; - # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n"; #debug - # begin store-to-SQLite "ppep_gene_site" table - # --- - $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id - $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names - $ppep_gene_site_stmth->bind_param(3, $motif_parts_0); # ppep_gene_site.kinase_map - $ppep_gene_site_stmth->bind_param(4, $SITE_MOTIF); # ppep_gene_site.site_type_id - if (not $ppep_gene_site_stmth->execute()) { - print "Error writing tuple ($peptide,$gene_names,$motif_parts_0): $ppep_gene_site_stmth->errstr\n"; - } - # ... - # end store-to-SQLite "ppep_gene_site" table - } - } - else { print OUT "\t";} - } - for my $i (0 .. $#kinases_PhosphoSite) { - if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]})) { - print MELT "$peptide\t$gene_names\t$site_description{$SITE_PHOSPHOSITE}\t$phosphosites_observed_lbl[$i]\n"; - if ($i < $#kinases_PhosphoSite) { - print OUT "X\t"; - } - else { - print OUT "X\n"; - } - # begin store-to-SQLite "ppep_gene_site" table - # --- - $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id - $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names - $ppep_gene_site_stmth->bind_param(3, $phosphosites_observed_lbl[$i]); # ppep_gene_site.kinase_map - $ppep_gene_site_stmth->bind_param(4, $SITE_PHOSPHOSITE); # ppep_gene_site.site_type_id - if (not $ppep_gene_site_stmth->execute()) { - print "Error writing tuple ($peptide,$gene_names,$phosphosites_observed_lbl[$i]): $ppep_gene_site_stmth->errstr\n"; - } - # ... - # end store-to-SQLite "ppep_gene_site" table - } - else { - if ($i < $#kinases_PhosphoSite) { - print OUT "\t"; - } - elsif ($i == $#kinases_PhosphoSite) { - print OUT "\n"; - } - } - } -} - -close OUT; -close MELT; -$ppep_gene_site_stmth->finish; -print "begin DB commit at " . format_localtime_iso8601() . "\n"; -$dbh->{AutoCommit} = $auto_commit; -$dbh->disconnect if ( defined $dbh ); - -print "\nFinished writing output at " . format_localtime_iso8601() ."\n\n"; - -###############################################################################################################################
--- a/macros.xml Tue Mar 15 12:44:40 2022 +0000 +++ b/macros.xml Tue Mar 15 18:17:55 2022 +0000 @@ -1,5 +1,5 @@ <macros> - <token name="@TOOL_VERSION@">0.1.2</token> + <token name="@TOOL_VERSION@">0.1.3</token> <token name="@VERSION_SUFFIX@">0</token> <xml name="requirements"> <requirements>
--- a/mqppep_anova.xml Tue Mar 15 12:44:40 2022 +0000 +++ b/mqppep_anova.xml Tue Mar 15 18:17:55 2022 +0000 @@ -78,7 +78,7 @@ /> </when> </conditional> - <param name="sample_names_regex" type="text" value="\.(\d+)[A-Z]$" + <param name="sample_names_regex" type="text" value="\.\d+[A-Z]$" help="[sample_names_regex] PERL-compatible regular expression extracting sample-names from the the name of a spectrum file (without extension)" label="Sample-extraction regex"> <sanitizer> @@ -87,7 +87,7 @@ </valid> </sanitizer> </param> - <param name="sample_grouping_regex" type="text" value="(\d+)" + <param name="sample_grouping_regex" type="text" value="\d+" help="[sample_grouping_regex] PERL-compatible regular expression extracting sample-group from each sample-name (i.e., extracted by previous regex pattern)" label="Group-extraction regex"> <sanitizer>
--- a/mqppep_anova_script.Rmd Tue Mar 15 12:44:40 2022 +0000 +++ b/mqppep_anova_script.Rmd Tue Mar 15 18:17:55 2022 +0000 @@ -11,8 +11,8 @@ imputationMethod: !r c("group-median", "median", "mean", "random")[1] meanPercentile: 1 sdPercentile: 0.2 - regexSampleNames: "\\.(\\d+)[A-Z]$" - regexSampleGrouping: "(\\d+)" + regexSampleNames: "\\.\\d+[A-Z]$" + regexSampleGrouping: "\\d+" imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt" --- ```{r setup, include = FALSE} @@ -570,23 +570,62 @@ sample_factor_levels <- as.factor(regmatches(temp_matches, m2)) - if (length(levels(sample_factor_levels)) < 2) { + nuke_control_sequences <- + function(s) { + s <- gsub("[\\]", "xyzzy_plugh", s) + s <- gsub("[$]", "\\\\$", s) + s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) + return(s) + } cat( "ERROR!!!! Cannot perform ANOVA analysis", - "because it requires two or more factor levels\n" + "(see next page)\\newpage\n" + ) + cat( + "ERROR: ANOVA analysis", + "requires two or more factor levels!\\newline\n" ) - cat("Unparsed sample names are:\n") - print(names(quant_data_imp_qn_log)) - cat(sprintf("Parsing rule for SampleNames is '%s'\n", regex_sample_names)) - cat("Parsed names are:\n") - print(temp_matches) - cat(sprintf( - "Parsing rule for SampleGrouping is '%s'\n", - regex_sample_grouping - )) - cat("Sample group assignments are:\n") - print(regmatches(temp_matches, m2)) + + cat("\\newline\\newline\n") + cat("Unparsed sample names are:\\newline\n", + "\\begin{quote}\n", + paste(names(quant_data_imp_qn_log), collapse = "\\newline\n"), + "\n\\end{quote}\n\n") + + regex_sample_names <- nuke_control_sequences(regex_sample_names) + + cat("\\leavevmode\\newline\n") + cat("Parsing rule for SampleNames is", + "\\newline\n", + "\\text{'", + regex_sample_names, + "'}\\newline\n", + sep = "" + ) + + cat("\nParsed sample names are:\n", + "\\begin{quote}\n", + paste(temp_matches, collapse = "\\newline\n"), + "\n\\end{quote}\n\n") + + regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) + + cat("\\leavevmode\\newline\n") + cat("Parsing rule for SampleGrouping is", + "\\newline\n", + "\\text{'", + regex_sample_grouping, + "'}\\newline\n", + sep = "" + ) + + cat("\\newline\n") + cat("Sample group assignments are:\n", + "\\begin{quote}\n", + paste(regmatches(temp_matches, m2), collapse = "\\newline\n"), + "\n\\end{quote}\n\n") + } else { p_value_data_anova_ps <- apply( @@ -707,8 +746,6 @@ } ) - - anova_filtered <- data.table( anova_filtered_merge$Phosphopeptide , @@ -719,7 +756,7 @@ colnames(anova_filtered) <- c("Phosphopeptide", colnames(filtered_data_filtered)) - # merge qualitative columns into the ANOVA data + # Merge qualitative columns into the ANOVA data output_table <- data.frame(anova_filtered$Phosphopeptide) output_table <- merge( x = output_table @@ -731,9 +768,16 @@ by.y = "Phosphopeptide" ) - #Produce heatmap to visualize significance and the effect of imputation + # Produce heatmap to visualize significance and the effect of imputation m <- as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) + m_nan_rows <- rowSums( + matrix( + as.integer(is.na(m)), + nrow = nrow(m) + ) + ) + m <- m[!m_nan_rows, ] if (nrow(m) > 0) { rownames_m <- rownames(m) rownames(m) <- sapply( @@ -741,53 +785,53 @@ , FUN = function(i) { sprintf( - anova_filtered_merge_format[i] - , - filtered_p$fdr_adjusted_anova_p[i] - , + anova_filtered_merge_format[i], + filtered_p$fdr_adjusted_anova_p[i], rownames_m[i] ) } ) - margins <- c(max(nchar(colnames(m))) * 10 / 16 # col - , max(nchar(rownames(m))) * 5 / 16 # row - ) - how_many_peptides <- min(50, nrow(m)) + margins <- + c(max(nchar(colnames(m))) * 10 / 16 # col + , max(nchar(rownames(m))) * 5 / 16 # row + ) + how_many_peptides <- min(50, nrow(m)) - cat("\\newpage\n") - if (nrow(m) > 50) { - cat("Heatmap for the 50 most-significant peptides", - sprintf( - "whose adjusted p-value < %0.2f\n", - cutoff) - ) - } else { - cat("Heatmap for peptides whose", - sprintf("adjusted p-value < %0.2f\n", - cutoff) - ) - } - cat("\\newline\n") - cat("\\newline\n") - op <- par("cex.main") - try( - if (nrow(m) > 1) { - par(cex.main = 0.6) - heatmap( - m[how_many_peptides:1, ], - Rowv = NA, - Colv = NA, - cexRow = 0.7, - cexCol = 0.8, - scale = "row", - margins = margins, - main = - "Heatmap of unimputed, unnormalized intensities", - xlab = "" - ) - } - ) - par(op) + cat("\\newpage\n") + if (nrow(m) > 50) { + cat("Heatmap for the 50 most-significant peptides", + sprintf( + "whose adjusted p-value < %0.2f\n", + cutoff) + ) + } else { + cat("Heatmap for peptides whose", + sprintf("adjusted p-value < %0.2f\n", + cutoff) + ) + } + cat("\\newline\n") + cat("\\newline\n") + op <- par("cex.main") + try( + if (nrow(m) > 1) { + par(cex.main = 0.6) + heatmap( + m[how_many_peptides:1, ], + Rowv = NA, + Colv = NA, + cexRow = 0.7, + cexCol = 0.8, + scale = "row", + #ACE scale = "none", + margins = margins, + main = + "Heatmap of unimputed, unnormalized intensities", + xlab = "" + ) + } + ) + par(op) } } }
--- a/mqppep_mrgfltr.py Tue Mar 15 12:44:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1519 +0,0 @@ -#!/usr/bin/env python - -# Import the packages needed -import argparse -import operator # for operator.itemgetter -import os.path -import re -import shutil # for shutil.copyfile(src, dest) -import sqlite3 as sql -import sys # import the sys module for exc_info -import time -import traceback # for formatting stack-trace -from codecs import getreader as cx_getreader - -import numpy as np -import pandas - -# global constants -N_A = "N/A" - - -# ref: https://stackoverflow.com/a/8915613/15509512 -# answers: "How to handle exceptions in a list comprehensions" -# usage: -# from math import log -# eggs = [1,3,0,3,2] -# print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) -# producing: -# for <built-in function log> -# with args (0,) -# exception: math domain error -# [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] -def catch(func, *args, handle=lambda e: e, **kwargs): - - try: - return func(*args, **kwargs) - except Exception as e: - print("For %s" % str(func)) - print(" with args %s" % str(args)) - print(" caught exception: %s" % str(e)) - (ty, va, tb) = sys.exc_info() - print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) - exit(-1) - return None - - -def whine(func, *args, handle=lambda e: e, **kwargs): - - try: - return func(*args, **kwargs) - except Exception as e: - print("Warning: For %s" % str(func)) - print(" with args %s" % str(args)) - print(" caught exception: %s" % str(e)) - (ty, va, tb) = sys.exc_info() - print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) - return None - - -def ppep_join(x): - x = [i for i in x if N_A != i] - result = "%s" % " | ".join(x) - if result != "": - return result - else: - return N_A - - -def melt_join(x): - tmp = {key.lower(): key for key in x} - result = "%s" % " | ".join([tmp[key] for key in tmp]) - return result - - -def __main__(): - # Parse Command Line - parser = argparse.ArgumentParser( - description="Phopsphoproteomic Enrichment Pipeline Merge and Filter." - ) - - # inputs: - # Phosphopeptide data for experimental results, including the intensities - # and the mapping to kinase domains, in tabular format. - parser.add_argument( - "--phosphopeptides", - "-p", - nargs=1, - required=True, - dest="phosphopeptides", - help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format", - ) - # UniProtKB/SwissProt DB input, SQLite - parser.add_argument( - "--ppep_mapping_db", - "-d", - nargs=1, - required=True, - dest="ppep_mapping_db", - help="UniProtKB/SwissProt SQLite Database", - ) - # species to limit records chosed from PhosPhositesPlus - parser.add_argument( - "--species", - "-x", - nargs=1, - required=False, - default=[], - dest="species", - help="limit PhosphoSitePlus records to indicated species (field may be empty)", - ) - - # outputs: - # tabular output - parser.add_argument( - "--mrgfltr_tab", - "-o", - nargs=1, - required=True, - dest="mrgfltr_tab", - help="Tabular output file for results", - ) - # CSV output - parser.add_argument( - "--mrgfltr_csv", - "-c", - nargs=1, - required=True, - dest="mrgfltr_csv", - help="CSV output file for results", - ) - # SQLite output - parser.add_argument( - "--mrgfltr_sqlite", - "-S", - nargs=1, - required=True, - dest="mrgfltr_sqlite", - help="SQLite output file for results", - ) - - # "Make it so!" (parse the arguments) - options = parser.parse_args() - print("options: " + str(options)) - - # determine phosphopeptide ("upstream map") input tabular file access - if options.phosphopeptides is None: - exit('Argument "phosphopeptides" is required but not supplied') - try: - upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0]) - input_file = open(upstream_map_filename_tab, "r") - input_file.close() - except Exception as e: - exit("Error parsing phosphopeptides argument: %s" % str(e)) - - # determine input SQLite access - if options.ppep_mapping_db is None: - exit('Argument "ppep_mapping_db" is required but not supplied') - try: - uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0]) - input_file = open(uniprot_sqlite, "rb") - input_file.close() - except Exception as e: - exit("Error parsing ppep_mapping_db argument: %s" % str(e)) - - # copy input SQLite dataset to output SQLite dataset - if options.mrgfltr_sqlite is None: - exit('Argument "mrgfltr_sqlite" is required but not supplied') - try: - output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0]) - shutil.copyfile(uniprot_sqlite, output_sqlite) - except Exception as e: - exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e)) - - # determine species to limit records from PSP_Regulatory_Sites - if options.species is None: - exit( - 'Argument "species" is required (and may be empty) but not supplied' - ) - try: - if len(options.species) > 0: - species = options.species[0] - else: - species = "" - except Exception as e: - exit("Error parsing species argument: %s" % str(e)) - - # determine tabular output destination - if options.mrgfltr_tab is None: - exit('Argument "mrgfltr_tab" is required but not supplied') - try: - output_filename_tab = os.path.abspath(options.mrgfltr_tab[0]) - output_file = open(output_filename_tab, "w") - output_file.close() - except Exception as e: - exit("Error parsing mrgfltr_tab argument: %s" % str(e)) - - # determine CSV output destination - if options.mrgfltr_csv is None: - exit('Argument "mrgfltr_csv" is required but not supplied') - try: - output_filename_csv = os.path.abspath(options.mrgfltr_csv[0]) - output_file = open(output_filename_csv, "w") - output_file.close() - except Exception as e: - exit("Error parsing mrgfltr_csv argument: %s" % str(e)) - - def mqpep_getswissprot(): - - # - # copied from Excel Output Script.ipynb BEGIN # - # - - # String Constants ################# - DEPHOSPHOPEP = "DephosphoPep" - DESCRIPTION = "Description" - FUNCTION_PHOSPHORESIDUE = ( - "Function Phosphoresidue(PSP=PhosphoSitePlus.org)" - ) - GENE_NAME = "Gene_Name" # Gene Name from UniProtKB - ON_FUNCTION = ( - "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites - ) - ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites - ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites - ON_PROCESS = ( - "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites - ) - ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites - PHOSPHOPEPTIDE = "Phosphopeptide" - PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match" - PHOSPHORESIDUE = "Phosphoresidue" - PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains" - SEQUENCE = "Sequence" - SEQUENCE10 = "Sequence10" - SEQUENCE7 = "Sequence7" - SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA" - UNIPROT_ID = "UniProt_ID" - UNIPROT_SEQ_AND_META_SQL = """ - select Uniprot_ID, Description, Gene_Name, Sequence, - Organism_Name, Organism_ID, PE, SV - from UniProtKB - order by Sequence, UniProt_ID - """ - UNIPROT_UNIQUE_SEQ_SQL = """ - select distinct Sequence - from UniProtKB - group by Sequence - """ - PPEP_PEP_UNIPROTSEQ_SQL = """ - select distinct phosphopeptide, peptide, sequence - from uniprotkb_pep_ppep_view - order by sequence - """ - PPEP_MELT_SQL = """ - SELECT DISTINCT - phospho_peptide AS 'p_peptide', - kinase_map AS 'characterization', - 'X' AS 'X' - FROM ppep_gene_site_view - """ - # CREATE TABLE PSP_Regulatory_site ( - # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, - # domain TEXT, - # ON_FUNCTION TEXT, - # ON_PROCESS TEXT, - # ON_PROT_INTERACT TEXT, - # ON_OTHER_INTERACT TEXT, - # notes TEXT, - # organism TEXT - # ); - PSP_REGSITE_SQL = """ - SELECT DISTINCT - SITE_PLUSMINUS_7AA , - DOMAIN , - ON_FUNCTION , - ON_PROCESS , - ON_PROT_INTERACT , - ON_OTHER_INTERACT , - NOTES , - ORGANISM - FROM PSP_Regulatory_site - """ - PPEP_ID_SQL = """ - SELECT - id AS 'ppep_id', - seq AS 'ppep_seq' - FROM ppep - """ - MRGFLTR_DDL = """ - DROP VIEW IF EXISTS mrgfltr_metadata_view; - DROP TABLE IF EXISTS mrgfltr_metadata; - CREATE TABLE mrgfltr_metadata - ( ppep_id INTEGER REFERENCES ppep(id) - , Sequence10 TEXT - , Sequence7 TEXT - , GeneName TEXT - , Phosphoresidue TEXT - , UniProtID TEXT - , Description TEXT - , FunctionPhosphoresidue TEXT - , PutativeUpstreamDomains TEXT - , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE - ) - ; - CREATE VIEW mrgfltr_metadata_view AS - SELECT DISTINCT - ppep.seq AS phospho_peptide - , Sequence10 - , Sequence7 - , GeneName - , Phosphoresidue - , UniProtID - , Description - , FunctionPhosphoresidue - , PutativeUpstreamDomains - FROM - ppep, mrgfltr_metadata - WHERE - mrgfltr_metadata.ppep_id = ppep.id - ORDER BY - ppep.seq - ; - """ - - CITATION_INSERT_STMT = """ - INSERT INTO Citation ( - ObjectName, - CitationData - ) VALUES (?,?) - """ - CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' - CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' - - MRGFLTR_METADATA_COLUMNS = [ - "ppep_id", - "Sequence10", - "Sequence7", - "GeneName", - "Phosphoresidue", - "UniProtID", - "Description", - "FunctionPhosphoresidue", - "PutativeUpstreamDomains", - ] - - # String Constants (end) ############ - - class Error(Exception): - """Base class for exceptions in this module.""" - - pass - - class PreconditionError(Error): - """Exception raised for errors in the input. - - Attributes: - expression -- input expression in which the error occurred - message -- explanation of the error - """ - - def __init__(self, expression, message): - self.expression = expression - self.message = message - - # start_time = time.clock() #timer - start_time = time.process_time() # timer - - # get keys from upstream tabular file using readline() - # ref: https://stackoverflow.com/a/16713581/15509512 - # answer to "Use codecs to read file with correct encoding" - file1_encoded = open(upstream_map_filename_tab, "rb") - file1 = cx_getreader("latin-1")(file1_encoded) - - count = 0 - upstream_map_p_peptide_list = [] - re_tab = re.compile("^[^\t]*") - while True: - count += 1 - # Get next line from file - line = file1.readline() - # if line is empty - # end of file is reached - if not line: - break - if count > 1: - m = re_tab.match(line) - upstream_map_p_peptide_list.append(m[0]) - file1.close() - file1_encoded.close() - - # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed - re_phos = re.compile("p") - - end_time = time.process_time() # timer - print( - "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), - file=sys.stderr, - ) - - # ----------- Get SwissProt data from SQLite database (start) ----------- - # build UniProt sequence LUT and list of unique SwissProt sequences - - # Open SwissProt SQLite database - conn = sql.connect(uniprot_sqlite) - cur = conn.cursor() - - # Set up structures to hold SwissProt data - - uniprot_Sequence_List = [] - UniProtSeqLUT = {} - - # Execute query for unique seqs without fetching the results yet - uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL) - - while 1: - batch = uniprot_unique_seq_cur.fetchmany(size=50) - if not batch: - # handle case where no records are returned - break - for row in batch: - Sequence = row[0] - UniProtSeqLUT[(Sequence, DESCRIPTION)] = [] - UniProtSeqLUT[(Sequence, GENE_NAME)] = [] - UniProtSeqLUT[(Sequence, UNIPROT_ID)] = [] - UniProtSeqLUT[Sequence] = [] - - # Execute query for seqs and metadata without fetching the results yet - uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL) - - while 1: - batch = uniprot_seq_and_meta.fetchmany(size=50) - if not batch: - # handle case where no records are returned - break - for ( - UniProt_ID, - Description, - Gene_Name, - Sequence, - OS, - OX, - PE, - SV, - ) in batch: - uniprot_Sequence_List.append(Sequence) - UniProtSeqLUT[Sequence] = Sequence - UniProtSeqLUT[(Sequence, UNIPROT_ID)].append(UniProt_ID) - UniProtSeqLUT[(Sequence, GENE_NAME)].append(Gene_Name) - if OS != N_A: - Description += " OS=" + OS - if OX != N_A: - Description += " OX=" + str(int(OX)) - if Gene_Name != N_A: - Description += " GN=" + Gene_Name - if PE != N_A: - Description += " PE=" + PE - if SV != N_A: - Description += " SV=" + SV - UniProtSeqLUT[(Sequence, DESCRIPTION)].append(Description) - - # Close SwissProt SQLite database; clean up local variables - conn.close() - Sequence = "" - UniProt_ID = "" - Description = "" - Gene_Name = "" - - # ----------- Get SwissProt data from SQLite database (finish) ----------- - - end_time = time.process_time() # timer - print( - "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), - file=sys.stderr, - ) - - # ----------- Get SwissProt data from SQLite database (start) ----------- - # Open SwissProt SQLite database - conn = sql.connect(uniprot_sqlite) - cur = conn.cursor() - - # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide - DephosphoPep_UniProtSeq_LUT = {} - - # Set up dictionary to accumulate results - PhosphoPep_UniProtSeq_LUT = {} - - # Execute query for tuples without fetching the results yet - ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL) - - while 1: - batch = ppep_pep_uniprotseq_cur.fetchmany(size=50) - if not batch: - # handle case where no records are returned - break - for (phospho_pep, dephospho_pep, sequence) in batch: - # do interesting stuff here... - PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep - PhosphoPep_UniProtSeq_LUT[ - (phospho_pep, DEPHOSPHOPEP) - ] = dephospho_pep - if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: - DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set() - DephosphoPep_UniProtSeq_LUT[ - (dephospho_pep, DESCRIPTION) - ] = [] - DephosphoPep_UniProtSeq_LUT[ - (dephospho_pep, GENE_NAME) - ] = [] - DephosphoPep_UniProtSeq_LUT[ - (dephospho_pep, UNIPROT_ID) - ] = [] - DephosphoPep_UniProtSeq_LUT[(dephospho_pep, SEQUENCE)] = [] - DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep) - - if ( - sequence - not in DephosphoPep_UniProtSeq_LUT[ - (dephospho_pep, SEQUENCE) - ] - ): - DephosphoPep_UniProtSeq_LUT[ - (dephospho_pep, SEQUENCE) - ].append(sequence) - for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]: - if phospho_pep != phospho_pep: - print( - "phospho_pep:'%s' phospho_pep:'%s'" - % (phospho_pep, phospho_pep) - ) - if phospho_pep not in PhosphoPep_UniProtSeq_LUT: - PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep - PhosphoPep_UniProtSeq_LUT[ - (phospho_pep, DEPHOSPHOPEP) - ] = dephospho_pep - r = list( - zip( - [s for s in UniProtSeqLUT[(sequence, UNIPROT_ID)]], - [s for s in UniProtSeqLUT[(sequence, GENE_NAME)]], - [ - s - for s in UniProtSeqLUT[(sequence, DESCRIPTION)] - ], - ) - ) - # Sort by `UniProt_ID` - # ref: https://stackoverflow.com/a/4174955/15509512 - r = sorted(r, key=operator.itemgetter(0)) - # Get one tuple for each `phospho_pep` - # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] - for (upid, gn, desc) in r: - # Append pseudo-tuple per UniProt_ID but only when it is not present - if ( - upid - not in DephosphoPep_UniProtSeq_LUT[ - (dephospho_pep, UNIPROT_ID) - ] - ): - DephosphoPep_UniProtSeq_LUT[ - (dephospho_pep, UNIPROT_ID) - ].append(upid) - DephosphoPep_UniProtSeq_LUT[ - (dephospho_pep, DESCRIPTION) - ].append(desc) - DephosphoPep_UniProtSeq_LUT[ - (dephospho_pep, GENE_NAME) - ].append(gn) - - # Close SwissProt SQLite database; clean up local variables - conn.close() - # wipe local variables - phospho_pep = dephospho_pep = sequence = 0 - upid = gn = desc = r = "" - - # ----------- Get SwissProt data from SQLite database (finish) ----------- - - end_time = time.process_time() # timer - print( - "%0.6f finished reading and decoding '%s' [0.4]" - % (end_time - start_time, upstream_map_filename_tab), - file=sys.stderr, - ) - - print( - "{:>10} unique upstream phosphopeptides tested".format( - str(len(upstream_map_p_peptide_list)) - ) - ) - - # Read in Upstream tabular file - # We are discarding the intensity data; so read it as text - upstream_data = pandas.read_table( - upstream_map_filename_tab, dtype="str", index_col=0 - ) - - end_time = time.process_time() # timer - print( - "%0.6f read Upstream Map from file [1g_1]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - upstream_data.index = upstream_map_p_peptide_list - - end_time = time.process_time() # timer - print( - "%0.6f added index to Upstream Map [1g_2]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # trim upstream_data to include only the upstream map columns - old_cols = upstream_data.columns.tolist() - i = 0 - first_intensity = -1 - last_intensity = -1 - intensity_re = re.compile("Intensity.*") - for col_name in old_cols: - m = intensity_re.match(col_name) - if m: - last_intensity = i - if first_intensity == -1: - first_intensity = i - i += 1 - # print('last intensity = %d' % last_intensity) - col_PKCalpha = last_intensity + 2 - - data_in_cols = [old_cols[0]] + old_cols[ - first_intensity: last_intensity + 1 - ] - - if upstream_data.empty: - print("upstream_data is empty") - exit(0) - - data_in = upstream_data.copy(deep=True)[data_in_cols] - - # Convert floating-point integers to int64 integers - # ref: https://stackoverflow.com/a/68497603/15509512 - data_in[list(data_in.columns[1:])] = ( - data_in[list(data_in.columns[1:])] - .astype("float64") - .apply(np.int64) - ) - - # create another phosphopeptide column that will be used to join later; - # MAY need to change depending on Phosphopeptide column position - # data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]] - data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index - - end_time = time.process_time() # timer - print( - "%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # Produce a dictionary of metadata for a single phosphopeptide. - # This is a replacement of `UniProtInfo_subdict` in the original code. - def pseq_to_subdict(phospho_pep): - # Strip "p" from phosphopeptide sequence - dephospho_pep = re_phos.sub("", phospho_pep) - - # Determine number of phosphoresidues in phosphopeptide - numps = len(phospho_pep) - len(dephospho_pep) - - # Determine location(s) of phosphoresidue(s) in phosphopeptide - # (used later for Phosphoresidue, Sequence7, and Sequence10) - ploc = [] # list of p locations - i = 0 - p = phospho_pep - while i < numps: - ploc.append(p.find("p")) - p = p[: p.find("p")] + p[p.find("p") + 1:] - i += 1 - - # Establish nested dictionary - result = {} - result[SEQUENCE] = [] - result[UNIPROT_ID] = [] - result[DESCRIPTION] = [] - result[GENE_NAME] = [] - result[PHOSPHORESIDUE] = [] - result[SEQUENCE7] = [] - result[SEQUENCE10] = [] - - # Add stripped sequence to dictionary - result[SEQUENCE].append(dephospho_pep) - - # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT - # Caller may elect to: - # try: - # ... - # except PreconditionError as pe: - # print("'{expression}': {message}".format( - # expression = pe.expression, - # message = pe.message)) - # ) - # ) - if phospho_pep not in PhosphoPep_UniProtSeq_LUT: - raise PreconditionError( - phospho_pep, - "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT", - ) - if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: - raise PreconditionError( - dephospho_pep, - "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT", - ) - if ( - dephospho_pep - != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] - ): - raise PreconditionError( - dephospho_pep, - "dephosphorylated phosphopeptide does not match " - + "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " - + PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)], - ) - result[SEQUENCE] = [dephospho_pep] - result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[ - (dephospho_pep, UNIPROT_ID) - ] - result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[ - (dephospho_pep, DESCRIPTION) - ] - result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[ - (dephospho_pep, GENE_NAME) - ] - if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: - raise PreconditionError( - dephospho_pep, - "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT", - ) - UniProtSeqList = DephosphoPep_UniProtSeq_LUT[ - (dephospho_pep, SEQUENCE) - ] - if len(UniProtSeqList) < 1: - print( - "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" - % dephospho_pep - ) - # raise PreconditionError( - # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", - # 'value has zero length' - # ) - for UniProtSeq in UniProtSeqList: - i = 0 - phosphoresidues = [] - seq7s_set = set() - seq7s = [] - seq10s_set = set() - seq10s = [] - while i < len(ploc): - start = UniProtSeq.find(dephospho_pep) - # handle case where no sequence was found for dep-pep - if start < 0: - i += 1 - continue - psite = ( - start + ploc[i] - ) # location of phosphoresidue on protein sequence - - # add Phosphoresidue - phosphosite = "p" + str(UniProtSeq)[psite] + str(psite + 1) - phosphoresidues.append(phosphosite) - - # Add Sequence7 - if psite < 7: # phospho_pep at N terminus - seq7 = str(UniProtSeq)[: psite + 8] - if seq7[psite] == "S": # if phosphosresidue is serine - pres = "s" - elif ( - seq7[psite] == "T" - ): # if phosphosresidue is threonine - pres = "t" - elif ( - seq7[psite] == "Y" - ): # if phosphoresidue is tyrosine - pres = "y" - else: # if not pSTY - pres = "?" - seq7 = ( - seq7[:psite] + pres + seq7[psite + 1: psite + 8] - ) - while ( - len(seq7) < 15 - ): # add appropriate number of "_" to the front - seq7 = "_" + seq7 - elif ( - len(UniProtSeq) - psite < 8 - ): # phospho_pep at C terminus - seq7 = str(UniProtSeq)[psite - 7:] - if seq7[7] == "S": - pres = "s" - elif seq7[7] == "T": - pres = "t" - elif seq7[7] == "Y": - pres = "y" - else: - pres = "?" - seq7 = seq7[:7] + pres + seq7[8:] - while ( - len(seq7) < 15 - ): # add appropriate number of "_" to the back - seq7 = seq7 + "_" - else: - seq7 = str(UniProtSeq)[psite - 7: psite + 8] - pres = "" # phosphoresidue - if seq7[7] == "S": # if phosphosresidue is serine - pres = "s" - elif seq7[7] == "T": # if phosphosresidue is threonine - pres = "t" - elif seq7[7] == "Y": # if phosphoresidue is tyrosine - pres = "y" - else: # if not pSTY - pres = "?" - seq7 = seq7[:7] + pres + seq7[8:] - if seq7 not in seq7s_set: - seq7s.append(seq7) - seq7s_set.add(seq7) - - # add Sequence10 - if psite < 10: # phospho_pep at N terminus - seq10 = ( - str(UniProtSeq)[:psite] - + "p" - + str(UniProtSeq)[psite: psite + 11] - ) - elif ( - len(UniProtSeq) - psite < 11 - ): # phospho_pep at C terminus - seq10 = ( - str(UniProtSeq)[psite - 10: psite] - + "p" - + str(UniProtSeq)[psite:] - ) - else: - seq10 = str(UniProtSeq)[psite - 10: psite + 11] - seq10 = seq10[:10] + "p" + seq10[10:] - if seq10 not in seq10s_set: - seq10s.append(seq10) - seq10s_set.add(seq10) - - i += 1 - - result[PHOSPHORESIDUE].append(phosphoresidues) - result[SEQUENCE7].append(seq7s) - # result[SEQUENCE10] is a list of lists of strings - result[SEQUENCE10].append(seq10s) - - r = list( - zip( - result[UNIPROT_ID], - result[GENE_NAME], - result[DESCRIPTION], - result[PHOSPHORESIDUE], - ) - ) - # Sort by `UniProt_ID` - # ref: https://stackoverflow.com//4174955/15509512 - s = sorted(r, key=operator.itemgetter(0)) - - result[UNIPROT_ID] = [] - result[GENE_NAME] = [] - result[DESCRIPTION] = [] - result[PHOSPHORESIDUE] = [] - - for r in s: - result[UNIPROT_ID].append(r[0]) - result[GENE_NAME].append(r[1]) - result[DESCRIPTION].append(r[2]) - result[PHOSPHORESIDUE].append(r[3]) - - # convert lists to strings in the dictionary - for key, value in result.items(): - if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]: - result[key] = "; ".join(map(str, value)) - elif key in [SEQUENCE10]: - # result[SEQUENCE10] is a list of lists of strings - joined_value = "" - joined_set = set() - sep = "" - for valL in value: - # valL is a list of strings - for val in valL: - # val is a string - if val not in joined_set: - joined_set.add(val) - joined_value += sep + val - sep = "; " - # joined_value is a string - result[key] = joined_value - - newstring = "; ".join( - [", ".join(prez) for prez in result[PHOSPHORESIDUE]] - ) - # #separate the isoforms in PHOSPHORESIDUE column with ";" - # oldstring = result[PHOSPHORESIDUE] - # oldlist = list(oldstring) - # newstring = "" - # i = 0 - # for e in oldlist: - # if e == ";": - # if numps > 1: - # if i%numps: - # newstring = newstring + ";" - # else: - # newstring = newstring + "," - # else: - # newstring = newstring + ";" - # i +=1 - # else: - # newstring = newstring + e - result[PHOSPHORESIDUE] = newstring - - # separate sequence7's by | - oldstring = result[SEQUENCE7] - oldlist = oldstring - newstring = "" - for ol in oldlist: - for e in ol: - if e == ";": - newstring = newstring + " |" - elif len(newstring) > 0 and 1 > newstring.count(e): - newstring = newstring + " | " + e - elif 1 > newstring.count(e): - newstring = newstring + e - result[SEQUENCE7] = newstring - - return [phospho_pep, result] - - # Construct list of [string, dictionary] lists - # where the dictionary provides the SwissProt metadata - # for a phosphopeptide - result_list = [ - whine(pseq_to_subdict, psequence) - for psequence in data_in[PHOSPHOPEPTIDE_MATCH] - ] - - end_time = time.process_time() # timer - print( - "%0.6f added SwissProt annotations to phosphopeptides [B]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # Construct dictionary from list of lists - # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ - UniProt_Info = { - result[0]: result[1] - for result in result_list - if result is not None - } - - end_time = time.process_time() # timer - print( - "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # cosmetic: add N_A to phosphopeptide rows with no hits - p_peptide_list = [] - for key in UniProt_Info: - p_peptide_list.append(key) - for nestedKey in UniProt_Info[key]: - if UniProt_Info[key][nestedKey] == "": - UniProt_Info[key][nestedKey] = N_A - - end_time = time.process_time() # timer - print( - "%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # convert UniProt_Info dictionary to dataframe - uniprot_df = pandas.DataFrame.transpose( - pandas.DataFrame.from_dict(UniProt_Info) - ) - - # reorder columns to match expected output file - uniprot_df[ - PHOSPHOPEPTIDE - ] = uniprot_df.index # make index a column too - - cols = uniprot_df.columns.tolist() - # cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]] - # uniprot_df = uniprot_df[cols] - uniprot_df = uniprot_df[ - [ - PHOSPHOPEPTIDE, - SEQUENCE10, - SEQUENCE7, - GENE_NAME, - PHOSPHORESIDUE, - UNIPROT_ID, - DESCRIPTION, - ] - ] - - end_time = time.process_time() # timer - print( - "%0.6f reordered columns to match expected output file [1]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # concat to split then groupby to collapse - seq7_df = pandas.concat( - [ - pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(" | ")) - for _, row in uniprot_df.iterrows() - ] - ).reset_index() - seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE] - - # --- -------------- begin read PSP_Regulatory_sites --------------------------------- - # read in PhosphoSitePlus Regulatory Sites dataset - # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- - conn = sql.connect(uniprot_sqlite) - regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) - # Close SwissProt SQLite database - conn.close() - # ... -------------- end read PSP_Regulatory_sites ------------------------------------ - - # keep only the human entries in dataframe - if len(species) > 0: - print( - 'Limit PhosphoSitesPlus records to species "' + species + '"' - ) - regsites_df = regsites_df[regsites_df.ORGANISM == species] - - # merge the seq7 df with the regsites df based off of the sequence7 - merge_df = seq7_df.merge( - regsites_df, - left_on=SEQUENCE7, - right_on=SITE_PLUSMINUS_7AA_SQL, - how="left", - ) - - # after merging df, select only the columns of interest; - # note that PROTEIN is absent here - merge_df = merge_df[ - [ - PHOSPHOPEPTIDE, - SEQUENCE7, - ON_FUNCTION, - ON_PROCESS, - ON_PROT_INTERACT, - ON_OTHER_INTERACT, - ON_NOTES, - ] - ] - # combine column values of interest - # into one FUNCTION_PHOSPHORESIDUE column" - merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat( - merge_df[ON_PROCESS], sep="; ", na_rep="" - ) - merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ - FUNCTION_PHOSPHORESIDUE - ].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="") - merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ - FUNCTION_PHOSPHORESIDUE - ].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="") - merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ - FUNCTION_PHOSPHORESIDUE - ].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="") - - # remove the columns that were combined - merge_df = merge_df[ - [PHOSPHOPEPTIDE, SEQUENCE7, FUNCTION_PHOSPHORESIDUE] - ] - - end_time = time.process_time() # timer - print( - "%0.6f merge regsite metadata [1a]" % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # cosmetic changes to Function Phosphoresidue column - fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE]) - - end_time = time.process_time() # timer - print( - "%0.6f more cosmetic changes [1b]" % (end_time - start_time,), - file=sys.stderr, - ) # timer - - i = 0 - while i < len(fp_series): - # remove the extra ";" so that it looks more professional - if fp_series[i] == "; ; ; ; ": # remove ; from empty hits - fp_series[i] = "" - while fp_series[i].endswith("; "): # remove ; from the ends - fp_series[i] = fp_series[i][:-2] - while fp_series[i].startswith("; "): # remove ; from the beginning - fp_series[i] = fp_series[i][2:] - fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") - fp_series[i] = fp_series[i].replace("; ; ; ", "; ") - fp_series[i] = fp_series[i].replace("; ; ", "; ") - - # turn blanks into N_A to signify the info was searched for but cannot be found - if fp_series[i] == "": - fp_series[i] = N_A - - i += 1 - merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series - - end_time = time.process_time() # timer - print( - "%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # merge uniprot df with merge df - uniprot_regsites_merged_df = uniprot_df.merge( - merge_df, - left_on=PHOSPHOPEPTIDE, - right_on=PHOSPHOPEPTIDE, - how="left", - ) - - # collapse the merged df - uniprot_regsites_collapsed_df = pandas.DataFrame( - uniprot_regsites_merged_df.groupby(PHOSPHOPEPTIDE)[ - FUNCTION_PHOSPHORESIDUE - ].apply(lambda x: ppep_join(x)) - ) - # .apply(lambda x: "%s" % ' | '.join(x))) - - end_time = time.process_time() # timer - print( - "%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), - file=sys.stderr, - ) # timer - - uniprot_regsites_collapsed_df[ - PHOSPHOPEPTIDE - ] = ( - uniprot_regsites_collapsed_df.index - ) # add df index as its own column - - # rename columns - uniprot_regsites_collapsed_df.columns = [ - FUNCTION_PHOSPHORESIDUE, - "ppp", - ] - - end_time = time.process_time() # timer - print( - "%0.6f selected columns to be merged to uniprot_df [1e]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # add columns based on Sequence7 matching site_+/-7_AA - uniprot_regsite_df = pandas.merge( - left=uniprot_df, - right=uniprot_regsites_collapsed_df, - how="left", - left_on=PHOSPHOPEPTIDE, - right_on="ppp", - ) - - end_time = time.process_time() # timer - print( - "%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - data_in.rename( - {"Protein description": PHOSPHOPEPTIDE}, - axis="columns", - inplace=True, - ) - - # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') - res2 = sorted( - data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold() - ) - data_in = data_in.loc[res2] - - end_time = time.process_time() # timer - print( - "%0.6f sorting time [1f]" % (end_time - start_time,), - file=sys.stderr, - ) # timer - - cols = [old_cols[0]] + old_cols[col_PKCalpha - 1:] - upstream_data = upstream_data[cols] - - end_time = time.process_time() # timer - print( - "%0.6f refactored columns for Upstream Map [1g]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # #rename upstream columns in new list - # new_cols = [] - # for name in cols: - # if "_NetworKIN" in name: - # name = name.split("_")[0] - # if " motif" in name: - # name = name.split(" motif")[0] - # if " sequence " in name: - # name = name.split(" sequence")[0] - # if "_Phosida" in name: - # name = name.split("_")[0] - # if "_PhosphoSite" in name: - # name = name.split("_")[0] - # new_cols.append(name) - - # rename upstream columns in new list - def col_rename(name): - if "_NetworKIN" in name: - name = name.split("_")[0] - if " motif" in name: - name = name.split(" motif")[0] - if " sequence " in name: - name = name.split(" sequence")[0] - if "_Phosida" in name: - name = name.split("_")[0] - if "_PhosphoSite" in name: - name = name.split("_")[0] - return name - - new_cols = [col_rename(col) for col in cols] - upstream_data.columns = new_cols - - end_time = time.process_time() # timer - print( - "%0.6f renamed columns for Upstream Map [1h_1]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # Create upstream_data_cast as a copy of upstream_data - # but with first column substituted by the phosphopeptide sequence - upstream_data_cast = upstream_data.copy() - new_cols_cast = new_cols - new_cols_cast[0] = "p_peptide" - upstream_data_cast.columns = new_cols_cast - upstream_data_cast["p_peptide"] = upstream_data.index - - # --- -------------- begin read upstream_data_melt ------------------------------------ - # ----------- Get melted kinase mapping data from SQLite database (start) ----------- - conn = sql.connect(uniprot_sqlite) - upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) - # Close SwissProt SQLite database - conn.close() - upstream_data_melt = upstream_data_melt_df.copy() - upstream_data_melt.columns = ["p_peptide", "characterization", "X"] - upstream_data_melt["characterization"] = [ - col_rename(s) for s in upstream_data_melt["characterization"] - ] - - print( - "%0.6f upstream_data_melt_df initially has %d rows" - % (end_time - start_time, len(upstream_data_melt.axes[0])), - file=sys.stderr, - ) - # ref: https://stackoverflow.com/a/27360130/15509512 - # e.g. df.drop(df[df.score < 50].index, inplace=True) - upstream_data_melt.drop( - upstream_data_melt[upstream_data_melt.X != "X"].index, inplace=True - ) - print( - "%0.6f upstream_data_melt_df pre-dedup has %d rows" - % (end_time - start_time, len(upstream_data_melt.axes[0])), - file=sys.stderr, - ) - # ----------- Get melted kinase mapping data from SQLite database (finish) ----------- - # ... -------------- end read upstream_data_melt -------------------------------------- - - end_time = time.process_time() # timer - print( - "%0.6f melted and minimized Upstream Map dataframe [1h_2]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - # ... end read upstream_data_melt - - end_time = time.process_time() # timer - print( - "%0.6f indexed melted Upstream Map [1h_2a]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - upstream_delta_melt_LoL = upstream_data_melt.values.tolist() - - melt_dict = {} - for key in upstream_map_p_peptide_list: - melt_dict[key] = [] - - for el in upstream_delta_melt_LoL: - (p_peptide, characterization, X) = tuple(el) - if p_peptide in melt_dict: - melt_dict[p_peptide].append(characterization) - else: - exit( - 'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' - % (p_peptide) - ) - - end_time = time.process_time() # timer - print( - "%0.6f appended peptide characterizations [1h_2b]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # for key in upstream_map_p_peptide_list: - # melt_dict[key] = ' | '.join(melt_dict[key]) - - for key in upstream_map_p_peptide_list: - melt_dict[key] = melt_join(melt_dict[key]) - - end_time = time.process_time() # timer - print( - "%0.6f concatenated multiple characterizations [1h_2c]" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # map_dict is a dictionary of dictionaries - map_dict = {} - for key in upstream_map_p_peptide_list: - map_dict[key] = {} - map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key] - - end_time = time.process_time() # timer - print( - "%0.6f instantiated map dictionary [2]" % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # convert map_dict to dataframe - map_df = pandas.DataFrame.transpose( - pandas.DataFrame.from_dict(map_dict) - ) - map_df["p-peptide"] = map_df.index # make index a column too - cols_map_df = map_df.columns.tolist() - cols_map_df = [cols_map_df[1]] + [cols_map_df[0]] - map_df = map_df[cols_map_df] - - # join map_df to uniprot_regsite_df - output_df = uniprot_regsite_df.merge( - map_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="p-peptide" - ) - - output_df = output_df[ - [ - PHOSPHOPEPTIDE, - SEQUENCE10, - SEQUENCE7, - GENE_NAME, - PHOSPHORESIDUE, - UNIPROT_ID, - DESCRIPTION, - FUNCTION_PHOSPHORESIDUE, - PUTATIVE_UPSTREAM_DOMAINS, - ] - ] - - # cols_output_prelim = output_df.columns.tolist() - # - # print("cols_output_prelim") - # print(cols_output_prelim) - # - # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]] - # - # print("cols_output with p-peptide") - # print(cols_output) - # - # cols_output = [col for col in cols_output if not col == "p-peptide"] - # - # print("cols_output") - # print(cols_output) - # - # output_df = output_df[cols_output] - - # join output_df back to quantitative columns in data_in df - quant_cols = data_in.columns.tolist() - quant_cols = quant_cols[1:] - quant_data = data_in[quant_cols] - - # ----------- Write merge/filter metadata to SQLite database (start) ----------- - # Open SwissProt SQLite database - conn = sql.connect(output_sqlite) - cur = conn.cursor() - - cur.executescript(MRGFLTR_DDL) - - cur.execute( - CITATION_INSERT_STMT, - ("mrgfltr_metadata_view", CITATION_INSERT_PSP), - ) - cur.execute( - CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP) - ) - cur.execute( - CITATION_INSERT_STMT, - ("mrgfltr_metadata_view", CITATION_INSERT_PSP_REF), - ) - cur.execute( - CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP_REF) - ) - - # Read ppep-to-sequence LUT - ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn) - # write only metadata for merged/filtered records to SQLite - mrgfltr_metadata_df = output_df.copy() - # replace phosphopeptide seq with ppep.id - mrgfltr_metadata_df = ppep_lut_df.merge( - mrgfltr_metadata_df, - left_on="ppep_seq", - right_on=PHOSPHOPEPTIDE, - how="inner", - ) - mrgfltr_metadata_df.drop( - columns=[PHOSPHOPEPTIDE, "ppep_seq"], inplace=True - ) - # rename columns - mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS - mrgfltr_metadata_df.to_sql( - "mrgfltr_metadata", - con=conn, - if_exists="append", - index=False, - method="multi", - ) - - # Close SwissProt SQLite database - conn.close() - # ----------- Write merge/filter metadata to SQLite database (finish) ----------- - - output_df = output_df.merge( - quant_data, - how="right", - left_on=PHOSPHOPEPTIDE, - right_on=PHOSPHOPEPTIDE_MATCH, - ) - output_cols = output_df.columns.tolist() - output_cols = output_cols[:-1] - output_df = output_df[output_cols] - - # cosmetic changes to Upstream column - output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[ - PUTATIVE_UPSTREAM_DOMAINS - ].fillna( - "" - ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping - us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) - i = 0 - while i < len(us_series): - # turn blanks into N_A to signify the info was searched for but cannot be found - if us_series[i] == "": - us_series[i] = N_A - i += 1 - output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series - - end_time = time.process_time() # timer - print( - "%0.6f establisheed output [3]" % (end_time - start_time,), - file=sys.stderr, - ) # timer - - (output_rows, output_cols) = output_df.shape - - output_df = output_df.convert_dtypes(convert_integer=True) - - # Output onto Final CSV file - output_df.to_csv(output_filename_csv, index=False) - output_df.to_csv( - output_filename_tab, quoting=None, sep="\t", index=False - ) - - end_time = time.process_time() # timer - print( - "%0.6f wrote output [4]" % (end_time - start_time,), - file=sys.stderr, - ) # timer - - print( - "{:>10} phosphopeptides written to output".format(str(output_rows)) - ) - - end_time = time.process_time() # timer - print( - "%0.6f seconds of non-system CPU time were consumed" - % (end_time - start_time,), - file=sys.stderr, - ) # timer - - # Rev. 7/1/2016 - # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's - # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS - # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ - # read from SwissProt SQLite database - # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper - - # - # copied from Excel Output Script.ipynb END # - # - - try: - catch( - mqpep_getswissprot, - ) - exit(0) - except Exception as e: - exit("Internal error running mqpep_getswissprot(): %s" % (e)) - - -if __name__ == "__main__": - __main__()
--- a/repository_dependencies.xml Tue Mar 15 12:44:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -<?xml version="1.0" ?> -<repositories description="Suite for preprocessing and ANOVA of MaxQuant results using LC-MS proteomics data from phosphoproteomic enrichment."> - <repository name="mqppep_preproc" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="302918bd77e0"/> - <repository name="mqppep_anova" owner="eschen42" toolshed="https://testtoolshed.g2.bx.psu.edu" changeset_revision="6c22e8563a93"/> -</repositories> \ No newline at end of file
--- a/search_ppep.py Tue Mar 15 12:44:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,545 +0,0 @@ -#!/usr/bin/env python -# Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB - -import argparse -import os.path -import re -import sqlite3 -import sys # import the sys module for exc_info -import time -import traceback # import the traceback module for format_exception -from codecs import getreader as cx_getreader - -# For Aho-Corasick search for fixed set of substrings -# - add_word -# - make_automaton -# - iter -import ahocorasick - - -# ref: https://stackoverflow.com/a/8915613/15509512 -# answers: "How to handle exceptions in a list comprehensions" -# usage: -# from math import log -# eggs = [1,3,0,3,2] -# print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) -# producing: -# for <built-in function log> -# with args (0,) -# exception: math domain error -# [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] -def catch(func, *args, handle=lambda e: e, **kwargs): - - try: - return func(*args, **kwargs) - except Exception as e: - print("For %s" % str(func)) - print(" with args %s" % str(args)) - print(" caught exception: %s" % str(e)) - (ty, va, tb) = sys.exc_info() - print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) - # exit(-1) - return None # was handle(e) - - -def __main__(): - - DROP_TABLES_SQL = """ - DROP VIEW IF EXISTS ppep_gene_site_view; - DROP VIEW IF EXISTS uniprot_view; - DROP VIEW IF EXISTS uniprotkb_pep_ppep_view; - DROP VIEW IF EXISTS ppep_intensity_view; - DROP VIEW IF EXISTS ppep_metadata_view; - - DROP TABLE IF EXISTS sample; - DROP TABLE IF EXISTS ppep; - DROP TABLE IF EXISTS site_type; - DROP TABLE IF EXISTS deppep_UniProtKB; - DROP TABLE IF EXISTS deppep; - DROP TABLE IF EXISTS ppep_gene_site; - DROP TABLE IF EXISTS ppep_metadata; - DROP TABLE IF EXISTS ppep_intensity; - """ - - CREATE_TABLES_SQL = """ - CREATE TABLE deppep - ( id INTEGER PRIMARY KEY - , seq TEXT UNIQUE ON CONFLICT IGNORE - ) - ; - CREATE TABLE deppep_UniProtKB - ( deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE - , UniProtKB_id TEXT REFERENCES UniProtKB(id) ON DELETE CASCADE - , pos_start INTEGER - , pos_end INTEGER - , PRIMARY KEY (deppep_id, UniProtKB_id, pos_start, pos_end) - ON CONFLICT IGNORE - ) - ; - CREATE TABLE ppep - ( id INTEGER PRIMARY KEY - , deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE - , seq TEXT UNIQUE ON CONFLICT IGNORE - , scrubbed TEXT - ); - CREATE TABLE site_type - ( id INTEGER PRIMARY KEY - , type_name TEXT UNIQUE ON CONFLICT IGNORE - ); - CREATE INDEX idx_ppep_scrubbed on ppep(scrubbed) - ; - CREATE TABLE sample - ( id INTEGER PRIMARY KEY - , name TEXT UNIQUE ON CONFLICT IGNORE - ) - ; - CREATE VIEW uniprot_view AS - SELECT DISTINCT - Uniprot_ID - , Description - , Organism_Name - , Organism_ID - , Gene_Name - , PE - , SV - , Sequence - , Description || ' OS=' || - Organism_Name || ' OX=' || Organism_ID || - CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN='|| Gene_Name END || - CASE WHEN PE = 'N/A' THEN '' ELSE ' PE='|| PE END || - CASE WHEN SV = 'N/A' THEN '' ELSE ' SV='|| SV END - AS long_description - , Database - FROM UniProtKB - ; - CREATE VIEW uniprotkb_pep_ppep_view AS - SELECT deppep_UniProtKB.UniprotKB_ID AS accession - , deppep_UniProtKB.pos_start AS pos_start - , deppep_UniProtKB.pos_end AS pos_end - , deppep.seq AS peptide - , ppep.seq AS phosphopeptide - , ppep.scrubbed AS scrubbed - , uniprot_view.Sequence AS sequence - , uniprot_view.Description AS description - , uniprot_view.long_description AS long_description - , ppep.id AS ppep_id - FROM ppep, deppep, deppep_UniProtKB, uniprot_view - WHERE deppep.id = ppep.deppep_id - AND deppep.id = deppep_UniProtKB.deppep_id - AND deppep_UniProtKB.UniprotKB_ID = uniprot_view.Uniprot_ID - ORDER BY UniprotKB_ID, deppep.seq, ppep.seq - ; - CREATE TABLE ppep_gene_site - ( ppep_id INTEGER REFERENCES ppep(id) - , gene_names TEXT - , site_type_id INTEGER REFERENCES site_type(id) - , kinase_map TEXT - , PRIMARY KEY (ppep_id, kinase_map) ON CONFLICT IGNORE - ) - ; - CREATE VIEW ppep_gene_site_view AS - SELECT DISTINCT - ppep.seq AS phospho_peptide - , ppep_id - , gene_names - , type_name - , kinase_map - FROM - ppep, ppep_gene_site, site_type - WHERE - ppep_gene_site.ppep_id = ppep.id - AND - ppep_gene_site.site_type_id = site_type.id - ORDER BY - ppep.seq - ; - CREATE TABLE ppep_metadata - ( ppep_id INTEGER REFERENCES ppep(id) - , protein_description TEXT - , gene_name TEXT - , FASTA_name TEXT - , phospho_sites TEXT - , motifs_unique TEXT - , accessions TEXT - , motifs_all_members TEXT - , domain TEXT - , ON_FUNCTION TEXT - , ON_PROCESS TEXT - , ON_PROT_INTERACT TEXT - , ON_OTHER_INTERACT TEXT - , notes TEXT - , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE - ) - ; - CREATE VIEW ppep_metadata_view AS - SELECT DISTINCT - ppep.seq AS phospho_peptide - , protein_description - , gene_name - , FASTA_name - , phospho_sites - , motifs_unique - , accessions - , motifs_all_members - , domain - , ON_FUNCTION - , ON_PROCESS - , ON_PROT_INTERACT - , ON_OTHER_INTERACT - , notes - FROM - ppep, ppep_metadata - WHERE - ppep_metadata.ppep_id = ppep.id - ORDER BY - ppep.seq - ; - CREATE TABLE ppep_intensity - ( ppep_id INTEGER REFERENCES ppep(id) - , sample_id INTEGER - , intensity INTEGER - , PRIMARY KEY (ppep_id, sample_id) ON CONFLICT IGNORE - ) - ; - CREATE VIEW ppep_intensity_view AS - SELECT DISTINCT - ppep.seq AS phospho_peptide - , sample.name AS sample - , intensity - FROM - ppep, sample, ppep_intensity - WHERE - ppep_intensity.sample_id = sample.id - AND - ppep_intensity.ppep_id = ppep.id - ; - """ - - UNIPROT_SEQ_AND_ID_SQL = """ - select Sequence, Uniprot_ID - from UniProtKB - """ - - # Parse Command Line - parser = argparse.ArgumentParser( - description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)." - ) - - # inputs: - # Phosphopeptide data for experimental results, including the intensities - # and the mapping to kinase domains, in tabular format. - parser.add_argument( - "--phosphopeptides", - "-p", - nargs=1, - required=True, - dest="phosphopeptides", - help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool", - ) - parser.add_argument( - "--uniprotkb", - "-u", - nargs=1, - required=True, - dest="uniprotkb", - help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool", - ) - parser.add_argument( - "--schema", - action="store_true", - dest="db_schema", - help="show updated database schema", - ) - parser.add_argument( - "--warn-duplicates", - action="store_true", - dest="warn_duplicates", - help="show warnings for duplicated sequences", - ) - parser.add_argument( - "--verbose", - action="store_true", - dest="verbose", - help="show somewhat verbose program tracing", - ) - # "Make it so!" (parse the arguments) - options = parser.parse_args() - if options.verbose: - print("options: " + str(options) + "\n") - - # path to phosphopeptide (e.g., "outputfile_STEP2.txt") input tabular file - if options.phosphopeptides is None: - exit('Argument "phosphopeptides" is required but not supplied') - try: - f_name = os.path.abspath(options.phosphopeptides[0]) - except Exception as e: - exit("Error parsing phosphopeptides argument: %s" % (e)) - - # path to SQLite input/output tabular file - if options.uniprotkb is None: - exit('Argument "uniprotkb" is required but not supplied') - try: - db_name = os.path.abspath(options.uniprotkb[0]) - except Exception as e: - exit("Error parsing uniprotkb argument: %s" % (e)) - - # print("options.schema is %d" % options.db_schema) - - # db_name = "demo/test.sqlite" - # f_name = "demo/test_input.txt" - - con = sqlite3.connect(db_name) - cur = con.cursor() - ker = con.cursor() - - cur.executescript(DROP_TABLES_SQL) - - # if options.db_schema: - # print("\nAfter dropping tables/views that are to be created, schema is:") - # cur.execute("SELECT * FROM sqlite_schema") - # for row in cur.fetchall(): - # if row[4] is not None: - # print("%s;" % row[4]) - - cur.executescript(CREATE_TABLES_SQL) - - if options.db_schema: - print( - "\nAfter creating tables/views that are to be created, schema is:" - ) - cur.execute("SELECT * FROM sqlite_schema") - for row in cur.fetchall(): - if row[4] is not None: - print("%s;" % row[4]) - - def generate_ppep(f): - # get keys from upstream tabular file using readline() - # ref: https://stackoverflow.com/a/16713581/15509512 - # answer to "Use codecs to read file with correct encoding" - file1_encoded = open(f, "rb") - file1 = cx_getreader("latin-1")(file1_encoded) - - count = 0 - re_tab = re.compile("^[^\t]*") - re_quote = re.compile('"') - while True: - count += 1 - # Get next line from file - line = file1.readline() - # if line is empty - # end of file is reached - if not line: - break - if count > 1: - m = re_tab.match(line) - m = re_quote.sub("", m[0]) - yield m - file1.close() - file1_encoded.close() - - # Build an Aho-Corasick automaton from a trie - # - ref: - # - https://pypi.org/project/pyahocorasick/ - # - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm - # - https://en.wikipedia.org/wiki/Trie - auto = ahocorasick.Automaton() - re_phos = re.compile("p") - # scrub out unsearchable characters per section - # "Match the p_peptides to the @sequences array:" - # of the original - # PhosphoPeptide Upstream Kinase Mapping.pl - # which originally read - # $tmp_p_peptide =~ s/#//g; - # $tmp_p_peptide =~ s/\d//g; - # $tmp_p_peptide =~ s/\_//g; - # $tmp_p_peptide =~ s/\.//g; - # - re_scrub = re.compile("0-9_.#") - ppep_count = 0 - for ppep in generate_ppep(f_name): - ppep_count += 1 - add_to_trie = False - # print(ppep) - scrubbed = re_scrub.sub("", ppep) - deppep = re_phos.sub("", scrubbed) - if options.verbose: - print("deppep: %s; scrubbed: %s" % (deppep, scrubbed)) - # print(deppep) - cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) - if cur.fetchone() is None: - add_to_trie = True - cur.execute("INSERT INTO deppep(seq) VALUES (?)", (deppep,)) - cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) - deppep_id = cur.fetchone()[0] - if add_to_trie: - # print((deppep_id, deppep)) - # Build the trie - auto.add_word(deppep, (deppep_id, deppep)) - cur.execute( - "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)", - (ppep, scrubbed, deppep_id), - ) - # def generate_deppep(): - # cur.execute("SELECT seq FROM deppep") - # for row in cur.fetchall(): - # yield row[0] - cur.execute("SELECT count(*) FROM (SELECT seq FROM deppep GROUP BY seq)") - for row in cur.fetchall(): - deppep_count = row[0] - - cur.execute( - "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)" - ) - for row in cur.fetchall(): - sequence_count = row[0] - - print("%d phosphopeptides were read from input" % ppep_count) - print( - "%d corresponding dephosphopeptides are represented in input" - % deppep_count - ) - # Look for cases where both Gene_Name and Sequence are identical - cur.execute( - """ - SELECT Uniprot_ID, Gene_Name, Sequence - FROM UniProtKB - WHERE Sequence IN ( - SELECT Sequence - FROM UniProtKB - GROUP BY Sequence, Gene_Name - HAVING count(*) > 1 - ) - ORDER BY Sequence - """ - ) - duplicate_count = 0 - old_seq = "" - for row in cur.fetchall(): - if duplicate_count == 0: - print( - "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)." - ) - if row[2] != old_seq: - old_seq = row[2] - duplicate_count += 1 - if options.warn_duplicates: - print("\n%s\t%s\t%s" % row) - else: - if options.warn_duplicates: - print("%s\t%s" % (row[0], row[1])) - if duplicate_count > 0: - print( - "\n%d sequences have duplicated accession IDs\n" % duplicate_count - ) - - print("%s accession sequences will be searched\n" % sequence_count) - - # print(auto.dump()) - - # Convert the trie to an automaton (a finite-state machine) - auto.make_automaton() - - # Execute query for seqs and metadata without fetching the results yet - uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL) - while 1: - batch = uniprot_seq_and_id.fetchmany(size=50) - if not batch: - break - for Sequence, UniProtKB_id in batch: - if Sequence is not None: - for end_index, (insert_order, original_value) in auto.iter( - Sequence - ): - ker.execute( - """ - INSERT INTO deppep_UniProtKB - (deppep_id,UniProtKB_id,pos_start,pos_end) - VALUES (?,?,?,?) - """, - ( - insert_order, - UniProtKB_id, - 1 + end_index - len(original_value), - end_index, - ), - ) - else: - raise ValueError( - "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" - % (UniProtKB_id,) - ) - ker.execute( - """ - SELECT count(*) || ' accession-peptide-phosphopeptide combinations were found' - FROM uniprotkb_pep_ppep_view - """ - ) - for row in ker.fetchall(): - print(row[0]) - - ker.execute( - """ - SELECT count(*) || ' accession matches were found', count(*) AS accession_count - FROM ( - SELECT accession - FROM uniprotkb_pep_ppep_view - GROUP BY accession - ) - """ - ) - for row in ker.fetchall(): - print(row[0]) - - ker.execute( - """ - SELECT count(*) || ' peptide matches were found' - FROM ( - SELECT peptide - FROM uniprotkb_pep_ppep_view - GROUP BY peptide - ) - """ - ) - for row in ker.fetchall(): - print(row[0]) - - ker.execute( - """ - SELECT count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count - FROM ( - SELECT phosphopeptide - FROM uniprotkb_pep_ppep_view - GROUP BY phosphopeptide - ) - """ - ) - for row in ker.fetchall(): - print(row[0]) - - # link peptides not found in sequence database to a dummy sequence-record - ker.execute( - """ - INSERT INTO deppep_UniProtKB(deppep_id,UniProtKB_id,pos_start,pos_end) - SELECT id, 'No Uniprot_ID', 0, 0 - FROM deppep - WHERE id NOT IN (SELECT deppep_id FROM deppep_UniProtKB) - """ - ) - - con.commit() - ker.execute("vacuum") - con.close() - - -if __name__ == "__main__": - wrap_start_time = time.perf_counter() - __main__() - wrap_stop_time = time.perf_counter() - # print(wrap_start_time) - # print(wrap_stop_time) - print( - "\nThe matching process took %d milliseconds to run.\n" - % ((wrap_stop_time - wrap_start_time) * 1000), - ) - -# vim: sw=4 ts=4 et ai :
--- a/test-data/pSTY_motifs.tabular Tue Mar 15 12:44:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,196 +0,0 @@ -1 ((E|D|A)(D|E)(E|D)(E|D)pS(E|D|A)(D|E|A)(E|D)(E|D))|(pS.(E|pS|pT))|(pS..(E|pS|pT))|((pS|pT)..(E|D))|(pS(D|E).(D|E).(D|E))|((D|E)pS(D|E).(D|E))|(pS(D|E)(D|E)(D|E))|((pS|pT)..(D|E))|((pS|pT)..(E|D|pS|pY))|((S|E|P|G)(D|S|N|E|P)(E|D|G|Q|W)(Y|E|D|S|W|T)(W|E|D)pS(D|E)(D|E|W|N)(E|D)(E|D|N|Q)) Casein Kinase II substrate motif (HPRD) -2 ((L|F|I)...R(Q|S|T)L(pS|pT)(M|L|I|V))|(..B.R..pS..)|(pS...(pS|pT)) MAPKAPK2 kinase substrate motif (HPRD) -3 ((M|V|L|I|F)(R|K|H)...(pS|pT)...(M|V|L|I|F))|((M|V|L|I)..(R|K|H).(pS|pT)...(M|V|L|I))|((M|V|L|I|F)(R|K|H)..(pS|pT)...(M|V|L|I|F)) AMP-activated protein kinase substrate motif (HPRD) -4 ((P|L|I|M).(L|I|D|E)pSQ)|(LpSQE)|(pSQ) ATM kinase substrate motif (HPRD) -5 ((R|K).R..(pS|pT)(M|L|V|I))|(VFLGFpTYVAP) p70 Ribosomal S6 kinase substrate motif (HPRD) -6 ((R|K).R..pS)|(RRR.pS) MAPKAPK1 kinase substrate motif (HPRD) -7 ((R|K)pSP(R|P)(R|K|H))|((pS|pT)P.(R|K))|(HHH(R|K)pSPR(R|K)R) Cdc2 kinase substrate motif (HPRD) -8 ((R|N)(F|L|M)(R|K)(R|K)pS(R|I|V|M)(R|I|M|V)(M|I|F|V)(I|F|M))|(FR.(pS|pT))|(RF(R|K)(R|K)pS(R|I)(R|I)MI) NIMA kinase substrate motif (HPRD) -9 ((pS|pT)P.(K|R))|((K|R)(pS|pT)P)|((pS|pT)P(K|R)) Growth associated histone HI kinase substrate motif (HPRD) -10 (..(pS|pT)E)|(.(pS|pT)...(A|P|S|T)) G protein-coupled receptor kinase 1 substrate motif (HPRD) -11 (.R..(pS|pT).R.)|((pS|pT).(R|K))|((R|K)..(pS|pT))|((R|K)..(pS|pT).(R|K))|((K|R).(pS|pT))|((R|K).(pS|pT).(R|K)) PKC kinase substrate motif (HPRD) -12 (.pSQ)|(P(pS|pT).) DNA dependent Protein kinase substrate motif (HPRD) -13 (AKRRRLSpSLRA)|(VRKRpTLRRL) PAK1 kinase substrate motif (HPRD) -14 (ARKGpSLRQ)|(R(R|F)RR(R|K)GpSF(R|K)(R|K)) PKC alpha kinase substrate motif (HPRD) -15 (HpSTSDD)|(YRpSVDE) Branched chain alpha-ketoacid dehydrogenase kinase substrate motif (HPRD) -16 (KCSpTWP)|(R..pS)|(R.R..pS.P)|(YpTV)|(RS.(pS|pT).P)|(R.(Y|F).pS.P)|(RPVSSAApSVY) 14-3-3 domain binding motif (HPRD) -17 (KK.RRpT(L|V).)|(KKR.RpT(L|V).)|((R|K).RR.(pS|pT)(L|V).) DMPK1 kinase substrate motif (HPRD) -18 (KKKKKK(pS|pT)...)|((R|K|Q|N)(M|C|W)(R|T|S|N)(E|D|S|N)(R|K|E|D|N)pS(S|D|E)(S|GC|D)(SM|R|N)(N|H|S|R|C)) TGF beta receptor kinase substrate motif (HPRD) -19 (KRKQIpSVR)|((F|M|K)(R|K)(M|R|Q|F)(M|F|L|I)pS(F|I|M|L)(F|R|K)(L|I)(F|L|I))|((K|R)..pS(V|I)) Phosphorylase kinase substrate motif (HPRD) -20 (KRQGpSVRR)|(R(K|E|R).pS) PKC epsilon kinase substrate motif (HPRD) -21 (P.(pS|pT)P)|(pSP) ERK1, ERK2 Kinase substrate motif (HPRD) -22 (P.(pS|pT)PP)|(..P.(pS|pT)PPP.) ERK1,2 kinase substrate motif (HPRD) -23 (PL(pS|pT)PIP(K|R|H))|(PL(pS|pT)P.(K|R|H)) CDK4 kinase substrate motif (HPRD) -24 (PLpTLP)|(PLLpTP)|(PLpTP)|(PpTLP)|(PLpTLP)|(PpTLP)|(LpTP) RAF1 kinase substrate motif (HPRD) -25 (R..(pS|pT))|((K|F)(R|K)(Q|M)(Q|M|K|L|F)pS(F|I|M|L|V)(D|E|I)(L|M|K|I)(F|K))|((M|V|L|I|F).(R|K)..(pS|pT)..)|(R..pS) Calmodulin-dependent protein kinase II substrate motif (HPRD) -26 (R..pSPV)|(K(pS|pT)P.K)|(KpSP...K)|(KpSP..K)|(KpSP....K)|(KpTPAKEE)|(P.pSP)|(.(pS|pT)P)|(..pSP) GSK-3, ERK1, ERK2, CDK5 substrate motif (HPRD) -27 (R.R..(pS|pT)(F|L))|(R.R..(pS|pT))|(GRART(S|T)pSFAE)|((R|Q|K)(R|K|N|Q|P|H)(R|K)(R|S|T)(N|K|Q|H|D|P)pS(F|W|I|M|N|S)(S|T|H)(R|S|K)(S|T|P|Q))|((R|K).(R|K)(S|T).pS) Akt kinase substrate motif (HPRD) -28 (RR..pS)|(KR.RpS)|(KRR.pT) ZIP kinase substrate motif (HPRD) -29 (RR.pS(M|I|L|V|F|Y))|(R.pS)|(KR..pS)|(R..pS)|((R|K).(pS|pT))|(K..(pS|pT))|((R|K)(R|K).(pS|pT))|(K...(pS|pT))|((pS|pT).(R|K))|(RRRRpSIIFI)|(RR.pS)|(R(R|K).(pS|pT)(I|L|V|F|Y)(D|C|.).D)|(RR.pS)|(RRR(R|N)pSII(F|D))|((R|C|P|K)(R|A|P)(R|K)(R|K|S)(N|L|S|M|P)Ps(I|L|V|C)(S|P|H|Q)(S|W|Q)(S|L|G)) PKA kinase substrate motif (HPRD) -30 (RRFGpSBRRF)|(RRFGpS(M|L|V|I|F)RR(M|L|V|I|F)) MEKK kinase substrate motif (HPRD) -31 (VPGKARKKpSSCQLL)|(PLARTLpSVAGLP)|((M|I|L|V|F|Y).R..(pS|pT)) Calmodulin-dependent protein kinase IV substrate motif (HPRD) -32 (pSD.E)|(pS..(E|D)) Casein kinase II substrate motif (HPRD) -33 (pSP..(pS|pT))|((D|E)..(pS|pT))|((pS|pT)..(S|T))|((pS|pT)...(S|T)(M|L|V|I|F)) Casein Kinase I substrate motif (HPRD) -34 (pTP.K)|((K|H|G)H(H|P)(K|G|H)pSP(R|K)(H|R|K)(R|H|K))|((pS|pT)PG(pS|pT)PGTP) CDK5 kinase substrate motif (HPRD) -35 (R|K).R..pS...(R|K) AMP-activated protein kinase 2 substrate motif (HPRD) -36 (R|K|N)R.(pS|pT)(M|L|V|I) Aurora-A kinase substrate motif (HPRD) -37 (D|E)(pS|pT)... b-Adrenergic Receptor kinase substrate motif (HPRD) -38 (M|V|L|I|F).R..(pS|pT)...(M|V|L|I|F) Calmodulin-dependent protein kinase I substrate motif (HPRD) -39 (M|I|L|V|F|Y).R..(pS|pT)(M|I|L|V|F|Y) Calmodulin-dependent protein kinase II alpha substrate motif (HPRD) -40 E(F|E)D(T|A|G)GpSI(I|F|Y|G)(I|G|F)(F|G)(F|P|L) Casein Kinase I delta substrate motif (HPRD) -41 Y(Y|E)(D|Y)(A|D)(A|G)pSI(I|Y|F|G)(I|G|F)(F|G)(F|P|L) Casein Kinase I gamma substrate motif (HPRD) -42 P.(pS|pT)PKK.KK Cdc2 like protein kinase substrate motif (HPRD) -43 (pS|pT)P.(R|K) CDK1,2, 4, 6 kinase substrate motif (HPRD) -44 pSP.(R|K). CDK kinase substrate motif (HPRD) -45 (M|I|L|V).(R|K)..(pS|pT) Chk1 kinase substrate motif (HPRD) -46 R..(pS|pT)..R CLK1 kinase substrate motif (HPRD) -47 (R|K).(R|K).(R|K).pS..R CLK1,2 kinase substrate motif (HPRD) -48 R(R|H)(R|H)(R|E)RE(R|H)pSR(R|D)L CLK2 kinase substrate motif (HPRD) -49 R..(pS|pT)(L|V)R DMPK1,2 kinase substrate motif (HPRD) -50 R(R|K)R(E|R)R(E|A)(H|R)pSRR(R|D)(L|E) DOA/CDC-like kinase 2 substrate motif (HPRD) -51 (I|L|V|F|M)RR..(pS|pT)(I|L|M|V|F) Doublecortin kinase-1 kinase substrate motif (HPRD) -52 E.pS.R..R elF2 alpha kinase substrate motif (HPRD) -53 (T|P|S)(G|P|E|Y)(P|L|I)(L|M|P)pSP(G|P|F)(P|F|G|Y)(F|Y|I) ERK1 kinase substrate motif (HPRD) -54 pTEpY ERK1 Kinase substrate motif (HPRD) -55 KpSPP ERK1, ERK2, SAPK, CDK5 and GSK3 kinase substrate motif (HPRD) -56 (D|Y|W|E)(C)(P|S|C|E)(P|C|S|L|T|V)(L|M|T)pS(P|A)(T|S|G|R|C|F)(W|P|S)(W|F) ERK2 kinase substrate motif (HPRD) -57 pS...pS GSK3 kinase substrate motif (HPRD) -58 P.pTP GSK3, Erk1, Erk2 and CDK5 kinase motif (HPRD) -59 (M|L|V|I|F)(R|K|H)..pS...(M|L|V|I|F) HMGCoA Reductase kinase substrate motif (HPRD) -60 GP(Q|M)pSPI JNK1 Kinase substrate motif (HPRD) -61 LRpT LKB1 Kinase substrate motif (HPRD) -62 pT(G|P|E)pY MAPK 11,13,14 Kinase substrate motif (HPRD) -63 KKR..pS.(R|K)(R|K) MLCK kinase substrate motif (HPRD) -64 FpTY mTOR kinase substrate motif (HPRD) -65 IRRLpSTRRR Nek 2 kinase substrate motif (HPRD) -66 (R|K)(R|.).(pS|pT) PAK2 kinase substrate motif (HPRD) -67 F..F(pS|pT)(F|Y) PDK1 kinase substrate motif (HPRD) -68 (R|K)(R|K)(R|K).(pS|pT). Pim1 kinase substrate sequence (HPRD) -69 (R|K)(R|K|A|Q|P)(R|K)(R|Q|H|N|Y)(P|H|K)pS(G|S|T)(P|S|G|Q|H|S|T)(S|P|Q|G|D)(T|S|P|G) Pim2 kinase substrate sequence (HPRD) -70 R(R|K).(pS|pT)B PKA, PKG kinase substrate motif (HPRD) -71 (L|R|F)(R|K)R(K|Q)GpS(F|M)KK.A PKC beta kinase substrate motif (HPRD) -72 R.RKGpSF PKC delta kinase substrate motif (HPRD) -73 AR..R(R|K)RpSFRR PKC eta kinase substrate motif (HPRD) -74 F..F(pS|pT)(F|Y) PKC family kinase substrate motif (HPRD) -75 RRRK(G|K)SF(R|K)(R|K)KA PKC gamma kinase substrate motif (HPRD) -76 (L|V)(V|L|A)R(Q|K|E)MpS PKC mu kinase substrate motif (HPRD) -77 (R|F|W|M)(W|A|K|S)(R|S|K|H)(R|H|S|Q)(R|K|N|P|G|Q)pS(I|F|R|V|K|S|L|M)(K|M|R|S|T)(R|S|K|W)(R|K|G) PKC theta kinase substrate motif (HPRD) -78 F.R..pS(F|M)(F|M) PKC zeta kinase substrate motif (HPRD) -79 (L|V|I)(R|K|Q)(R|K)(R|K|T|Q|M)(N|K|R|L|M|H)pS(F|W|I|M|L|V)(S|N)(R|S|P|Y|W)(S|R|N|L) PKD kinase substrate motif (HPRD) -80 R(R|K).(pS|pT)B PKG kinase substrate motif (HPRD) -81 R..(pS|pT).R..R PKR kinase substrate motif (HPRD) -82 (D|E).(pS|pT)(I|L|V|M).(D|E) Plk1 kinase substrate motif (HPRD) -83 .pS..D.. Pyruvate dehydrogenase kinase substrate motif (HPRD) -84 pTEY Dual specificity protein phosphatase 1 substrate motif (HPRD) -85 pT.pY Dual specificity protein phosphatase 6 substrate motif (HPRD) -86 RRA(pS|pT)VA PP2A, PP2C substrate motif (HPRD) -87 .R..pSVA PP2B substrate motif (HPRD) -88 .pT.pY. PP2C delta substrate motif (HPRD) -89 pS(D|E)(D|E)E BARD1 BRCT domain binding motif (HPRD) -90 DpSG..pS Beta-TrCP1 domain binding motif (HPRD) -91 pS(F|Y|H)(V|F|Y)(F|Y) BRCA1 BRCT domain binding motif (HPRD) -92 (I|L)(I|L|P)pTP(R|K) CDC4 WD40 domain binding motif (HPRD) -93 HFDpTYLI Chk2 FHA domain binding motif (HPRD) -94 (R|D|H)(L|Y)(L|M)(K|A)pT(Q|L|M|E|V)(K|L|I|R) FHA domain binding motif (HPRD) -95 S(pS|pT). MDC1 BRCT domain binding motif (HPRD) -96 S(pS|pT). Plk1 PBD domain binding motif (HPRD) -97 pSYII RAD9 BRCT domain binding motif (HPRD) -98 (pS|pT)P WW domain binding motif (HPRD) -99 ((pS|pT)P.(K|R))|((pS|pT)P(K|R)) CDK1_Phosida -100 (P.(pS|pT)P)|(V.(pS|pT)P)|(PE(pS|pT)P) ERK/MAPK_Phosida -101 (R(R|S|T).(pS|pT).(S|T))|(R.R..(pS|pT)) PKB/AKT_Phosida -102 (R.(pS|pT))|(R(R|K).(pS|pT))|(KR..(pS|pT)) PKA_Phosida -103 (R..(pS|pT))|(R..(pS|pT)V) CAMK2_Phosida -104 (S..(pS|pT))|((S|T)...pS) CK1_Phosida -105 (pS|pT)..E CK2_Phosida -106 pS...S GSK3_Phosida -107 (pS|pT)P.(K|R) CDK2_Phosida -108 R..(pS|pT).R PKC_Phosida -109 (L|V|I).(R|K)..(pS|pT) PKD_Phosida -110 (I|E|V)pY(E|G)(E|D|P|N)(I|V|L) LCK_Phosida -111 (I|V|L)pY..(P|F) ABL_Phosida -112 (E|D)..pY..(D|E|A|G|S|T) SRC_Phosida -113 pY..(I|L|V|M) ALK_Phosida -114 (D|P|S|A|E|N).pY(V|L|D|E|I|N|P) EGFR_Phosida -115 (R|K).(pS|pT)(I|L|V) AURORA_Phosida -116 (R|K|N)R.(pS|pT)(M|L|V|I) AURORA-A_Phosida -117 (D|E).(pS|pT)(V|I|L|M).(D|E) PLK_Phosida -118 (E|D).(pS|pT)(F|L|I|Y|W|V|M) PLK1_Phosida -119 L..(pS|pT) NEK6_Phosida -120 L.R..(pS|pT) CHK1/2_Phosida -121 (M|I|L|V).(R|K)..(pS|pT) CHK1_Phosida -122 F..F(pS|pT)(F|Y) PDK1_Phosida -123 (F|L|M)(R|K)(R|K)(pS|pT) NIMA_Phosida -124 ((D|E)(D|E)...pYVA)|((E|D|Y)pY) TC-PTP phosphatase substrate motif (HPRD) -125 ((D|E).(L|I|V).pY..(L|I|V))|((D|E).(L|I|V)..pY..(L|I|V))|((D|E)(D|E)(D|E|L).pY..(F|M|L|V|I)(D|E))|((D|E).pY)|((E|P)(F|I|L)pYA.(F|I|L|V)) SHP1 phosphatase substrate motif (HPRD) -126 ((D|E).......(D|E)..pY..L.......Y..(L|I))|((I|V|L|S).pY..(L|I)) Src family kinase substrate motif (HPRD) -127 ((D|E)pYpY(R|K))|(EFpY(G|A)TY(G|A))|(E(Y|F|D)pYM)|((E|P)(M|L|I|V|F)pY(G|A).(M|L|I|V|F|Y)A)|(RD.Y.TDYpYR)|(E(F|D|Y)pY) PTP1B phosphatase substrate motif (HPRD) -128 ((H|F).V.(T|S|A)pY)|((I|V|L).pY(F|M).P)|(pY(I|V).(I|V))|((I|L|V|M).pY(T|V|A).(I|V|L|F))|((I|V).pY(L|M|T)Y(A|P|T)SG)|(W(M|T|V)pY(Y|R)(I|L).) SHP2 N-terminal SH2 domain binding motif (HPRD) -129 ((V|I|L).pYA.(L|V))|(..pYYM(K|R)) SHP1 C-terminal SH2 domain binding motif (HPRD) -130 (.E.IpYGVLF)|(E.(I|V|L|F)pY(G|A)V(L|V|F|I)(F|L|V|I)) Lck kinase substrate motif (HPRD) -131 (DEEIpY(E|G)EL.)|((D|E).......(D|E)..pY..L.......Y..(L|I)) Lyn kinase substrate motif (HPRD) -132 (EE(D|E)IpYFFFF)|(...IpY(M|I|F)FFF) CSK kinase substrate motif (HPRD) -133 (EEEEpYFELV)|((E|D|R|A)(D|E)(D|E)(E|D|I)pY(F|V|I|E)(E|F|D)(L|I|F|V)V)|(.(D|E)pY.)|(pYIPP)|(.(D|E)pY(I|L|V)) EGFR kinase substrate motif (HPRD) -134 (EEEEpYVFI.)|((L|N)(R|I)TpY)|((D|E)(D|E)(D|E)(D|E)pY(V|E|I)F(I|V|F)) PDGFR kinase substrate motif (HPRD) -135 (EEEIpYEEIE)|((E|A|D)(E|A)(E|A)(I|E|V)pY(D|E)(D|E)(I|V|E)(E|I|V)) Fes kinase substrate motif (HPRD) -136 (EEEpYFFLF)|(A(E|A)EEpY(F|V)F(L|F|M|I|V)F) FGFR kinase substrate motif (HPRD) -137 (L(Y|H)pY(M|F).(F|M))|(L.pYA.L) SHP1 N-terminal SH2 domain binding motif (HPRD) -138 (pY(M|L|E)EP)|(pYESP) Vav SH2 domain binding motif (HPRD) -139 (pY(Y|I|V)N(F|L|I|V))|(pY(Q|Y|V)N(Y|Q|F))|(pY.N) Grb2 SH2 domain binding motif (HPRD) -140 (pY..P)|(pYDHP) Crk SH2 domain binding motif (HPRD) -141 (pY..Q)|(pY(M|L|V|I|F)(P|R|K|H)Q) STAT3 SH2 domain binding motif (HPRD) -142 (pY..YY)|(pY(D|E).(I|L|V|M))|((D|E)..pY)|(pY....(F|Y)) ALK kinase substrate motif (HPRD) -143 (pYIDL)|(pYASI)|(EFpYA.(V|I)G(R|K|H)S) SHP2 phosphatase substrate motif (HPRD) -144 (pYM.M)|(EDAIpY)|(.VIpYAAPF)|(EAIpYAAPF)|(EEIpYEEpY)|(E.IpY..P.)|(EEIpYYYVH)|(ERIpYARTK)|(AEV(I|V|L|F)pYAA(P|F)F) Abl kinase substrate motif (HPRD) -145 (pYM.M)|(EE(E|N|D)pY(M|F)(M|F)(M|F|I|E)(M|F))|(.EEEpYMMMM)|(KKSRGDpYMTMQIG)|(KKKLPATGDpYMNMSPVGD) Insulin receptor kinase substrate motif (HPRD) -146 (pYM.M)|(YIpYGSFK)|(EEEIpY(G|E)EFD)|(D(D|E)(E|D|G)(I|V|L)pY(G|E)E(F|I)F)|((D|E).......(D|E)..pY..L.......Y..(L|I))|((D|E)(D|E)(E|D|G)(I|V|L)pY(G|E|D)E(F|I|L|V)(D|E))|(pY(A|G|S|T|D|E)) Src kinase substrate motif (HPRD) -147 (pYM.M)|(pY..M)|(pYMPMS) PI3 Kinase p85 SH2 domain binding motif (HPRD) -148 ME(E|N)(I|V)pY(G|E)IFF Fgr kinase substrate motif (HPRD) -149 KKKSPGEpYVNIEFG IGF1 receptor kinase substrate motif (HPRD) -150 pY..(L|I|V) JAK2 kinase substrate motif (HPRD) -151 pTPpY JNK kinase substrate motif (HPRD) -152 (E|D|pT|pY).pYEE Syk kinase substrate motif (HPRD) -153 DpYpYR PTP1B, TC-PTP phosphatase substrate motif (HPRD) -154 (D|E)FpY(G|A)(F|Y)(A|G) PTPRH phosphatase substrate motif (HPRD) -155 F(M|L|V|I)pY PTPRJ phosphatase substrate motif (HPRD) -156 pY(E|M|V)(N|V|I) 3BP2 SH2 domain binding motif (HPRD) -157 pYENP Abl SH2 domain binding motif (HPRD) -158 pY(T|A|S)(K|R|Q|N)(M|I|V|R) Csk SH2 domain binding motif (HPRD) -159 pYE.(V|I) Fes SH2 domain binding motif (HPRD) -160 pYEE(I|V) Fgr SH2 domain binding motif (HPRD) -161 pYEDP Fyn SH2 domain binding motif (HPRD) -162 pY(M|I|L|V).(M|I|L|V) GRB2, 3BP2, Csk, Fes, Syk C-terminal SH2 domain binding motif (HPRD) -163 (F|Y)pY(E|T|Y|S)N(I|L|V|P|T|Y|S) GRB7, GRB10 SH2 domain binding motif (HPRD) -164 pYF.(F|P|L|Y) HCP SH2 domain binding motif (HPRD) -165 pY(A|E|V)(Y|F|E|S|N|V)(P|F|I|H) Itk SH2 domain binding motif (HPRD) -166 pYDYV Lck and Src SH2 domain binding motif (HPRD) -167 pYDEP Nck SH2 domain binding motif (HPRD) -168 pY(L|I|V)E(L|I|V) PLCgamma C and N-terminal SH2 domain binding motif (HPRD) -169 pY..P RasGAP C-terminal SH2 domain binding motif (HPRD) -170 pYILV.(M|L|I|V|P) RasGAP N-terminal SH2 domain binding motif (HPRD) -171 TIpY..(V|I) SAP and EAT2 SH2 domain binding motif (HPRD) -172 pY(L|V)N(V|P) Sem5 SH2 domain binding motif (HPRD) -173 pY(T|V|I).L Shb SH2 domain binding motif (HPRD) -174 pY(I|E|Y|L).(I|L|M) SHC SH2 domain binding motif (HPRD) -175 (I|V|L|S).pY..(L|I) SHIP2 SH2 domain binding motif (HPRD) -176 (I|V).pY..(L|V) SHP1 SH2 domain binding motif (HPRD) -177 (V|I|L).pY(M|L|F).P SHP1, SHP2 SH2 domain binding motif (HPRD) -178 (T|V|I|Y).pY(A|S|T|V).(I|V|L) SHP2 CSH2 domain binding motif (HPRD) -179 (I|L|V)(I|L|V)(I|L|V|F|T|Y)pY(T|I|L|V)(I|L)(I|L|V|P) SHP2 C-terminal SH2 domain binding motif (HPRD) -180 pYIPP SHP2, PLCgamma SH2 domain binding motif (HPRD) -181 pYM.M Src and Abl SH2 domain binding motif (HPRD) -182 pY(R|K|H|Q|E|D)(R|K|H|Q|E|D)(I|P) Src, Fyn, Lck, Fgr, Abl, Crk, Nck SH2 domain binding motif (HPRD) -183 PP.pY Src, Fyn,Csk, Nck and SHC SH2 domain binding motif (HPRD) -184 pYEEI Src,Lck and Fyn SH2 domains binding motif (HPRD) -185 pY(D|E)(P|R)(R|P|Q) STAT1 SH2 domain binding motif (HPRD) -186 pY(Q|T|E)(E|Q)(L|I) Syk C-terminal SH2 domain binding motif (HPRD) -187 pYTT(I|L|M) Syk N-terminal SH2 domain binding motif (HPRD) -188 (D|E).......(D|E)..pY..L.......Y..(L|I) Syk, ZAP-70, Shc, Lyn SH2 domain binding motif (HPRD) -189 pYEN(F|I|V) Tensin SH2 domain binding motif (HPRD) -190 D(N|D).pY Cbl PTB domain binding motif (HPRD) -191 N.LpY Dok1 PTB domain binding motif (HPRD) -192 N..pY FRIP PTB domain binding motif (HPRD) -193 NP.pY Shc PTB domain binding motif (HPRD) -194 DD.pY Shb PTB domain binding motif (HPRD) -195 NP.pYF.R ShcA PTB domain binding motif (HPRD) -196 HN(M|L|V|I)(M|L|V|I|N)NP(S|T)pY ShcC PTB domain binding motif (HPRD)
--- a/test-data/test_input_for_preproc.tabular Tue Mar 15 12:44:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -Proteins Positions within proteins Leading proteins Protein Fasta headers Localization prob Score diff PEP Score Delta score Score for localization Localization prob shL.1A Score diff shL.1A PEP shL.1A Score shL.1A Localization prob shL.1B Score diff shL.1B PEP shL.1B Score shL.1B Localization prob shL.1C Score diff shL.1C PEP shL.1C Score shL.1C Localization prob shR.2A Score diff shR.2A PEP shR.2A Score shR.2A Localization prob shR.2B Score diff shR.2B PEP shR.2B Score shR.2B Localization prob shR.2C Score diff shR.2C PEP shR.2C Score shR.2C Diagnostic peak Number of Phospho (STY) Amino acid Sequence window Modification window Peptide window coverage Phospho (STY) Probabilities Phospho (STY) Score diffs Position in peptide Charge Mass error [ppm] Identification type shL.1A Identification type shL.1B Identification type shL.1C Identification type shR.2A Identification type shR.2B Identification type shR.2C Intensity Intensity___1 Intensity___2 Intensity___3 Ratio mod/base Intensity shL.1A Intensity shL.1B Intensity shL.1C Intensity shR.2A Intensity shR.2B Intensity shR.2C Ratio mod/base shL.1A Ratio mod/base shL.1B Ratio mod/base shL.1C Ratio mod/base shR.2A Ratio mod/base shR.2B Ratio mod/base shR.2C Intensity shL.1A___1 Intensity shL.1A___2 Intensity shL.1A___3 Intensity shL.1B___1 Intensity shL.1B___2 Intensity shL.1B___3 Intensity shL.1C___1 Intensity shL.1C___2 Intensity shL.1C___3 Intensity shR.2A___1 Intensity shR.2A___2 Intensity shR.2A___3 Intensity shR.2B___1 Intensity shR.2B___2 Intensity shR.2B___3 Intensity shR.2C___1 Intensity shR.2C___2 Intensity shR.2C___3 Occupancy shL.1A Occupancy ratioshL.1A Occupancy error scale shL.1A Occupancy shL.1B Occupancy ratioshL.1B Occupancy error scale shL.1B Occupancy shL.1C Occupancy ratioshL.1C Occupancy error scale shL.1C Occupancy shR.2A Occupancy ratioshR.2A Occupancy error scale shR.2A Occupancy shR.2B Occupancy ratioshR.2B Occupancy error scale shR.2B Occupancy shR.2C Occupancy ratioshR.2C Occupancy error scale shR.2C Reverse Potential contaminant id Protein group IDs Positions Position Peptide IDs Mod. peptide IDs Evidence IDs MS/MS IDs Best localization evidence ID Best localization MS/MS ID Best localization raw file Best localization scan number Best score evidence ID Best score MS/MS ID Best score raw file Best score scan number Best PEP evidence ID Best PEP MS/MS ID Best PEP raw file Best PEP scan number -sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN 108;108;124;124;131;104;104;120 sp|O43768-2|ENSA_HUMAN sp|O43768-2|ENSA_HUMAN 0.877317 8.54376 0.001041 110.11 55.028 110.11 1 S TGDHIPTPQDLPQRKSSLVTSKLAG______ X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXPPPPPPPPXXXXXXXXX KS(0.877)S(0.123)LVTSK KS(8.54)S(-8.54)LVT(-58.58)S(-72.01)K 2 2 0.022801 By MS/MS 18629000 18629000 0 0 0 0 18629000 0 0 0 0 0 0 0 0 0 18629000 0 0 0 0 0 0 0 0 0 0 0 700 529 108 108 12310;20039 13742;22688 99166 91729 99166 91729 QE05099 5593 99166 91729 QE05099 5593 99166 91729 QE05099 5593 -sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN 109;109;125;125;132;105;105;121 sp|O43768-2|ENSA_HUMAN sp|O43768-2|ENSA_HUMAN 0.877764 9.23011 0.00135208 98.182 25.939 55.754 1 S GDHIPTPQDLPQRKSSLVTSKLAG_______ X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPXXXXXXXXXX KS(0.105)S(0.878)LVT(0.015)S(0.002)K KS(-9.23)S(9.23)LVT(-17.65)S(-25.69)K 3 2 -0.061619 By MS/MS By MS/MS By matching By matching By matching By MS/MS 81973000 81973000 0 0 7090300 8341200 9691500 10030000 1675200 9952100 7090300 0 0 8341200 0 0 9691500 0 0 10030000 0 0 1675200 0 0 9952100 0 0 701 529 109 109 12310;20039 13742;22688 99164;99165;99168;99169;160369;160370;160371;160372;160373;160374 91727;91728;91731;142479 99164 91727 QE05097 5219 99167 91730 QE05100 5516 99167 91730 QE05100 5516 -CON__P02662 46 CON__P02662 CON__P02662 0.99978 36.4544 1.10E-08 122.19 116.48 122.19 2 S VFGKEKVNELSKDIGSESTEDQAMEDIKQME X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;Phospho (STY);X;X;X;X;X;Oxidation (M);X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPPPPPPPXXX DIGS(1)ES(0.972)T(0.029)EDQAMEDIK DIGS(36.45)ES(15.33)T(-15.33)EDQAMEDIK 4 2 0.56139 By MS/MS By MS/MS By MS/MS 49187000 0 49187000 0 NaN 16494000 0 20139000 0 0 12553000 NaN NaN NaN NaN NaN NaN 0 16494000 0 0 0 0 0 20139000 0 0 0 0 0 0 0 0 12553000 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + 2 14 46 46 3452 3862;3863 27864;27865;27866;27867 25820;25821;25822;25823 27865 25821 QE05099 36641 27865 25821 QE05099 36641 27865 25821 QE05099 36641 -CON__P02662 48 CON__P02662 CON__P02662 0.971522 15.3284 1.10E-08 122.19 116.48 122.19 2 S GKEKVNELSKDIGSESTEDQAMEDIKQMEAE X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;Phospho (STY);X;X;X;X;X;Oxidation (M);X;X;X;X;X;X;X;X;X XXXXXXXXXXPPPPPPPPPPPPPPPPXXXXX DIGS(1)ES(0.972)T(0.029)EDQAMEDIK DIGS(36.45)ES(15.33)T(-15.33)EDQAMEDIK 6 2 0.56139 By MS/MS By MS/MS By MS/MS 49187000 0 49187000 0 NaN 16494000 0 20139000 0 0 12553000 NaN NaN NaN NaN NaN NaN 0 16494000 0 0 0 0 0 20139000 0 0 0 0 0 0 0 0 12553000 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + 3 14 48 48 3452 3862;3863 27864;27865;27866;27867 25820;25821;25822;25823 27865 25821 QE05099 36641 27865 25821 QE05099 36641 27865 25821 QE05099 36641 -CON__P02662 115 CON__P02662 CON__P02662 1 50.1781 4.91E-07 124.08 88.205 50.178 1 S RLKKYKVPQLEIVPNSAEERLHSMKEGIHAQ X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXPPPPPPPPPPPPPPXXXXXXXXXXX VPQLEIVPNS(1)AEER VPQLEIVPNS(50.18)AEER 10 3 -0.26085 By MS/MS By matching By MS/MS By matching By matching By MS/MS 228160000 228160000 0 0 NaN 36938000 3667100 7945800 0 2359500 8418700 NaN NaN NaN NaN NaN NaN 36938000 0 0 3667100 0 0 7945800 0 0 0 0 0 2359500 0 0 8418700 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + 4 14 115 115 23142 26196 185609;185610;185611;185612;185613;185614;185615 165233;165234;165235;165236 185612 165236 QE05102 41518 185610 165234 QE05097 41110 185610 165234 QE05097 41110 -sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-8|ENSA_HUMAN 2;2;2;2;2;2 sp|O43768-2|ENSA_HUMAN sp|O43768-2|ENSA_HUMAN 1.0 73.249 3.69e-06 83.395 74.925 83.395 1 S ______________MSQKQEEENPAEETGEE X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXXPPPPPPPPPPPPPPPP S(1)QKQEEENPAEETGEEK S(73.25)QKQEEENPAEET(-73.25)GEEK 1 2 -0.84902 By matching By matching By MS/MS 25828000 25828000 0 0 0 0 8765300 0 2355900 14706000 0 0 0 0 0 0 8765300 0 0 0 0 0 2355900 0 0 14706000 0 0 702 529 2 2 19781 22398 158249;158250;158251 140920 158249 140920 QE05102 12907 158249 140920 QE05102 12907 158249 140920 QE05102 12907 -sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN;sp|P56211-2|ARP19_HUMAN;sp|P56211|ARP19_HUMAN 67;67;83;83;90;63;63;79;46;62 sp|O43768-2|ENSA_HUMAN;sp|P56211-2|ARP19_HUMAN sp|O43768-2|ENSA_HUMAN 0.999907 42.1841 4.04e-05 77.894 72.756 77.894 1 S DFLMKRLQKGQKYFDSGDYNMAKAKMKNKQL;DFLRKRLQKGQKYFDSGDYNMAKAKMKNKQL X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPPXXXXXXXX YFDS(1)GDYNMAK Y(-44.9)FDS(42.18)GDY(-42.18)NMAK 4 2 0.090313 By MS/MS By MS/MS By matching By MS/MS By MS/MS 602510000 602510000 0 0 323250000 127970000 0 67123000 12790000 71378000 323250000 0 0 127970000 0 0 0 0 0 67123000 0 0 12790000 0 0 71378000 0 0 703 529;2007 67;46 67 23817 26932 190543;190544;190545;190546;190547 169398;169399;169400;169401 190543 169398 QE05097 28697 190543 169398 QE05097 28697 190543 169398 QE05097 28697 -sp|O95714|HERC2_HUMAN;sp|Q9BVR0|HRC23_HUMAN 1577;304 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 1.0 100.152 1.12e-15 100.15 94.415 100.15 2 S KPESTDDEEKIGNEESDLEEACILPHSPINV X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X XXXXXXXXXXPPPPPPPPPPPPPPPPPPPPP IGNEES(1)DLEEACILPHS(1)PINVDK IGNEES(100.15)DLEEACILPHS(100.15)PINVDK 6 3 -0.31776 By matching By matching By matching By matching By MS/MS By MS/MS 398730000 0 398730000 0 83882000 60609000 77868000 70320000 41821000 64234000 0 83882000 0 0 60609000 0 0 77868000 0 0 70320000 0 0 41821000 0 0 64234000 0 1295 867 1577 1577 11517 12858 93270;93271;93272;93273;93274;93275 86700;86701 93271 86701 QE05102 51298 93271 86701 QE05102 51298 93271 86701 QE05102 51298 -sp|O95714|HERC2_HUMAN;sp|Q9BVR0|HRC23_HUMAN 1588;315 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 1.0 100.152 1.12e-15 100.15 94.415 100.15 2 S GNEESDLEEACILPHSPINVDKRPIAIKSPK X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X PPPPPPPPPPPPPPPPPPPPPPXXXXXXXXX IGNEES(1)DLEEACILPHS(1)PINVDK IGNEES(100.15)DLEEACILPHS(100.15)PINVDK 17 3 -0.31776 By matching By matching By matching By matching By MS/MS By MS/MS 398730000 0 398730000 0 83882000 60609000 77868000 70320000 41821000 64234000 0 83882000 0 0 60609000 0 0 77868000 0 0 70320000 0 0 41821000 0 0 64234000 0 1296 867 1588 1588 11517 12858 93270;93271;93272;93273;93274;93275 86700;86701 93271 86701 QE05102 51298 93271 86701 QE05102 51298 93271 86701 QE05102 51298 -sp|O95714|HERC2_HUMAN 2928 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 1.0 44.9549 6.81e-12 84.285 78.578 44.955 1 S IRAEEEDLAAVPFLASDNEEEEDEKGNSGSL X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X PPPPPPPPPPPPPPPPPPPPPPPPPXXXXXX IRAEEEDLAAVPFLAS(1)DNEEEEDEK IRAEEEDLAAVPFLAS(44.95)DNEEEEDEK 16 3 -0.24823 By MS/MS By MS/MS By matching By matching 61597000 61597000 0 0 22562000 18225000 9119700 11689000 0 0 22562000 0 0 18225000 0 0 9119700 0 0 11689000 0 0 0 0 0 0 0 0 1297 867 2928 2928 11904 13281 96043;96044;96045;96046 89048;89049 96044 89049 QE05098 52942 96043 89048 QE05097 52381 96043 89048 QE05097 52381 -sp|O95714|HERC2_HUMAN 1938 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 0.427104 0.0 4.17e-06 44.164 42.292 44.164 S KYDLKLAELPAAAQPSAEDSDTEDDSEAEQT X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXPPPPPPPPPPPPPPPPPPPPPPPPPP LAELPAAAQPS(0.427)AEDS(0.427)DT(0.142)EDDS(0.003)EAEQTER LAELPAAAQPS(0)AEDS(0)DT(-4.78)EDDS(-20.87)EAEQT(-37.92)ER 11 3 -1.2171 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1298 867 1938 1938 12395 13829 99721 92163 QE05099 31358 99721 92163 QE05099 31358 99721 92163 QE05099 31358 -sp|O95714|HERC2_HUMAN 1942 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 0.427104 0.0 4.17e-06 44.164 42.292 44.164 S KLAELPAAAQPSAEDSDTEDDSEAEQTERNI X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX LAELPAAAQPS(0.427)AEDS(0.427)DT(0.142)EDDS(0.003)EAEQTER LAELPAAAQPS(0)AEDS(0)DT(-4.78)EDDS(-20.87)EAEQT(-37.92)ER 15 3 -1.2171 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1299 867 1942 1942 12395 13829 99721 92163 QE05099 31358 99721 92163 QE05099 31358 99721 92163 QE05099 31358 -sp|O95714|HERC2_HUMAN 3462 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 1.0 41.1171 0.0267288 41.117 33.02 41.117 1 S NGEECMLAVDIEDRLSPNPWQEKREIVSSED X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXPPPPPPPPPXXXXXXXX LS(1)PNPWQEK LS(41.12)PNPWQEK 2 2 0.64603 By matching By MS/MS By matching By matching 40352000 40352000 0 0 0 11706000 12495000 0 7273000 8877800 0 0 0 11706000 0 0 12495000 0 0 0 0 0 7273000 0 0 8877800 0 0 1300 867 3462 3462 14140 15756 112737;112738;112739;112740 102778 112737 102778 QE05099 28079 112737 102778 QE05099 28079 112737 102778 QE05099 28079 -sp|Q08945|SSRP1_HUMAN 667 sp|Q08945|SSRP1_HUMAN sp|Q08945|SSRP1_HUMAN 0.824557 6.72928 2.29e-05 88.385 80.253 88.385 1 S SSRQLSESFKSKEFVSSDESSSGENKSKKKR X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPPPPPXXXXX EFVS(0.825)S(0.175)DESSSGENK EFVS(6.73)S(-6.73)DES(-34.1)S(-47.3)S(-52.91)GENK 4 2 -0.31453 By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS 78553000 78553000 0 0 12562000 16302000 23000000 7857800 0 18830000 12562000 0 0 16302000 0 0 23000000 0 0 7857800 0 0 0 0 0 18830000 0 0 3469 2387 667 667 6499 7276 53820;53821;53822;53823;53824 51145;51146;51147;51148;51149 53820 51145 QE05097 12983 53820 51145 QE05097 12983 53820 51145 QE05097 12983 -sp|Q08945|SSRP1_HUMAN 444 sp|Q08945|SSRP1_HUMAN sp|Q08945|SSRP1_HUMAN 0.999939 44.165 7.94e-20 97.469 93.771 97.469 1 S GLKEGMNPSYDEYADSDEDQHDAYLERMKEE X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXPPPPPPPPPPPPPPPPPPPPPPPPXXXX EGMNPSYDEYADS(1)DEDQHDAYLER EGMNPS(-49.21)Y(-49.82)DEY(-44.17)ADS(44.17)DEDQHDAY(-90.19)LER 13 3 0.19918 By MS/MS By MS/MS 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3470 2387 444 444 6658 7448 55048;55049 52320;52321 55048 52320 QE05099 31926 55048 52320 QE05099 31926 55048 52320 QE05099 31926 -sp|Q08945|SSRP1_HUMAN 659 sp|Q08945|SSRP1_HUMAN sp|Q08945|SSRP1_HUMAN 0.999878 39.1416 0.00235198 117.7 65.216 117.7 1 S SRGSSSKSSSRQLSESFKSKEFVSSDESSSG X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X XXXXXXXXXXXPPPPPPPXXXXXXXXXXXXX QLSES(1)FK QLS(-39.14)ES(39.14)FK 5 2 0.14738 By MS/MS By MS/MS By MS/MS By MS/MS By matching By MS/MS 558700000 558700000 0 0 68201000 87774000 138300000 95357000 19966000 149110000 68201000 0 0 87774000 0 0 138300000 0 0 95357000 0 0 19966000 0 0 149110000 0 0 3471 2387 659 659 16873 19002 134380;134381;134382;134383;134384;134385 120469;120470;120471;120472;120473 134381 120470 QE05098 17736 134381 120470 QE05098 17736 134381 120470 QE05098 17736 -sp|Q15751|HERC1_HUMAN 3446 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.999981 47.2167 0.0187791 47.548 7.8172 47.548 2 S VMTCVWCNKKGLLATSGNDGTIRVWNVTKKQ X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXPPPPPPPPPPPPPPXXXXXXXX KGLLAT(1)S(1)GNDGTIR KGLLAT(47.2)S(47.22)GNDGT(-47.2)IR 7 2 -0.95722 By matching By MS/MS By matching 129800000 0 129800000 0 3921800 0 120850000 0 0 5021300 0 3921800 0 0 0 0 0 120850000 0 0 0 0 0 0 0 0 5021300 0 4421 2824 3446 3446 12194 13609 98227;98228;98229 90789 98227 90789 QE05099 12004 98227 90789 QE05099 12004 98227 90789 QE05099 12004 -sp|Q15751|HERC1_HUMAN 1491 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.9956 24.4686 0.000725254 80.245 41.065 80.245 1 S STSASEGGGLMTRSESLTAESRLVHTSPNYR X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPPXXXXXXXXX S(0.004)ES(0.996)LT(0.001)AESR S(-24.47)ES(24.47)LT(-30.8)AES(-48.77)R 3 2 -0.02332 By matching By MS/MS By MS/MS By MS/MS By matching By MS/MS 88117000 88117000 0 0 11766000 13176000 20540000 16963000 4364700 21308000 11766000 0 0 13176000 0 0 20540000 0 0 16963000 0 0 4364700 0 0 21308000 0 0 4422 2824 1491 1491 18146 20455 144586;144587;144588;144589;144590;144591 129449;129450;129451;129452 144587 129450 QE05099 10286 144587 129450 QE05099 10286 144587 129450 QE05099 10286 -sp|Q15751|HERC1_HUMAN 1510 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.330689 0.0 7.97e-05 45.193 39.23 45.193 S ESRLVHTSPNYRLIKSRSESDLSQPESDEEG X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXXPPPPPPPPPPPPPPPP S(0.331)RS(0.331)ES(0.331)DLS(0.008)QPESDEEGYALSGR S(0)RS(0)ES(0)DLS(-16.27)QPES(-35.13)DEEGY(-44.24)ALS(-45.11)GR 1 3 0.88872 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4423 2824 1510 1510 19884 22510 159108 141525 QE05102 26609 159108 141525 QE05102 26609 159108 141525 QE05102 26609 -sp|Q15751|HERC1_HUMAN 1512 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.473289 2.22394 8.37e-06 56.783 53.982 56.783 S RLVHTSPNYRLIKSRSESDLSQPESDEEGYA X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPPPPPPPPPPP S(0.284)RS(0.473)ES(0.219)DLS(0.024)QPESDEEGYALSGR S(-2.22)RS(2.22)ES(-3.34)DLS(-13.02)QPES(-39.32)DEEGY(-52.92)ALS(-56.34)GR 3 3 -0.16378 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4424 2824 1512 1512 19884 22510 159107 141524 QE05101 26243 159107 141524 QE05101 26243 159107 141524 QE05101 26243 -sp|Q15751|HERC1_HUMAN 1514 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.330689 0.0 7.97e-05 45.193 39.23 45.193 S VHTSPNYRLIKSRSESDLSQPESDEEGYALS X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX S(0.331)RS(0.331)ES(0.331)DLS(0.008)QPESDEEGYALSGR S(0)RS(0)ES(0)DLS(-16.27)QPES(-35.13)DEEGY(-44.24)ALS(-45.11)GR 5 3 0.88872 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4425 2824 1514 1514 19884 22510 159108 141525 QE05102 26609 159108 141525 QE05102 26609 159108 141525 QE05102 26609 -sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN 18;18 sp|Q6ZN18-2|AEBP2_HUMAN sp|Q6ZN18-2|AEBP2_HUMAN 0.998316 27.7896 1.21e-62 181.56 176.76 181.56 2 S AAITDMADLEELSRLSPLPPGSPGSAARGRA X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X PPPPPPPPPPPPPPPPPPPPPPPPPPPPXXX AAAITDMADLEELS(0.002)RLS(0.998)PLPPGS(0.809)PGS(0.191)AAR AAAIT(-99.88)DMADLEELS(-27.79)RLS(27.79)PLPPGS(6.28)PGS(-6.28)AAR 17 3 0.97551 By matching By matching By matching By MS/MS By MS/MS By MS/MS 499850000 0 499850000 0 2708200 3550900 192640000 104030000 20713000 176200000 0 2708200 0 0 3550900 0 0 192640000 0 0 104030000 0 0 20713000 0 0 176200000 0 5468 3335 18 18 28 35 264;265;266;267;268;269 236;237;238;239 264 236 QE05100 65231 264 236 QE05100 65231 264 236 QE05100 65231 -sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN 24;24 sp|Q6ZN18-2|AEBP2_HUMAN sp|Q6ZN18-2|AEBP2_HUMAN 0.809237 6.27624 1.21e-62 181.56 176.76 181.56 2 S ADLEELSRLSPLPPGSPGSAARGRAEPPEEE X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X PPPPPPPPPPPPPPPPPPPPPPXXXXXXXXX AAAITDMADLEELS(0.002)RLS(0.998)PLPPGS(0.809)PGS(0.191)AAR AAAIT(-99.88)DMADLEELS(-27.79)RLS(27.79)PLPPGS(6.28)PGS(-6.28)AAR 23 3 0.97551 By matching By matching By matching By MS/MS By MS/MS By MS/MS 499850000 0 499850000 0 2708200 3550900 192640000 104030000 20713000 176200000 0 2708200 0 0 3550900 0 0 192640000 0 0 104030000 0 0 20713000 0 0 176200000 0 5469 3335 24 24 28 35 264;265;266;267;268;269 236;237;238;239 264 236 QE05100 65231 264 236 QE05100 65231 264 236 QE05100 65231 -sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN 206;206 sp|Q6ZN18-2|AEBP2_HUMAN sp|Q6ZN18-2|AEBP2_HUMAN 0.999982 48.3708 1.18e-09 128.05 118.25 128.05 1 S TGGGGSSATSGGRRGSLEMSSDGEPLSRMDS X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPPPPPPPPXXX RGS(1)LEMSSDGEPLSR RGS(48.37)LEMS(-48.37)S(-54.13)DGEPLS(-99.69)R 3 2 -0.10602 By MS/MS By MS/MS By MS/MS By matching By MS/MS 73663000 73663000 0 0 19262000 11103000 19454000 0 1816900 22028000 19262000 0 0 11103000 0 0 19454000 0 0 0 0 0 1816900 0 0 22028000 0 0 5470 3335 206 206 17255 19413 137099;137100;137101;137102;137103 122913;122914;122915;122916 137099 122913 QE05097 23240 137099 122913 QE05097 23240 137099 122913 QE05097 23240 - REV__sp|P35908|K22E_HUMAN REV__sp|P35908|K22E_HUMAN 1 71.692 0.00457965 71.692 14.102 71.692 1 S X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXPPPPPPPPPXXXXXXXXXXXX IIKELS(1)DGR IIKELS(71.69)DGR 6 2 2.0005 By matching By MS/MS By matching By matching By matching 431850000 431850000 0 0 NaN 103010000 67359000 64124000 74201000 0 55805000 NaN NaN NaN NaN NaN NaN 103010000 0 0 67359000 0 0 64124000 0 0 74201000 0 0 0 0 0 55805000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + + 61 57 252 252 11589 12932 93729;93730;93731;93732;93733;93734 87100 93729 87100 QE05098 47490 93729 87100 QE05098 47490 93729 87100 QE05098 47490 - REV__sp|Q9NSB4|KRT82_HUMAN REV__sp|Q9NSB4|KRT82_HUMAN 1 45.368 0.0161156 45.368 28.697 45.368 1 S X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPXXXXXXXXX VDGS(1)VCDLRR VDGS(45.37)VCDLRR 4 2 0.77096 By matching By matching By matching By matching By matching By MS/MS 1670400000 1670400000 0 0 NaN 218420000 241200000 328130000 240860000 52984000 294390000 NaN NaN NaN NaN NaN NaN 218420000 0 0 241200000 0 0 328130000 0 0 240860000 0 0 52984000 0 0 294390000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + + 62 58 330 330 22307 25289 178961;178962;178963;178964;178965;178966;178967 159240 178961 159240 QE05102 16922 178961 159240 QE05102 16922 178961 159240 QE05102 16922 - REV__sp|Q6S5H4-2|POTEB_HUMAN REV__sp|Q6S5H4-2|POTEB_HUMAN 1 51.2862 0.045235 51.286 32.662 51.286 S X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPPXXXXXXXXX EVS(1)EIEELK EVS(51.29)EIEELK 3 2 0.81181 By matching By matching By matching By matching By matching 50767000 50767000 0 0 0.044169 0 8469100 14247000 11062000 1262600 15726000 0 0.056281 0.030122 0.051456 0.037786 0.081346 0 0 0 8469100 0 0 14247000 0 0 11062000 0 0 1262600 0 0 15726000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + 63 59 22 22 8166 9110 66515;66516;66517;66518;66519 61714;61715 66516 61715 QE05100 38402 66516 61715 QE05100 38402 66516 61715 QE05100 38402 -sp|Q8IUD2-4|RB6I2_HUMAN;sp|Q8IUD2-2|RB6I2_HUMAN;sp|Q8IUD2-3|RB6I2_HUMAN;sp|Q8IUD2|RB6I2_HUMAN;sp|Q8IUD2-5|RB6I2_HUMAN;sp|O15083|ERC2_HUMAN 191;191;191;191;191;187 sp|Q8IUD2-4|RB6I2_HUMAN sp|Q8IUD2-4|RB6I2_HUMAN 0.999998 58.0663 0.00181554 89.827 67.799 89.827 1 S ESKLSSSMNSIKTFWSPELKKERALRKDEAS X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPXXXXXXXXXXX TFWS(1)PELK T(-58.07)FWS(58.07)PELK 4 2 0.075831 By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS 138400000 138400000 0 0 29764000 20957000 24855000 30752000 8304800 23771000 29764000 0 0 20957000 0 0 24855000 0 0 30752000 0 0 8304800 0 0 23771000 0 0 6037 3584 191 191 21148 23984 169817;169818;169819;169820;169821;169822 151176;151177;151178;151179;151180;151181 169822 151181 QE05102 49176 169822 151181 QE05102 49176 169822 151181 QE05102 49176 -sp|Q9NRX5|SERC1_HUMAN 364 sp|Q9NRX5|SERC1_HUMAN sp|Q9NRX5|SERC1_HUMAN 0.999996 54.0798 2.24e-16 159.22 148.1 159.22 1 S DESTLIEDGGARSDGSLEDGDDVHRAVDNER X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPPPPXXXXXX SDGS(1)LEDGDDVHR S(-54.08)DGS(54.08)LEDGDDVHR 4 2 0.64808 By MS/MS By MS/MS By matching By MS/MS By MS/MS By MS/MS 222110000 222110000 0 0 31407000 17665000 20892000 23194000 5132400 54893000 31407000 0 0 17665000 0 0 20892000 0 0 23194000 0 0 5132400 0 0 54893000 0 0 8729 5187 364 364 17793 20026 141355;141356;141357;141358;141359;141360;141361;141362;141363;141364;141365 126543;126544;126545;126546;126547;126548;126549 141361 126549 QE05102 10564 141361 126549 QE05102 10564 141361 126549 QE05102 10564 -sp|Q9Y3B9|RRP15_HUMAN 11 sp|Q9Y3B9|RRP15_HUMAN sp|Q9Y3B9|RRP15_HUMAN 0.997432 25.8922 9.39e-31 175.33 139.7 175.33 1 S _____MAAAAPDSRVSEEENLKKTPKKKMKM X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXPPPPPPPPPPPPPPPPXXXXXXXXX AAAAPDS(0.003)RVS(0.997)EEENLK AAAAPDS(-25.89)RVS(25.89)EEENLK 10 2 -0.029697 By matching By matching By MS/MS By MS/MS By MS/MS By MS/MS 266450000 266450000 0 0 38150000 39445000 56305000 55338000 7010600 70203000 38150000 0 0 39445000 0 0 56305000 0 0 55338000 0 0 7010600 0 0 70203000 0 0 9895 5791 11 11 12 17 158;159;160;161;162;163 166;167;168;169 159 167 QE05100 23225 159 167 QE05100 23225 159 167 QE05100 23225 -sp|Q15751|HERC1_HUMAN 3445 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.999981 47.2024 0.0187791 47.548 7.8172 47.548 2 T RVMTCVWCNKKGLLATSGNDGTIRVWNVTKK X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXPPPPPPPPPPPPPPXXXXXXX KGLLAT(1)S(1)GNDGTIR KGLLAT(47.2)S(47.22)GNDGT(-47.2)IR 6 2 -0.95722 By matching By MS/MS By matching 129800000 0 129800000 0 3921800 0 120850000 0 0 5021300 0 3921800 0 0 0 0 0 120850000 0 0 0 0 0 0 0 0 5021300 0 10983 2824 3445 3445 12194 13609 98227;98228;98229 90789 98227 90789 QE05099 12004 98227 90789 QE05099 12004 98227 90789 QE05099 12004 -sp|O75379|VAMP4_HUMAN 30 sp|O75379|VAMP4_HUMAN sp|O75379|VAMP4_HUMAN 1 67.6437 1.44E-52 203.56 187.24 67.644 1 S TGSVKSERRNLLEDDSDEEEDFFLRGPSGPR X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXPPPPPPPPPPPPPPPPPPPPPP NLLEDDS(1)DEEEDFFLR NLLEDDS(67.64)DEEEDFFLR 7 3 -0.051914 By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS 7929000000 7929000000 0 0 NaN 1592100000 973800000 1011600000 1450300000 631970000 878760000 NaN NaN NaN NaN NaN NaN 1592100000 0 0 973800000 0 0 1011600000 0 0 1450300000 0 0 631970000 0 0 878760000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 963 669 30 30 15558;15559 17538;17539 124829;124830;124831;124832;124833;124834;124835;124836;124837;124838;124839;124840;124841;124842;124843;124844;124845;124846 112951;112952;112953;112954;112955;112956;112957;112958;112959;112960;112961;112962;112963;112964;112965;112966;112967;112968;112969;112970;112971;112972 124840 112969 QE05102 57877 124833 112957 QE05099 57820 124833 112957 QE05099 57820 -sp|O95183|VAMP5_HUMAN 48 sp|O95183|VAMP5_HUMAN sp|O95183|VAMP5_HUMAN 0.72657 5.36697 5.72E-05 79.514 55.133 79.514 1 S KLAELQQRSDQLLDMSSTFNKTTQNLAQKKC X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXPPPPPPPPPPPPPXXXXXXXXXX SDQLLDMS(0.727)S(0.211)T(0.062)FNK S(-64.13)DQLLDMS(5.37)S(-5.37)T(-10.67)FNK 8 2 -0.18713 By matching By matching By MS/MS By matching By matching By matching 86590000 86590000 0 0 0.032027 17447000 15753000 20219000 14001000 6284700 12885000 0.028348 0.025719 0.032895 0.033925 0.083789 0.034516 17447000 0 0 15753000 0 0 20219000 0 0 14001000 0 0 6284700 0 0 12885000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1189 809 48 48 17891 20149 142427;142428;142429;142430;142431;142432 127454 142427 127454 QE05099 48504 142427 127454 QE05099 48504 142427 127454 QE05099 48504 -sp|Q15836|VAMP3_HUMAN;sp|P63027|VAMP2_HUMAN 63;80 sp|Q15836|VAMP3_HUMAN sp|Q15836|VAMP3_HUMAN 0.920811 10.6555 1.81E-09 124.1 98.278 107.25 1 S DRADALQAGASQFETSAAKLKRKYWWKNCKM X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXPPPPPPPPPPPPPPPPPXXXXXXXXXXXX ADALQAGASQFET(0.079)S(0.921)AAK ADALQAGAS(-49.99)QFET(-10.66)S(10.66)AAK 14 2 0.23449 By MS/MS By MS/MS By MS/MS By MS/MS By matching By MS/MS 265240000 265240000 0 0 0.036151 44627000 41445000 69094000 42521000 5738000 61819000 0.03226 0.028442 0.039791 0.036967 0.030963 0.043392 44627000 0 0 41445000 0 0 69094000 0 0 42521000 0 0 5738000 0 0 61819000 0 0 0.47624 0.90925 12.188 0.51677 1.0694 7.2217 NaN NaN NaN 0.81588 4.4311 19.209 NaN NaN NaN 0.4388 0.78189 5.9861 4442 2836 63 63 279 319 2297;2298;2299;2300;2301;2302 1992;1993;1994;1995;1996 2300 1995 QE05100 30086 2301 1996 QE05102 30007 2301 1996 QE05102 30007 -sp|Q15836|VAMP3_HUMAN;sp|P63027|VAMP2_HUMAN;sp|P23763-2|VAMP1_HUMAN;sp|P23763-3|VAMP1_HUMAN;sp|P23763|VAMP1_HUMAN 44;61;63;63;63 sp|Q15836|VAMP3_HUMAN sp|Q15836|VAMP3_HUMAN 1 65.4951 2.36E-06 126.19 98.602 65.495 1 S MRVNVDKVLERDQKLSELDDRADALQAGASQ X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXPPPPPPPPPPXXXXXXXXXX DQKLS(1)ELDDR DQKLS(65.5)ELDDR 5 3 -0.72518 By MS/MS By MS/MS By MS/MS By MS/MS By matching By MS/MS 412950000 412950000 0 0 NaN 75542000 44814000 32924000 35016000 11023000 4669900 NaN NaN NaN NaN NaN NaN 75542000 0 0 44814000 0 0 32924000 0 0 35016000 0 0 11023000 0 0 4669900 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4443 2836 44 44 4530 5083 37093;37094;37095;37096;37097;37098;37099;37100;37101;37102;37103;37104 34712;34713;34714;34715;34716;34717;34718;34719 37100 34719 QE05102 18436 37093 34712 QE05097 18245 37093 34712 QE05097 18245 -sp|Q15836|VAMP3_HUMAN 11 sp|Q15836|VAMP3_HUMAN sp|Q15836|VAMP3_HUMAN 0.97018 15.1316 0.000117365 79.652 72.041 79.652 1 S _____MSTGPTAATGSNRRLQQTQNQVDEVV X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXPPPPPPPPPPPPPXXXXXXXXXXXX STGPTAAT(0.03)GS(0.97)NRR S(-66.94)T(-63.48)GPT(-42.47)AAT(-15.13)GS(15.13)NRR 10 2 -0.15791 By matching By matching By MS/MS By matching By matching By MS/MS 34280000 34280000 0 0 NaN 3057100 4718800 12052000 5047700 1070900 8333500 NaN NaN NaN NaN NaN NaN 3057100 0 0 4718800 0 0 12052000 0 0 5047700 0 0 1070900 0 0 8333500 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4444 2836 11 11 20280 22978 162490;162491;162492;162493;162494;162495 144222;144223 162490 144222 QE05099 7582 162490 144222 QE05099 7582 162490 144222 QE05099 7582 -sp|Q9BV40|VAMP8_HUMAN 55 sp|Q9BV40|VAMP8_HUMAN sp|Q9BV40|VAMP8_HUMAN 0.959784 13.7778 3.78E-05 91.969 27.98 91.969 1 S NLEHLRNKTEDLEATSEHFKTTSQKVARKFW X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXPPPPPPPPPPPPXXXXXXXXXXX TEDLEAT(0.04)S(0.96)EHFK T(-83.18)EDLEAT(-13.78)S(13.78)EHFK 8 2 0.40785 By matching By matching By matching By MS/MS 114520000 114520000 0 0 NaN 20400000 9738500 7862300 0 0 76518000 NaN NaN NaN NaN NaN NaN 20400000 0 0 9738500 0 0 7862300 0 0 0 0 0 0 0 0 76518000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7902 4687 55 55 21013 23827 168874;168875;168876;168877 150433 168874 150433 QE05102 19524 168874 150433 QE05102 19524 168874 150433 QE05102 19524
--- a/test-data/test_kinase_substrate.tabular Tue Mar 15 12:44:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ -GENE KINASE KIN_ACC_ID KIN_ORGANISM SUBSTRATE SUB_GENE_ID SUB_ACC_ID SUB_GENE SUB_ORGANISM SUB_MOD_RSD SITE_GRP_ID SITE_+/-7_AA DOMAIN IN_VIVO_RXN IN_VITRO_RXN CST_CAT# -Csnk2a1 CK2A1 Q60737 human VAMP4 53330 O70480 Vamp4 human S30 454285 RNLLEDDsDEEEDFF X
--- a/test-data/test_networkin.tabular Tue Mar 15 12:44:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -#substrate position id networkin_score tree netphorest_group netphorest_score string_identifier string_score substrate_name sequence string_path -VAMP4 (ENSP00000236192) 30 CK2alpha 35.6396 KIN CK2_group 0.5228 ENSP00000236192 0.85 VAMP4 LLEDDsDEEED "ENSP00000217244, 0.68 ENSP00000236192" -SSRP1 (ENSP00000278412) 444 CK2alpha 28.6345 KIN CK2_group 0.3768 ENSP00000278412 0.874 SSRP1 DEYADsDEDQH "ENSP00000217244, 0.6992 ENSP00000278412" -SSRP1 (ENSP00000278412) 667 CK2alpha 22.2088 KIN CK2_group 0.3168 ENSP00000278412 0.874 SSRP1 SKEFVsSDESS "ENSP00000217244, 0.6992 ENSP00000278412" -HERC2 (ENSP00000261609) 1577 CK2alpha 10.7686 KIN CK2_group 0.5253 ENSP00000261609 0.4514 HERC2 IGNEEsDLEEA "ENSP00000217244, 0.764 ENSP00000346659, 0.76 ENSP00000261609" -HERC2 (ENSP00000261609) 2928 CK2alpha 10.7686 KIN CK2_group 0.4698 ENSP00000261609 0.4514 HERC2 VPFLAsDNEEE "ENSP00000217244, 0.764 ENSP00000346659, 0.76 ENSP00000261609" -RRP15 (ENSP00000355899) 11 CK2alpha 8.5484 KIN CK2_group 0.3566 ENSP00000355899 0.461 RRP15 PDSRVsEEENL "ENSP00000217244, 0.3688 ENSP00000355899" -SSRP1 (ENSP00000278412) 444 CK2a2 7.8435 KIN CK2_group 0.3768 ENSP00000278412 0.615 SSRP1 DEYADsDEDQH "ENSP00000262506, 0.492 ENSP00000278412" -SSRP1 (ENSP00000278412) 667 CK2a2 7.7757 KIN CK2_group 0.3168 ENSP00000278412 0.615 SSRP1 SKEFVsSDESS "ENSP00000262506, 0.492 ENSP00000278412" -VAMP2 (ENSP00000314214) 80 PKD3 6.9217 KIN PKD_group 0.0744 ENSP00000314214 0.949 VAMP2 SQFETsAAKLK "ENSP00000234179, 0.7592 ENSP00000314214" -VAMP2 (ENSP00000314214) 61 CK2alpha 6.3122 KIN CK2_group 0.3338 ENSP00000314214 0.4391 VAMP2 RDQKLsELDDR "ENSP00000217244, 0.7992 ENSP00000222812, 0.7544 ENSP00000314214" -VAMP1 (ENSP00000380148) 63 CK2alpha 6.1363 KIN CK2_group 0.3338 ENSP00000380148 0.4364 VAMP1 RDQKLsELDDR "ENSP00000217244, 0.7944 ENSP00000222812, 0.7544 ENSP00000380148" -ERC1 (ENSP00000354158) 191 IKKalpha 5.3194 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.96 ERC1 IKTFWsPELKK "ENSP00000359424, 0.768 ENSP00000354158" -ERC1 (ENSP00000354158) 191 IKKalpha 5.3194 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.96 ERC1 IKTFWsPELKK "ENSP00000359424, 0.768 ENSP00000354158" -VAMP2 (ENSP00000314214) 61 PKAbeta 4.9293 KIN PKA_group 0.1153 ENSP00000314214 0.8 VAMP2 RDQKLsELDDR "ENSP00000359719, 0.64 ENSP00000314214" -VAMP2 (ENSP00000314214) 61 PKAgamma 4.9293 KIN PKA_group 0.1153 ENSP00000314214 0.8 VAMP2 RDQKLsELDDR "ENSP00000366488, 0.64 ENSP00000314214" -VAMP3 (ENSP00000054666) 44 CK2alpha 4.2842 KIN CK2_group 0.3338 ENSP00000054666 0.4201 VAMP3 RDQKLsELDDR "ENSP00000217244, 0.7992 ENSP00000317714, 0.6792 ENSP00000054666" -VAMP2 (ENSP00000314214) 80 PKCiota 3.8971 KIN PKC_group 0.0928 ENSP00000314214 0.899 VAMP2 SQFETsAAKLK "ENSP00000295797, 0.7192 ENSP00000314214" -SSRP1 (ENSP00000278412) 444 CDK7 3.6159 KIN CDK7 0.0186 ENSP00000278412 0.903 SSRP1 DEYADsDEDQH "ENSP00000256443, 0.7224 ENSP00000278412" -SSRP1 (ENSP00000278412) 444 CK1alpha 3.3573 KIN CK1_group 0.1264 ENSP00000278412 0.404 SSRP1 DEYADsDEDQH "ENSP00000261798, 0.3232 ENSP00000278412" -VAMP3 (ENSP00000054666) 11 PKCalpha 3.0633 KIN PKC_group 0.4633 ENSP00000054666 0.3277 VAMP3 TAATGsNRRLQ "ENSP00000284384, 0.6232 ENSP00000359025, 0.6352 ENSP00000054666" -SSRP1 (ENSP00000278412) 659 PKCalpha 3.0524 KIN PKC_group 0.4345 ENSP00000278412 0.237 SSRP1 RQLSEsFKSKE "ENSP00000284384, 0.4552 ENSP00000351885, 0.76 ENSP00000278412" -VAMP2 (ENSP00000314214) 61 PKCiota 2.7785 KIN PKC_group 0.0463 ENSP00000314214 0.899 VAMP2 RDQKLsELDDR "ENSP00000295797, 0.7192 ENSP00000314214" -SSRP1 (ENSP00000278412) 659 CDK7 2.5961 KIN CDK7 0.0104 ENSP00000278412 0.903 SSRP1 RQLSEsFKSKE "ENSP00000256443, 0.7224 ENSP00000278412" -SSRP1 (ENSP00000278412) 667 CDK7 2.5961 KIN CDK7 0.0124 ENSP00000278412 0.903 SSRP1 SKEFVsSDESS "ENSP00000256443, 0.7224 ENSP00000278412" -ERC1 (ENSP00000354158) 191 IKKbeta 2.571 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.946 ERC1 IKTFWsPELKK "ENSP00000339151, 0.7568 ENSP00000354158" -ERC1 (ENSP00000354158) 191 IKKbeta 2.571 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.946 ERC1 IKTFWsPELKK "ENSP00000339151, 0.7568 ENSP00000354158" -SSRP1 (ENSP00000278412) 659 PKCbeta 2.4948 KIN PKC_group 0.4345 ENSP00000278412 0.1743 SSRP1 RQLSEsFKSKE "ENSP00000305355, 0.7976 ENSP00000366013, 0.7192 ENSP00000284811, 0.7448 ENSP00000278412" -VAMP3 (ENSP00000054666) 11 PKCbeta 2.4948 KIN PKC_group 0.4633 ENSP00000054666 0.2393 VAMP3 TAATGsNRRLQ "ENSP00000305355, 0.512 ENSP00000348986, 0.7616 ENSP00000054666" -SSRP1 (ENSP00000278412) 659 CK2a2 2.4345 KIN CK2_group 0.0356 ENSP00000278412 0.615 SSRP1 RQLSEsFKSKE "ENSP00000262506, 0.492 ENSP00000278412" -ERC1 (ENSP00000354158) 191 HIPK2 2.2748 KIN HIPK1_HIPK2_group 0.0463 ENSP00000354158 0.4159 ERC1 IKTFWsPELKK "ENSP00000263551, 0.7696 ENSP00000286332, 0.7192 ENSP00000354158" -VAMP3 (ENSP00000054666) 11 PKCzeta 2.0773 KIN PKC_group 0.4633 ENSP00000054666 0.4263 VAMP3 TAATGsNRRLQ "ENSP00000367830, 0.7688 ENSP00000320935, 0.796 ENSP00000054666" -SSRP1 (ENSP00000278412) 659 DNAPK 2.0042 KIN DNAPK 0.0584 ENSP00000278412 0.56 SSRP1 RQLSEsFKSKE "ENSP00000313420, 0.448 ENSP00000278412"
--- a/test-data/test_regulatory_sites.tabular Tue Mar 15 12:44:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ -32017 -"PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words ""PhosphoSitePlus(R), www.phosphosite.org"" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: ""Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926.""" - -GENE PROTEIN PROT_TYPE ACC_ID GENE_ID HU_CHR_LOC ORGANISM MOD_RSD SITE_GRP_ID SITE_+/-7_AA DOMAIN ON_FUNCTION ON_PROCESS ON_PROT_INTERACT ON_OTHER_INTERACT PMIDs LT_LIT MS_LIT MS_CST NOTES -ENSA ENSA "Inhibitor; Protein phosphatase, regulatory subunit" O43768 2029 1q21.3 human S109-p 477819 DLPQRKSsLVTSKLA Endosulfine "molecular association, regulation; protein conformation" SNCA(DISRUPTS) 18973346 1 34 50 -VAMP8 VAMP8 "Membrane protein, integral; Vesicle" Q9BV40 8673 2p11.2 human S55-p 12738929 TEDLEATsEHFKTTS Synaptobrevin "activity, inhibited" 27402227 1 8 0 "abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion" -ENSA ENSA "Inhibitor; Protein phosphatase, regulatory subunit" O43768 2029 1q21.3 human S67-p 455934 KGQKYFDsGDYNMAK Endosulfine "molecular association, regulation" cell cycle regulation PPP2CA(INDUCES) 27889260 3 56 47 -Vamp4 VAMP4 "Membrane protein, integral; Vesicle" O70480 53330 1 H2.1|1 70.29 cM mouse S30-p 454285 RNLLEDDsDEEEDFF "molecular association, regulation; intracellular localization" PACS-1(INDUCES) 14608369 1 64 10
--- a/test-data/test_swissprot.fasta Tue Mar 15 12:44:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,68 +0,0 @@ ->sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 -MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDDDAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEKEKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKHQKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKSEEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT ->sp|Q08945|SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 -MAETLEFNDVYQEVKGSMNDGRLRLSRQGIIFKNSKTGKVDNIQAGELTEGIWRRVALGHGLKLLTKNGHVYKYDGFRESEFEKLSDFFKTHYRLELMEKDLCVKGWNWGTVKFGGQLLSFDIGDQPVFEIPLSNVSQCTTGKNEVTLEFHQNDDAEVSLMEVRFYVPPTQEDGVDPVEAFAQNVLSKADVIQATGDAICIFRELQCLTPRGRYDIRIYPTFLHLHGKTFDYKIPYTTVLRLFLLPHKDQRQMFFVISLDPPIKQGQTRYHFLILLFSKDEDISLTLNMNEEEVEKRFEGRLTKNMSGSLYEMVSRVMKALVNRKITVPGNFQGHSGAQCITCSYKASSGLLYPLERGFIYVHKPPVHIRFDEISFVNFARGTTTTRSFDFEIETKQGTQYTFSSIEREEYGKLFDFVNAKKLNIKNRGLKEGMNPSYDEYADSDEDQHDAYLERMKEEGKIREENANDSSDDSGEETDESFNPGEEEEDVAEEFDSNASASSSSNEGDSDRDEKKRKQLKKAKMAKDRKSRKKPVEVKKGKDPNAPKRPMSAYMLWLNASREKIKSDHPGISITDLSKKAGEIWKGMSKEKKEEWDRKAEDARRDYEKAMKEYEGGRGESSKRDKSKKKKKVKVKMEKKSTPSRGSSSKSSSRQLSESFKSKEFVSSDESSSGENKSKKKRRRSEDSEEEELASTPPSSEDSASGSDE ->sp|Q96SA4|SERC2_HUMAN Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 PE=2 SV=3 -MGACLGACSLLSCASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS ->sp|Q96SA4-2|SERC2_HUMAN Isoform 2 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 -MGAEGAPDFLSCPRVRRASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS ->sp|Q96SA4-3|SERC2_HUMAN Isoform 3 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 -MRSMRLREEESPGPSHTASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS ->sp|Q96SA4-4|SERC2_HUMAN Isoform 4 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 -MDGRMMRSMRLREEESPGPSHTASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS ->sp|Q9NRX5|SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1 -MGSVLGLCSMASWIPCLCGSAPCLLCRCCPSGNNSTVTRLIYALFLLVGVCVACVMLIPGMEEQLNKIPGFCENEKGVVPCNILVGYKAVYRLCFGLAMFYLLLSLLMIKVKSSSDPRAAVHNGFWFFKFAAAIAIIIGAFFIPEGTFTTVWFYVGMAGAFCFILIQLVLLIDFAHSWNESWVEKMEEGNSRCWYAALLSATALNYLLSLVAIVLFFVYYTHPASCSENKAFISVNMLLCVGASVMSILPKIQESQPRSGLLQSSVITVYTMYLTWSAMTNEPETNCNPSLLSIIGYNTTSTVPKEGQSVQWWHAQGIIGLILFLLCVFYSSIRTSNNSQVNKLTLTSDESTLIEDGGARSDGSLEDGDDVHRAVDNERDGVTYSYSFFHFMLFLASLYIMMTLTNWYRYEPSREMKSQWTAVWVKISSSWIGIVLYVWTLVAPLVLTNRDFD ->sp|O43768|ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1 -MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE ->sp|O43768-2|ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA -MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG ->sp|O43768-3|ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA -MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE ->sp|O43768-4|ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA -MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGVWGIASYPLSLGLKEVLRMKSVEQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG ->sp|O43768-5|ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA -MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE ->sp|O43768-6|ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA -MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG ->sp|O43768-7|ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA -MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE ->sp|O43768-8|ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA -MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGVWGIVSYPLSLELKEVLRMKSVEVLLDPFLEVLLLNRSRGEFEI ->sp|O43768-9|ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA -MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG ->sp|Q15751|HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 -MATMIPPVKLKWLEHLNSSWITEDSESIATREGVAVLYSKLVSNKEVVPLPQQVLCLKGPQLPDFERESLSSDEQDHYLDALLSSQLALAKMVCSDSPFAGALRKRLLVLQRVFYALSNKYHDKGKVKQQQHSPESSSGSADVHSVSERPRSSTDALIEMGVRTGLSLLFALLRQSWMMPVSGPGLSLCNDVIHTAIEVVSSLPPLSLANESKIPPMGLDCLSQVTTFLKGVTIPNSGADTLGRRLASELLLGLAAQRGSLRYLLEWIEMALGASAVVHTMEKGKLLSSQEGMISFDCFMTILMQMRRSLGSSADRSQWREPTRTSDGLCSLYEAALCLFEEVCRMASDYSRTCASPDSIQTGDAPIVSETCEVYVWGSNSSHQLVEGTQEKILQPKLAPSFSDAQTIEAGQYCTFVISTDGSVRACGKGSYGRLGLGDSNNQSTLKKLTFEPHRSIKKVSSSKGSDGHTLAFTTEGEVFSWGDGDYGKLGHGNSSTQKYPKLIQGPLQGKVVVCVSAGYRHSAAVTEDGELYTWGEGDFGRLGHGDSNSRNIPTLVKDISNVGEVSCGSSHTIALSKDGRTVWSFGGGDNGKLGHGDTNRVYKPKVIEALQGMFIRKVCAGSQSSLALTSTGQVYAWGCGACLGCGSSEATALRPKLIEELAATRIVDVSIGDSHCLALSHDNEVYAWGNNSMGQCGQGNSTGPITKPKKVSGLDGIAIQQISAGTSHSLAWTALPRDRQVVAWHRPYCVDLEESTFSHLRSFLERYCDKINSEIPPLPFPSSREHHSFLKLCLKLLSNHLALALAGGVATSILGRQAGPLRNLLFRLMDSTVPDEIQEVVIETLSVGATMLLPPLRERMELLHSLLPQGPDRWESLSKGQRMQLDIILTSLQDHTHVASLLGYSSPSDAADLSSVCTGYGNLSDQPYGTQSCHPDTHLAEILMKTLLRNLGFYTDQAFGELEKNSDKFLLGTSSSENSQPAHLHELLCSLQKQLLAFCHINNISENSSSVALLHKHLQLLLPHATDIYSRSANLLKESPWNGSVGEKLRDVIYVSAAGSMLCQIVNSLLLLPVSVARPLLSYLLDLLPPLDCLNRLLPAADLLEDQELQWPLHGGPELIDPAGLPLPQPAQSWVWLVDLERTIALLIGRCLGGMLQGSPVSPEEQDTAYWMKTPLFSDGVEMDTPQLDKCMSCLLEVALSGNEEQKPFDYKLRPEIAVYVDLALGCSKEPARSLWISMQDYAVSKDWDSATLSNESLLDTVSRFVLAALLKHTNLLSQACGESRYQPGKHLSEVYRCVYKVRSRLLACKNLELIQTRSSSRDRWISENQDSADVDPQEHSFTRTIDEEAEMEEQAERDREEGHPEPEDEEEEREHEVMTAGKIFQCFLSAREVARSRDRDRMNSGAGSGARADDPPPQSQQERRVSTDLPEGQDVYTAACNSVIHRCALLILGVSPVIDELQKRREEGQLQQPSTSASEGGGLMTRSESLTAESRLVHTSPNYRLIKSRSESDLSQPESDEEGYALSGRRNVDLDLAASHRKRGPMHSQLESLSDSWARLKHSRDWLCNSSYSFESDFDLTKSLGVHTLIENVVSFVSGDVGNAPGFKEPEESMSTSPQASIIAMEQQQLRAELRLEALHQILVLLSGMEEKGSISLAGSRLSSGFQSSTLLTSVRLQFLAGCFGLGTVGHTGGKGESGRLHHYQDGIRAAKRNIQIEIQVAVHKIYQQLSATLERALQANKHHIEAQQRLLLVTVFALSVHYQPVDVSLAISTGLLNVLSQLCGTDTMLGQPLQLLPKTGVSQLSTALKVASTRLLQILAITTGTYADKLSPKVVQSLLDLLCSQLKNLLSQTGVLHMASFGEGEQEDGEEEEKKVDSSGETEKKDFRAALRKQHAAELHLGDFLVFLRRVVSSKAIQSKMASPKWTEVLLNIASQKCSSGIPLVGNLRTRLLALHVLEAVLPACESGVEDDQMAQIVERLFSLLSDCMWETPIAQAKHAIQIKEKEQEIKLQKQGELEEEDENLPIQEVSFDPEKAQCCLVENGQILTHGSGGKGYGLASTGVTSGCYQWKFYIVKENRGNEGTCVGVSRWPVHDFNHRTTSDMWLYRAYSGNLYHNGEQTLTLSSFTQGDFITCVLDMEARTISFGKNGEEPKLAFEDVDAAELYPCVMFYSSNPGEKVKICDMQMRGTPRDLLPGDPICSPVAAVLAEATIQLIRILHRTDRWTYCINKKMMERLHKIKICIKESGQKLKKSRSVQSREENEMREEKESKEEEKGKHTRHGLADLSELQLRTLCIEVWPVLAVIGGVDAGLRVGGRCVHKQTGRHATLLGVVKEGSTSAKVQWDEAEITISFPTFWSPSDTPLYNLEPCEPLPFDVARFRGLTASVLLDLTYLTGVHEDMGKQSTKRHEKKHRHESEEKGDVEQKPESESALDMRTGLTSDDVKSQSTTSSKSENEIASFSLDPTLPSVESQHQITEGKRKNHEHMSKNHDVAQSEIRAVQLSYLYLGAMKSLSALLGCSKYAELLLIPKVLAENGHNSDCASSPVVHEDVEMRAALQFLMRHMVKRAVMRSPIKRALGLADLERAQAMIYKLVVHGLLEDQFGGKIKQEIDQQAEESDPAQQAQTPVTTSPSASSTTSFMSSSLEDTTTATTPVTDTETVPASESPGVMPLSLLRQMFSSYPTTTVLPTRRAQTPPISSLPTSPSDEVGRRQSLTSPDSQSARPANRTALSDPSSRLSTSPPPPAIAVPLLEMGFSLRQIAKAMEATGARGEADAQNITVLAMWMIEHPGHEDEEEPQSGSTADSRPGAAVLGSGGKSNDPCYLQSPGDIPSADAAEMEEGFSESPDNLDHTENAASGSGPSARGRSAVTRRHKFDLAARTLLARAAGLYRSVQAHRNQSRREGISLQQDPGALYDFNLDEELEIDLDDEAMEAMFGQDLTSDNDILGMWIPEVLDWPTWHVCESEDREEVVVCELCECSVVSFNQHMKRNHPGCGRSANRQGYRSNGSYVDGWFGGECGSGNPYYLLCGTCREKYLAMKTKSKSTSSERYKGQAPDLIGKQDSVYEEDWDMLDVDEDEKLTGEEEFELLAGPLGLNDRRIVPEPVQFPDSDPLGASVAMVTATNSMEETLMQIGCHGSVEKSSSGRITLGEQAAALANPHDRVVALRRVTAAAQVLLARTMVMRALSLLSVSGSSCSLAAGLESLGLTDIRTLVRLMCLAAAGRAGLSTSPSAMASTSERSRGGHSKANKPISCLAYLSTAVGCLASNAPSAAKLLVQLCTQNLISAATGVNLTTVDDSIQRKFLPSFLRGIAEENKLVTSPNFVVTQALVALLADKGAKLRPNYDKSEVEKKGPLELANALAACCLSSRLSSQHRQWAAQQLVRTLAAHDRDNQTTLQTLADMGGDLRKCSFIKLEAHQNRVMTCVWCNKKGLLATSGNDGTIRVWNVTKKQYSLQQTCVFNRLEGDAEESLGSPSDPSFSPVSWSISGKYLAGALEKMVNIWQVNGGKGLVDIQPHWVSALAWPEEGPATAWSGESPELLLVGRMDGSLGLIEVVDVSTMHRRELEHCYRKDVSVTCIAWFSEDRPFAVGYFDGKLLLGTKEPLEKGGIVLIDAHKDTLISMKWDPTGHILMTCAKEDSVKLWGSISGCWCCLHSLCHPSIVNGIAWCRLPGKGSKLQLLMATGCQSGLVCVWRIPQDTTQTNVTSAEGWWEQESNCQDGYRKSSGAKCVYQLRGHITPVRTVAFSSDGLALVSGGLGGLMNIWSLRDGSVLQTVVIGSGAIQTTVWIPEVGVAACSNRSKDVLVVNCTAEWAAANHVLATCRTALKQQGVLGLNMAPCMRAFLERLPMMLQEQYAYEKPHVVCGDQLVHSPYMQCLASLAVGLHLDQLLCNPPVPPHHQNCLPDPASWNPNEWAWLECFSTTIKAAEALTNGAQFPESFTVPDLEPVPEDELVFLMDNSKWINGMDEQIMSWATSRPEDWHLGGKCDVYLWGAGRHGQLAEAGRNVMVPAAAPSFSQAQQVICGQNCTFVIQANGTVLACGEGSYGRLGQGNSDDLHVLTVISALQGFVVTQLVTSCGSDGHSMALTESGEVFSWGDGDYGKLGHGNSDRQRRPRQIEALQGEEVVQMSCGFKHSAVVTSDGKLFTFGNGDYGRLGLGNTSNKKLPERVTALEGYQIGQVACGLNHTLAVSADGSMVWAFGDGDYGKLGLGNSTAKSSPQKIDVLCGIGIKKVACGTQFSVALTKDGHVYTFGQDRLIGLPEGRARNHNRPQQIPVLAGVIIEDVAVGAEHTLALASNGDVYAWGSNSEGQLGLGHTNHVREPTLVTGLQGKNVRQISAGRCHSAAWTAPPVPPRAPGVSVPLQLGLPDTVPPQYGALREVSIHTVRARLRLLYHFSDLMYSSWRLLNLSPNNQNSTSHYNAGTWGIVQGQLRPLLAPRVYTLPMVRSIGKTMVQGKNYGPQITVKRISTRGRKCKPIFVQIARQVVKLNASDLRLPSRAWKVKLVGEGADDAGGVFDDTITEMCQELETGIVDLLIPSPNATAEVGYNRDRFLFNPSACLDEHLMQFKFLGILMGVAIRTKKPLDLHLAPLVWKQLCCVPLTLEDLEEVDLLYVQTLNSILHIEDSGITEESFHEMIPLDSFVGQSADGKMVPIIPGGNSIPLTFSNRKEYVERAIEYRLHEMDRQVAAVREGMSWIVPVPLLSLLTAKQLEQMVCGMPEISVEVLKKVVRYREVDEQHQLVQWFWHTLEEFSNEERVLFMRFVSGRSRLPANTADISQRFQIMKVDRPYDSLPTSQTCFFQLRLPPYSSQLVMAERLRYAINNCRSIDMDNYMLSRNVDNAEGSDTDY ->sp|O95714|HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 -MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIVYTGTESTQNGELPPRKDDSVEPSGTKKEDLNDKEKKDEEETPAPIYRAKSILDSWVWGKQPDVNELKECLSVLVKEQQALAVQSATTTLSALRLKQRLVILERYFIALNRTVFQENVKVKWKSSGISLPPVDKKSSRPAGKGVEGLARVGSRAALSFAFAFLRRAWRSGEDADLCSELLQESLDALRALPEASLFDESTVSSVWLEVVERATRFLRSVVTGDVHGTPATKGPGSIPLQDQHLALAILLELAVQRGTLSQMLSAILLLLQLWDSGAQETDNERSAQGTSAPLLPLLQRFQSIICRKDAPHSEGDMHLLSGPLSPNESFLRYLTLPQDNELAIDLRQTAVVVMAHLDRLATPCMPPLCSSPTSHKGSLQEVIGWGLIGWKYYANVIGPIQCEGLANLGVTQIACAEKRFLILSRNGRVYTQAYNSDTLAPQLVQGLASRNIVKIAAHSDGHHYLALAATGEVYSWGCGDGGRLGHGDTVPLEEPKVISAFSGKQAGKHVVHIACGSTYSAAITAEGELYTWGRGNYGRLGHGSSEDEAIPMLVAGLKGLKVIDVACGSGDAQTLAVTENGQVWSWGDGDYGKLGRGGSDGCKTPKLIEKLQDLDVVKVRCGSQFSIALTKDGQVYSWGKGDNQRLGHGTEEHVRYPKLLEGLQGKKVIDVAAGSTHCLALTEDSEVHSWGSNDQCQHFDTLRVTKPEPAALPGLDTKHIVGIACGPAQSFAWSSCSEWSIGLRVPFVVDICSMTFEQLDLLLRQVSEGMDGSADWPPPQEKECVAVATLNLLRLQLHAAISHQVDPEFLGLGLGSILLNSLKQTVVTLASSAGVLSTVQSAAQAVLQSGWSVLLPTAEERARALSALLPCAVSGNEVNISPGRRFMIDLLVGSLMADGGLESALHAAITAEIQDIEAKKEAQKEKEIDEQEANASTFHRSRTPLDKDLINTGICESSGKQCLPLVQLIQQLLRNIASQTVARLKDVARRISSCLDFEQHSRERSASLDLLLRFQRLLISKLYPGESIGQTSDISSPELMGVGSLLKKYTALLCTHIGDILPVAASIASTSWRHFAEVAYIVEGDFTGVLLPELVVSIVLLLSKNAGLMQEAGAVPLLGGLLEHLDRFNHLAPGKERDDHEELAWPGIMESFFTGQNCRNNEEVTLIRKADLENHNKDGGFWTVIDGKVYDIKDFQTQSLTGNSILAQFAGEDPVVALEAALQFEDTRESMHAFCVGQYLEPDQEIVTIPDLGSLSSPLIDTERNLGLLLGLHASYLAMSTPLSPVEIECAKWLQSSIFSGGLQTSQIHYSYNEEKDEDHCSSPGGTPASKSRLCSHRRALGDHSQAFLQAIADNNIQDHNVKDFLCQIERYCRQCHLTTPIMFPPEHPVEEVGRLLLCCLLKHEDLGHVALSLVHAGALGIEQVKHRTLPKSVVDVCRVVYQAKCSLIKTHQEQGRSYKEVCAPVIERLRFLFNELRPAVCNDLSIMSKFKLLSSLPRWRRIAQKIIRERRKKRVPKKPESTDDEEKIGNEESDLEEACILPHSPINVDKRPIAIKSPKDKWQPLLSTVTGVHKYKWLKQNVQGLYPQSPLLSTIAEFALKEEPVDVEKMRKCLLKQLERAEVRLEGIDTILKLASKNFLLPSVQYAMFCGWQRLIPEGIDIGEPLTDCLKDVDLIPPFNRMLLEVTFGKLYAWAVQNIRNVLMDASAKFKELGIQPVPLQTITNENPSGPSLGTIPQARFLLVMLSMLTLQHGANNLDLLLNSGMLALTQTALRLIGPSCDNVEEDMNASAQGASATVLEETRKETAPVQLPVSGPELAAMMKIGTRVMRGVDWKWGDQDGPPPGLGRVIGELGEDGWIRVQWDTGSTNSYRMGKEGKYDLKLAELPAAAQPSAEDSDTEDDSEAEQTERNIHPTAMMFTSTINLLQTLCLSAGVHAEIMQSEATKTLCGLLRMLVESGTTDKTSSPNRLVYREQHRSWCTLGFVRSIALTPQVCGALSSPQWITLLMKVVEGHAPFTATSLQRQILAVHLLQAVLPSWDKTERARDMKCLVEKLFDFLGSLLTTCSSDVPLLRESTLRRRRVRPQASLTATHSSTLAEEVVALLRTLHSLTQWNGLINKYINSQLRSITHSFVGRPSEGAQLEDYFPDSENPEVGGLMAVLAVIGGIDGRLRLGGQVMHDEFGEGTVTRITPKGKITVQFSDMRTCRVCPLNQLKPLPAVAFNVNNLPFTEPMLSVWAQLVNLAGSKLEKHKIKKSTKQAFAGQVDLDLLRCQQLKLYILKAGRALLSHQDKLRQILSQPAVQETGTVHTDDGAVVSPDLGDMSPEGPQPPMILLQQLLASATQPSPVKAIFDKQELEAAALAVCQCLAVESTHPSSPGFEDCSSSEATTPVAVQHIRPARVKRRKQSPVPALPIVVQLMEMGFSRRNIEFALKSLTGASGNASSLPGVEALVGWLLDHSDIQVTELSDADTVSDEYSDEEVVEDVDDAAYSMSTGAVVTESQTYKKRADFLSNDDYAVYVRENIQVGMMVRCCRAYEEVCEGDVGKVIKLDRDGLHDLNVQCDWQQKGGTYWVRYIHVELIGYPPPSSSSHIKIGDKVRVKASVTTPKYKWGSVTHQSVGVVKAFSANGKDIIVDFPQQSHWTGLLSEMELVPSIHPGVTCDGCQMFPINGSRFKCRNCDDFDFCETCFKTKKHNTRHTFGRINEPGQSAVFCGRSGKQLKRCHSSQPGMLLDSWSRMVKSLNVSSSVNQASRLIDGSEPCWQSSGSQGKHWIRLEIFPDVLVHRLKMIVDPADSSYMPSLVVVSGGNSLNNLIELKTININPSDTTVPLLNDCTEYHRYIEIAIKQCRSSGIDCKIHGLILLGRIRAEEEDLAAVPFLASDNEEEEDEKGNSGSLIRKKAAGLESAATIRTKVFVWGLNDKDQLGGLKGSKIKVPSFSETLSALNVVQVAGGSKSLFAVTVEGKVYACGEATNGRLGLGISSGTVPIPRQITALSSYVVKKVAVHSGGRHATALTVDGKVFSWGEGDDGKLGHFSRMNCDKPRLIEALKTKRIRDIACGSSHSAALTSSGELYTWGLGEYGRLGHGDNTTQLKPKMVKVLLGHRVIQVACGSRDAQTLALTDEGLVFSWGDGDFGKLGRGGSEGCNIPQNIERLNGQGVCQIECGAQFSLALTKSGVVWTWGKGDYFRLGHGSDVHVRKPQVVEGLRGKKIVHVAVGALHCLAVTDSGQVYAWGDNDHGQQGNGTTTVNRKPTLVQGLEGQKITRVACGSSHSVAWTTVDVATPSVHEPVLFQTARDPLGASYLGVPSDADSSAASNKISGASNSKPNRPSLAKILLSLDGNLAKQQALSHILTALQIMYARDAVVGALMPAAMIAPVECPSFSSAAPSDASAMASPMNGEECMLAVDIEDRLSPNPWQEKREIVSSEDAVTPSAVTPSAPSASARPFIPVTDDLGAASIIAETMTKTKEDVESQNKAAGPEPQALDEFTSLLIADDTRVVVDLLKLSVCSRAGDRGRDVLSAVLSGMGTAYPQVADMLLELCVTELEDVATDSQSGRLSSQPVVVESSHPYTDDTSTSGTVKIPGAEGLRVEFDRQCSTERRHDPLTVMDGVNRIVSVRSGREWSDWSSELRIPGDELKWKFISDGSVNGWGWRFTVYPIMPAAGPKELLSDRCVLSCPSMDLVTCLLDFRLNLASNRSIVPRLAASLAACAQLSALAASHRMWALQRLRKLLTTEFGQSININRLLGENDGETRALSFTGSALAALVKGLPEALQRQFEYEDPIVRGGKQLLHSPFFKVLVALACDLELDTLPCCAETHKWAWFRRYCMASRVAVALDKRTPLPRLFLDEVAKKIRELMADSENMDVLHESHDIFKREQDEQLVQWMNRRPDDWTLSAGGSGTIYGWGHNHRGQLGGIEGAKVKVPTPCEALATLRPVQLIGGEQTLFAVTADGKLYATGYGAGGRLGIGGTESVSTPTLLESIQHVFIKKVAVNSGGKHCLALSSEGEVYSWGEAEDGKLGHGNRSPCDRPRVIESLRGIEVVDVAAGGAHSACVTAAGDLYTWGKGRYGRLGHSDSEDQLKPKLVEALQGHRVVDIACGSGDAQTLCLTDDDTVWSWGDGDYGKLGRGGSDGCKVPMKIDSLTGLGVVKVECGSQFSVALTKSGAVYTWGKGDYHRLGHGSDDHVRRPRQVQGLQGKKVIAIATGSLHCVCCTEDGEVYTWGDNDEGQLGDGTTNAIQRPRLVAALQGKKVNRVACGSAHTLAWSTSKPASAGKLPAQVPMEYNHLQEIPIIALRNRLLLLHHLSELFCPCIPMFDLEGSLDETGLGPSVGFDTLRGILISQGKEAAFRKVVQATMVRDRQHGPVVELNRIQVKRSRSKGGLAGPDGTKSVFGQMCAKMSSFGPDSLLLPHRVWKVKFVGESVDDCGGGYSESIAEICEELQNGLTPLLIVTPNGRDESGANRDCYLLSPAARAPVHSSMFRFLGVLLGIAIRTGSPLSLNLAEPVWKQLAGMSLTIADLSEVDKDFIPGLMYIRDNEATSEEFEAMSLPFTVPSASGQDIQLSSKHTHITLDNRAEYVRLAINYRLHEFDEQVAAVREGMARVVPVPLLSLFTGYELETMVCGSPDIPLHLLKSVATYKGIEPSASLIQWFWEVMESFSNTERSLFLRFVWGRTRLPRTIADFRGRDFVIQVLDKYNPPDHFLPESYTCFFLLKLPRYSCKQVLEEKLKYAIHFCKSIDTDDYARIALTGEPAADDSSDDSDNEDVDSFASDSTQDYLTGH ->sp|Q6ZN18|AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2 -MAAAITDMADLEELSRLSPLPPGSPGSAARGRAEPPEEEEEEEEEEEEAEAEAVAALLLNGGSGGGGGGGGGGVGGGEAETMSEPSPESASQAGEDEDEEEDDEEEEDESSSSGGGEEESSAESLVGSSGGSSSDETRSLSPGAASSSSGDGDGKEGLEEPKGPRGSQGGGGGGSSSSSVVSSGGDEGYGTGGGGSSATSGGRRGSLEMSSDGEPLSRMDSEDSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKRTLIRKVFNLYLSKQ ->sp|Q6ZN18-2|AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 -MAAAITDMADLEELSRLSPLPPGSPGSAARGRAEPPEEEEEEEEEEEEAEAEAVAALLLNGGSGGGGGGGGGGVGGGEAETMSEPSPESASQAGEDEDEEEDDEEEEDESSSSGGGEEESSAESLVGSSGGSSSDETRSLSPGAASSSSGDGDGKEGLEEPKGPRGSQGGGGGGSSSSSVVSSGGDEGYGTGGGGSSATSGGRRGSLEMSSDGEPLSRMDSEDSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKR ->sp|Q6ZN18-3|AEBP2_HUMAN Isoform 3 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 -MYTRRYSSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKRTLIRKVFNLYLSKQ ->sp|O15083|ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3 -MYGSARTITNLEGSPSRSPRLPRSPRLGHRRTSSGGGGGTGKTLSMENIQSLNAAYATSGPMYLSDHEGVASTTYPKGTMTLGRATNRAVYGGRVTAMGSSPNIASAGLSHTDVLSYTDQHGGLTGSSHHHHHQVPSMLRQVRDSTMLDLQAQLKELQRENDLLRKELDIKDSKLGSSMNSIKTFWSPELKKERVLRKEEAARMSVLKEQMRVSHEENQHLQLTIQALQDELRTQRDLNHLLQQESGNRGAEHFTIELTEENFRRLQAEHDRQAKELFLLRKTLEEMELRIETQKQTLNARDESIKKLLEMLQSKGLPSKSLEDDNERTRRMAEAESQVSHLEVILDQKEKENIHLREELHRRSQLQPEPAKTKALQTVIEMKDTKIASLERNIRDLEDEIQMLKANGVLNTEDREEEIKQIEVYKSHSKFMKTKIDQLKQELSKKESELLALQTKLETLSNQNSDCKQHIEVLKESLTAKEQRAAILQTEVDALRLRLEEKESFLNKKTKQLQDLTEEKGTLAGEIRDMKDMLEVKERKINVLQKKIENLQEQLRDKDKQLTNLKDRVKSLQTDSSNTDTALATLEEALSEKERIIERLKEQRERDDRERLEEIESFRKENKDLKEKVNALQAELTEKESSLIDLKEHASSLASAGLKRDSKLKSLEIAIEQKKEECSKLEAQLKKAHNIEDDSRMNPEFADQIKQLDKEASYYRDECGKAQAEVDRLLEILKEVENEKNDKDKKIAELESLTLRHMKDQNKKVANLKHNQQLEKKKNAQLLEEVRRREDSMADNSQHLQIEELMNALEKTRQELDATKARLASTQQSLAEKEAHLANLRIERRKQLEEILEMKQEALLAAISEKDANIALLELSASKKKKTQEEVMALKREKDRLVHQLKQQTQNRMKLMADNYDDDHHHYHHHHHHHHHRSPGRSQHSNHRPSPDQDDEEGIWA ->sp|P23763|VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1 -MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVIYFFT ->sp|P23763-3|VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 -MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVSKYR ->sp|P23763-2|VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 -MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVRRD ->sp|Q15836|VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 -MSTGPTAATGSNRRLQQTQNQVDEVVDIMRVNVDKVLERDQKLSELDDRADALQAGASQFETSAAKLKRKYWWKNCKMWAIGITVLVIFIIIIIVWVVSS ->sp|P63027|VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3 -MSATAATAPPAAPAGEGGPPAPPPNLTSNRRLQQTQAQVDEVVDIMRVNVDKVLERDQKLSELDDRADALQAGASQFETSAAKLKRKYWWKNLKMMIILGVICAIILIIIIVYFST ->sp|O75379|VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2 -MPPKFKRHLNDDDVTGSVKSERRNLLEDDSDEEEDFFLRGPSGPRFGPRNDKIKHVQNQVDEVIDVMQENITKVIERGERLDELQDKSESLSDNATAFSNRSKQLRRQMWWRGCKIKAIMALVAAILLLVIIILIVMKYRT ->sp|O75379-2|VAMP4_HUMAN_Isoform 2 of Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 -MPPKFKRHLNDDDVTGSVKSERRNLLEDDSDEEEDFFLGPSGPRFGPRNDKIKHVQNQVDEVIDVMQENITKVIERGERLDELQDKSESLSDNATAFSNRSKQLRRQMWWRGCKIKAIMALVAAILLLVIIILIVMKYRT ->sp|O95183|VAMP5_HUMAN_Vesicle-associated membrane protein 5 OS=Homo sapiens OX=9606 GN=VAMP5 PE=1 SV=1 -MAGIELERCQQQANEVTEIMRNNFGKVLERGVKLAELQQRSDQLLDMSSTFNKTTQNLAQKKCWENIRYRICVGLVVVGVLLIILIVLLVVFLPQSSDSSSAPRTQDAGIASGPGN ->sp|P51809|VAMP7_HUMAN_Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7 PE=1 SV=3 -MAILFAVVARGTTILAKHAWCGGNFLEVTEQILAKIPSENNKLTYSHGNYLFHYICQDRIVYLCITDDDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIDLVAQRGERLELLIDKTENLVDSSVTFKTTSRNLARAMCMKNLKLTIIIIIVSIVFIYIIVSPLCGGFTWPSCVKK ->sp|P51809-2|VAMP7_HUMAN_Isoform 2 of Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7 -MAILFAVVARGTTILAKHAWCGGNFLEVTEQILAKIPSENNKLTYSHGNYLFHYICQDRIVYLCITDDDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIVCHLQNYQQKSCSSHVYEEPQAHYYHHHRINCVHLYHCFTSLWWIYMAKLCEEIGKKKLPLTKDMREQGVKSNPCDSSLSHTDRWYLPVSSTLFSLFKILFHASRFIFVLSTSLFL ->sp|P51809-3|VAMP7_HUMAN_Isoform 3 of Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7 -MAILFAVVARGTTILAKHAWCGGNFLEDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIDLVAQRGERLELLIDKTENLVDSSVTFKTTSRNLARAMCMKNLKLTIIIIIVSIVFIYIIVSPLCGGFTWPSCVKK ->sp|Q9BV40|VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1 -MEEASEGGGNDRVRNLQSEVEGVKNIMTQNVERILARGENLEHLRNKTEDLEATSEHFKTTSQKVARKFWWKNVKMIVLICVIVFIIILFIVLFATGAFS
--- a/workflow/ppenrich_suite_wf.ga Tue Mar 15 12:44:40 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,678 +0,0 @@ -{ - "a_galaxy_workflow": "true", - "annotation": "phoshpoproteomic enrichment data pre-processing and ANOVA", - "creator": [ - { - "class": "Person", - "identifier": "0000-0002-2882-0508", - "name": "Art Eschenlauer" - } - ], - "format-version": "0.1", - "license": "MIT", - "name": "ppenrich_suite_wf", - "steps": { - "0": { - "annotation": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).", - "content_id": null, - "errors": null, - "id": 0, - "input_connections": {}, - "inputs": [ - { - "description": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).", - "name": "Phospho (STY)Sites.txt" - } - ], - "label": "Phospho (STY)Sites.txt", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": -36.30000305175781, - "height": 82.19999694824219, - "left": 150, - "right": 350, - "top": -118.5, - "width": 200, - "x": 150, - "y": -118.5 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"tabular\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "f4273d40-f2b8-4ad0-8bcc-91e72bd25fe1", - "workflow_outputs": [] - }, - "1": { - "annotation": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)", - "content_id": null, - "errors": null, - "id": 1, - "input_connections": {}, - "inputs": [ - { - "description": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)", - "name": "SwissProt_Human_Canonical_Isoform.fasta" - } - ], - "label": "SwissProt_Human_Canonical_Isoform.fasta", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": 278.1000061035156, - "height": 102.60000610351562, - "left": 376, - "right": 576, - "top": 175.5, - "width": 200, - "x": 376, - "y": 175.5 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"fasta\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "cb31b0ac-cacc-42ee-bd42-f42d0bdae128", - "workflow_outputs": [] - }, - "2": { - "annotation": "Derived from https://networkin.info/download/networkin_human_predictions_3.1.tsv.xz (which is free for non-commercial use - for required citation, see https://networkin.info/)", - "content_id": null, - "errors": null, - "id": 2, - "input_connections": {}, - "inputs": [ - { - "description": "Derived from https://networkin.info/download/networkin_human_predictions_3.1.tsv.xz (which is free for non-commercial use - for required citation, see https://networkin.info/)", - "name": "NetworKIN_cutoffscore2.0.tabular" - } - ], - "label": "NetworKIN_cutoffscore2.0.tabular", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": 423.1000061035156, - "height": 102.60000610351562, - "left": 387, - "right": 587, - "top": 320.5, - "width": 200, - "x": 387, - "y": 320.5 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"tabular\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "e6ec01b8-ff1a-4c90-a064-b40c5cad75bb", - "workflow_outputs": [] - }, - "3": { - "annotation": "Derived from http://hprd.org/serine_motifs, http://hprd.org/tyrosine_motifs, and http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx", - "content_id": null, - "errors": null, - "id": 3, - "input_connections": {}, - "inputs": [ - { - "description": "Derived from http://hprd.org/serine_motifs, http://hprd.org/tyrosine_motifs, and http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx", - "name": "pSTY_Motifs.tabular" - } - ], - "label": "pSTY_Motifs.tabular", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": 546.6999969482422, - "height": 82.19999694824219, - "left": 399, - "right": 599, - "top": 464.5, - "width": 200, - "x": 399, - "y": 464.5 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"tabular\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "2c59056a-c1b4-4a20-a194-991d56c8b6c2", - "workflow_outputs": [] - }, - "4": { - "annotation": "Derived from Kinase_Substrate_Dataset.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", - "content_id": null, - "errors": null, - "id": 4, - "input_connections": {}, - "inputs": [ - { - "description": "Derived from Kinase_Substrate_Dataset.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", - "name": "PSP_Kinase_Substrate_Dataset.tabular" - } - ], - "label": "PSP_Kinase_Substrate_Dataset.tabular", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": 696.1000061035156, - "height": 102.60000610351562, - "left": 420, - "right": 620, - "top": 593.5, - "width": 200, - "x": 420, - "y": 593.5 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"tabular\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "987a5891-15f1-4f70-89a8-386447f0bf24", - "workflow_outputs": [] - }, - "5": { - "annotation": "Derived from Regulatory_sites.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", - "content_id": null, - "errors": null, - "id": 5, - "input_connections": {}, - "inputs": [ - { - "description": "Derived from Regulatory_sites.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", - "name": "PSP_Regulatory_sites.tabular" - } - ], - "label": "PSP_Regulatory_sites.tabular", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": 820.6999969482422, - "height": 82.19999694824219, - "left": 436, - "right": 636, - "top": 738.5, - "width": 200, - "x": 436, - "y": 738.5 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"tabular\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "964d8d21-b063-411a-aee8-372a0d0dfba3", - "workflow_outputs": [] - }, - "6": { - "annotation": "List of alpha cutoff values for significance testing; text file having no header and a single line for each cutoff value.", - "content_id": null, - "errors": null, - "id": 6, - "input_connections": {}, - "inputs": [ - { - "description": "List of alpha cutoff values for significance testing; text file having no header and a single line for each cutoff value.", - "name": "alpha_levels.tabular" - } - ], - "label": "alpha_levels.tabular", - "name": "Input dataset", - "outputs": [], - "position": { - "bottom": 1071.1999969482422, - "height": 82.19999694824219, - "left": 418, - "right": 618, - "top": 989, - "width": 200, - "x": 418, - "y": 989 - }, - "tool_id": null, - "tool_state": "{\"optional\": false, \"format\": [\"tabular\"]}", - "tool_version": null, - "type": "data_input", - "uuid": "42577db7-d5e5-4f39-b3ad-d0648abb9df3", - "workflow_outputs": [] - }, - "7": { - "annotation": "", - "content_id": "mqppep_preproc", - "errors": null, - "id": 7, - "input_connections": { - "networkin": { - "id": 2, - "output_name": "output" - }, - "p_sty_motifs": { - "id": 3, - "output_name": "output" - }, - "phosphoSites": { - "id": 0, - "output_name": "output" - }, - "protein_fasta": { - "id": 1, - "output_name": "output" - }, - "psp_kinase_substrate": { - "id": 4, - "output_name": "output" - }, - "psp_regulatory_sites": { - "id": 5, - "output_name": "output" - } - }, - "inputs": [ - { - "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", - "name": "networkin" - }, - { - "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", - "name": "p_sty_motifs" - }, - { - "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", - "name": "phosphoSites" - }, - { - "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", - "name": "protein_fasta" - }, - { - "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", - "name": "psp_kinase_substrate" - }, - { - "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", - "name": "psp_regulatory_sites" - } - ], - "label": null, - "name": "MaxQuant Phosphopeptide Preprocessing", - "outputs": [ - { - "name": "phosphoPepIntensities", - "type": "tabular" - }, - { - "name": "enrichGraph", - "type": "pdf" - }, - { - "name": "locProbCutoffGraph", - "type": "pdf" - }, - { - "name": "enrichGraph_svg", - "type": "svg" - }, - { - "name": "locProbCutoffGraph_svg", - "type": "svg" - }, - { - "name": "filteredData_tabular", - "type": "tabular" - }, - { - "name": "quantData_tabular", - "type": "tabular" - }, - { - "name": "mapped_phophopeptides", - "type": "tabular" - }, - { - "name": "melted_phophopeptide_map", - "type": "tabular" - }, - { - "name": "mqppep_output_sqlite", - "type": "sqlite" - }, - { - "name": "preproc_tab", - "type": "tabular" - }, - { - "name": "preproc_csv", - "type": "csv" - }, - { - "name": "preproc_sqlite", - "type": "sqlite" - } - ], - "position": { - "bottom": 964.0999755859375, - "height": 793.5999755859375, - "left": 826.5, - "right": 1026.5, - "top": 170.5, - "width": 200, - "x": 826.5, - "y": 170.5 - }, - "post_job_actions": { - "RenameDatasetActionenrichGraph": { - "action_arguments": { - "newname": "#{phosphoSites}.enrichGraph_pdf" - }, - "action_type": "RenameDatasetAction", - "output_name": "enrichGraph" - }, - "RenameDatasetActionenrichGraph_svg": { - "action_arguments": { - "newname": "#{phosphoSites}.enrichGraph_svg" - }, - "action_type": "RenameDatasetAction", - "output_name": "enrichGraph_svg" - }, - "RenameDatasetActionfilteredData_tabular": { - "action_arguments": { - "newname": "#{phosphoSites}.filteredData" - }, - "action_type": "RenameDatasetAction", - "output_name": "filteredData_tabular" - }, - "RenameDatasetActionlocProbCutoffGraph": { - "action_arguments": { - "newname": "#{phosphoSites}.locProbCutoffGraph_pdf" - }, - "action_type": "RenameDatasetAction", - "output_name": "locProbCutoffGraph" - }, - "RenameDatasetActionlocProbCutoffGraph_svg": { - "action_arguments": { - "newname": "#{phosphoSites}.locProbCutoffGraph_svg" - }, - "action_type": "RenameDatasetAction", - "output_name": "locProbCutoffGraph_svg" - }, - "RenameDatasetActionmapped_phophopeptides": { - "action_arguments": { - "newname": "#{phosphoSites}.ppep_map" - }, - "action_type": "RenameDatasetAction", - "output_name": "mapped_phophopeptides" - }, - "RenameDatasetActionmelted_phophopeptide_map": { - "action_arguments": { - "newname": "#{phosphoSites}.melted" - }, - "action_type": "RenameDatasetAction", - "output_name": "melted_phophopeptide_map" - }, - "RenameDatasetActionmqppep_output_sqlite": { - "action_arguments": { - "newname": "#{phosphoSites}.ppep_mapping_sqlite" - }, - "action_type": "RenameDatasetAction", - "output_name": "mqppep_output_sqlite" - }, - "RenameDatasetActionphosphoPepIntensities": { - "action_arguments": { - "newname": "#{phosphoSites}.ppep_intensities" - }, - "action_type": "RenameDatasetAction", - "output_name": "phosphoPepIntensities" - }, - "RenameDatasetActionpreproc_csv": { - "action_arguments": { - "newname": "#{phosphoSites}.preproc_csv" - }, - "action_type": "RenameDatasetAction", - "output_name": "preproc_csv" - }, - "RenameDatasetActionpreproc_sqlite": { - "action_arguments": { - "newname": "#{phosphoSites}.preproc_sqlite" - }, - "action_type": "RenameDatasetAction", - "output_name": "preproc_sqlite" - }, - "RenameDatasetActionpreproc_tab": { - "action_arguments": { - "newname": "#{phosphoSites}.preproc_tab" - }, - "action_type": "RenameDatasetAction", - "output_name": "preproc_tab" - }, - "RenameDatasetActionquantData_tabular": { - "action_arguments": { - "newname": "#{phosphoSites}.quantData" - }, - "action_type": "RenameDatasetAction", - "output_name": "quantData_tabular" - } - }, - "tool_id": "mqppep_preproc", - "tool_state": "{\"collapseFunc\": \"sum\", \"intervalCol\": \"1\", \"localProbCutoff\": \"0.75\", \"merge_function\": \"sum\", \"networkin\": {\"__class__\": \"RuntimeValue\"}, \"p_sty_motifs\": {\"__class__\": \"RuntimeValue\"}, \"phosphoCol\": \"^Number of Phospho [(]STY[)]$\", \"phosphoSites\": {\"__class__\": \"RuntimeValue\"}, \"protein_fasta\": {\"__class__\": \"RuntimeValue\"}, \"psp_kinase_substrate\": {\"__class__\": \"RuntimeValue\"}, \"psp_regulatory_sites\": {\"__class__\": \"RuntimeValue\"}, \"pst_not_py\": \"true\", \"species\": \"human\", \"startCol\": \"^Intensity[^_]\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", - "tool_version": null, - "type": "tool", - "uuid": "886043ce-8d9b-474e-b970-4fe9ee6a74fa", - "workflow_outputs": [ - { - "label": "ppep_intensities", - "output_name": "phosphoPepIntensities", - "uuid": "e19a64d1-edee-4119-a72e-456af7a6c056" - }, - { - "label": "enrichGraph_pdf", - "output_name": "enrichGraph", - "uuid": "7e9936d9-9617-4df4-9133-7a04f8d05d26" - }, - { - "label": "locProbCutoffGraph_pdf", - "output_name": "locProbCutoffGraph", - "uuid": "5656cba7-25e2-4362-ae92-1ddac67dee07" - }, - { - "label": "enrichGraph_svg", - "output_name": "enrichGraph_svg", - "uuid": "ca13a22e-a41b-481c-ab87-1f97bbf768e9" - }, - { - "label": "locProbCutoffGraph_svg", - "output_name": "locProbCutoffGraph_svg", - "uuid": "fc7a11f5-30d8-4409-878a-d3b70366711c" - }, - { - "label": "filteredData", - "output_name": "filteredData_tabular", - "uuid": "aab49fc5-a3cf-4479-ac23-8e9272dadf28" - }, - { - "label": "quantData", - "output_name": "quantData_tabular", - "uuid": "23940202-403e-4256-916b-92539db07cdb" - }, - { - "label": "ppep_map", - "output_name": "mapped_phophopeptides", - "uuid": "08ad13d4-c103-4f18-92cc-2c3b58565981" - }, - { - "label": "melted_phosphopeptide_map", - "output_name": "melted_phophopeptide_map", - "uuid": "77cecaeb-8f7c-482e-b78a-e4809b194eb7" - }, - { - "label": "ppep_mapping_sqlite", - "output_name": "mqppep_output_sqlite", - "uuid": "8e53e05a-a47c-4b97-87e4-ebab133ccaea" - }, - { - "label": "preproc_tab", - "output_name": "preproc_tab", - "uuid": "530a8140-9eba-4c87-a76b-4922febc12e7" - }, - { - "label": "preproc_csv", - "output_name": "preproc_csv", - "uuid": "c5f22f05-0bf7-48cf-adc0-c2beffe33169" - }, - { - "label": "preproc_sqlite", - "output_name": "preproc_sqlite", - "uuid": "53424150-7673-40af-ad60-0b4035e0c302" - } - ] - }, - "8": { - "annotation": "Perform ANOVA. For imputing missing values, use median of non-missing values from the same treatment group.", - "content_id": "mqppep_anova", - "errors": null, - "id": 8, - "input_connections": { - "alpha_file": { - "id": 6, - "output_name": "output" - }, - "input_file": { - "id": 7, - "output_name": "preproc_tab" - } - }, - "inputs": [], - "label": "MaxQuant Phosphopeptide ANOVA group-median imputed", - "name": "MaxQuant Phosphopeptide ANOVA", - "outputs": [ - { - "name": "imputed_data_file", - "type": "tabular" - }, - { - "name": "report_file", - "type": "html" - } - ], - "position": { - "bottom": 1349, - "height": 256, - "left": 1058, - "right": 1258, - "top": 1093, - "width": 200, - "x": 1058, - "y": 1093 - }, - "post_job_actions": { - "RenameDatasetActionimputed_data_file": { - "action_arguments": { - "newname": "#{input_file}.intensities_group-mean-imputed_QN_LT" - }, - "action_type": "RenameDatasetAction", - "output_name": "imputed_data_file" - }, - "RenameDatasetActionreport_file": { - "action_arguments": { - "newname": "#{input_file}.intensities_group-mean-imputed_report (download/unzip to view)" - }, - "action_type": "RenameDatasetAction", - "output_name": "report_file" - } - }, - "tool_id": "mqppep_anova", - "tool_state": "{\"alpha_file\": {\"__class__\": \"ConnectedValue\"}, \"first_data_column\": \"Intensity\", \"imputation\": {\"imputation_method\": \"group-median\", \"__current_case__\": 0}, \"input_file\": {\"__class__\": \"ConnectedValue\"}, \"sample_grouping_regex\": \"(\\\\d+)\", \"sample_names_regex\": \"\\\\.(\\\\d+)[A-Z]$\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", - "tool_version": null, - "type": "tool", - "uuid": "a3cb902d-8ef6-4f84-bed3-80b2b20d1916", - "workflow_outputs": [ - { - "label": "intensities_group-mean-imputed_QN_LT", - "output_name": "imputed_data_file", - "uuid": "ef19dcd3-8f3e-4fc4-829e-dae6719ff1cc" - }, - { - "label": "intensities_group-mean-imputed_report", - "output_name": "report_file", - "uuid": "26bb93b0-bc11-4455-a280-241253b21981" - } - ] - }, - "9": { - "annotation": "Perform ANOVA. For imputing missing values, create random values.", - "content_id": "mqppep_anova", - "errors": null, - "id": 9, - "input_connections": { - "alpha_file": { - "id": 6, - "output_name": "output" - }, - "input_file": { - "id": 7, - "output_name": "preproc_tab" - } - }, - "inputs": [], - "label": "MaxQuant Phosphopeptide ANOVA randomly imputed", - "name": "MaxQuant Phosphopeptide ANOVA", - "outputs": [ - { - "name": "imputed_data_file", - "type": "tabular" - }, - { - "name": "report_file", - "type": "html" - } - ], - "position": { - "bottom": 1186, - "height": 256, - "left": 1308, - "right": 1508, - "top": 930, - "width": 200, - "x": 1308, - "y": 930 - }, - "post_job_actions": { - "RenameDatasetActionimputed_data_file": { - "action_arguments": { - "newname": "#{input_file}.intensities_randomly-imputed_QN_LT" - }, - "action_type": "RenameDatasetAction", - "output_name": "imputed_data_file" - }, - "RenameDatasetActionreport_file": { - "action_arguments": { - "newname": "#{input_file}.intensities_randomly-imputed_report (download/unzip to view)" - }, - "action_type": "RenameDatasetAction", - "output_name": "report_file" - } - }, - "tool_id": "mqppep_anova", - "tool_state": "{\"alpha_file\": {\"__class__\": \"ConnectedValue\"}, \"first_data_column\": \"Intensity\", \"imputation\": {\"imputation_method\": \"random\", \"__current_case__\": 3, \"meanPercentile\": \"1\", \"sdPercentile\": \"0.2\"}, \"input_file\": {\"__class__\": \"ConnectedValue\"}, \"sample_grouping_regex\": \"(\\\\d+)\", \"sample_names_regex\": \"\\\\.(\\\\d+)[A-Z]$\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", - "tool_version": null, - "type": "tool", - "uuid": "217d92af-f6d6-4fd3-a78a-090d8afd3ae0", - "workflow_outputs": [ - { - "label": "intensities_randomly-imputed_QN_LT", - "output_name": "imputed_data_file", - "uuid": "925d734f-f9d8-49e8-aebb-c8d7598d45b2" - }, - { - "label": "intensities_randomly-imputed_report", - "output_name": "report_file", - "uuid": "4ab5f1b1-d04e-4634-8765-265122bc1064" - } - ] - } - }, - "tags": [ - "ppenrich" - ], - "uuid": "c54c2b2e-8080-445c-bc3e-43950c89d4e4", - "version": 3 -} \ No newline at end of file