Mercurial > repos > eschen42 > mqppep_anova
changeset 0:c1403d18c189 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
author | eschen42 |
---|---|
date | Mon, 07 Mar 2022 19:05:01 +0000 |
parents | |
children | 5ccf4e985c6a |
files | MaxQuantProcessingScript.R PhosphoPeptide_Upstream_Kinase_Mapping.pl macros.xml mqppep_anova.R mqppep_anova.xml mqppep_anova_script.Rmd mqppep_mrgfltr.py search_ppep.py test-data/alpha_levels.tabular test-data/pSTY_motifs.tabular test-data/test_input_for_anova.tabular test-data/test_input_for_preproc.tabular test-data/test_kinase_substrate.tabular test-data/test_networkin.tabular test-data/test_regulatory_sites.tabular test-data/test_swissprot.fasta workflow/ppenrich_suite_wf.ga |
diffstat | 17 files changed, 6568 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MaxQuantProcessingScript.R Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,500 @@ +#!/usr/bin/env Rscript + +# This is the implementation for the +# "MaxQuant Phosphopeptide Localization Probability Cutoff" +# Galaxy tool (mqppep_lclztn_filter) +# It is adapted from the MaxQuant Processing Script written by Larry Cheng. + +# libraries +library(optparse) +library(data.table) +library(stringr) +library(ggplot2) +#library(PTXQC) +#require(PTXQC) +#require(methods) + +# title: "MaxQuant Processing Script" +# author: "Larry Cheng" +# date: "February 19, 2018" +# +# # MaxQuant Processing Script +# Takes MaxQuant Phospho (STY)sites.txt file as input and performs the following (in order): +# 1) Runs the Proteomics Quality Control software +# 2) Remove contaminant and reverse sequence rows +# 3) Filters rows based on localization probability +# 4) Extract the quantitative data +# 5) Sequences phosphopeptides +# 6) Merges multiply phosphorylated peptides +# 7) Filters out phosphopeptides based on enrichment +# The output file contains the phosphopeptide (first column) and the quantitative values for each sample +# +# ## Revision History +# Rev. 2022-02-10 :wrap for inclusion in Galaxy +# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" and "Phosphopeptide Processing Script" +# Rev. 2017-12-12 :added PTXQC +# added additional plots and table outputs for quality control +# allowed for more than 2 samples to be grouped together (up to 26 (eg, 1A, 1B, 1C, etc))regexSampleNames <- +# "\\.(\\d+)[A-Z]$" +# converted from .r to .rmd file to knit report for quality control +# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data impute multiple times +# Rev. 2016-09-09 :added filter to eliminate contaminant and reverse sequence rows +# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to preANOVA file output +# Rev. 2016-08-22 :changed regexpression to regexSampleNames <- "\\.(\\d+)[AB]$" so that it looks at the end of string +# Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....) +# Rev. 2016-07-03 :Removed row names from the write.table() output for ANOVA and PreANOVA +# Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75 +# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting the row numbers afterwards +# Rev. 2016-06-21 :test18 + standardized the regexpression in protocol + + +### FUNCTION DECLARATIONS begin ---------------------------------------------- + +# Read first line of file at filePath +# adapted from: https://stackoverflow.com/a/35761217/15509512 +readFirstLine <- function(filepath) { + con = file(filepath, "r") + line = readLines(con, n = 1) + close(con) + return(line) +} + +# Move columns to the end of dataframe +# - data: the dataframe +# - move: a vector of column names, each of which is an element of names(data) +movetolast <- function(data, move) { + data[c(setdiff(names(data), move), move)] +} + +# Generate phosphopeptide and build list when applied +phosphopeptide_func <- function(df) { + + #generate peptide sequence and list of phosphopositions + phosphoprobsequence <- strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]] + output <- vector() + phosphopeptide <- "" + counter <- 0 #keep track of position in peptide + phosphopositions <- vector() #keep track of phosphorylation positions in peptide + score_diff <- "" + for (chara in phosphoprobsequence){ + #build peptide sequence + if (!(chara == " " | chara == "(" | chara == ")" | chara =="." | chara =="-" | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9")) { + phosphopeptide <- paste(phosphopeptide,chara,sep="") + counter <- counter + 1 + } + #generate score_diff + if (chara == "-" | chara =="." | chara == "0" | chara == "1" | chara == "2" | chara == "3" | chara =="4" | chara == "5" | chara == "6" | chara == "7" | chara =="8" | chara =="9"){ + score_diff <- paste(score_diff,chara,sep="") + } + #evaluate score_diff + if (chara == ")" ){ + score_diff <- as.numeric(score_diff) + #only consider a phosphoresidue if score_diff > 0 + if (score_diff > 0) { + phosphopositions <- append(phosphopositions, counter) + } + score_diff <- "" + } + } + + #generate phosphopeptide sequence (ie, peptide sequence with "p"'s) + counter <- 1 + phosphoposition_correction1 <- -1 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string + phosphoposition_correction2 <- 0 #used to correct phosphosposition as "p"'s are inserted into the phosphopeptide string + while (counter <= length(phosphopositions) ) { + phosphopeptide <- paste(substr(phosphopeptide,0,phosphopositions[counter]+phosphoposition_correction1),"p",substr(phosphopeptide,phosphopositions[counter]+phosphoposition_correction2,nchar(phosphopeptide)),sep="") + counter <- counter + 1 + phosphoposition_correction1 <- phosphoposition_correction1 + 1 + phosphoposition_correction2 <- phosphoposition_correction2 + 1 + } + + #building phosphopeptide list + output <- append(output,phosphopeptide) + return(output) +} + +### FUNCTION DECLARATIONS end ------------------------------------------------ + + +### EXTRACT ARGUMENTS begin -------------------------------------------------- + +# parse options +option_list <- list( + make_option( + c("-i", "--input"), + action = "store", + type = "character", + help = "A MaxQuant Phospho (STY)Sites.txt" + ) +, make_option( + c("-o", "--output"), + action = "store", + type = "character", + help = "path to output file" + ) +, make_option( + c("-E", "--enrichGraph"), + action = "store", + type = "character", + help = "path to enrichment graph PDF" + ) +, make_option( + c("-F", "--enrichGraph_svg"), + action = "store", + type = "character", + help = "path to enrichment graph SVG" + ) +, make_option( + c("-L", "--locProbCutoffGraph"), + action = "store", + type = "character", + help = "path to location-proability cutoff graph PDF" + ) +, make_option( + c("-M", "--locProbCutoffGraph_svg"), + action = "store", + type = "character", + help = "path to location-proability cutoff graph SVG" + ) +, make_option( + c("-e", "--enriched"), + action = "store", + type = "character", + help = "pY or pST enriched samples (ie, 'Y' or 'ST')" + ) + # default = "^Number of Phospho [(]STY[)]$", +, make_option( + c("-p", "--phosphoCol"), + action = "store", + type = "character", + help = "PERL-compatible regular expression matching header of column having number of 'Phospho (STY)'" + ) + # default = "^Intensity[^_]", +, make_option( + c("-s", "--startCol"), + action = "store", + type = "character", + help = "PERL-compatible regular expression matching column header having first sample intensity" + ) + # default = 1, +, make_option( + c("-I", "--intervalCol"), + action = "store", + type = "integer", + help = "Column interval between the Intensities of samples (eg, 1 if subsequent column; 2 if every other column" + ) + # default = 0.75, +, make_option( + c("-l", "--localProbCutoff"), + action = "store", + type = "double", + help = "Localization Probability Cutoff" + ) + # default = "sum", +, make_option( + c("-f", "--collapse_func"), + action = "store", + type = "character", + help = "merge identical phosphopeptides by ('sum' or 'average') the intensities" + ) + # default = "filteredData.txt", +, make_option( + c("-r", "--filtered_data"), + action = "store", + type = "character", + help = "filteredData.txt" + ) + # default = "quantData.txt", +, make_option( + c("-q", "--quant_data"), + action = "store", + type = "character", + help = "quantData.txt" + ) +) +args <- parse_args(OptionParser(option_list=option_list)) +# Check parameter values + +### EXTRACT ARGUMENTS end ---------------------------------------------------- + + +### EXTRACT PARAMETERS from arguments begin ---------------------------------- + +if (! file.exists(args$input)) { + stop((paste("File", args$input, "does not exist"))) +} + +phosphoColPattern <- "^Number of Phospho [(][STY][STY]*[)]$" +startColPattern <- "^Intensity[^_]" +phosphoColPattern <- readFirstLine(args$phosphoCol) +startColPattern <- readFirstLine(args$startCol) + +sink(getConnection(2)) +#ACE print(paste("phosphoColPattern", phosphoColPattern)) +#ACE print(paste("startColPattern", startColPattern)) + +inputFilename <- args$input +filteredFilename <- args$filtered_data +quantFilename <- args$quant_data +intervalCol <- as.integer(args$intervalCol) + +firstLine <- readFirstLine(inputFilename) +columnHeaders <- unlist(strsplit(x=firstLine, split=c('\t'), fixed=TRUE)) +sink(getConnection(2)) +#ACE print("columnHeaders") +#ACE print(columnHeaders) +sink() + + +intensityHeaderCols <- grep(pattern=startColPattern, x=columnHeaders, perl=TRUE) +if ( length(intensityHeaderCols) == 0) { + err_msg <- paste("Found no intensity columns matching pattern:", startColPattern) + # Divert output to stderr + sink(getConnection(2)) + print(err_msg) + sink() + stop(err_msg) + } + + +phosphoCol <- grep(pattern=phosphoColPattern, x=columnHeaders, perl=TRUE)[1] +if (is.na(phosphoCol)) { + err_msg <- paste("Found no 'number of phospho sites' columns matching pattern:", phosphoColPattern) + # Divert output to stderr + sink(getConnection(2)) + print(err_msg) + sink() + stop(err_msg) + } + + +i_count <- 0 +this_column <- 1 +last_value <- intensityHeaderCols[1] +intensityCols <- c(last_value) + +while ( length(intensityHeaderCols) >= intervalCol * i_count ) { + i_count <- 1 + i_count + this_column <- intervalCol + this_column + if ( last_value + intervalCol != intensityHeaderCols[this_column] ) break + last_value <- intensityHeaderCols[this_column] + if (length(intensityHeaderCols) < intervalCol * i_count) break + intensityCols <- c(intensityCols, intensityHeaderCols[this_column]) + } + +startCol <- intensityCols[1] +numSamples <- i_count + +outputfilename <- args$output +enrichGraphFilename <- args$enrichGraph +locProbCutoffGraphFilename <- args$locProbCutoffGraph +enrichGraphFilename_svg <- args$enrichGraph_svg +locProbCutoffGraphFilename_svg <- args$locProbCutoffGraph_svg + +localProbCutoff <- args$localProbCutoff +enriched <- args$enriched +collapse_FUN <- args$collapse_func + +### EXTRACT PARAMETERS from arguments end ------------------------------------ + + +# Proteomics Quality Control for MaxQuant Results +# (Bielow C et al. J Proteome Res. 2016 PMID: 26653327) +# is run by the Galaxy MaxQuant wrapper and need not be invoked here. + + +# Read data, filtering out contaminants, reverse sequences, and localization probability +# --- +fullData <- read.table(file = inputFilename, sep ="\t", header=T, quote="") + +#Filter out contaminant rows and reverse rows +filteredData <- subset(fullData,!grepl("CON__", Proteins)) +filteredData <- subset(filteredData,!grepl("_MYCOPLASMA", Proteins)) +filteredData <- subset(filteredData,!grepl("CONTAMINANT_", Proteins)) +filteredData <- subset(filteredData,!grepl("REV__", Protein)) #since REV__ rows are blank in the first column (Proteins) +write.table(filteredData, file = filteredFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE) +# ... + + +# Filter out data with localization probability below localProbCutoff +# --- +#Data filtered by localization probability +locProbFilteredData <- filteredData[filteredData$Localization.prob>=localProbCutoff,] +# ... + + +# Localization probability -- visualize locprob cutoff +# --- +locProbGraphData <- data.frame( + group = c(paste(">",toString(localProbCutoff),sep=""), paste("<",toString(localProbCutoff),sep="")), + value = c(nrow(locProbFilteredData)/nrow(filteredData)*100, (nrow(filteredData)-nrow(locProbFilteredData))/nrow(filteredData)*100) +) +gigi <- + ggplot(locProbGraphData, aes(x = "", y = value, fill = group)) + + geom_bar(width = 0.5, stat = "identity", color = "black") + + labs( + x = NULL + , y = "percent" + , title = "Phosphopeptides partitioned by localization-probability cutoff" + ) + + scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") + + theme_minimal() + + theme( + legend.position = "right" + , legend.title=element_text() + , plot.title = element_text(hjust = 0.5) + , plot.subtitle = element_text(hjust = 0.5) + , plot.title.position = "plot" + ) +pdf(locProbCutoffGraphFilename) +print(gigi) +dev.off() +svg(locProbCutoffGraphFilename_svg) +print(gigi) +dev.off() +# ... + + +# Extract quantitative values from filtered data +# --- +quantData <- locProbFilteredData[,seq(from=startCol, by=intervalCol, length.out=numSamples)] +# ... + + +# Generate Phosphopeptide Sequence +# for latest version of MaxQuant (Version 1.5.3.30) +# --- +dataTable <- data.frame(locProbFilteredData[,1:8],locProbFilteredData[,phosphoCol],locProbFilteredData[,phosphoCol+1],locProbFilteredData[,phosphoCol+2],locProbFilteredData[,phosphoCol+3],locProbFilteredData[,phosphoCol+4],locProbFilteredData[,phosphoCol+5],locProbFilteredData[,phosphoCol+6],locProbFilteredData[,phosphoCol+7],quantData) +colnames(dataTable) <- c("Proteins","Positions within proteins", "Leading proteins", "Protein", "Protein names", "Gene names", "Fasta headers", "Localization prob", "Number of Phospho (STY)", "Amino Acid", "Sequence window","Modification window", "Peptide window coverage", "Phospho (STY) Probabilities", "Phospho (STY) Score diffs", "Position in peptide", colnames(quantData)) +# 'phosphopeptide_func' generates a phosphopeptide sequence for each row of data. +# for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1,2) = both +dataTable$Phosphopeptide <- apply(X=dataTable, MARGIN=1, FUN=phosphopeptide_func) +# Move the quant data columns to the right end of the data.frame +dataTable <- movetolast(dataTable,c(colnames(quantData))) +# ... + + +# Write quantitative values for debugging purposes +# --- +quantWrite <- cbind( dataTable[,"Sequence window"], quantData ) +colnames(quantWrite)[1] <- "Sequence.Window" +write.table(quantWrite, file = quantFilename, sep = "\t", quote=FALSE, col.names=TRUE, row.names=FALSE) +# ... + + +# Make new data frame containing only Phosphopeptides to be mapped to quant data (merge_df) +# --- +dataTable <- setDT(dataTable, keep.rownames=TRUE) #row name will be used to map +merge_df <- data.frame(as.integer(dataTable$rn), dataTable$Phosphopeptide) #row index to merge data frames +colnames(merge_df) <- c("rn", "Phosphopeptide") +# ... + + +# Add Phosphopeptide column to quant columns for quality control checking +# --- +quantData_qc <- as.data.frame(quantData) +setDT(quantData_qc, keep.rownames=TRUE) #will use to match rowname to data +quantData_qc$rn <- as.integer(quantData_qc$rn) +quantData_qc <- merge(merge_df,quantData_qc, by="rn") +quantData_qc$rn <- NULL #remove rn column +# ... + + +# Collapse multiphosphorylated peptides +# --- +quantData_qc_collapsed <- data.table(quantData_qc, key = "Phosphopeptide") +quantData_qc_collapsed <- aggregate(. ~ Phosphopeptide,quantData_qc, FUN= collapse_FUN) +# ... + + +# Compute (as string) % of phosphopeptides that are multiphosphorylated (for use in next step) +# --- +pct_multiphos <- (nrow(quantData_qc) - nrow(quantData_qc_collapsed)) / (2 * nrow(quantData_qc)) +pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%") +# ... + + +# Compute and visualize breakdown of pY, pS, and pT before enrichment filter +# --- +pY_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),] +pS_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS"),] +pT_data <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),] + +pY_num <- nrow(pY_data) +pS_num <- nrow(pS_data) +pT_num <- nrow(pT_data) + +# Visualize enrichment +enrichGraphData <- data.frame( + group = c("pY", "pS", "pT"), + value = c(pY_num, pS_num, pT_num) +) + +enrichGraphData <- enrichGraphData[enrichGraphData$value > 0,] + +# Plot pie chart with legend +# start: https://stackoverflow.com/a/62522478/15509512 +# refine: https://www.statology.org/ggplot-pie-chart/ +# colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8 +slices <- enrichGraphData$value +phosphoresidue <- enrichGraphData$group +pct <- round(100 * slices / sum(slices)) +lbls <- paste(enrichGraphData$group,"\n",pct, "%\n(", slices, ")", sep="") +slc_ctr <- c() +run_tot <- 0 +for (p in pct) { + slc_ctr <- c(slc_ctr, run_tot + p/2.0) + run_tot <- run_tot + p +} +lbl_y <- 100 - slc_ctr +df <- data.frame(slices, pct, lbls, phosphoresidue = factor(phosphoresidue, levels = phosphoresidue)) +gigi <- ggplot( + df +, aes(x = 1, y = pct, fill = phosphoresidue)) + + geom_col(position = "stack", orientation = "x") + + geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") + + coord_polar(theta = "y", direction = -1) + + labs( + x = NULL + , y = NULL + , title = "Percentages (and counts) of phosphosites, by type of residue" + , caption = sprintf("Roughly %s of peptides have multiple phosphosites.", pct_multiphos) + ) + + labs(x = NULL, y = NULL, fill = NULL) + + theme_classic() + + theme( legend.position="right" + , axis.line = element_blank() + , axis.text = element_blank() + , axis.ticks = element_blank() + , plot.title = element_text(hjust = 0.5) + , plot.subtitle = element_text(hjust = 0.5) + , plot.caption = element_text(hjust = 0.5) + , plot.title.position = "plot" + ) + + scale_fill_manual(breaks = phosphoresidue, values=c("#c7eae5", "#f6e8c3", "#dfc27d")) + +pdf(enrichGraphFilename) +print(gigi) +dev.off() +svg(enrichGraphFilename_svg) +print(gigi) +dev.off() +# ... + + +# Filter phosphopeptides by enrichment +# -- +if (enriched == "Y"){ + quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pY"),] +} else if ( enriched == "ST" ) { + quantData_qc_enrichment <- quantData_qc_collapsed[str_detect(quantData_qc_collapsed$Phosphopeptide, "pS") | str_detect(quantData_qc_collapsed$Phosphopeptide, "pT"),] +} else { + print("Error in enriched variable. Set to either 'Y' or 'ST'") +} +# ... + + +# Write phosphopeptides filtered by enrichment +# -- +write.table(quantData_qc_enrichment, file=outputfilename, sep="\t", quote = FALSE, row.names = FALSE) +# ...
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PhosphoPeptide_Upstream_Kinase_Mapping.pl Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,2124 @@ +#!/usr/local/bin/perl +############################################################################################################################### +# perl Kinase_enrichment_analysis_complete_v0.pl +# +# Nick Graham, USC +# 2016-02-27 +# +# Built from scripts written by NG at UCLA in Tom Graeber's lab: +# CombinePhosphoSites.pl +# Retrieve_p_motifs.pl +# NetworKIN_Motif_Finder_v7.pl +# +# Given a list of phospho-peptides, find protein information and upstream kinases. +# Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl +# +# Updated 2022-01-13, Art Eschenlauer, UMN on behalf of Justin Drake's lab: +# Added warnings and used strict; +# fixed some code paths resulting in more NetworKIN matches; +# applied Aho-Corasick algorithm (via external Python script because Perl implementation was still too slow) +# to speed up "Match the non_p_peptides to the @sequences array"; +# added support for SQLite-formatted UniProtKB/Swiss-Prot data as an alternative to FASTA-formatted data; +# added support for SQLite output in addition to tabular files. +# +# +############################################################################################################################### + +use strict; +use warnings; + +use Getopt::Std; +use DBD::SQLite::Constants qw/:file_open/; +use DBI qw(:sql_types); +use File::Copy; +use File::Basename; +use POSIX qw(strftime); +use Time::HiRes qw(gettimeofday); +#use Data::Dump qw(dump); + +my $USE_SEARCH_PPEP_PY = 1; + +my $dirname = dirname(__FILE__); +my %opts; +my ($file_in, $average_or_sum, $db_out, $file_out, $file_melt, $phospho_type); +my $dbtype; +my ($fasta_in, $networkin_in, $motifs_in, $PSP_Kinase_Substrate_in, $PSP_Regulatory_Sites_in); +my (@samples, %sample_id_lut, %ppep_id_lut, %data, @tmp_data, %n); +my $line = 0; +my @failed_match = ("Failed match"); +my @failed_matches; +my (%all_data); +my (@p_peptides, @non_p_peptides); +my @parsed_fasta; +my (@accessions, @names, @sequences, @databases, $database); +my ($dbfile, $dbh, $stmth); +my @col_names; +my (%matched_sequences, %accessions, %names, %sites, ); +my (@tmp_matches, @tmp_accessions, @tmp_names, @tmp_sites); +my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues); +my (@kinases_observed, $kinases); +my (@kinases_observed_lbl, @phosphosites_observed_lbl); +my ($p_sequence_kinase, $p_sequence, $kinase); +my (@motif_sequence, %motif_type, %motif_count); +my (@kinases_PhosphoSite, $kinases_PhosphoSite); +my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite); +my (%regulatory_sites_PhosphoSite_hash); +#ACE my %psp_regsite_protein; +my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes, %organism); +my (%unique_motifs); +my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches); +my %psp_regsite_protein_2; +my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2, %organism_2); +my @timeData; +my $PhosphoSitePlusCitation; +my %site_description; + +my %kinase_substrate_NetworKIN_matches; +my %kinase_motif_matches; +my $regulatory_sites_PhosphoSite; +my ($seq_plus5aa, $seq_plus7aa, %seq_plus7aa_2); +my %kinase_substrate_PhosphoSite_matches; +my @formatted_sequence; +my $pSTY_sequence; +my $i; +my @a; +my $use_sqlite; +my $verbose; + +########## +## opts ## +########## + ## input files + # i : path to input file, e.g., 'outputfile_STEP2.txt' + # f : path to UniProtKB/SwissProt FASTA + # s : optional species argument + # n : path to NetworKIN_201612_cutoffscore2.0.txt + # m : path to pSTY_Motifs.txt + # p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt + # r : path to 2017-03_PSP_Regulatory_sites.txt + ## options + # P : phospho_type + # F : function + # v : verbose output + ## output files + # o : path to output file + # O : path to "melted" output file + # D : path to output SQLite file + +sub usage() + { + print STDERR <<"EOH"; + This program given a list of phospho-peptides, finds protein information and upstream kinases. + usage: $0 [-hvd] -f FASTA_file + -h : this (help) message + -v : slightly verbose + -a : use SQLite less + ## input files + -i : path to input file, e.g., 'outputfile_STEP2.txt' + -f : path to UniProtDB/SwissProt FASTA + -s : optional species filter argument for PSP records; defaults to 'human' + -n : path to NetworKIN_201612_cutoffscore2.0.txt + -m : path to pSTY_Motifs.txt + -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt + -r : path to 2017-03_PSP_Regulatory_sites.txt + ## options + -P : phospho_type + -F : function + ## output files + -o : path to output file + -O : path to "melted" output file + -D : path to output SQLite file + example: $0 +EOH + exit; + } + +sub format_localtime_iso8601 { + # ref: https://perldoc.perl.org/Time::HiRes + my ($seconds, $microseconds) = gettimeofday; + # ref: https://pubs.opengroup.org/onlinepubs/9699919799/functions/strftime.html + return strftime("%Y-%m-%dT%H:%M:%S",localtime(time)) . sprintf(".%03d", $microseconds/1000); +} + +sub replace_pSpTpY { + my ($formatted_sequence, $phospho_type) = @_; + if ($phospho_type eq 'y') { + $formatted_sequence =~ s/pS/S/g; + $formatted_sequence =~ s/pT/T/g; + $formatted_sequence =~ s/pY/y/g; + } + elsif ($phospho_type eq "sty") { + $formatted_sequence =~ s/pS/s/g; + $formatted_sequence =~ s/pT/t/g; + $formatted_sequence =~ s/pY/y/g; + } + $formatted_sequence; +} + +sub pseudo_sed() +{ + # Comments give the sed equivalent + my $s; + # / GN=/!{ s:\(OX=[^ \t]*\):\1 GN=N/A:; }; + unless (m / GN=/s) + { + $s = s :(OX=[^ \t]*):${1} GN=N/A:s; + } + # / PE=/!{ s:\(GN=[^ \t]*\):\1 PE=N/A:; }; + unless (m / PE=/s) + { + $s = s :(GN=[^ \t]*):${1} PE=N/A:s; + } + # / SV=/!{ s:\(PE=[^ \t]*\):\1 SV=N/A:; }; + unless (m / SV=/s) + { + $s = s :(PE=[^ \t]*):${1} SV=N/A:s; + } + # s/^sp.//; + $s = s /^sp.//s; + # s/[|]/\t/g; + $s = s /[|]/\t/sg; + # s/ OS=/\t/; + $s = s / OS=/\t/s; + # s/ OX=/\t/; + $s = s / OX=/\t/s; + # s/ GN=/\t/; + $s = s / GN=/\t/s; + # s/ PE=/\t/; + $s = s / PE=/\t/s; + # s/ SV=/\t/; + $s = s / SV=/\t/s; +} # sub pseudo_sed + +getopts('i:f:s:n:m:p:r:P:F:o:O:D:hva', \%opts) ; + +#ACE print %opts; #ACE +#ACE print "\n"; #ACE + +if (exists($opts{'h'})) { + usage(); +} +if (exists($opts{'a'})) { + $USE_SEARCH_PPEP_PY = 0; +} +if (exists($opts{'v'})) { + $verbose = 1; +} else { + $verbose = 0; +} +if (!exists($opts{'i'}) || !-e $opts{'i'}) { + die('Input File not found'); +} else { + $file_in = $opts{'i'}; +} +if (!exists($opts{'f'}) || !-e $opts{'f'}) { + die('FASTA not found'); +} else { + $fasta_in = $opts{'f'}; + $use_sqlite = 0; +} +#ACE if (exists($opts{'s'}) && -e $opts{'s'}) { +#ACE $use_sqlite = 1; +#ACE $dbfile = $opts{'s'}; +#ACE } elsif (!exists($opts{'f'}) || !-e $opts{'f'}) { +#ACE die('Neither input FASTA file nor input SQLite file was found'); +#ACE } else { +#ACE $use_sqlite = 0; +#ACE $fasta_in = $opts{'f'}; +#ACE } +my $species; +if ((!exists($opts{'s'})) || ($opts{'s'} eq '')) { + $species = 'human'; +} else { + $species = $opts{'s'}; + print "'-s' option is '$species'\n"; +} +print "species filter is '$species'\n"; + +if (!exists($opts{'n'}) || !-e $opts{'n'}) { + die('Input NetworKIN File not found'); +} else { + $networkin_in = $opts{'n'}; +} +if (!exists($opts{'m'}) || !-e $opts{'m'}) { + die('Input pSTY_Motifs File not found'); +} else { + $motifs_in = $opts{'m'}; +} +if (!exists($opts{'p'}) || !-e $opts{'p'}) { + die('Input PSP_Kinase_Substrate_Dataset File not found'); +} else { + $PSP_Kinase_Substrate_in = $opts{'p'}; +} +if (!exists($opts{'r'}) || !-e $opts{'r'}) { + die('Input PSP_Regulatory_sites File not found'); +} else { + $PSP_Regulatory_Sites_in = $opts{'r'}; +} +if (exists($opts{'P'})) { + $phospho_type = $opts{'P'}; +} +else { + $phospho_type = "sty"; +} +if (exists($opts{'F'})) { + $average_or_sum = $opts{'F'}; +} +else { + $average_or_sum = "sum"; +} +if (exists($opts{'D'})) { + $db_out = $opts{'D'}; +} +else { + $db_out = "db_out.sqlite"; +} +if (exists($opts{'O'})) { + $file_melt = $opts{'O'}; +} +else { + $file_melt = "output_melt.tsv"; +} +if (exists($opts{'o'})) { + $file_out = $opts{'o'}; +} +else { + $file_out = "output.tsv"; +} + + +############################################################################################################################### +# Print the relevant file names to the screen +############################################################################################################################### +# print "\nData file: $data_in\nFASTA file: $fasta_in\nSpecies: $species\nOutput file: $motifs_out\n\n"; +print "\n--- parameters:\n"; +print "Data file: $file_in\nAverage or sum identical p-sites? $average_or_sum\nOutput file: $file_out\nMelted map: $file_melt\n"; +if ($use_sqlite == 0) { + print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data: $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt FASTA file: $fasta_in\nOutput SQLite file: $db_out\n"; +} else { + print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data: $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt SQLIte file: $dbfile\nOutput SQLite file: $db_out\n"; +} +print "...\n\n"; + +print "Phospho-residues(s) = $phospho_type\n\n"; +if ($phospho_type ne 'y') { + if ($phospho_type ne 'sty') { + die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n"; + } +} + +############################################################################################################################### +# read the input data file +# average or sum identical phospho-sites, depending on the value of $average_or_sum +############################################################################################################################### + +open (IN, "$file_in") or die "I couldn't find the input file: $file_in\n"; + +die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ; + + +$line = 0; + +while (<IN>) { + chomp; + my @x = split(/\t/); + for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n] =~ s/\n//g; $x[$n] =~ s/\"//g;} + + # Read in the samples + if ($line == 0) { + for my $n (1 .. $#x) { + push (@samples, $x[$n]); + $sample_id_lut{$x[$n]} = $n; + } + $line++; + } else { + # check whether we have already seen a phospho-peptide + if (exists($data{$x[0]})) { + if ($average_or_sum eq "sum") { # add the data + # unload the data + @tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); } + # add the new data and repack + for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; } + $all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); } + + } elsif ($average_or_sum eq "average") { # average the data + # unload the data + @tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); } + # average with the new data and repack + for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); } + $n{$x[0]}++; + $data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); } + } + } + # if the phospho-sequence has not been seen, save the data + else { + for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); } + $n{$x[0]} = 1; + } + } +} +close(IN); + + +############################################################################################################################### +# Search the FASTA database for phospho-sites and motifs +# +# based on Retrieve_p_peptide_motifs_v2.pl +############################################################################################################################### + + +############################################################################################################################### +# +# Read in the Data file: +# 1) make @p_peptides array as in the original file +# 2) make @non_p_peptides array w/o residue modifications (p, #, other) +# +############################################################################################################################### + +foreach my $peptide (keys %data) { + $peptide =~ s/s/pS/g; $peptide =~ s/t/pT/g; $peptide =~ s/y/pY/g; + push (@p_peptides, $peptide); + $peptide =~ s/p//g; + push(@non_p_peptides, $peptide); +} + +if ($use_sqlite == 0) { + ############################################################################################################################### + # + # Read in the UniProtKB/Swiss-Prot data from FASTA; save to @sequences array and SQLite output database + # + ############################################################################################################################### + + # e.g. + # >sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 + # MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD + # DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK + # EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH + # QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS + # EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT + # accession: Q9Y3B9 + # name: RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 + # sequence: MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT + + open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n"; + print "Reading FASTA file $fasta_in\n"; + # ref: https://perldoc.perl.org/perlsyn#Compound-Statements + # "If the condition expression of a while statement is based on any of + # a group of iterative expression types then it gets some magic treatment. + # The affected iterative expression types are readline, the <FILEHANDLE> + # input operator, readdir, glob, the <PATTERN> globbing operator, and + # `each`. If the condition expression is one of these expression types, + # then the value yielded by the iterative operator will be implicitly + # assigned to `$_`." + while (<IN1>) { + chomp; + # ref: https://perldoc.perl.org/functions/split#split-/PATTERN/,EXPR + # "If only PATTERN is given, EXPR defaults to $_." + my (@x) = split(/\|/); + for my $i (0 .. $#x) { + $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; } + if ($x[0] =~ /^>/) { + $x[0] =~ s/\>//g; + push (@databases, $x[0]); + push (@accessions, $x[1]); + push (@names, $x[2]); + #ACE print "names $x[2]\n"; + #ACE print "--- $_\n"; + pseudo_sed(); + s/$/\t/; + push (@parsed_fasta, $_); + } elsif ($x[0] =~ /^\w/) { + if (defined $sequences[$#accessions]) { + $sequences[$#accessions] = $sequences[$#accessions].$x[0]; + } else { + $sequences[$#accessions] = $x[0]; + } + $parsed_fasta[$#accessions] = $parsed_fasta[$#accessions].$x[0]; + } + #ACE print "... '$parsed_fasta[$#accessions]'\n"; + } + close IN1; + print "Done Reading FASTA file $fasta_in\n"; + $dbfile = $db_out; + print "Begin writing $dbfile at " . format_localtime_iso8601() . "\n"; + $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef); + my $auto_commit = $dbh->{AutoCommit}; + print "auto_commit was $auto_commit and is now 0\n" if ($verbose); + $dbh->{AutoCommit} = 0; + + # begin DDL-to-SQLite + # --- + $stmth = $dbh->prepare(" + DROP TABLE IF EXISTS UniProtKB; + "); + $stmth->execute(); + + $stmth = $dbh->prepare(" + CREATE TABLE UniProtKB ( + Uniprot_ID TEXT PRIMARY KEY ON CONFLICT IGNORE, + Description TEXT, + Organism_Name TEXT, + Organism_ID INTEGER, + Gene_Name TEXT, + PE TEXT, + SV TEXT, + Sequence TEXT, + Database TEXT + ) + "); + $stmth->execute(); + $stmth = $dbh->prepare(" + CREATE UNIQUE INDEX idx_uniq_UniProtKB_0 on UniProtKB(Uniprot_ID); + "); + $stmth->execute(); + $stmth = $dbh->prepare(" + CREATE INDEX idx_UniProtKB_0 on UniProtKB(Gene_Name); + "); + $stmth->execute(); + # ... + # end DDL-to-SQLite + + # insert all rows + # begin store-to-SQLite "UniProtKB" table + # --- + $stmth = $dbh->prepare(" + INSERT INTO UniProtKB ( + Uniprot_ID, + Description, + Organism_Name, + Organism_ID, + Gene_Name, + PE, + SV, + Sequence, + Database + ) VALUES (?,?,?,?,?,?,?,?,?) + "); + my $row_count = 1; + my $row_string; + my (@row, @rows); + my $wrd; + while ( scalar @parsed_fasta > 0 ) { + $database = $databases[$#parsed_fasta]; + #### print "parsed_fasta[-1]: " . $parsed_fasta[$#parsed_fasta] . "\n"; + $row_string = pop(@parsed_fasta); + #### print "row_string: $row_string\n"; + @row = (split /\t/, $row_string); + for $i (1..3,5..8) { + $stmth->bind_param($i, $row[$i]); + } + $stmth->bind_param(9, $database); + $stmth->bind_param(4, $row[4], { TYPE => SQL_INTEGER }); + if (not $stmth->execute()) { + print "Error in row $row_count: $stmth->errstr\n"; + } + $row_count += 1; + } + # ... + # end store-to-SQLite "UniProtKB" table + + print "begin commit at " . format_localtime_iso8601() . "\n"; + $dbh->{AutoCommit} = $auto_commit; + print "auto_commit is now $auto_commit\n" if ($verbose); + $dbh->disconnect if ( defined $dbh ); + print "Finished writing $dbfile at " . format_localtime_iso8601() . "\n\n"; + $dbtype = "FASTA"; +} + +if ($use_sqlite == 1) { + ############################################################################################################################### + # + # Read in the UniProtKB/Swiss-Prot data from SQLite; save to @sequences array + # + ############################################################################################################################### + + copy($dbfile, $db_out) or die "Copy $dbfile to $db_out failed: $!"; + + # https://metacpan.org/pod/DBD::SQLite#Read-Only-Database + $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef, { + sqlite_open_flags => SQLITE_OPEN_READONLY, + }); + print "DB connection $dbh is to $dbfile\n"; + + # Uniprot_ID, Description, Organism_Name, Organism_ID, Gene_Name, PE, SV, Sequence + $stmth = $dbh->prepare(" + SELECT Uniprot_ID + , Description || ' OS=' || Organism_Name || ' OX=' || Organism_ID + || CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN='|| Gene_Name END + || CASE WHEN PE = 'N/A' THEN '' ELSE ' PE='|| PE END + || CASE WHEN SV = 'N/A' THEN '' ELSE ' SV='|| SV END + AS Description + , Sequence + , Database + FROM + UniProtKB + "); + $stmth->execute(); + @col_names = @{$stmth->{NAME}}; + print "\nColumn names selected from UniProtKB SQLite table: " . join(", ", @col_names) . "\n\n" if ($verbose); + while (my @row = $stmth->fetchrow_array) { + push (@names, $row[1]); # redacted Description + push (@accessions, $row[0]); # Uniprot_ID + $sequences[$#accessions] = $row[2]; # Sequence + push (@databases, $row[3]); # Database (should be 'sp') + } + + $dbh->disconnect if ( defined $dbh ); + + print "Done Reading UniProtKB/Swiss-Prot file $dbfile\n\n"; + $dbtype = "SQLite"; +} + +print "$#accessions accessions were read from the UniProtKB/Swiss-Prot $dbtype file\n"; + +@timeData = localtime(time); +print "\n--- Start search at " . format_localtime_iso8601() ."\n"; + +print " --> Calling 'search_ppep' script\n\n"; +if ($verbose) { + $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in --verbose"); +} else { + $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in"); +} +if ($i) { + print "python $dirname/search_ppep.py -u $db_out -p $file_in\n exited with exit code $i\n"; + die "Search failed for phosphopeptides in SwissProt/SQLite file."; +} +print " <-- Returned from 'search_ppep' script\n"; + +@timeData = localtime(time); +print "... Finished search at " . format_localtime_iso8601() ."\n\n"; + + +############################################################################################################################### +# +# Match the non_p_peptides to the @sequences array: +# 1) Format the motifs +/- 10 residues around the phospho-site +# 2) Print the original data plus the phospho-motif to the output file +# +############################################################################################################################### + +#ACE print OUT "$headers\tFormatted Motifs\tUnique Motifs\tPhospho-site(s)\tAccessions(s)\tName(s)\n"; + +print "--- Match the non_p_peptides to the \@sequences array:\n"; + +if ($USE_SEARCH_PPEP_PY) { + print "Find the matching protein sequence(s) for the peptide using SQLite\n"; +} else { + print "Find the matching protein sequence(s) for the peptide using slow search\n"; +} + +# https://metacpan.org/pod/DBD::SQLite#Read-Only-Database +$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef, { + sqlite_open_flags => SQLITE_OPEN_READONLY, +}); +print "DB connection $dbh is to $db_out\n"; + +# CREATE VIEW uniprotid_pep_ppep AS +# SELECT deppep_UniProtKB.UniprotKB_ID AS accession +# , deppep.seq AS peptide +# , ppep.seq AS phosphopeptide +# , UniProtKB.Sequence AS sequence +# , UniProtKB.Description AS description +# FROM ppep, deppep, deppep_UniProtKB, UniProtKB +# WHERE deppep.id = ppep.deppep_id +# AND deppep.id = deppep_UniProtKB.deppep_id +# AND deppep_UniProtKB.UniprotKB_ID = UniProtKB.Uniprot_ID +# ORDER BY UniprotKB_ID, deppep.seq, ppep.seq; + +my %ppep_to_count_lut; +print "start select peptide counts " . format_localtime_iso8601() . "\n"; +$stmth = $dbh->prepare(" + SELECT DISTINCT + phosphopeptide + , count(*) as i + FROM + uniprotkb_pep_ppep_view + GROUP BY + phosphopeptide + ORDER BY + phosphopeptide +"); +if (not $stmth->execute()) { + die "Error fetching peptide counts: $stmth->errstr\n"; +} +while (my @row = $stmth->fetchrow_array) { + $ppep_to_count_lut{$row[0]} = $row[1]; + #print "\$ppep_to_count_lut{$row[0]} = $ppep_to_count_lut{$row[0]}\n"; +} + +# accession, peptide, sequence, description, phosphopeptide, long_description, pos_start, pos_end, scrubbed, ppep_id +# 0 1 2 3 4 5 6 7 8 9 +my $COL_ACCESSION = 0; +my $COL_PEPTIDE = 1; +my $COL_SEQUENCE = 2; +my $COL_DESCRIPTION = 3; +my $COL_PHOSPHOPEPTIDE = 4; +my $COL_LONG_DESCRIPTION = 5; +my $COL_POS_START = 6; +my $COL_POS_END = 7; +my $COL_SCRUBBED = 8; +my $COL_PPEP_ID = 9; + +my %ppep_to_row_lut; +print "start select all records without qualification " . format_localtime_iso8601() . "\n"; +$stmth = $dbh->prepare(" + SELECT DISTINCT + accession + , peptide + , sequence + , description + , phosphopeptide + , long_description + , pos_start + , pos_end + , scrubbed + , ppep_id + FROM + uniprotkb_pep_ppep_view + ORDER BY + phosphopeptide +"); +if (not $stmth->execute()) { + die "Error fetching all records without qualification: $stmth->errstr\n"; +} +my $current_ppep; +my $counter = 0; +my $former_ppep = ""; +@tmp_matches = (); +@tmp_accessions = (); +@tmp_names = (); +@tmp_sites = (); +while (my @row = $stmth->fetchrow_array) { + # Identify phosphopeptide for current row; + # it is an error for it to change when the counter is not zero. + $current_ppep = $row[$COL_PHOSPHOPEPTIDE]; + + # when counter is zero, prepare for a new phosphopeptide + if (not $current_ppep eq $former_ppep) { + die "counter is $counter instead of zero" if ($counter != 0); + $ppep_id_lut{$current_ppep} = $row[$COL_PPEP_ID]; + print "next phosphpepetide: $current_ppep; id: $ppep_id_lut{$current_ppep}\n" if ($verbose); + $counter = $ppep_to_count_lut{$current_ppep}; + @tmp_matches = (); + @tmp_accessions = (); + @tmp_names = (); + @tmp_sites = (); + } + + if ($USE_SEARCH_PPEP_PY) { + push(@tmp_matches, $row[ $COL_SEQUENCE ]); + push(@tmp_accessions, $row[ $COL_ACCESSION ]); + push(@tmp_names, $row[ $COL_LONG_DESCRIPTION ]); + push(@tmp_sites, $row[ $COL_POS_START ]); + } + + # Prepare counter and phosphopeptide tracker for next row + #ACE print "counter: $counter; phosphpepetide: $current_ppep\n"; + $former_ppep = $current_ppep; + $counter -= 1; + + # Set trackers for later use after last instance of current phosphopeptide + if ($counter == 0) { + if ($USE_SEARCH_PPEP_PY) { + $matched_sequences{$current_ppep} = [ @tmp_matches ]; + $accessions{ $current_ppep} = [ @tmp_accessions ]; + $names{ $current_ppep} = [ @tmp_names ]; + $sites{ $current_ppep} = [ @tmp_sites ]; + } + } +} + + +print "end select all records without qualification " . format_localtime_iso8601() . "\n"; + +for my $j (0 .. $#p_peptides) { + + #Find the matching protein sequence(s) for the peptide using SQLite + my ($site, $sequence); + my (@row, @rows); + my $match = 0; + my $p_peptide = $p_peptides[$j]; + @tmp_matches = (); + @tmp_accessions = (); + @tmp_names = (); + @tmp_sites = (); + + #Find the matching protein sequence(s) for the peptide using slow search + $site = -1; + unless ($USE_SEARCH_PPEP_PY) { + for my $k (0 .. $#sequences) { + $site = index($sequences[$k], $non_p_peptides[$j]); + if ($site != -1) { + push(@tmp_matches, $sequences[$k]); + push(@tmp_accessions, $accessions[$k]); + push(@tmp_names, $names[$k]); + push(@tmp_sites, $site); + } + # print "Non-phosphpeptide $non_p_peptides[$j] matched accession $accessions[$k] ($names[$k]) at site $site\n"; + $site = -1; $match++; + # print "tmp_accessions @tmp_accessions \n"; + } + if ($match == 0) { # Check to see if no match was found. Skip to next if no match found. + print "Warning: Failed match for $p_peptides[$j]\n"; + $matched_sequences{$p_peptides[$j]} = \@failed_match; + push(@failed_matches,$p_peptides[$j]); + next; + } else { + $matched_sequences{$p_peptides[$j]} = [ @tmp_matches ]; + $accessions{$p_peptides[$j]} = [ @tmp_accessions ]; + $names{$p_peptides[$j]} = [ @tmp_names ]; + $sites{$p_peptides[$j]} = [ @tmp_sites ]; + } + } + +} # end for my $j (0 .. $#p_peptides) + +print "... Finished match the non_p_peptides at " . format_localtime_iso8601() ."\n\n"; + +print "--- Match the p_peptides to the \@sequences array:\n"; + +for my $peptide_to_match ( keys %matched_sequences ) { + if (grep($peptide_to_match, @failed_matches)) { + print "Failed to match peptide $peptide_to_match\n"; + } + next if (grep($peptide_to_match, @failed_matches)); + my @matches = @{$matched_sequences{$peptide_to_match}}; + @tmp_motifs_array = (); + for my $i (0 .. $#matches) { + #ACE print "Matching $peptide_to_match to match $i\n"; + #ACE print "\$sites{\$peptide_to_match}[\$i] $sites{$peptide_to_match}[$i]\n"; + + # Find the location of the phospo-site in the sequence(s) + $tmp_site = 0; my $offset = 0; + my $tmp_p_peptide = $peptide_to_match; + #ACE print "peptide_to_match: $peptide_to_match at position $sites{$peptide_to_match}[$i] in sequence $matched_sequences{$peptide_to_match}[$i]\n"; + $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g; + #ACE print "tmp_p_peptide: $tmp_p_peptide\n"; + + # Find all phosphorylated residues in the p_peptide + @p_sites = (); + while ($tmp_site != -1) { + $tmp_site = index($tmp_p_peptide, 'p', $offset); + if ($tmp_site != -1) {push (@p_sites, $tmp_site);} + $offset = $tmp_site + 1; + $tmp_p_peptide =~ s/p//; + } + @tmp_p_residues = (); + for my $l (0 .. $#p_sites) { + next if not defined $sites{$peptide_to_match}[$i]; + + push (@tmp_p_residues, $p_sites[$l] + $sites{$peptide_to_match}[$i]); + + # Match the sequences around the phospho residues to find the motifs + my ($desired_residues_L, $desired_residues_R); + if ($tmp_p_residues[0] - 10 < 0) { #check to see if there are fewer than 10 residues left of the first p-site + # eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3 + $desired_residues_L = $tmp_p_residues[0]; + } + else { + $desired_residues_L = 10; + } + my $seq_length = length($matched_sequences{$peptide_to_match}[$i]); + if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) { #check to see if there are fewer than 10 residues right of the last p-site + $desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1); + # eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3 + # print "Line 170: seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n"; + } + else { + $desired_residues_R = 10; + } + + my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1; + my $arg2 = $tmp_p_residues[0] - $desired_residues_L; + my $arg1 = $matched_sequences{$peptide_to_match}[$i]; + + if (length($arg1) > $arg2 + $total_length - 1) { + $tmp_motif = substr($arg1, $arg2, $total_length); + #ACE print "tmp_motif = $tmp_motif\ti = $i\tpeptide_to_match = $peptide_to_match\tmatched_sequences{peptide_to_match}[i] = $matched_sequences{$peptide_to_match}[$i]\targ2 = $arg2\targ3 = $total_length\n"; + + # Put the "p" back in front of the appropriate phospho-residue(s). + my (@tmp_residues, $tmp_position); + for my $m (0 .. $#p_sites) { + # print "Line 183: $p_sites[$m]\n"; + if ($m == 0) { + $tmp_position = $desired_residues_L; + } else { + $tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0]; + } + #ACE print "Line 431: p_sites = $p_sites[$m]\ttmp_position = $tmp_position\ttmp_motif = $tmp_motif\n"; + if ($tmp_position < length($tmp_motif) + 1) { + push (@tmp_residues, substr($tmp_motif, $tmp_position, 1)); + if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");} + if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");} + if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");} + } + } + + $tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g; + + # Comment out on 8.10.13 to remove the numbers from motifs + my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1; + my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1; + $tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue; + push(@tmp_motifs_array, $tmp_motif); + $residues{$peptide_to_match}{$i} = [ @tmp_residues ]; + $p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ]; + } + } + $p_motifs{$peptide_to_match} = [ @tmp_motifs_array ]; + } # end for my $i (0 .. $#matches) ### this bracket could be in the wrong place +} + +print "... Finished match the p_peptides to the \@sequences array at " . format_localtime_iso8601() ."\n\n"; + +############################################################################################################################### +# +# Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs +# +############################################################################################################################### + + +print "--- Reading various site data:\n"; + +############################################################################################################################### +# +# Read the NetworKIN_predictions file: +# 1) make a "kinases_observed" array +# 2) annotate the phospho-substrates with the appropriate kinase +# +############################################################################################################################### +my $SITE_KINASE_SUBSTRATE = 1; +$site_description{$SITE_KINASE_SUBSTRATE} = "NetworKIN"; + +open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n"; +print "Reading the NetworKIN data: $networkin_in\n"; +while (<IN1>) { + chomp; + my (@x) = split(/\t/); + for my $i (0 .. $#x) { + $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; + } + next if ($x[0] eq "#substrate"); + if (exists ($kinases -> {$x[2]})) { + #do nothing + } + else { + $kinases -> {$x[2]} = $x[2]; + push (@kinases_observed, $x[2]); + } + my $tmp = $x[10]."_".$x[2]; #eg, REEILsEMKKV_PKCalpha + if (exists($p_sequence_kinase -> {$tmp})) { + #do nothing + } + else { + $p_sequence_kinase -> {$tmp} = $tmp; + } +} +close IN1; + +############################################################################################################################### +# +# Read the Kinase motifs file: +# 1) make a "motif_sequence" array +# +############################################################################################################################### + +# file format (tab separated): +# x[0] = primary key (character), e.g., '17' or '23a' +# x[1] = pattern (egrep pattern), e.g., '(M|I|L|V|F|Y).R..(pS|pT)' +# x[2] = description, e.g., 'PKA_Phosida' or '14-3-3 domain binding motif (HPRD)' or 'Akt kinase substrate motif (HPRD & Phosida)' + +my $SITE_MOTIF = 2; +$site_description{$SITE_MOTIF} = "motif"; + +open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n"; +print "Reading the Motifs file: $motifs_in\n"; + +while (<IN2>) { + chomp; + my (@x) = split(/\t/); + for my $i (0 .. 2) { + $x[$i] =~ s/\r//g; + $x[$i] =~ s/\n//g; + $x[$i] =~ s/\"//g; + } + if (exists ($motif_type{$x[1]})) { + $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2]; + } else { + $motif_type{$x[1]} = $x[2]; + $motif_count{$x[1]} = 0; + push (@motif_sequence, $x[1]); + } +} +close (IN2); + + +############################################################################################################################### +# 6.28.2011 +# Read PSP_Kinase_Substrate data: +# 1) make a "kinases_PhosphoSite" array +# 2) annotate the phospho-substrates with the appropriate kinase +# +# Columns: +# (0) GENE +# (1) KINASE +# (2) KIN_ACC_ID +# (3) KIN_ORGANISM +# (4) SUBSTRATE +# (5) SUB_GENE_ID +# (6) SUB_ACC_ID +# (7) SUB_GENE +# (8) SUB_ORGANISM +# (9) SUB_MOD_RSD +# (10) SITE_GRP_ID +# (11) SITE_+/-7_AA +# (12) DOMAIN +# (13) IN_VIVO_RXN +# (14) IN_VITRO_RXN +# (15) CST_CAT# +############################################################################################################################### + +my $SITE_PHOSPHOSITE = 3; +$site_description{$SITE_PHOSPHOSITE} = "PhosphoSite"; + + +$line = 0; + +open (IN3, "$PSP_Kinase_Substrate_in") or die "I couldn't find $PSP_Kinase_Substrate_in\n"; +print "Reading the PhosphoSite Kinase-Substrate data: $PSP_Kinase_Substrate_in\n"; + +while (<IN3>) { + chomp; + my (@x) = split(/\t/); + for my $i (0 .. $#x) { + $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; + } + if ($line != 0) { + #ACE FUE if (($species eq $species) && ($species eq $species)) { + if (($species eq $x[3]) && ($species eq $x[8])) { + #ACE print "KIN_ORGANISM is '$x[3]' and SUB_ORGANISM is '$x[8]', line: $line\n"; + if (exists ($kinases_PhosphoSite -> {$x[0]})) { + #do nothing + } + else { + $kinases_PhosphoSite -> {$x[0]} = $x[0]; + push (@kinases_PhosphoSite, $x[0]); + } + my $offset = 0; + # Replace the superfluous lower case s, t and y + my @lowercase = ('s','t','y'); + my @uppercase = ('S','T','Y'); + for my $k (0 .. 2) { + my $site = 0; + while ($site != -1) { + $site = index($x[11],$lowercase[$k], $offset); + if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);} + $offset = $site + 1; + } + } + my $tmp = $x[11]."_".$x[0]; #eg, RTPGRPLsSYGMDSR_PAK2 + if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { + #do nothing + } + else { + $p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp; + } + } + else { + # do nothing + #print "PSP_kinase_substrate line rejected because KIN_ORGANISM is '$x[3]' and SUB_ORGANISM is '$x[8]': $line\n"; + } + } + $line++; +} +close IN3; + + +############################################################################################################################### +# Read PhosphoSite regulatory site data: +# 1) make a "regulatory_sites_PhosphoSite" hash +# +# Columns: +# (0) GENE +# (1) PROTEIN --> #ACE %psp_regsite_protein +# (2) PROT_TYPE +# (3) ACC_ID +# (4) GENE_ID +# (5) HU_CHR_LOC +# (6) ORGANISM --> %organism +# (7) MOD_RSD +# (8) SITE_GRP_ID +# (9) SITE_+/-7_AA --> %regulatory_sites_PhosphoSite_hash +# (10) DOMAIN --> %domain +# (11) ON_FUNCTION --> %ON_FUNCTION +# (12) ON_PROCESS --> %ON_PROCESS +# (13) ON_PROT_INTERACT --> %ON_PROT_INTERACT +# (14) ON_OTHER_INTERACT --> %ON_OTHER_INTERACT +# (15) PMIDs +# (16) LT_LIT +# (17) MS_LIT +# (18) MS_CST +# (19) NOTES --> %notes +############################################################################################################################### + + +$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef); +my $auto_commit = $dbh->{AutoCommit}; +$dbh->{AutoCommit} = 0; +print "DB connection $dbh is to $db_out, opened for modification\n"; + +# add partial PSP_Regulatory_site table (if not exists) regardless of whether SwissProt input was FASTA or SQLite +$stmth = $dbh->prepare(" +CREATE TABLE IF NOT EXISTS PSP_Regulatory_site ( + SITE_PLUSMINUS_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, + DOMAIN TEXT, + ON_FUNCTION TEXT, + ON_PROCESS TEXT, + ON_PROT_INTERACT TEXT, + ON_OTHER_INTERACT TEXT, + NOTES TEXT, + ORGANISM TEXT, + PROTEIN TEXT +) +"); +$stmth->execute(); + +# add partial PSP_Regulatory_site LUT (if not exists) regardless of whether SwissProt input was FASTA or SQLite +$stmth = $dbh->prepare(" +CREATE TABLE IF NOT EXISTS ppep_regsite_LUT +( ppep_id INTEGER REFERENCES ppep(id) +, site_plusminus_7AA TEXT REFERENCES PSP_Regulatory_site(site_plusminus_7AA) +, PRIMARY KEY (ppep_id, site_plusminus_7AA) ON CONFLICT IGNORE +); +"); +$stmth->execute(); + +# $stmth = $dbh->prepare(" +# CREATE UNIQUE INDEX idx_PSP_Regulatory_site_0 +# ON PSP_Regulatory_site(site_plusminus_7AA); +# "); +# $stmth->execute(); + + +# add Citation table (if not exists) regardless of whether SwissProt input was FASTA or SQLite +my $citation_sql; +$citation_sql = " +CREATE TABLE IF NOT EXISTS Citation ( + ObjectName TEXT REFERENCES sqlite_schema(name) ON DELETE CASCADE, + CitationData TEXT, + PRIMARY KEY (ObjectName, CitationData) ON CONFLICT IGNORE +) +"; +$stmth = $dbh->prepare($citation_sql); +$stmth->execute(); + + +open (IN4, "$PSP_Regulatory_Sites_in") or die "I couldn't find $PSP_Regulatory_Sites_in\n"; +print "Reading the PhosphoSite regulatory site data: $PSP_Regulatory_Sites_in\n"; + +#ACE $i = system("head -n 4 $PSP_Regulatory_Sites_in"); + +$line = -1; +while (<IN4>) { + $line++; + chomp; + if ($_ =~ m/PhosphoSitePlus/) { + #$PhosphoSitePlusCitation = ($_ =~ s/PhosphoSitePlus/FooBar/g); + $PhosphoSitePlusCitation = $_; + $PhosphoSitePlusCitation =~ s/\t//g; + $PhosphoSitePlusCitation =~ s/\r//g; + $PhosphoSitePlusCitation =~ s/\n//g; + $PhosphoSitePlusCitation =~ s/""/"/g; + $PhosphoSitePlusCitation =~ s/^"//g; + $PhosphoSitePlusCitation =~ s/"$//g; + print "$PhosphoSitePlusCitation\n"; + next; + } + my (@x) = split(/\t/); + for my $i (0 .. $#x) { + $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; + } + my $found_GENE=0; + #ACE print STDERR "line $line: $_\n"; + if ( (not exists($x[0])) ) { + next; + } + elsif ( ($x[0] eq "GENE") ) { + $found_GENE=1; + next; + } + if ( (not exists($x[9])) || ($x[9] eq "") ) { + if (exists($x[8]) && (not $x[8] eq "")) { + die "$PSP_Regulatory_Sites_in line $line has no SITE_+/-7_AA: $_\n"; + } else { + if ( (not exists($x[1])) || (not $x[1] eq "") ) { + print "$PSP_Regulatory_Sites_in line $line (".length($_)." characters) has no SITE_+/-7_AA: $_\n" + if $found_GENE==1; + } + next; + } + } + elsif ($line != 0) { + #ACE print "PSPReg $line: $_\n" if ($x[9] eq 'KGQKYFDsGDYNMAK'); + #ACE FUE if ($species ne $species) { + if ($species ne $x[6]) { + # Do nothing - this record was filtered out by the species filter + #ACE print "PSP_regsite line rejected: $line\n"; + } + elsif (!exists($regulatory_sites_PhosphoSite_hash{$x[9]})) { + #ACE print "testing \$domain{\$x[9]} for \$regulatory_sites_PhosphoSite_hash{$x[9]}\n" if ($x[9] eq 'KGQKYFDsGDYNMAK'); #ACE + if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") { + #ACE print "adding found \$regulatory_sites_PhosphoSite_hash{$x[9]}\n" if ($x[9] eq 'KGQKYFDsGDYNMAK'); #ACE + $regulatory_sites_PhosphoSite_hash{$x[9]} = $x[9]; + $domain{$x[9]} = $x[10]; + #ACE $psp_regsite_protein{$x[9]} = $x[1]; + $ON_FUNCTION{$x[9]} = $x[11]; + $ON_PROCESS{$x[9]} = $x[12]; + $ON_PROT_INTERACT{$x[9]} = $x[13]; + $ON_OTHER_INTERACT{$x[9]} = $x[14]; + $notes{$x[9]} = $x[19]; + $organism{$x[9]} = $x[6]; + } + } + else { + # $domain + if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") { + if ($x[10] ne "") { + $domain{$x[9]} = $domain{$x[10]}; + } + else { + # do nothing + #ACE print "WARNING line $line - no domain or 7aa for: GENE $x[0] PROTEIN $x[1] PROT_TYPE $x[2] ACC_ID $x[3] GENE_ID $x[4] HU_CHR_LOC $x[5] ORGANISM $x[6] MOD_RSD $x[7] SITE_GRP_ID $x[8] SITE_+/-7_AA $x[9] DOMAIN $x[10]\n"; + #ACE print "$_\n"; + } + } + else { + #ACE print "Checking $domain{$x[9]} =~ /$x[10]/\n"; + if ($domain{$x[9]} =~ /$x[10]/) { + # do nothing + } + else { + $domain{$x[9]} = $domain{$x[9]}." / ".$x[10]; + #print "INFO line $line - compound domain for 7aa: GENE $x[0] PROTEIN $x[1] PROT_TYPE $x[2] ACC_ID $x[3] GENE_ID $x[4] HU_CHR_LOC $x[5] ORGANISM $x[6] MOD_RSD $x[7] SITE_GRP_ID $x[8] SITE_+/-7_AA $x[9] DOMAIN $domain{$x[9]}\n"; + } + } + + # $ON_FUNCTION + if (!defined $ON_FUNCTION{$x[9]} || $ON_FUNCTION{$x[9]} eq "") { + $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10]; + } + + # $ON_PROCESS + if (!defined $ON_PROCESS{$x[9]} || $ON_PROCESS{$x[9]} eq "") { + $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10]; + } + + # $ON_PROT_INTERACT + if (!defined $ON_PROT_INTERACT{$x[9]} || $ON_PROT_INTERACT{$x[9]} eq "") { + $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10]; + } + + # $ON_OTHER_INTERACT + if (!defined $ON_OTHER_INTERACT{$x[9]} || $ON_OTHER_INTERACT{$x[9]} eq "") { + $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10]; + } + + # $notes + if (!defined $notes{$x[9]} || $notes{$x[9]} eq "") { + $notes{$x[9]} = $notes{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $notes{$x[9]} = $notes{$x[9]}." / ".$x[10]; + } + + # $organism + if (!defined $organism{$x[9]} || $organism{$x[9]} eq "") { + $organism{$x[9]} = $organism{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $organism{$x[9]} = $organism{$x[9]}." / ".$x[10]; + } + } + } +} +close IN4; + +print "... Finished reading various site data at " . format_localtime_iso8601() ."\n\n"; + +$stmth = $dbh->prepare(" +INSERT INTO Citation ( + ObjectName, + CitationData +) VALUES (?,?) +"); + +sub add_citation { + my ($cit_table, $cit_text, $cit_label) = @_; + $stmth->bind_param(1, $cit_table); + $stmth->bind_param(2, $cit_text); + if (not $stmth->execute()) { + print "Error writing $cit_label cit for table $cit_table: $stmth->errstr\n"; + } +} +my ($citation_text, $citation_table); + +# PSP regulatory or kinase/substrate site +$citation_text = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'; +$citation_table = "PSP_Regulatory_site"; +add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "PSP_Regulatory_site"); +$citation_text = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'; +$citation_table = "PSP_Regulatory_site"; +add_citation($citation_table, $citation_text, "PSP_Regulatory_site"); +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); + +# NetworKIN site +$citation_text = 'Linding, 2007, "Systematic discovery of in vivo phosphorylation networks.", https://pubmed.ncbi.nlm.nih.gov/17570479, https://doi.org/10.1016/j.cell.2007.05.052'; +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "NetworkKIN"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "NetworkKIN"); +$citation_text = 'Horn, 2014, "KinomeXplorer: an integrated platform for kinome biology studies.", https://pubmed.ncbi.nlm.nih.gov/24874572, https://doi.org/10.1038/nmeth.296'; +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "NetworkKIN"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "NetworkKIN"); +$citation_text = 'Aken, 2016, "The Ensembl gene annotation system.", https://pubmed.ncbi.nlm.nih.gov/33137190, https://doi.org/10.1093/database/baw093'; +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "NetworkKIN"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "NetworkKIN"); + +# pSTY motifs +$citation_text = 'Amanchy, 2007, "A curated compendium of phosphorylation motifs.", https://pubmed.ncbi.nlm.nih.gov/17344875, https://doi.org/10.1038/nbt0307-285'; +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs"); +$citation_text = 'Gnad, 2011, "PHOSIDA 2011: the posttranslational modification database.", https://pubmed.ncbi.nlm.nih.gov/21081558, https://doi.org/10.1093/nar/gkq1159'; +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs"); + + +############################################################################################################################### +# +# Read the data file: +# 1) find sequences that match the NetworKIN predictions +# 2) find motifs that match the observed sequences +# +############################################################################################################################### + +print "--- Find sequences that match the NetworKIN predictions and find motifs that match observed sequences\n"; + +my $ppep_regsite_LUT_stmth; +$ppep_regsite_LUT_stmth = $dbh->prepare(" + INSERT INTO ppep_regsite_LUT ( + ppep_id, + site_plusminus_7AA + ) VALUES (?,?) +"); + +my ($start_seconds, $start_microseconds) = gettimeofday; + +foreach my $peptide (keys %data) { + # find the unique phospho-motifs for this $peptide + my @all_motifs = (); + my $have_all_motifs = 0; + for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { + my $tmp_motif = $p_motifs{$peptide}[$i]; + push(@all_motifs, $tmp_motif); + $have_all_motifs = 1; + } + if ($have_all_motifs == 1) { + for my $j (0 .. $#all_motifs) { + if (defined $all_motifs[$j]) { + $all_motifs[$j] =~ s/\d+-\[\s//; + $all_motifs[$j] =~ s/\s\]\-\d+//; + } + } + } + my %seen = (); + if ($have_all_motifs == 1) { + foreach my $a (@all_motifs) { + if (defined $a) { + if (exists($seen{$a})) { + next; + } else { + push(@{$unique_motifs{$peptide}}, $a); + $seen{$a} = 1; + } + } + print "push(\@{\$unique_motifs{$peptide}}, $a);\n" if ($verbose); + } + } + + # count the number of phospo-sites in the motif + my $number_pY = 0; + my $number_pSTY = 0; + if ($phospho_type eq 'y') { + if (defined(${$unique_motifs{$peptide}}[0])) { + while (${$unique_motifs{$peptide}}[0] =~ /pY/g) { + $number_pY++; + } + } + } + if ($phospho_type eq 'sty') { + print "looking for unique_motifs for $peptide\n" if ($verbose); + if (defined(${$unique_motifs{$peptide}}[0])) { + while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) { + $number_pSTY++; + print "We have found $number_pSTY unique_motifs for $peptide\n" if ($verbose); + } + } + } + + + # search each of the unique motifs for matches + print "searching $#{$unique_motifs{$peptide}} motifs for peptide $peptide\n" if ($verbose); + for my $i (0 .. $#{$unique_motifs{$peptide}}) { + print "\$i = $i; peptide = $peptide; unique_motif = ${$unique_motifs{$peptide}}[$i]\n" if ($verbose); + my $tmp_motif = ${$unique_motifs{$peptide}}[$i]; + print " --- matching unique motif $tmp_motif for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); + my $formatted_sequence; + if (($number_pY == 1) || ($number_pSTY == 1)) { + my $seq_plus5aa = ""; + my $seq_plus7aa = ""; + #ACE print "tmp_motif is $tmp_motif before replacement\n"; + $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type); + print " a #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequence for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); + #ACE print "formatted_sequence is $formatted_sequence after replacement\n"; + if ($phospho_type eq 'y') { + $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1]; + $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1]; + } + elsif ($phospho_type eq "sty") { + $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1]; + $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1]; + } + + if (defined $seq_plus7aa) { + # commit the 7aa LUT records + $ppep_regsite_LUT_stmth->bind_param( 1, $ppep_id_lut{$peptide} ); + $ppep_regsite_LUT_stmth->bind_param( 2, $seq_plus7aa ); + if (not $ppep_regsite_LUT_stmth->execute()) { + print "Error writing tuple ($ppep_id_lut{$peptide},$seq_plus7aa) for peptide $peptide to ppep_regsite_LUT: $ppep_regsite_LUT_stmth->errstr\n"; + } + } + #ACE print "seq_plus5aa is $seq_plus5aa \n"; + #ACE print "seq_plus7aa is $seq_plus7aa \n"; + for my $i (0 .. $#kinases_observed) { + if (defined $seq_plus5aa) { + my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should be PGRPLsSYGMD_PKCalpha + if (exists($p_sequence_kinase -> {$tmp})) { + #ACE print($tmp."\t"); + #ACE print(($p_sequence_kinase -> {$tmp})."\n"); #ACE + $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; #ACE + } + } + } + for my $i (0 .. $#motif_sequence) { + if ($peptide =~ /$motif_sequence[$i]/) { + $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; + #ACE print "\$kinase_motif_matches{$peptide}{$motif_sequence[$i]} = 'X'; $motif_type{$motif_sequence[$i]}\n"; #ACE + } + } + for my $i (0 .. $#kinases_PhosphoSite) { + if (defined $seq_plus7aa) { + my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 + if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { + $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; + } + } + } + #ACE print "checking for existence of \$regulatory_sites_PhosphoSite_hash{$seq_plus7aa}\n"; #ACE + if (exists($regulatory_sites_PhosphoSite_hash{$seq_plus7aa})) { + #ACE print "found regulatory_sites_PhosphoSite_hash{$seq_plus7aa}\n"; #ACE + $seq_plus7aa_2{$peptide} = $seq_plus7aa; + $domain_2{$peptide} = $domain{$seq_plus7aa}; + #ACE $psp_regsite_protein_2{$peptide} = $psp_regsite_protein{$seq_plus7aa}; + $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; + $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; + $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; + $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; + $notes_2{$peptide} = $notes{$seq_plus7aa}; + $organism_2{$peptide} = $organism{$seq_plus7aa}; + } else { + #ACE print "c not found \$regulatory_sites_PhosphoSite_hash{{$seq_plus7aa}\n"; #ACE + } + } + elsif (($number_pY > 1) || ($number_pSTY > 1)) { #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2 + $formatted_sequence = $tmp_motif; + #ACE print "formatted_sequence is $formatted_sequence \n"; + $seq_plus5aa = ""; + $seq_plus7aa = ""; + #Create the sequences with only one phosphorylation site + #eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes 1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329 and 1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329 + + my (@sites, $offset, $next_p_site); + $sites[0] = index($tmp_motif, "p"); + $offset = $sites[0] + 1; + $next_p_site = 0; + while ($next_p_site != -1) { + $next_p_site = index($tmp_motif, "p", $offset); + if ($next_p_site != -1) { + push (@sites, $next_p_site); + } + $offset = $next_p_site+1; + } + + my @pSTY_sequences; + for my $n (0 .. $#sites) { + $pSTY_sequences[$n] = $tmp_motif; + for (my $m = $#sites; $m >= 0; $m--) { + if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";} + } + } + + my @formatted_sequences; + for my $k (0 .. $#sites) { + #ACE print "pSTY_sequences[k] is $pSTY_sequences[$k] before replacement\n"; + $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type); + #ACE print "formatted_sequences[k] is $formatted_sequences[$k] \n"; + } + + for my $k (0 .. $#formatted_sequences) { + print " b #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequences[$k] for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); + #ACE print "formatted_sequences[k] for phosphotype $phospho_type is $formatted_sequences[$k] \n"; + if ($phospho_type eq 'y') { + $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequences[$k]))[1]; + $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequences[$k]))[1]; + } + elsif ($phospho_type eq "sty") { + $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequences[$k]))[1]; + $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequences[$k]))[1]; + } + #ACE print "seq_plus5aa is $seq_plus5aa \n"; + #ACE print "seq_plus7aa is $seq_plus7aa \n"; + for my $i (0 .. $#kinases_observed) { + my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should look like REEILsEMKKV_PKCalpha + #ACE print "seq_plus5aa._.kinases_observed[i] is $tmp\n"; #ACE + if (exists($p_sequence_kinase -> {$tmp})) { + $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; + #ACE print "$tmp matched\n"; + } + } + $pSTY_sequence = $formatted_sequences[$k]; + #ACE print "trying pSTY_sequence $pSTY_sequence \n"; + for my $i (0 .. $#motif_sequence) { + if ($pSTY_sequence =~ /$motif_sequence[$i]/) { + #ACE print "match for pSTY_sequence $pSTY_sequence was $motif_sequence[$i]\n"; + $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; + } + } + for my $i (0 .. $#kinases_PhosphoSite) { + my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 + #print "seq_plus7aa._.kinases_PhosphoSite[i] is $tmp"; + if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { + $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; + #ACE print "$tmp matched \n"; + } + } + if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) { + #ACE print "ACE processing seq_plus7aa '$domain{$seq_plus7aa}'\n"; #ACE + $seq_plus7aa_2{$peptide} = $seq_plus7aa; + + # $domain + if ($domain_2{$peptide} eq "") { + $domain_2{$peptide} = $domain{$seq_plus7aa}; + } + elsif ($domain{$seq_plus7aa} eq "") { + # do nothing + } + else { + $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa}; + } + + #ACE # $psp_regsite_protein + #ACE if ($psp_regsite_protein_2{$peptide} eq "") { + #ACE $psp_regsite_protein_2{$peptide} = $psp_regsite_protein{$seq_plus7aa}; + #ACE } + #ACE elsif ($psp_regsite_protein{$seq_plus7aa} eq "") { + #ACE # do nothing + #ACE } + #ACE else { + #ACE $psp_regsite_protein_2{$peptide} = $psp_regsite_protein_2{$peptide}." / ".$psp_regsite_protein{$seq_plus7aa}; + #ACE } + + # $ON_FUNCTION_2 + if ($ON_FUNCTION_2{$peptide} eq "") { + $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; + } + elsif ($ON_FUNCTION{$seq_plus7aa} eq "") { + # do nothing + } + else { + $ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa}; + } + + # $ON_PROCESS_2 + if ($ON_PROCESS_2{$peptide} eq "") { + $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; + } + elsif ($ON_PROCESS{$seq_plus7aa} eq "") { + # do nothing + } + else { + $ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa}; + } + + # $ON_PROT_INTERACT_2 + if ($ON_PROT_INTERACT_2{$peptide} eq "") { + $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; + } + elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") { + # do nothing + } + else { + $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa}; + } + + # $ON_OTHER_INTERACT_2 + if ($ON_OTHER_INTERACT_2{$peptide} eq "") { + $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; + } + elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") { + # do nothing + } + else { + $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa}; + } + + # $notes_2 + if ($notes_2{$peptide} eq "") { + $notes_2{$peptide} = $notes{$seq_plus7aa}; + } + elsif ($notes{$seq_plus7aa} eq "") { + # do nothing + } + else { + $notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa}; + } + $notes_2{$peptide} = $notes{$seq_plus7aa}; + + # $organism_2 + if ($organism_2{$peptide} eq "") { + $organism_2{$peptide} = $organism{$seq_plus7aa}; + } + elsif ($organism{$seq_plus7aa} eq "") { + # do nothing + } + else { + $organism_2{$peptide} = $organism_2{$peptide}." / ".$organism{$seq_plus7aa}; + } + $organism_2{$peptide} = $organism{$seq_plus7aa}; + } else { + #ACE print "d not found \$regulatory_sites_PhosphoSite_hash{{$seq_plus7aa}}\n"; + } # if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) + } # for my $k (0 .. $#formatted_sequences) + } # if/else number of phosphosites + } # for each motif i # for my $i (0 .. $#{$unique_motifs{$peptide}}) +} # for each $peptide + +my ($end_seconds, $end_microseconds) = gettimeofday; + +my $delta_seconds = $end_seconds - $start_seconds; +my $delta_microseconds = $end_microseconds - $start_microseconds; +$delta_microseconds += 1000000 * $delta_seconds; +my $key_count = keys(%data); +print sprintf("Average search time is %d microseconds per phopshopeptide\n", ($delta_microseconds / $key_count)); + +($start_seconds, $start_microseconds) = gettimeofday; + +print "Writing PSP_Regulatory_site records\n"; + +#ACE $stmth = $dbh->prepare(" +#ACE INSERT INTO PSP_Regulatory_site ( +#ACE DOMAIN, +#ACE ON_FUNCTION, +#ACE ON_PROCESS, +#ACE ON_PROT_INTERACT, +#ACE ON_OTHER_INTERACT, +#ACE NOTES, +#ACE SITE_PLUSMINUS_7AA, +#ACE ORGANISM, +#ACE PROTEIN +#ACE ) VALUES (?,?,?,?,?,?,?,?,?) +#ACE "); + +$stmth = $dbh->prepare(" + INSERT INTO PSP_Regulatory_site ( + DOMAIN, + ON_FUNCTION, + ON_PROCESS, + ON_PROT_INTERACT, + ON_OTHER_INTERACT, + NOTES, + SITE_PLUSMINUS_7AA, + ORGANISM + ) VALUES (?,?,?,?,?,?,?,?) + "); + +foreach my $peptide (keys %data) { + if (exists($domain_2{$peptide}) and (defined $domain_2{$peptide}) and (not $domain_2{$peptide} eq "") ) { + #ACE print "writing domain $domain_2{$peptide} for regulatory site(s) $seq_plus7aa_2{$peptide}\n"; #ACE + $stmth->bind_param(1, $domain_2{$peptide}); + $stmth->bind_param(2, $ON_FUNCTION_2{$peptide}); + $stmth->bind_param(3, $ON_PROCESS_2{$peptide}); + $stmth->bind_param(4, $ON_PROT_INTERACT_2{$peptide}); + $stmth->bind_param(5, $ON_OTHER_INTERACT_2{$peptide}); + $stmth->bind_param(6, $notes_2{$peptide}); + $stmth->bind_param(7, $seq_plus7aa_2{$peptide}); + $stmth->bind_param(8, $organism_2{$peptide}); + #ACE $stmth->bind_param(9, $psp_regsite_protein_2{$peptide}); + if (not $stmth->execute()) { + print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $stmth->errstr\n"; + } else { + #ACE print "added domain for $domain_2{$peptide}\n"; + } + } elsif (exists($domain_2{$peptide}) and (not defined $domain_2{$peptide})) { + print "\$domain_2{$peptide} is undefined\n"; #ACE + } +} + +$dbh->{AutoCommit} = $auto_commit; +# auto_commit implicitly finishes stmth, apparently # $stmth->finish; +$dbh->disconnect if ( defined $dbh ); + + +($end_seconds, $end_microseconds) = gettimeofday; + +$delta_seconds = $end_seconds - $start_seconds; +$delta_microseconds = $end_microseconds - $start_microseconds; +$delta_microseconds += 1000000 * $delta_seconds; +$key_count = keys(%data); +print sprintf("Write time is %d microseconds\n", ($delta_microseconds)); + +print "... Finished find sequences that match the NetworKIN predictions and find motifs that match observed sequences at " . format_localtime_iso8601() ."\n\n"; + +############################################################################################################################### +# +# Print to the output file +# +############################################################################################################################### +open (OUT, ">$file_out") || die "could not open the fileout: $file_out"; +open (MELT, ">$file_melt") || die "could not open the fileout: $file_melt"; + +# print the header info +print MELT "phospho_peptide\tgene_names\tsite_type\tkinase_map\n"; +print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t"; + +# print the PhosphoSite regulatory data +print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t"; + +# print the sample names +for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; } + +# print the kinases and groups +for my $i (0 .. $#kinases_observed) { + my $temp = $kinases_observed[$i]."_NetworKIN"; + print OUT "$temp\t"; + push(@kinases_observed_lbl, $temp); +} +for my $i (0 .. $#motif_sequence) { + print OUT "$motif_type{$motif_sequence[$i]} ($motif_sequence[$i])\t"; +} +for my $i (0 .. $#kinases_PhosphoSite) { + my $temp = $kinases_PhosphoSite[$i]."_PhosphoSite"; + if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; } + if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; } + push(@phosphosites_observed_lbl, $temp); +} + +# begin DDL-to-SQLite +# --- +$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef); +$auto_commit = $dbh->{AutoCommit}; +$dbh->{AutoCommit} = 0; +print "DB connection $dbh is to $db_out, opened for modification\n"; + +my $sample_stmth; +$sample_stmth = $dbh->prepare(" + INSERT INTO sample ( + id, + name + ) VALUES (?,?) +"); + +my $ppep_intensity_stmth; +$ppep_intensity_stmth = $dbh->prepare(" + INSERT INTO ppep_intensity ( + ppep_id, + sample_id, + intensity + ) VALUES (?,?,?) +"); + +my $site_type_stmth; +$site_type_stmth = $dbh->prepare(" + insert into site_type ( + id, + type_name + ) values (?,?) +"); + +my $ppep_gene_site_stmth; +$ppep_gene_site_stmth = $dbh->prepare(" + insert into ppep_gene_site ( + ppep_id, + gene_names, + kinase_map, + site_type_id + ) values (?,?,?,?) +"); + +my $ppep_metadata_stmth; +$ppep_metadata_stmth = $dbh->prepare(" + INSERT INTO ppep_metadata + ( ppep_id + , protein_description + , gene_name + , FASTA_name + , phospho_sites + , motifs_unique + , accessions + , motifs_all_members + , domain + , ON_FUNCTION + , ON_PROCESS + , ON_PROT_INTERACT + , ON_OTHER_INTERACT + , notes + ) VALUES ( + ?,?,?,?,?,?,? + , ?,?,?,?,?,?,? + ) +"); +# end DDL-to-SQLite +# ... + +# begin store-to-SQLite "sample" table +# --- +# %sample_id_lut maps name -> ID +for my $sample_name (keys %sample_id_lut) { + $sample_stmth->bind_param( 2, $sample_name ); + $sample_stmth->bind_param( 1, $sample_id_lut{$sample_name} ); + if (not $sample_stmth->execute()) { + print "Error writing tuple ($sample_name,$sample_id_lut{$sample_name}): $sample_stmth->errstr\n"; + } +} +# end store-to-SQLite "sample" table +# ... + +# begin store-to-SQLite "site_type" table +# --- +sub add_site_type { + my ($site_type_id, $site_type_type_name) = @_; + $site_type_stmth->bind_param( 2, $site_type_type_name ); + $site_type_stmth->bind_param( 1, $site_type_id ); + if (not $site_type_stmth->execute()) { + die "Error writing tuple ($site_type_id,$site_type_type_name): $site_type_stmth->errstr\n"; + } +} +add_site_type($SITE_KINASE_SUBSTRATE, $site_description{$SITE_KINASE_SUBSTRATE}); +add_site_type($SITE_MOTIF, $site_description{$SITE_MOTIF}); +add_site_type($SITE_PHOSPHOSITE, $site_description{$SITE_PHOSPHOSITE}); +# end store-to-SQLite "site_type" table +# ... + +foreach my $peptide (sort(keys %data)) { + next if (grep($peptide, @failed_matches)); + my $ppep_id = $ppep_id_lut{$peptide}; + my @ppep_metadata = (); + my @ppep_intensity = (); + my @gene = (); + my $gene_names; + my $j; + # Print the peptide itself + # column 1: p-peptide + print OUT "$peptide\t"; + push (@ppep_metadata, $ppep_id); + push (@ppep_intensity, $peptide); + + # skip over failed matches + if ($matched_sequences{$peptide} eq "Failed match") { + print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t"; + } else { + my @description = (); + my %seen = (); + # Print just the protein description + for $i (0 .. $#{$names{$peptide}}) { + my $long_name = $names{$peptide}[$i]; + my @naming_parts = split(/\sOS/, $long_name); + my @front_half = split(/\s/, $naming_parts[0]); + push(@description, join(" ", @front_half[1..($#front_half)])); + } + # column 2: Protein description + print OUT join(" /// ", @description), "\t"; + push (@ppep_metadata, join(" /// ", @description)); + + # Print just the gene name + for $i (0 .. $#{$names{$peptide}}) { + my $tmp_gene = $names{$peptide}[$i]; + $tmp_gene =~ s/^.*GN=//; + $tmp_gene =~ s/\s.*//; + if (!exists($seen{$tmp_gene})) { + push(@gene, $tmp_gene); + $seen{$tmp_gene} = $tmp_gene; + } + } + # column 3: Gene name(s) + $gene_names = join(" /// ", @gene); + print OUT $gene_names, "\t"; + push (@ppep_metadata, join(" /// ", @gene)); + + # print the entire names + # column 4: FASTA name + print OUT join(" /// ", @{$names{$peptide}}), "\t"; + push (@ppep_metadata, join(" /// ", @{$names{$peptide}})); + + # Print the phospho-residues + # column 5: + my $tmp_for_insert = ""; + for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { + if ($i < $#{ $matched_sequences{$peptide} }) { + if (defined $p_residues{$peptide}{$i}) { + @tmp_p_residues = @{$p_residues{$peptide}{$i}}; + for $j (0 .. $#tmp_p_residues) { + if ($j < $#tmp_p_residues) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; + $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; + } + elsif ($j == $#tmp_p_residues) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// "; + $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// "; + } + } + } + } + elsif ($i == $#{ $matched_sequences{$peptide} }) { + if (defined $p_residues{$peptide}{$i}) { + @tmp_p_residues = @{$p_residues{$peptide}{$i}}; + for my $j (0 .. $#tmp_p_residues) { + if ($j < $#tmp_p_residues) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; + $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; + } + elsif ($j == $#tmp_p_residues) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing\t"; + $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing"; + } + } + } else { + print OUT "\t"; + } + } + } + push (@ppep_metadata, $tmp_for_insert); + + # Print the UNIQUE phospho-motifs + # Column 6: + print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t"; + push (@ppep_metadata, join(" /// ", @{$unique_motifs{$peptide}})); + + # Print the accessions + # Column 7: + if (defined $accessions{$peptide}) { + print OUT join(" /// ", @{$accessions{$peptide}}), "\t"; + push (@ppep_metadata, join(" /// ", @{$accessions{$peptide}})); + } else { + print OUT "\t"; + push (@ppep_metadata, ""); + } + + # print ALL motifs with residue numbers + # Column 8: + if (defined $p_motifs{$peptide}) { + print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t"; + push (@ppep_metadata, join(" /// ", @{$p_motifs{$peptide}})); + } else { + print OUT "\t"; + push (@ppep_metadata, ""); + } + + } + + # Print the PhosphoSite regulatory data + + if (defined $domain_2{$peptide}) { print OUT "$domain_2{$peptide}\t"; } else { print OUT "\t"; } + if (defined $ON_FUNCTION_2{$peptide}) { print OUT "$ON_FUNCTION_2{$peptide}\t"; } else { print OUT "\t"; } + if (defined $ON_PROCESS_2{$peptide}) { print OUT "$ON_PROCESS_2{$peptide}\t"; } else { print OUT "\t"; } + if (defined $ON_PROT_INTERACT_2{$peptide}) { print OUT "$ON_PROT_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } + if (defined $ON_OTHER_INTERACT_2{$peptide}) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } + if (defined $notes_2{$peptide}) { print OUT "$notes_2{$peptide}\t"; } else { print OUT "\t"; } + + if (defined $domain_2{$peptide}) { push (@ppep_metadata, $domain_2{$peptide}); } else { push(@ppep_metadata, ""); } + if (defined $ON_FUNCTION_2{$peptide}) { push (@ppep_metadata, $ON_FUNCTION_2{$peptide}); } else { push(@ppep_metadata, ""); } + if (defined $ON_PROCESS_2{$peptide}) { push (@ppep_metadata, $ON_PROCESS_2{$peptide}); } else { push(@ppep_metadata, ""); } + if (defined $ON_PROT_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_PROT_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); } + if (defined $ON_OTHER_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_OTHER_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); } + if (defined $notes_2{$peptide}) { push (@ppep_metadata, $notes_2{$peptide}); } else { push(@ppep_metadata, ""); } + + # begin store-to-SQLite "ppep_metadata" table + # --- + for $i (1..14) { + #ACE print "\$ppep_metadata_stmth->bind_param($i, " . $ppep_metadata[$i-1] . ")\n"; + $ppep_metadata_stmth->bind_param($i, $ppep_metadata[$i-1]); + } + if (not $ppep_metadata_stmth->execute()) { + print "Error writing ppep_metadata row for phosphopeptide $ppep_metadata[$i]: $ppep_metadata_stmth->errstr\n"; + } + # ... + # end store-to-SQLite "ppep_metadata" table + + # Print the data + @tmp_data = (); + foreach (@{$data{$peptide}}) { + push(@tmp_data, $_); + } + print OUT join("\t", @tmp_data), "\t"; + + # begin store-to-SQLite "ppep_intensity" table + # --- + # commit the sample intensities + $i = 0; + foreach (@{$data{$peptide}}) { + my $intense = $_; + $ppep_intensity_stmth->bind_param( 1, $ppep_id ); + $ppep_intensity_stmth->bind_param( 2, $sample_id_lut{$samples[$i]} ); + $ppep_intensity_stmth->bind_param( 3, $intense ); + #ACE print "insert ($peptide, $samples[$i], $intense)\n"; + if (not $ppep_intensity_stmth->execute()) { + print "Error writing tuple ($peptide,$samples[$i],$intense): $ppep_intensity_stmth->errstr\n"; + } + $i += 1; + } + # ... + # end store-to-SQLite "ppep_intensity" table + + # print the kinase-substrate data + for my $i (0 .. $#kinases_observed) { + if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) { + print OUT "X\t"; + my $NetworKIN_label = $kinases_observed[$i]."_NetworKIN"; + print MELT "$peptide\t$gene_names\t$site_description{$SITE_KINASE_SUBSTRATE}\t$NetworKIN_label\n"; + # begin store-to-SQLite "ppep_gene_site" table + # --- + $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id + $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names + $ppep_gene_site_stmth->bind_param(3, $NetworKIN_label); # ppep_gene_site.kinase_map + $ppep_gene_site_stmth->bind_param(4, $SITE_KINASE_SUBSTRATE); # ppep_gene_site.site_type_id + if (not $ppep_gene_site_stmth->execute()) { + print "Error writing tuple ($peptide,$gene_names,$kinases_observed[$i]): $ppep_gene_site_stmth->errstr\n"; + } + # ... + # end store-to-SQLite "ppep_gene_site" table + } + else { print OUT "\t";} + } + #ACE my %wrote_motif = {}; + my %wrote_motif; + my $motif_parts_0; + for my $i (0 .. $#motif_sequence) { + if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) { + print OUT "X\t"; + #ACE my @motif_parts = split(/ motif /, $motif_type{$motif_sequence[$i]}); + $motif_parts_0 = $motif_type{$motif_sequence[$i]}." ".$motif_sequence[$i]; + my $key = "$peptide\t$gene_names\t$motif_parts_0"; + if (!exists($wrote_motif{$key})) { + $wrote_motif{$key} = $key; + print MELT "$peptide\t$gene_names\t$site_description{$SITE_MOTIF}\t$motif_parts_0\n"; + # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n"; #debug + # begin store-to-SQLite "ppep_gene_site" table + # --- + $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id + $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names + $ppep_gene_site_stmth->bind_param(3, $motif_parts_0); # ppep_gene_site.kinase_map + $ppep_gene_site_stmth->bind_param(4, $SITE_MOTIF); # ppep_gene_site.site_type_id + if (not $ppep_gene_site_stmth->execute()) { + print "Error writing tuple ($peptide,$gene_names,$motif_parts_0): $ppep_gene_site_stmth->errstr\n"; + } + # ... + # end store-to-SQLite "ppep_gene_site" table + } + } + else { print OUT "\t";} + } + for my $i (0 .. $#kinases_PhosphoSite) { + if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]})) { + print MELT "$peptide\t$gene_names\t$site_description{$SITE_PHOSPHOSITE}\t$phosphosites_observed_lbl[$i]\n"; + if ($i < $#kinases_PhosphoSite) { + print OUT "X\t"; + } + else { + print OUT "X\n"; + } + # begin store-to-SQLite "ppep_gene_site" table + # --- + $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id + $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names + $ppep_gene_site_stmth->bind_param(3, $phosphosites_observed_lbl[$i]); # ppep_gene_site.kinase_map + $ppep_gene_site_stmth->bind_param(4, $SITE_PHOSPHOSITE); # ppep_gene_site.site_type_id + if (not $ppep_gene_site_stmth->execute()) { + print "Error writing tuple ($peptide,$gene_names,$phosphosites_observed_lbl[$i]): $ppep_gene_site_stmth->errstr\n"; + } + # ... + # end store-to-SQLite "ppep_gene_site" table + } + else { + if ($i < $#kinases_PhosphoSite) { + print OUT "\t"; + } + elsif ($i == $#kinases_PhosphoSite) { + print OUT "\n"; + } + } + } +} + +close OUT; +close MELT; +$ppep_gene_site_stmth->finish; +print "begin DB commit at " . format_localtime_iso8601() . "\n"; +$dbh->{AutoCommit} = $auto_commit; +$dbh->disconnect if ( defined $dbh ); + +print "\nFinished writing output at " . format_localtime_iso8601() ."\n\n"; + +###############################################################################################################################
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,4 @@ +<macros> + <token name="@TOOL_VERSION@">0.1.0</token> + <token name="@VERSION_SUFFIX@">0</token> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mqppep_anova.R Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,191 @@ +#!/usr/bin/env Rscript +# libraries +library(optparse) +library(data.table) +library(stringr) +#library(ggplot2) +#library(PTXQC) +#require(PTXQC) +#require(methods) +# bioconductor-preprocesscore +# - libopenblas +# - r-data.table +# - r-rmarkdown +# - r-ggplot2 +# - texlive-core + +# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 + +# parse options +option_list <- list( + # <param name="inputFilename" type="data" format="tabular" label="Phosphopeptide Intensities" help="First column label 'Phosphopeptide'; sample-intensities must begin in column 10 and must have column labels to match argument regexSampleNames"/> + make_option( + c("-i", "--inputFile"), + action = "store", + default = NA, + type = "character", + help = "Phosphopeptide Intensities sparse input file path" + ), + make_option( + c("-a", "--alphaFile"), + action = "store", + default = NA, + type = "character", + help = "List of alpha cutoff values for significance testing; path to text file having one column and no header" + ), + make_option( + c("-f", "--firstDataColumn"), + action = "store", + default = "10", + type = "character", + help = "First column of intensity values" + ), + make_option( # imputationMethod <- c("group-median","median","mean","random")[1] + c("-m", "--imputationMethod"), + action = "store", + default = "group-median", + type = "character", + help = "Method for missing-value imputation, one of c('group-median','median','mean','random')" + ), + make_option( + c("-p", "--meanPercentile"), + action = "store", + default = 3, + type = "integer", + help = "Mean percentile for randomly generated imputed values; range [1,99]" + ), + make_option( + c("-d", "--sdPercentile"), + action = "store", + default = 3, + type = "double", + help = "Adjustment value for standard deviation of randomly generated imputed values; real" + ), + make_option( + c("-s", "--regexSampleNames"), + action = "store", + default = "\\.(\\d+)[A-Z]$", + type = "character", + help = "Regular expression extracting sample-names" + ), + make_option( + c("-g", "--regexSampleGrouping"), + action = "store", + default = "(\\d+)", + type = "character", + help = "Regular expression extracting sample-group from an extracted sample-name" + ), + # <data name="imputed_data_file" format="tabular" label="${input_file.name}.intensities_${imputation.imputation_method}-imputed_QN_LT" ></data> + make_option( + c("-o", "--imputedDataFile"), + action = "store", + default = "output_imputed.tsv", + type = "character", + help = "Imputed Phosphopeptide Intensities output file path" + ), + # <data name="report_file" format="html" label="report (download/unzip to view)" ></data> + make_option( + c("-r", "--reportFile"), + action = "store", + default = "QuantDataProcessingScript.html", + type = "character", + help = "HTML report file path" + ) +) +args <- parse_args(OptionParser(option_list=option_list)) +# Check parameter values + +if (! file.exists(args$inputFile)) { + stop((paste("Input file", args$inputFile, "does not exist"))) +} +inputFile <- args$inputFile +alphaFile <- args$alphaFile +firstDataColumn <- args$firstDataColumn +imputationMethod <- args$imputationMethod +meanPercentile <- args$meanPercentile +sdPercentile <- args$sdPercentile + +regexSampleNames <- gsub('^[ \t\n]*', '' , readChar(args$regexSampleNames, 1000)) +regexSampleNames <- gsub('[ \t\n]*$', '' , regexSampleNames ) +# regexSampleNames <- gsub('\\\\' , '@@', regexSampleNames ) +# regexSampleNames <- gsub('@@' , '\\', regexSampleNames ) +cat(regexSampleNames) +cat('\n') + +regexSampleGrouping <- gsub('^[ \t\n]*', '', readChar(args$regexSampleGrouping, 1000)) +regexSampleGrouping <- gsub('[ \t\n]*$', '', regexSampleGrouping ) +# regexSampleGrouping <- gsub('\\\\' , '@@', regexSampleGrouping ) +cat(regexSampleGrouping) +cat('\n') + +# regexSampleGrouping <- gsub('@@' , '\\', regexSampleGrouping ) +imputedDataFilename <- args$imputedDataFile +reportFileName <- args$reportFile + +print("args is:") +cat(str(args)) + +print("regexSampleNames is:") +cat(str(regexSampleNames)) + +print("regexSampleGrouping is:") +cat(str(regexSampleGrouping)) + +# from: https://github.com/molgenis/molgenis-pipelines/wiki/How-to-source-another_file.R-from-within-your-R-script +LocationOfThisScript = function() # Function LocationOfThisScript returns the location of this .R script (may be needed to source other files in same dir) +{ + this.file = NULL + # This file may be 'sourced' + for (i in -(1:sys.nframe())) { + if (identical(sys.function(i), base::source)) this.file = (normalizePath(sys.frame(i)$ofile)) + } + + if (!is.null(this.file)) return(dirname(this.file)) + + # But it may also be called from the command line + cmd.args = commandArgs(trailingOnly = FALSE) + cmd.args.trailing = commandArgs(trailingOnly = TRUE) + cmd.args = cmd.args[seq.int(from=1, length.out=length(cmd.args) - length(cmd.args.trailing))] + res = gsub("^(?:--file=(.*)|.*)$", "\\1", cmd.args) + + # If multiple --file arguments are given, R uses the last one + res = tail(res[res != ""], 1) + if (0 < length(res)) return(dirname(res)) + + # Both are not the case. Maybe we are in an R GUI? + return(NULL) +} + +script.dir <- LocationOfThisScript() + +rmarkdown_params <- list( + inputFile = inputFile + , alphaFile = alphaFile + , firstDataColumn = firstDataColumn + , imputationMethod = imputationMethod + , meanPercentile = meanPercentile + , sdPercentile = sdPercentile + , regexSampleNames = regexSampleNames + , regexSampleGrouping = regexSampleGrouping + , imputedDataFilename = imputedDataFilename + ) + +str(rmarkdown_params) + +# BUG +# Must render as HTML for the time being until this issue is resolved: +# https://github.com/conda-forge/texlive-core-feedstock/issues/19 +# for reason: +# "The following dependencies are not available in conda" +# reported here: +# https://github.com/ami-iit/bipedal-locomotion-framework/pull/457/commits/e98ccef8c8cb63e207df36628192af6ce22feb13 + +# freeze the random number generator so the same results will be produced from run to run +set.seed(28571) + +rmarkdown::render( + input = paste(script.dir, "mqppep_anova_script.Rmd", sep="/") +, output_format = rmarkdown::html_document(pandoc_args = "--self-contained") +, output_file = reportFileName +, params = rmarkdown_params +)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mqppep_anova.xml Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,219 @@ +<tool id="mqppep_anova" name="MaxQuant Phosphopeptide ANOVA" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" python_template_version="3.5"> + <description>Perform ANOVA on merged and filtered data from phospho-peptide enrichment/MaxQuant pipeline</description> + <macros> + <import>macros.xml</import> + </macros> + <requirements> + <requirement type="package" version="1.7.1">r-optparse</requirement> + <requirement type="package" version="1.4.0">r-stringr</requirement> + <requirement type="package" version="1.14.2">r-data.table</requirement> + <requirement type="package" version="3.3.5">r-ggplot2</requirement> + <requirement type="package" version="1.56.0">bioconductor-preprocesscore</requirement> + <requirement type="package" version="0.3.3" >openblas</requirement> + <requirement type="package" version="2.11" >r-rmarkdown</requirement> + <requirement type="package" version="0.4.0" >r-sass</requirement> + <requirement type="package" >texlive-core</requirement> + + </requirements> + <!-- Rscript -e 'rmarkdown::render("QuantDataProcessingScript.Rmd")' --> + <command detect_errors="exit_code"><![CDATA[ +cat $sample_names_regex_f; cat $sample_grouping_regex_f; +Rscript '$__tool_directory__/mqppep_anova.R' +--inputFile '$input_file' +--alphaFile $alpha_file +--firstDataColumn $first_data_column +--imputationMethod $imputation.imputation_method +#if '$imputation_method' == 'random': + --meanPercentile '$meanPercentile' + --sdPercentile '$sdPercentile' +#end if +--regexSampleNames $sample_names_regex_f +--regexSampleGrouping $sample_grouping_regex_f +--imputedDataFile $imputed_data_file +--reportFile $report_file + ]]></command> + <configfiles> + <configfile name="sample_names_regex_f"> + $sample_names_regex + </configfile> + <configfile name="sample_grouping_regex_f"> + $sample_grouping_regex + </configfile> + </configfiles> + <inputs> + <param name="input_file" type="data" format="tabular" label="Filtered Phosphopeptide Intensities" + help="[input_file] Phosphopeptide intensities filtered for minimal quality. First column label 'Phosphopeptide'; sample-intensities must begin in column 10 and must have column labels to match argument [sample_names_regex]" + /> + <param name="alpha_file" type="data" format="tabular" label="alpha cutoff level" + help="[alpha_file] List of alpha cutoff values for significance testing; text file having one column and no header" + /> + <param name="first_data_column" type="text" value="Intensity" + label="First data column" + help="[first_data_column] First column having intensity values (integer or PERL-compatible regular expression matching column label)" + /> + <!-- imputation_method <- c("group-median","median","mean","random")[1] --> + <conditional name="imputation"> + <param name="imputation_method" type="select" label="Imputation Method" + help="[imputation_method] Impute missing values by (1) using median for each sample-group; (2) using median across all samples; (3) using mean across all samples; or (4) using randomly generated values having same std. dev. as across all samples (with mean specified by [meanPercentile])" + > + <option value="random" selected="true">random</option> + <option value="group-median">group-median</option> + <option value="median">median</option> + <option value="mean">mean</option> + </param> + <when value="group-median" /> + <when value="median" /> + <when value="mean" /> + <when value="random"> + <param name="meanPercentile" type="integer" value="1" min="1" max="99" + label="Mean percentile for random values" + help="[meanPercentile] Percentile center of random values; range [1,99]" + /> + <param name="sdPercentile" type="float" value="0.2" + label="Percentile std. dev. for random values" + help="[sdPercentile] Standard deviation adjustment-factor for random values; real number. (1.0 means SD equal to the SD for the entire data set.)" + /> + </when> + </conditional> + <param name="sample_names_regex" type="text" value="\.(\d+)[A-Z]$" + help="[sample_names_regex] PERL-compatible regular expression extracting sample-names from the the name of a spectrum file (without extension)" + label="Sample-extraction regex"> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + </sanitizer> + </param> + <param name="sample_grouping_regex" type="text" value="(\d+)" + help="[sample_grouping_regex] PERL-compatible regular expression extracting sample-group from each sample-name (i.e., extracted by previous regex pattern)" + label="Group-extraction regex"> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + </sanitizer> + </param> + </inputs> + <outputs> + <data name="imputed_data_file" format="tabular" label="${input_file.name}.intensities_${imputation.imputation_method}-imputed_QN_LT" ></data> + <data name="report_file" format="html" label="${input_file.name}.report (download/unzip to view)" ></data> + </outputs> + <tests> + <test> + <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> + <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> + <param name="first_data_column" value="10"/> + <param name="imputation_method" value="group-median"/> + <param name="sample_names_regex" value="\.\d+[A-Z]$"/> + <param name="sample_grouping_regex" value="\d+"/> + <output name="imputed_data_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAAAAAGDpSDpSWDADAFSVEDPVRK" /> + <has_text text="23574000" /> + <has_text text="pSESELIDELSEDFDR" /> + </assert_contents> + </output> + </test> + <test> + <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> + <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> + <param name="first_data_column" value="10"/> + <param name="imputation_method" value="random"/> + <param name="sample_names_regex" value="\.\d+[A-Z]$"/> + <param name="sample_grouping_regex" value="\d+"/> + <output name="imputed_data_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAAAAAGDpSDpSWDADAFSVEDPVRK" /> + <has_text text="997800000" /> + <has_text text="pSESELIDELSEDFDR" /> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +=========================================== +Phopsphoproteomic Enrichment Pipeline ANOVA +=========================================== + +**Input files** + +``input_file`` + Phosphopeptides annotated with SwissProt and phosphosite metadata (in tabular format). + This is the output from the "Phopsphoproteomic Enrichment Pipeline Merge and Filter" + (``mqppep_mrgflt``) tool. + +``alpha_file`` + List of alpha cutoff values for significance testing; text file having one column and no header. For example: + +:: + + 0.2 + 0.1 + 0.05 + +**Input parameters** + +``first_data_column`` + First column of ``input_file`` having intensity values (integer or PERL-compatible regular expression matching column label). Default: **Intensity** + +``imputation_method`` + Impute missing values by: + + 1. using median for each sample-group; + 2. using median across all samples; + 3. using mean across all samples; or + 4. using randomly generated values where: + + - ``meanPercentile`` specifies the percentile among non-missing values to be used as mean of random values, and + - ``sdPercentile`` specifies the factor to be mulitplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values. + +``sample_names_regex`` + PERL-compatible regular expression extracting the sample-name from the the name of a column of instensities (from ``input_file``) for one sample. + + - For example, ``"\.\d+[A-Z]$"`` applied to ``Intensity.splunge.10A`` would produce ``.10A`` + - Note that *this is case sensitive* by default. + +``sample_grouping_regex`` + PERL-compatible regular expression extracting the sample-grouping from the sample-name that was extracted with ``sample_names_regex`` from a column of intensites (from ``input_file``). + + - For example, ``"\d+$"`` applied to ``.10A`` would produce ``10`` + - Note that *this is case sensitive* by default. + + +**Outputs** + +``intensities_*-imputed_QN_LT`` + Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format. + +``report_file`` + (download/unzip to view) Summary report for normalization, imputation, and ANOVA. + This dataset is displayed in Galaxy as having a datatype of ``html`` in Galaxy, + but it is in fact a zipfile; the zip file contains + an HTML file. Please download and unzip it locally to view the report. + Ideally this report would be a PDF, but there is an issue + `(linked here) + <https://github.com/conda-forge/texlive-core-feedstock/issues/19>`_. + that needs to be resolved first. + +**Authors** + +``Larry C. Cheng`` + (`ORCiD 0000-0002-6922-6433 <https://orcid.org/0000-0002-6922-6433>`_) wrote the original script. + +``Arthur C. Eschenlauer`` + (`ORCiD 0000-0002-2882-0508 <https://orcid.org/0000-0002-2882-0508>`_) adapted the script to run in Galaxy. + +=================================== +PERL-compatible regular expressions +=================================== + +Note that the PERL-compatible regular expressions accepted by this tool are documented at https://rdrr.io/r/base/regex.html + + ]]></help> + <citations> + <!-- Cheng_2018 "Phosphopeptide Enrichment ..." PMID: 30124664 --> + <citation type="doi">10.3791/57996</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mqppep_anova_script.Rmd Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,657 @@ +--- +title: "Quant Data Processing Script" +author: "Larry Cheng; Art Eschenlauer" +date: "May 28, 2018; Nov 16, 2021" +output: + html_document: default + pdf_document: default +params: + inputFile: "Upstream_Map_pST_outputfile_STEP4.txt" + alphaFile: "alpha_levels.txt" + firstDataColumn: "Intensity" + imputationMethod: !r c("group-median","median","mean","random")[4] + meanPercentile: 1 + sdPercentile: 0.2 + regexSampleNames: "\\.(\\d+)[A-Z]$" + regexSampleGrouping: "(\\d+)" + imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt" +--- +```{r setup, include=FALSE} +# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 +knitr::opts_chunk$set(echo = FALSE, fig.dim=c(9,10)) +``` + +## Purpose: +Perform imputation of missing values, quantile normalization, and ANOVA. + +<!-- +## Variables to change for each input file +--> +```{r include = FALSE} +#Input Filename +inputFile <- params$inputFile + +#First data column - ideally, this could be detected via regexSampleNames, but for now leave it as is. +firstDataColumn <- params$firstDataColumn +FDC_is_integer <- TRUE +firstDataColumn <- withCallingHandlers( + as.integer(firstDataColumn) + , warning = function(w) FDC_is_integer <<- FALSE + ) +if (FALSE == FDC_is_integer) { + firstDataColumn <- params$firstDataColumn +} + +#False discovery rate adjustment for ANOVA (Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05) +valFDR <- read.table(file = params$alphaFile, sep = "\t", header=F, quote="")[,1] + +#Imputed Data filename +imputedDataFilename <- params$imputedDataFilename + +#ANOVA data filename +``` + +```{r include = FALSE} +#Imputation method, should be one of c("random","group-median","median","mean") +imputationMethod <- params$imputationMethod + +#Selection of percentile of logvalue data to set the mean for random number generation when using random imputation +meanPercentile <- params$meanPercentile / 100.0 + +#deviation adjustment-factor for random values; real number. +sdPercentile <- params$sdPercentile + +#Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" +regexSampleNames <- params$regexSampleNames + +#Regular expression to extract Sample Grouping from Sample Name (if error occurs, compare sampleNumbers and tempMatches to see if groupings/pairs line up) +# e.g., "(\\d+)" +regexSampleGrouping <- params$regexSampleGrouping + +``` + + +```{r include = FALSE} +### FUNCTIONS + +#ANOVA filter function +anovaFunc <- function(x, groupingFactor) { + x.aov = aov(as.numeric(x) ~ groupingFactor) + pvalue = summary(x.aov)[[1]][["Pr(>F)"]][1] + pvalue +} +``` + + + +### Checking that log-transformed sample distributions are similar: +```{r echo=FALSE} + +library(data.table) + +# read.table reads a file in table format and creates a data frame from it. +# - note that `quote=""` means that quotation marks are treated literally. +fullData <- read.table(file = inputFile, sep = "\t", header=T, quote="", check.names=FALSE) +print(colnames(fullData)) +#head(fullData) + +if (FALSE == FDC_is_integer) { + dataColumnIndices <- grep(firstDataColumn, names(fullData), perl=TRUE) + str(dataColumnIndices) + if (length(dataColumnIndices) > 0) { + firstDataColumn <- dataColumnIndices[1] + } else { + stop(paste("failed to convert firstDataColumn:", firstDataColumn)) + } +} + +quantData0 <- fullData[firstDataColumn:length(fullData)] +quantData <- fullData[firstDataColumn:length(fullData)] +quantData[quantData==0] <- NA #replace 0 with NA +quantDataLog <- log10(quantData) + +rownames(quantDataLog) <- fullData$Phosphopeptide + +summary(quantDataLog) + +#data visualization +old_par <- par( + mai=par("mai") + c(0.5,0,0,0) +) +boxplot( + quantDataLog +, las=2 +) +par(old_par) + +quantDataLog_stack <- stack(quantDataLog) +``` + +```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)} +library(ggplot2) +ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind)) +``` + +### Globally, are phosphopeptide intensities are approximately unimodal? +```{r echo = FALSE,fig.align="left", fig.dim=c(9,5)} + +# ref for bquote particularly and plotting math expressions generally: +# https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ + +#identify the location of missing values +fin <- is.finite(as.numeric(as.matrix(quantDataLog))) + +logvalues <- as.numeric(as.matrix(quantDataLog))[fin] +plot( + density(logvalues) +, main = bquote("Smoothed estimated probability density vs." ~ log[10](intensity)) +, xlab = bquote(log[10](intensity)) +) +hist( + x = as.numeric(as.matrix(quantDataLog)) +, breaks = 100 +, main = bquote("Frequency vs." ~ log[10](intensity)) +, xlab = bquote(log[10](intensity)) +) +``` + +<!-- +## Impute missing values +--> + +### Distribution of standard deviations of phosphopeptides, ignoring missing values: + +```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)} +#determine quantile +q1 <- quantile(logvalues, probs = meanPercentile)[1] + +#determine standard deviation of quantile to impute +sd_finite <- function(x) { + ok <- is.finite(x) + sd(x[ok]) * sdPercentile +} +sds <- apply(quantDataLog, 1, sd_finite) # 1 = row of matrix (ie, phosphopeptide) +plot( + density(sds, na.rm=T) +, main="Smoothed estimated probability density vs. std. deviation" +, sub="(probability estimation made with Gaussian smoothing)" +) + +m1 <- median(sds, na.rm=T) #sd to be used is the median sd + +``` + + + +<!-- +The number of missing values are: +--> +```{r echo=FALSE} +#Determine number of cells to impute +temp <- quantData[is.na(quantData)] + +#Determine number of values to impute +NoToImpute <- length(temp) +``` + +<!-- +% of values that are missing: +--> +```{r echo=FALSE} +pct_missing_values <- length(temp)/(length(logvalues)+length(temp)) * 100 +``` + +<!-- +First few rows of data before imputation: +--> +## Impute missing values +```{r echo = FALSE} + +#ACE start segment: trt-median based imputation +# prep for trt-median based imputation + +# Assuming that regexSampleNames <- "\\.(\\d+)[A-Z]$" +# get factors -> group runs (samples) by ignoring terminal [A-Z] in sample names +# regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) +m <- regexpr(regexSampleNames, names(quantData), perl=TRUE) +tempMatches <- regmatches(names(quantData), m) +print("Extracted sample names") +print(tempMatches) +m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE) +sampleNumbers <- as.factor(regmatches(tempMatches, m2)) +print("Factor levels") +print(sampleNumbers) + +``` +```{r echo = FALSE} + +#ACE hack begin +#Determine number of cells to impute +cat( + sprintf("Before imputation, there are:\n %d peptides\n %d missing values (%2.0f%s)" + , sum(rep.int(TRUE, nrow(quantData))) + , sum(is.na(quantData)) + , pct_missing_values + , "%" + ) +) +#ACE hack end + +``` +```{r echo = FALSE} + +#Impute data +quantDataImputed <- quantData + +# Identify which values are missing and need to be imputed +ind <- which(is.na(quantDataImputed), arr.ind=TRUE) + +``` +```{r echo = FALSE} + +# Apply imputation +switch( + imputationMethod +, "group-median"={ + cat("Imputation method: substitute missing value with median peptide-intensity for sample-group\n") + #goodRows <- rep.int(TRUE, nrow(quantDataImputed)) + sampleLevelIntegers <- as.integer(sampleNumbers) + for (i in 1:length(levels(sampleNumbers))) { + levelCols <- i == sampleLevelIntegers + ind <- which(is.na(quantDataImputed[,levelCols]), arr.ind=TRUE) + quantDataImputed[ind,levelCols] <- apply(quantDataImputed[,levelCols], 1, median, na.rm=T)[ind[,1]] + } + goodRows <- !is.na(rowMeans(quantDataImputed)) + } +, "median"={ + cat("Imputation method: substitute missing value with median peptide-intensity across all sample classes\n") + quantDataImputed[ind] <- apply(quantDataImputed, 1, median, na.rm=T)[ind[,1]] + goodRows <- !is.na(rowMeans(quantDataImputed)) + } +, "mean"={ + cat("Imputation method: substitute missing value with mean peptide-intensity across all sample classes\n") + quantDataImputed[ind] <- apply(quantDataImputed, 1, mean, na.rm=T)[ind[,1]] + goodRows <- !is.na(rowMeans(quantDataImputed)) + } +, "random"={ + cat( + sprintf( + "Imputation method: substitute missing value with random intensity N ~ (%0.2f, %0.2f)\n" + , q1, m1 + ) + ) + quantDataImputed[is.na(quantDataImputed)] <- 10^rnorm(NoToImpute, mean= q1, sd = m1) + goodRows <- !is.na(rowMeans(quantDataImputed)) + } +) + +``` +```{r echo = FALSE} + +#Determine number of cells to impute +temp <- quantDataImputed[is.na(quantDataImputed)] +cat( + sprintf( + "After imputation, there are:\n %d missing values\n %d usable peptides\n %d peptides with too many missing values for further analysis" + , sum(is.na(quantDataImputed[goodRows,])) + , sum(goodRows) + , sum(!goodRows) + ) +) +``` +```{r echo = FALSE} + + +# Zap rows where imputation was ineffective +fullData <- fullData [goodRows, ] +quantData <- quantData [goodRows, ] +quantDataImputed <- quantDataImputed[goodRows, ] + +``` +```{r echo = FALSE} + +d_combined <- (density(as.numeric(as.matrix(log10(quantDataImputed))))) +d_original <- density(as.numeric(as.matrix(log10(quantDataImputed[!is.na(quantData)])))) + +``` +```{r echo = FALSE} + +if (sum(is.na(quantData)) > 0) { + # There ARE missing values + d_imputed <- (density(as.numeric(as.matrix(log10(quantDataImputed[is.na(quantData)]))))) +} else { + # There are NO missing values + d_imputed <- d_combined +} + +``` + +<!-- ```{r echo = FALSE, fig.cap = "Blue = Data before imputation; Red = Imputed data"} --> +```{r echo = FALSE, fig.dim=c(9,5)} +ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) +plot( + d_combined +, ylim = ylim +, sub = "Blue = data before imputation; Red = imputed data" +, main = "Density vs. log10(intensity) before and after imputation" +) +lines(d_original, col="blue") +lines(d_imputed, col="red") +``` + +## Perform Quantile Normalization +```{r echo=FALSE} +library(preprocessCore) +# Apply quantile normalization using preprocessCore::normalize.quantiles +# --- +# tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html +# except this: https://support.bioconductor.org/p/122925/#9135989 +# says to install it like this: +# ``` +# BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE,lib=.libPaths()[1]) +# ``` +# conda installation (necessary because of a bug in recent openblas): +# conda install bioconductor-preprocesscore openblas=0.3.3 +# ... +# --- +# normalize.quantiles {preprocessCore} -- Quantile Normalization +# +# Description: +# Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities. +# +# Usage: +# normalize.quantiles(x,copy=TRUE, keep.names=FALSE) +# +# Arguments: +# +# - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. +# +# - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, +# but in certain situations not making a copy of the matrix, but instead normalizing +# it in place will be more memory friendly. +# +# - keep.names: Boolean option to preserve matrix row and column names in output. +# +# Details: +# This method is based upon the concept of a quantile-quantile plot extended to n dimensions. +# No special allowances are made for outliers. If you make use of quantile normalization +# please cite Bolstad et al, Bioinformatics (2003). +# +# This functions will handle missing data (ie NA values), based on +# the assumption that the data is missing at random. +# +# Note that the current implementation optimizes for better memory usage +# at the cost of some additional run-time. +# +# Value: A normalized matrix. +# +# Author: Ben Bolstad, bmbolstad.com +# +# References +# +# - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide +# Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf +# +# - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of +# Normalization Methods for High Density Oligonucleotide Array Data Based on Bias +# and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 +# http://bmbolstad.com/misc/normalize/normalize.html +# ... + +if (TRUE) { + quantDataImputed.qn <- normalize.quantiles(as.matrix(quantDataImputed)) +} else { + quantDataImputed.qn <- as.matrix(quantDataImputed) +} + +quantDataImputed.qn = as.data.frame(quantDataImputed.qn) +names(quantDataImputed.qn) = names(quantDataImputed) +quantDataImputed_QN_log <- log10(quantDataImputed.qn) + +rownames(quantDataImputed_QN_log) <- fullData[,1] + +quantDataImputed.qn.LS = t(scale(t(log10(quantDataImputed.qn)))) +anyNaN <- function (x) { + !any(x == "NaN") +} +sel = apply(quantDataImputed.qn.LS, 1, anyNaN) +quantDataImputed.qn.LS2 <- quantDataImputed.qn.LS[which(sel),] +quantDataImputed.qn.LS2 = as.data.frame(quantDataImputed.qn.LS2) + +#output quantile normalized data +dataTableImputed_QN_LT <- cbind(fullData[1:9], quantDataImputed_QN_log) +write.table(dataTableImputed_QN_LT, file = paste(paste(strsplit(imputedDataFilename, ".txt"),"QN_LT",sep="_"),".txt",sep=""), sep = "\t", col.names=TRUE, row.names=FALSE) + +``` + +<!-- ACE insertion begin --> +### Checking that normalized, imputed, log-transformed sample distributions are similar: + +```{r echo=FALSE} +#library(data.table) + +#Save unimputed quantDataLog for plotting below +unimputedQuantDataLog <- quantDataLog + +#Log10 transform (after preparing for zero values, which should never happen...) +quantDataImputed.qn[quantDataImputed.qn == 0] <- .000000001 +quantDataLog <- log10(quantDataImputed.qn) + +summary(quantDataLog) + +#Output quantile-normalized log-transformed dataset with imputed, normalized data + +dataTableImputed <- cbind(fullData[1:9], quantDataLog) +write.table( + dataTableImputed + , file=imputedDataFilename + , sep="\t" + , col.names=TRUE + , row.names=FALSE + , quote=FALSE + ) + + + +#data visualization +old_par <- par( + mai=par("mai") + c(0.5,0,0,0) +, oma=par("oma") + c(0.5,0,0,0) +) +boxplot( + quantDataLog +, las=2 +) +par(old_par) +``` + +```{r echo=FALSE, fig.dim=c(9,5)} +quantDataLog_stack <- stack(quantDataLog) +ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind)) +``` + +## Perform ANOVA filters + +```{r,echo=FALSE} +#Make new data frame containing only Phosphopeptides to connect preANOVA to ANOVA (connect_df) +connect_df <- data.frame( + dataTableImputed_QN_LT$Phosphopeptide + , dataTableImputed_QN_LT[,firstDataColumn] + ) +colnames(connect_df) <- c("Phosphopeptide","Intensity") +``` + +```{r echo=FALSE, fig.dim=c(9,10)} +# Get factors -> group replicates (as indicated by terminal letter) by the preceding digits +# For example, group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc.. +m <- regexpr(regexSampleNames, names(quantDataImputed_QN_log), perl=TRUE) +#ACE str(m) +tempMatches <- regmatches(names(quantDataImputed_QN_log), m) +#ACE str(tempMatches) +numSamples <- length(tempMatches) +#ACE str(numSamples) +m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE) +#ACE str(m2) +#ACE str(regmatches(tempMatches, m2)) +sampleNumbers <- as.factor(regmatches(tempMatches, m2)) +#ACE str(sampleNumbers) + +if (length(levels(sampleNumbers))<2) { + cat("ERROR!!!! Cannot perform ANOVA analysis because it requires two or more factor levels\n") + cat("Unparsed sample names are:\n") + print(names(quantDataImputed_QN_log)) + cat(sprintf("Parsing rule for SampleNames is '%s'\n", regexSampleNames)) + cat("Parsed names are:\n") + print(tempMatches) + cat(sprintf("Parsing rule for SampleGrouping is '%s'\n", regexSampleGrouping)) + cat("Sample group assignments are:\n") + print(regmatches(tempMatches, m2)) +} else { + pValueData.anovaPs <- apply(quantDataImputed_QN_log, 1, anovaFunc, groupingFactor=sampleNumbers) + + pValueData.anovaPs.FDR <- p.adjust(pValueData.anovaPs, method="fdr") + pValueData <- data.frame( + phosphopeptide = fullData[,1] + , rawANOVAp = pValueData.anovaPs + , FDRadjustedANOVAp = pValueData.anovaPs.FDR + ) + #ACE rownames(pValueData) <- fullData[,1] + # output ANOVA file to constructed filename, + # e.g. "Outputfile_pST_ANOVA_STEP5.txt" + # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" + + #Re-output quantile-normalized log-transformed dataset with imputed, normalized data to include p-values + + dataTableImputed <- cbind(fullData[1:9], pValueData[,2:3], quantDataLog) + write.table( + dataTableImputed + , file=imputedDataFilename + , sep="\t" + , col.names=TRUE + , row.names=FALSE + , quote=FALSE + ) + + + pValueData <- pValueData[order(pValueData$FDRadjustedANOVAp),] + + cutoff <- valFDR[1] + for (cutoff in valFDR){ #loop through FDR cutoffs + + filtered_p <- pValueData[which(pValueData$FDRadjustedANOVAp < cutoff),, drop = FALSE] + filteredData.filtered <- quantDataImputed_QN_log[rownames(filtered_p),, drop = FALSE] + filteredData.filtered <- filteredData.filtered[order(filtered_p$FDRadjustedANOVAp),, drop = FALSE] + + # <!-- ACE insertion start --> + old_oma <- par("oma") + old_par <- par( + mai=(par("mai") + c(0.7,0,0,0)) * c(1,1,0.3,1) + , oma=old_oma * c(1,1,0.3,1) + , cex.main=0.9 + , cex.axis=0.7 + ) + + if (nrow(filteredData.filtered) > 0) { + boxplot( + filteredData.filtered + , main = sprintf("Imputed, normalized intensities where adjusted p-value < %0.2f", cutoff) + # no line plot , main = "" + , las = 2 + # , ylim = c(5.5,10) + , ylab = expression(log[10](intensity)) + ) + } else { + cat(sprintf("No peptides were found to have cutoff adjusted p-value < %0.2f\n", cutoff)) + } + par(old_par) + + #Add Phosphopeptide column to ANOVA filtered table + ANOVA.filtered_merge <- merge( + x = connect_df + , y = filteredData.filtered + , by.x="Intensity" + , by.y=1 + ) + ANOVA.filtered_merge.order <- rownames(filtered_p) + + ANOVA.filtered_merge.format <- sapply( + X = filtered_p$FDRadjustedANOVAp + , FUN = function(x) { + if (x > 0.0001) + paste0("(%0.",1+ceiling(-log10(x)),"f) %s") + else + paste0("(%0.4e) %s") + } + ) + + #ANOVA.filtered_merge.format <- paste0("(%0.",1+ceiling(-log10(filtered_p$FDRadjustedANOVAp)),"f) %s") + + ANOVA.filtered <- data.table( + ANOVA.filtered_merge$Phosphopeptide + , ANOVA.filtered_merge$Intensity + , ANOVA.filtered_merge[, 2:numSamples+1] + ) + colnames(ANOVA.filtered) <- c("Phosphopeptide", colnames(filteredData.filtered)) + + # merge qualitative columns into the ANOVA data + output_table <- data.frame(ANOVA.filtered$Phosphopeptide) + output_table <- merge( + x = output_table + , y = dataTableImputed_QN_LT + , by.x = "ANOVA.filtered.Phosphopeptide" + , by.y="Phosphopeptide" + ) + + #Produce heatmap to visualize significance and the effect of imputation + m <- as.matrix(unimputedQuantDataLog[ANOVA.filtered_merge.order,]) + if (nrow(m) > 0) { + rownames_m <- rownames(m) + rownames(m) <- sapply( + X = 1:nrow(m) + , FUN = function(i) { + sprintf( + ANOVA.filtered_merge.format[i] + , filtered_p$FDRadjustedANOVAp[i] + , rownames_m[i] + ) + } + ) + margins <- c( + max(nchar(colnames(m))) * 10 / 16 # col + , max(nchar(rownames(m))) * 5 / 16 # row + ) + how_many_peptides <- min(50, nrow(m)) + + op <- par("cex.main") + try( + if (nrow(m) > 1) { + par(cex.main=0.6) + heatmap( + m[how_many_peptides:1,] + , Rowv = NA + , Colv = NA + , cexRow = 0.7 + , cexCol = 0.8 + , scale="row" + , margins = margins + , main = "Heatmap of unimputed, unnormalized intensities" + , xlab = "" + # , main = bquote( + # .( how_many_peptides ) + # ~ " peptides with adjusted p-value <" + # ~ .(sprintf("%0.2f", cutoff)) + # ) + ) + } + ) + #ACE fig_dim knitr::opts_chunk$set(fig.dim = fig_dim) + par(op) + } + + } +} +``` + +## Peptide IDs, etc. + +See output files.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mqppep_mrgfltr.py Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,1337 @@ +#!/usr/bin/env python + +# Import the packages needed +import argparse +import os.path +import sys + +import pandas +import re +import time +import sqlite3 as sql +from codecs import getreader as cx_getreader +import sys +import numpy as np + +# for sorting list of lists using operator.itemgetter +import operator + +# for formatting stack-trace +import traceback + +# for Aho-Corasick search for fixed set of substrings +import ahocorasick +import operator +import hashlib + +# for shutil.copyfile(src, dest) +import shutil + +# global constants +N_A = 'N/A' + +# ref: https://stackoverflow.com/a/8915613/15509512 +# answers: "How to handle exceptions in a list comprehensions" +# usage: +# from math import log +# eggs = [1,3,0,3,2] +# print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) +# producing: +# for <built-in function log> +# with args (0,) +# exception: math domain error +# [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] +def catch(func, *args, handle=lambda e : e, **kwargs): + try: + return func(*args, **kwargs) + except Exception as e: + print("For %s" % str(func)) + print(" with args %s" % str(args)) + print(" caught exception: %s" % str(e)) + (ty, va, tb) = sys.exc_info() + print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) + exit(-1) + return None # was handle(e) + +def ppep_join(x): + x = [i for i in x if N_A != i] + result = "%s" % ' | '.join(x) + if result != "": + return result + else: + return N_A + +def melt_join(x): + tmp = {key.lower(): key for key in x} + result = "%s" % ' | '.join([tmp[key] for key in tmp]) + return result + +def __main__(): + # Parse Command Line + parser = argparse.ArgumentParser( + description='Phopsphoproteomic Enrichment Pipeline Merge and Filter.' + ) + + # inputs: + # Phosphopeptide data for experimental results, including the intensities + # and the mapping to kinase domains, in tabular format. + parser.add_argument( + '--phosphopeptides', '-p', + nargs=1, + required=True, + dest='phosphopeptides', + help='Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format' + ) + # UniProtKB/SwissProt DB input, SQLite + parser.add_argument( + '--ppep_mapping_db', '-d', + nargs=1, + required=True, + dest='ppep_mapping_db', + help='UniProtKB/SwissProt SQLite Database' + ) + #ACE # PhosPhositesPlus DB input, csv + #ACE parser.add_argument( + #ACE '--psp_regulatory_sites', '-s', + #ACE nargs=1, + #ACE required=True, + #ACE dest='psp_regulatory_sites_csv', + #ACE help='PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header' + #ACE ) + # species to limit records chosed from PhosPhositesPlus + parser.add_argument( + '--species', '-x', + nargs=1, + required=False, + default=[], + dest='species', + help='limit PhosphoSitePlus records to indicated species (field may be empty)' + ) + + # outputs: + # tabular output + parser.add_argument( + '--mrgfltr_tab', '-o', + nargs=1, + required=True, + dest='mrgfltr_tab', + help='Tabular output file for results' + ) + # CSV output + parser.add_argument( + '--mrgfltr_csv', '-c', + nargs=1, + required=True, + dest='mrgfltr_csv', + help='CSV output file for results' + ) + # SQLite output + parser.add_argument( + '--mrgfltr_sqlite', '-S', + nargs=1, + required=True, + dest='mrgfltr_sqlite', + help='SQLite output file for results' + ) + + # "Make it so!" (parse the arguments) + options = parser.parse_args() + print("options: " + str(options)) + + # determine phosphopeptide ("upstream map") input tabular file access + if options.phosphopeptides is None: + exit('Argument "phosphopeptides" is required but not supplied') + try: + upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0]) + input_file = open(upstream_map_filename_tab, 'r') + input_file.close() + except Exception as e: + exit('Error parsing phosphopeptides argument: %s' % str(e)) + + # determine input SQLite access + if options.ppep_mapping_db is None: + exit('Argument "ppep_mapping_db" is required but not supplied') + try: + uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0]) + input_file = open(uniprot_sqlite, 'rb') + input_file.close() + except Exception as e: + exit('Error parsing ppep_mapping_db argument: %s' % str(e)) + + # copy input SQLite dataset to output SQLite dataset + if options.mrgfltr_sqlite is None: + exit('Argument "mrgfltr_sqlite" is required but not supplied') + try: + output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0]) + shutil.copyfile(uniprot_sqlite, output_sqlite) + except Exception as e: + exit('Error copying ppep_mapping_db to mrgfltr_sqlite: %s' % str(e)) + + #ACE # determine psp_regulatory_sites CSV access + #ACE if options.psp_regulatory_sites_csv is None: + #ACE exit('Argument "psp_regulatory_sites_csv" is required but not supplied') + #ACE #ACE print('options.psp_regulatory_sites_csv: ' + options.psp_regulatory_sites_csv) + #ACE try: + #ACE phosphosite_filename_csv = os.path.abspath(options.psp_regulatory_sites_csv[0]) + #ACE input_file = open(phosphosite_filename_csv, 'r') + #ACE input_file.close() + #ACE except Exception as e: + #ACE exit('Error parsing psp_regulatory_sites_csv argument: %s' % str(e)) + #ACE print('phosphosite_filename_csv: ' + phosphosite_filename_csv) + + # determine species to limit records from PSP_Regulatory_Sites + if options.species is None: + exit('Argument "species" is required (and may be empty) but not supplied') + try: + if len(options.species) > 0: + species = options.species[0] + else: + species = '' + except Exception as e: + exit('Error parsing species argument: %s' % str(e)) + + # determine tabular output destination + if options.mrgfltr_tab is None: + exit('Argument "mrgfltr_tab" is required but not supplied') + try: + output_filename_tab = os.path.abspath(options.mrgfltr_tab[0]) + output_file = open(output_filename_tab, 'w') + output_file.close() + except Exception as e: + exit('Error parsing mrgfltr_tab argument: %s' % str(e)) + + # determine CSV output destination + if options.mrgfltr_csv is None: + exit('Argument "mrgfltr_csv" is required but not supplied') + try: + output_filename_csv = os.path.abspath(options.mrgfltr_csv[0]) + output_file = open(output_filename_csv, 'w') + output_file.close() + except Exception as e: + exit('Error parsing mrgfltr_csv argument: %s' % str(e)) + + + def mqpep_getswissprot(): + + ############################################### + # copied from Excel Output Script.ipynb BEGIN # + ############################################### + + ########### String Constants ################# + DEPHOSPHOPEP = 'DephosphoPep' + DESCRIPTION = 'Description' + FUNCTION_PHOSPHORESIDUE = 'Function Phosphoresidue(PSP=PhosphoSitePlus.org)' + GENE_NAME = 'Gene_Name' # Gene Name from UniProtKB + ON_FUNCTION = 'ON_FUNCTION' # ON_FUNCTION column from PSP_Regulatory_Sites + ON_NOTES = 'NOTES' # NOTES column from PSP_Regulatory_Sites + ON_OTHER_INTERACT = 'ON_OTHER_INTERACT' # ON_OTHER_INTERACT column from PSP_Regulatory_Sites + ON_PROCESS = 'ON_PROCESS' # ON_PROCESS column from PSP_Regulatory_Sites + ON_PROT_INTERACT = 'ON_PROT_INTERACT' # ON_PROT_INTERACT column from PSP_Regulatory_Sites + PHOSPHOPEPTIDE = 'Phosphopeptide' + PHOSPHOPEPTIDE_MATCH = 'Phosphopeptide_match' + PHOSPHORESIDUE = 'Phosphoresidue' + PUTATIVE_UPSTREAM_DOMAINS = 'Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains' + SEQUENCE = 'Sequence' + SEQUENCE10 = 'Sequence10' + SEQUENCE7 = 'Sequence7' + SITE_PLUSMINUS_7AA = 'SITE_+/-7_AA' + SITE_PLUSMINUS_7AA_SQL = 'SITE_PLUSMINUS_7AA' + UNIPROT_ID = 'UniProt_ID' + UNIPROT_SEQ_AND_META_SQL = ''' + select Uniprot_ID, Description, Gene_Name, Sequence, + Organism_Name, Organism_ID, PE, SV + from UniProtKB + order by Sequence, UniProt_ID + ''' + UNIPROT_UNIQUE_SEQ_SQL = ''' + select distinct Sequence + from UniProtKB + group by Sequence + ''' + PPEP_PEP_UNIPROTSEQ_SQL = ''' + select distinct phosphopeptide, peptide, sequence + from uniprotkb_pep_ppep_view + order by sequence + ''' + PPEP_MELT_SQL = ''' + SELECT DISTINCT + phospho_peptide AS 'p_peptide', + kinase_map AS 'characterization', + 'X' AS 'X' + FROM ppep_gene_site_view + ''' + # CREATE TABLE PSP_Regulatory_site ( + # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, + # domain TEXT, + # ON_FUNCTION TEXT, + # ON_PROCESS TEXT, + # ON_PROT_INTERACT TEXT, + # ON_OTHER_INTERACT TEXT, + # notes TEXT, + # organism TEXT + # ); + PSP_REGSITE_SQL = ''' + SELECT DISTINCT + SITE_PLUSMINUS_7AA , + DOMAIN , + ON_FUNCTION , + ON_PROCESS , + ON_PROT_INTERACT , + ON_OTHER_INTERACT , + NOTES , + ORGANISM + FROM PSP_Regulatory_site + ''' + PPEP_ID_SQL =''' + SELECT + id AS 'ppep_id', + seq AS 'ppep_seq' + FROM ppep + ''' + MRGFLTR_DDL =''' + DROP VIEW IF EXISTS mrgfltr_metadata_view; + DROP TABLE IF EXISTS mrgfltr_metadata; + CREATE TABLE mrgfltr_metadata + ( ppep_id INTEGER REFERENCES ppep(id) + , Sequence10 TEXT + , Sequence7 TEXT + , GeneName TEXT + , Phosphoresidue TEXT + , UniProtID TEXT + , Description TEXT + , FunctionPhosphoresidue TEXT + , PutativeUpstreamDomains TEXT + , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE + ) + ; + CREATE VIEW mrgfltr_metadata_view AS + SELECT DISTINCT + ppep.seq AS phospho_peptide + , Sequence10 + , Sequence7 + , GeneName + , Phosphoresidue + , UniProtID + , Description + , FunctionPhosphoresidue + , PutativeUpstreamDomains + FROM + ppep, mrgfltr_metadata + WHERE + mrgfltr_metadata.ppep_id = ppep.id + ORDER BY + ppep.seq + ; + ''' + + CITATION_INSERT_STMT = ''' + INSERT INTO Citation ( + ObjectName, + CitationData + ) VALUES (?,?) + ''' + CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' + CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' + + MRGFLTR_METADATA_COLUMNS = [ + 'ppep_id', + 'Sequence10', + 'Sequence7', + 'GeneName', + 'Phosphoresidue', + 'UniProtID', + 'Description', + 'FunctionPhosphoresidue', + 'PutativeUpstreamDomains' + ] + + ########### String Constants (end) ############ + + class Error(Exception): + """Base class for exceptions in this module.""" + pass + + class PreconditionError(Error): + """Exception raised for errors in the input. + + Attributes: + expression -- input expression in which the error occurred + message -- explanation of the error + """ + + def __init__(self, expression, message): + self.expression = expression + self.message = message + + #start_time = time.clock() #timer + start_time = time.process_time() #timer + + #get keys from upstream tabular file using readline() + # ref: https://stackoverflow.com/a/16713581/15509512 + # answer to "Use codecs to read file with correct encoding" + file1_encoded = open(upstream_map_filename_tab, 'rb') + file1 = cx_getreader("latin-1")(file1_encoded) + + count = 0 + upstream_map_p_peptide_list = [] + re_tab = re.compile('^[^\t]*') + while True: + count += 1 + # Get next line from file + line = file1.readline() + # if line is empty + # end of file is reached + if not line: + break + if count > 1: + m = re_tab.match(line) + upstream_map_p_peptide_list.append(m[0]) + file1.close() + file1_encoded.close() + + # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed + re_phos = re.compile('p') + dephospho_peptide_list = [ re_phos.sub('',foo) for foo in upstream_map_p_peptide_list ] + + end_time = time.process_time() #timer + print("%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr) + + ## ----------- Get SwissProt data from SQLite database (start) ----------- + # build UniProt sequence LUT and list of unique SwissProt sequences + + # Open SwissProt SQLite database + conn = sql.connect(uniprot_sqlite) + cur = conn.cursor() + + # Set up structures to hold SwissProt data + + uniprot_Sequence_List = [] + UniProtSeqLUT = {} + + # Execute query for unique seqs without fetching the results yet + uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL) + + while batch := uniprot_unique_seq_cur.fetchmany(size=50): + if None == batch: + # handle case where no records are returned + break + for row in batch: + Sequence = row[0] + UniProtSeqLUT[(Sequence,DESCRIPTION)] = [] + UniProtSeqLUT[(Sequence,GENE_NAME) ] = [] + UniProtSeqLUT[(Sequence,UNIPROT_ID) ] = [] + UniProtSeqLUT[ Sequence ] = [] + + # Execute query for seqs and metadata without fetching the results yet + uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL) + + while batch := uniprot_seq_and_meta.fetchmany(size=50): + if None == batch: + # handle case where no records are returned + break + for UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV in batch: + uniprot_Sequence_List.append(Sequence) + UniProtSeqLUT[Sequence] = Sequence + UniProtSeqLUT[(Sequence,UNIPROT_ID) ].append(UniProt_ID) + UniProtSeqLUT[(Sequence,GENE_NAME) ].append(Gene_Name) + if OS != N_A: + Description += ' OS=' + OS + if OX != N_A: + Description += ' OX=' + str(int(OX)) + if Gene_Name != N_A: + Description += ' GN=' + Gene_Name + if PE != N_A: + Description += ' PE=' + PE + if SV != N_A: + Description += ' SV=' + SV + UniProtSeqLUT[(Sequence,DESCRIPTION)].append(Description) + + # Close SwissProt SQLite database; clean up local variables + conn.close() + Sequence = '' + UniProt_ID = '' + Description = '' + Gene_Name = '' + + ## ----------- Get SwissProt data from SQLite database (finish) ----------- + + end_time = time.process_time() #timer + print("%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr) + + ## ----------- Get SwissProt data from SQLite database (start) ----------- + # build PhosphoPep_UniProtSeq_LUT and PhosphoPep_UniProtSeq_LUT + #ACE_temp pepSeqList = list( zip(pepList, dephosphPepList, [seq]*len(pepList)) ) + + # Open SwissProt SQLite database + conn = sql.connect(uniprot_sqlite) + cur = conn.cursor() + + # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide + DephosphoPep_UniProtSeq_LUT = {} + + # Set up dictionary to accumulate results + PhosphoPep_UniProtSeq_LUT = {} + + # Execute query for tuples without fetching the results yet + ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL) + + while batch := ppep_pep_uniprotseq_cur.fetchmany(size=50): + if None == batch: + # handle case where no records are returned + break + for (phospho_pep, dephospho_pep, sequence) in batch: + #do interesting stuff here... + PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep + PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep + if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: + DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set() + DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] = [] + DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] = [] + DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] = [] + DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] = [] + DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep) + + #ACE print("ppep:'%s' dephospho_pep:'%s' sequence:'%s'" % (phospho_pep, dephospho_pep, sequence)) + if sequence not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]: + DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)].append(sequence) + for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]: + if phospho_pep != phospho_pep: + print("phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep)) + if phospho_pep not in PhosphoPep_UniProtSeq_LUT: + PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep + PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep + r = list(zip( + [s for s in UniProtSeqLUT[(sequence,UNIPROT_ID)]], + [s for s in UniProtSeqLUT[(sequence,GENE_NAME)]], + [s for s in UniProtSeqLUT[(sequence,DESCRIPTION)]] + )) + # Sort by `UniProt_ID` + # ref: https://stackoverflow.com/a/4174955/15509512 + r = sorted(r, key=operator.itemgetter(0)) + # Get one tuple for each `phospho_pep` + # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] + for (upid, gn, desc) in r: + # Append pseudo-tuple per UniProt_ID but only when it is not present + if upid not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]: + DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)].append(upid) + DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)].append(desc) + DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)].append(gn) + + # Close SwissProt SQLite database; clean up local variables + conn.close() + # wipe local variables + phospho_pep = dephospho_pep = sequence = 0 + upid = gn = desc = r = '' + + ## ----------- Get SwissProt data from SQLite database (finish) ----------- + + end_time = time.process_time() #timer + print("%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time,upstream_map_filename_tab), file=sys.stderr) + + print('{:>10} unique upstream phosphopeptides tested'.format(str(len(upstream_map_p_peptide_list)))) + + #Read in Upstream tabular file + # We are discarding the intensity data; so read it as text + upstream_data = pandas.read_table( + upstream_map_filename_tab, + dtype='str', + index_col = 0 + ) + + end_time = time.process_time() #timer + print("%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr) #timer + + upstream_data.index = upstream_map_p_peptide_list + + + end_time = time.process_time() #timer + print("%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr) #timer + + + #trim upstream_data to include only the upstream map columns + old_cols = upstream_data.columns.tolist() + i = 0 + first_intensity = -1 + last_intensity = -1 + intensity_re = re.compile('Intensity.*') + for col_name in old_cols: + m = intensity_re.match(col_name) + if m: + last_intensity = i + if first_intensity == -1: + first_intensity = i + i += 1 + #print('last intensity = %d' % last_intensity) + col_PKCalpha = last_intensity + 2 + col_firstIntensity = first_intensity + + data_in_cols = [old_cols[0]] + old_cols[first_intensity:last_intensity+1] + + if upstream_data.empty: + print("upstream_data is empty") + exit(0) + + data_in = upstream_data.copy(deep=True)[data_in_cols] + + # Convert floating-point integers to int64 integers + # ref: https://stackoverflow.com/a/68497603/15509512 + data_in[list(data_in.columns[1:])] = data_in[ + list(data_in.columns[1:])].astype('float64').apply(np.int64) + + #create another phosphopeptide column that will be used to join later; + # MAY need to change depending on Phosphopeptide column position + #data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]] + data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index + + + + + end_time = time.process_time() #timer + print("%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr) #timer + + # Produce a dictionary of metadata for a single phosphopeptide. + # This is a replacement of `UniProtInfo_subdict` in the original code. + def pseq_to_subdict(phospho_pep): + #ACE print("calling pseq_to_subdict, %s" % phospho_pep); + # Strip "p" from phosphopeptide sequence + dephospho_pep = re_phos.sub('',phospho_pep) + + # Determine number of phosphoresidues in phosphopeptide + numps = len(phospho_pep) - len(dephospho_pep) + + # Determine location(s) of phosphoresidue(s) in phosphopeptide + # (used later for Phosphoresidue, Sequence7, and Sequence10) + ploc = [] #list of p locations + i = 0 + p = phospho_pep + while i < numps: + ploc.append(p.find("p")) + p = p[:p.find("p")] + p[p.find("p")+1:] + i +=1 + + + # Establish nested dictionary + result = {} + result[SEQUENCE] = [] + result[UNIPROT_ID] = [] + result[DESCRIPTION] = [] + result[GENE_NAME] = [] + result[PHOSPHORESIDUE] = [] + result[SEQUENCE7] = [] + result[SEQUENCE10] = [] + + # Add stripped sequence to dictionary + result[SEQUENCE].append(dephospho_pep) + + # Locate dephospho_pep in DephosphoPep_UniProtSeq_LUT + dephos = DephosphoPep_UniProtSeq_LUT[dephospho_pep] + + # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT + ### Caller may elect to: + ## try: + ## ... + ## except PreconditionError as pe: + ## print("'{expression}': {message}".format( + ## expression = pe.expression, + ## message = pe.message)) + ## ) + ## ) + if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: + raise PreconditionError( dephospho_pep, + 'dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT' + ) + if phospho_pep not in PhosphoPep_UniProtSeq_LUT: + raise PreconditionError( dephospho_pep, + 'no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT' + ) + if dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]: + raise PreconditionError( dephospho_pep, + "dephosphorylated phosphopeptide does not match " + + "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " + + PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] + ) + result[SEQUENCE] = [dephospho_pep] + result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] + result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] + result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] + if (dephospho_pep,SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: + raise PreconditionError( dephospho_pep, + 'no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT' + ) + UniProtSeqList = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] + if len (UniProtSeqList) < 1: + print("Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep) + # raise PreconditionError( + # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", + # 'value has zero length' + # ) + for UniProtSeq in UniProtSeqList: + i = 0 + phosphoresidues = [] + seq7s_set = set() + seq7s = [] + seq10s_set = set() + seq10s = [] + while i < len(ploc): + start = UniProtSeq.find(dephospho_pep) + psite = start+ploc[i] #location of phosphoresidue on protein sequence + + #add Phosphoresidue + phosphosite = "p"+str(UniProtSeq)[psite]+str(psite+1) + phosphoresidues.append(phosphosite) + + #Add Sequence7 + if psite < 7: #phospho_pep at N terminus + seq7 = str(UniProtSeq)[:psite+8] + if seq7[psite] == "S": #if phosphosresidue is serine + pres = "s" + elif seq7[psite] == "T": #if phosphosresidue is threonine + pres = "t" + elif seq7[psite] == "Y": #if phosphoresidue is tyrosine + pres = "y" + else: # if not pSTY + pres = "?" + seq7 = seq7[:psite] + pres + seq7[psite+1:psite+8] + while len(seq7) < 15: #add appropriate number of "_" to the front + seq7 = "_" + seq7 + elif len(UniProtSeq) - psite < 8: #phospho_pep at C terminus + seq7 = str(UniProtSeq)[psite-7:] + if seq7[7] == "S": + pres = "s" + elif seq7[7] == "T": + pres = "t" + elif seq7[7] == "Y": + pres = "y" + else: + pres = "?" + seq7 = seq7[:7] + pres + seq7[8:] + while len(seq7) < 15: #add appropriate number of "_" to the back + seq7 = seq7 + "_" + else: + seq7 = str(UniProtSeq)[psite-7:psite+8] + pres = "" #phosphoresidue + if seq7[7] == "S": #if phosphosresidue is serine + pres = "s" + elif seq7[7] == "T": #if phosphosresidue is threonine + pres = "t" + elif seq7[7] == "Y": #if phosphoresidue is tyrosine + pres = "y" + else: # if not pSTY + pres = "?" + seq7 = seq7[:7] + pres + seq7[8:] + if seq7 not in seq7s_set: + seq7s.append(seq7) + seq7s_set.add(seq7) + + #add Sequence10 + if psite < 10: #phospho_pep at N terminus + seq10 = str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite:psite+11] + elif len(UniProtSeq) - psite < 11: #phospho_pep at C terminus + seq10 = str(UniProtSeq)[psite-10:psite] + "p" + str(UniProtSeq)[psite:] + else: + seq10 = str(UniProtSeq)[psite-10:psite+11] + seq10 = seq10[:10] + "p" + seq10[10:] + if seq10 not in seq10s_set: + seq10s.append(seq10) + seq10s_set.add(seq10) + + i+=1 + + result[PHOSPHORESIDUE].append(phosphoresidues) + result[SEQUENCE7].append(seq7s) + # result[SEQUENCE10] is a list of lists of strings + result[SEQUENCE10].append(seq10s) + + + + + r = list(zip( + result[UNIPROT_ID], + result[GENE_NAME], + result[DESCRIPTION], + result[PHOSPHORESIDUE] + )) + # Sort by `UniProt_ID` + # ref: https://stackoverflow.com//4174955/15509512 + s = sorted(r, key=operator.itemgetter(0)) + + result[UNIPROT_ID] = [] + result[GENE_NAME] = [] + result[DESCRIPTION] = [] + result[PHOSPHORESIDUE] = [] + + for r in s: + result[UNIPROT_ID].append(r[0]) + result[GENE_NAME].append(r[1]) + result[DESCRIPTION].append(r[2]) + result[PHOSPHORESIDUE].append(r[3]) + + + #convert lists to strings in the dictionary + for key,value in result.items(): + if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]: + result[key] = '; '.join(map(str, value)) + elif key in [SEQUENCE10]: + # result[SEQUENCE10] is a list of lists of strings + joined_value = '' + joined_set = set() + sep = '' + for valL in value: + # valL is a list of strings + for val in valL: + # val is a string + if val not in joined_set: + joined_set.add(val) + #joined_value += sep + '; '.join(map(str, val)) + joined_value += sep + val + sep = '; ' + # joined_value is a string + result[key] = joined_value + + + newstring = '; '.join( + [', '.join(l) for l in result[PHOSPHORESIDUE]] + ) + ### #separate the isoforms in PHOSPHORESIDUE column with ";" + ### oldstring = result[PHOSPHORESIDUE] + ### oldlist = list(oldstring) + ### newstring = "" + ### i = 0 + ### for e in oldlist: + ### if e == ";": + ### if numps > 1: + ### if i%numps: + ### newstring = newstring + ";" + ### else: + ### newstring = newstring + "," + ### else: + ### newstring = newstring + ";" + ### i +=1 + ### else: + ### newstring = newstring + e + result[PHOSPHORESIDUE] = newstring + + + #separate sequence7's by | + oldstring = result[SEQUENCE7] + oldlist = oldstring + newstring = "" + for l in oldlist: + for e in l: + if e == ";": + newstring = newstring + " |" + elif len(newstring) > 0 and 1 > newstring.count(e): + newstring = newstring + " | " + e + elif 1 > newstring.count(e): + newstring = newstring + e + result[SEQUENCE7] = newstring + + + return [phospho_pep, result] + + # Construct list of [string, dictionary] lists + # where the dictionary provides the SwissProt metadata for a phosphopeptide + result_list = [ + catch(pseq_to_subdict,psequence) + for psequence + in data_in[PHOSPHOPEPTIDE_MATCH] + ] + + + end_time = time.process_time() #timer + print("%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr) #timer + + # Construct dictionary from list of lists + # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ + UniProt_Info = { + result[0]:result[1] + for result + in result_list + if result is not None + } + + + end_time = time.process_time() #timer + print("%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr) #timer + + #cosmetic: add N_A to phosphopeptide rows with no hits + p_peptide_list = [] + for key in UniProt_Info: + p_peptide_list.append(key) + for nestedKey in UniProt_Info[key]: + if UniProt_Info[key][nestedKey] == "": + UniProt_Info[key][nestedKey] = N_A + + end_time = time.process_time() #timer + print("%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr) #timer + + #convert UniProt_Info dictionary to dataframe + uniprot_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(UniProt_Info)) + + #reorder columns to match expected output file + uniprot_df[PHOSPHOPEPTIDE] = uniprot_df.index #make index a column too + + + cols = uniprot_df.columns.tolist() + #cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]] + #uniprot_df = uniprot_df[cols] + uniprot_df = uniprot_df[[ + PHOSPHOPEPTIDE, + SEQUENCE10, + SEQUENCE7, + GENE_NAME, + PHOSPHORESIDUE, + UNIPROT_ID, + DESCRIPTION + ]] + + + end_time = time.process_time() #timer + print("%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr) #timer + + #concat to split then groupby to collapse + seq7_df = pandas.concat([pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(' | ')) + for _, row in uniprot_df.iterrows()]).reset_index() + seq7_df.columns = [SEQUENCE7,PHOSPHOPEPTIDE] + + # --- -------------- begin read PSP_Regulatory_sites --------------------------------- + #read in PhosphoSitePlus Regulatory Sites dataset + #ACE if (True): + ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- + conn = sql.connect(uniprot_sqlite) + regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) + # Close SwissProt SQLite database + conn.close() + #ACE # Array indexes are zero-based + #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language) + #ACE RENAME_COLS = [ 'SITE_PLUSMINUS_7AA', 'DOMAIN', 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT' + #ACE , 'ON_OTHER_INTERACT' , 'NOTES' , 'ORGANISM'] + #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also + #ACE print(regsites_df) + ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (finish) ----------- + #ACE else: + #ACE regsites_df = pandas.read_csv(phosphosite_filename_csv, header=3,skiprows=1-3) + #ACE SITE_PLUSMINUS_7AA_SQL = SITE_PLUSMINUS_7AA + #ACE #ACE # Array indexes are zero-based + #ACE #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language) + #ACE #ACE RENAME_COLS = [ 'GENE' , 'PROTEIN' , 'PROT_TYPE' , 'ACC_ID' , 'GENE_ID' + #ACE #ACE , 'HU_CHR_LOC' , 'ORGANISM' , 'MOD_RSD' , 'SITE_GRP_ID' , 'SITE_+/-7_AA' + #ACE #ACE , 'DOMAIN' , 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT' + #ACE #ACE , 'PMIDs' , 'LT_LIT' , 'MS_LIT' , 'MS_CST' , 'NOTES' + #ACE #ACE ] + #ACE #ACE REGSITE_COL_SITE7AA = 9 + #ACE #ACE REGSITE_COL_PROTEIN = 1 + #ACE #ACE REGSITE_COL_DOMAIN = 10 + #ACE #ACE REGSITE_COL_PMIDs = 15 + + # ... -------------- end read PSP_Regulatory_sites ------------------------------------ + + + #keep only the human entries in dataframe + if len(species) > 0: + print('Limit PhosphoSitesPlus records to species "' + species + '"') + regsites_df = regsites_df[regsites_df.ORGANISM == species] + + #merge the seq7 df with the regsites df based off of the sequence7 + merge_df = seq7_df.merge(regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how='left') + #ACE print(merge_df.columns.tolist()) #ACE + + #after merging df, select only the columns of interest - note that PROTEIN is absent here + merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,ON_FUNCTION,ON_PROCESS, ON_PROT_INTERACT,ON_OTHER_INTERACT,ON_NOTES]] + #ACE print(merge_df.columns.tolist()) #ACE + #combine column values of interest into one FUNCTION_PHOSPHORESIDUE column" + merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(merge_df[ON_PROCESS], sep="; ", na_rep="") + merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="") + merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="") + merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="") + + #remove the columns that were combined + merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,FUNCTION_PHOSPHORESIDUE]] + + #ACE print(merge_df) #ACE + #ACE print(merge_df.columns.tolist()) #ACE + + end_time = time.process_time() #timer + print("%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr) #timer + + #cosmetic changes to Function Phosphoresidue column + fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE]) + + end_time = time.process_time() #timer + print("%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr) #timer + + i = 0 + while i < len(fp_series): + #remove the extra ";" so that it looks more professional + if fp_series[i] == "; ; ; ; ": #remove ; from empty hits + fp_series[i] = "" + while fp_series[i].endswith("; "): #remove ; from the ends + fp_series[i] = fp_series[i][:-2] + while fp_series[i].startswith("; "): #remove ; from the beginning + fp_series[i] = fp_series[i][2:] + fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") + fp_series[i] = fp_series[i].replace("; ; ; ", "; ") + fp_series[i] = fp_series[i].replace("; ; ", "; ") + + #turn blanks into N_A to signify the info was searched for but cannot be found + if fp_series[i] == "": + fp_series[i] = N_A + + i += 1 + merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series + + end_time = time.process_time() #timer + print("%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr) #timer + + #merge uniprot df with merge df + uniprot_regsites_merged_df = uniprot_df.merge(merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE,how="left") + + #collapse the merged df + uniprot_regsites_collapsed_df = pandas.DataFrame( + uniprot_regsites_merged_df + .groupby(PHOSPHOPEPTIDE)[FUNCTION_PHOSPHORESIDUE] + .apply(lambda x: ppep_join(x))) + #.apply(lambda x: "%s" % ' | '.join(x))) + + + end_time = time.process_time() #timer + print("%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr) #timer + + uniprot_regsites_collapsed_df[PHOSPHOPEPTIDE] = uniprot_regsites_collapsed_df.index #add df index as its own column + + + #rename columns + uniprot_regsites_collapsed_df.columns = [FUNCTION_PHOSPHORESIDUE, 'ppp'] + + + + #select columns to be merged to uniprot_df + #ACE cols = regsites_df.columns.tolist() + #ACE print(cols) #ACE + #ACE if len(cols) > 8: + #ACE cols = [cols[9]]+[cols[1]]+cols[10:15] + #ACE #ACE cols = [cols[9]]+[cols[1]]+cols[10:15] + #ACE print(cols) #ACE + #ACE regsite_merge_df = regsites_df[cols] + + end_time = time.process_time() #timer + print("%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr) #timer + + #add columns based on Sequence7 matching site_+/-7_AA + uniprot_regsite_df = pandas.merge( + left=uniprot_df, + right=uniprot_regsites_collapsed_df, + how='left', + left_on=PHOSPHOPEPTIDE, + right_on='ppp') + + end_time = time.process_time() #timer + print("%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr) #timer + + data_in.rename( + {'Protein description': PHOSPHOPEPTIDE}, + axis='columns', + inplace=True + ) + + + + sort_start_time = time.process_time() #timer + + #data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') + res2 = sorted(data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key = lambda s: s.casefold()) + data_in = data_in.loc[res2] + + end_time = time.process_time() #timer + print("%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr) #timer + + + + cols = [old_cols[0]] + old_cols[col_PKCalpha-1:] + upstream_data = upstream_data[cols] + + end_time = time.process_time() #timer + print("%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr) #timer + + + #### #rename upstream columns in new list + #### new_cols = [] + #### for name in cols: + #### if "_NetworKIN" in name: + #### name = name.split("_")[0] + #### if " motif" in name: + #### name = name.split(" motif")[0] + #### if " sequence " in name: + #### name = name.split(" sequence")[0] + #### if "_Phosida" in name: + #### name = name.split("_")[0] + #### if "_PhosphoSite" in name: + #### name = name.split("_")[0] + #### new_cols.append(name) + + #rename upstream columns in new list + def col_rename(name): + if "_NetworKIN" in name: + name = name.split("_")[0] + if " motif" in name: + name = name.split(" motif")[0] + if " sequence " in name: + name = name.split(" sequence")[0] + if "_Phosida" in name: + name = name.split("_")[0] + if "_PhosphoSite" in name: + name = name.split("_")[0] + return name + + new_cols = [col_rename(col) for col in cols] + upstream_data.columns = new_cols + + + + end_time = time.process_time() #timer + print("%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr) #timer + + + # Create upstream_data_cast as a copy of upstream_data + # but with first column substituted by the phosphopeptide sequence + upstream_data_cast = upstream_data.copy() + new_cols_cast = new_cols + new_cols_cast[0] = 'p_peptide' + upstream_data_cast.columns = new_cols_cast + upstream_data_cast['p_peptide'] = upstream_data.index + new_cols_cast0 = new_cols_cast[0] + + # --- -------------- begin read upstream_data_melt ------------------------------------ + ## ----------- Get melted kinase mapping data from SQLite database (start) ----------- + conn = sql.connect(uniprot_sqlite) + upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) + # Close SwissProt SQLite database + conn.close() + upstream_data_melt = upstream_data_melt_df.copy() + upstream_data_melt.columns = ['p_peptide', 'characterization', 'X'] + upstream_data_melt['characterization'] = [ + col_rename(s) + for s in upstream_data_melt['characterization'] + ] + + print('%0.6f upstream_data_melt_df initially has %d rows' % + (end_time - start_time, len(upstream_data_melt.axes[0])) + , file=sys.stderr) + # ref: https://stackoverflow.com/a/27360130/15509512 + # e.g. df.drop(df[df.score < 50].index, inplace=True) + upstream_data_melt.drop( + upstream_data_melt[upstream_data_melt.X != 'X'].index, + inplace = True + ) + print('%0.6f upstream_data_melt_df pre-dedup has %d rows' % + (end_time - start_time, len(upstream_data_melt.axes[0])) + , file=sys.stderr) + #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also + #ACE print(upstream_data_melt) + ## ----------- Get melted kinase mapping data from SQLite database (finish) ----------- + # ... -------------- end read upstream_data_melt -------------------------------------- + + end_time = time.process_time() #timer + print("%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr) #timer + # ... end read upstream_data_melt + + upstream_data_melt_index = upstream_data_melt.index + upstream_data_melt_p_peptide = upstream_data_melt['p_peptide'] + + end_time = time.process_time() #timer + print("%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr) #timer + + upstream_delta_melt_LoL = upstream_data_melt.values.tolist() + + melt_dict = {} + for key in upstream_map_p_peptide_list: + melt_dict[key] = [] + + for el in upstream_delta_melt_LoL: + (p_peptide, characterization, X) = tuple(el) + if p_peptide in melt_dict: + melt_dict[p_peptide].append(characterization) + else: + exit('Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide)) + + + end_time = time.process_time() #timer + print("%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr) #timer + + + # for key in upstream_map_p_peptide_list: + # melt_dict[key] = ' | '.join(melt_dict[key]) + + for key in upstream_map_p_peptide_list: + melt_dict[key] = melt_join(melt_dict[key]) + + end_time = time.process_time() #timer + print("%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr) #timer + + # map_dict is a dictionary of dictionaries + map_dict = {} + for key in upstream_map_p_peptide_list: + map_dict[key] = {} + map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key] + + + end_time = time.process_time() #timer + print("%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr) #timer + + #convert map_dict to dataframe + map_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(map_dict)) + map_df["p-peptide"] = map_df.index #make index a column too + cols_map_df = map_df.columns.tolist() + cols_map_df = [cols_map_df[1]] + [cols_map_df[0]] + map_df = map_df[cols_map_df] + + #join map_df to uniprot_regsite_df + output_df = uniprot_regsite_df.merge( + map_df, + how="left", + left_on=PHOSPHOPEPTIDE, + right_on="p-peptide") + + output_df = output_df[ + [ PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE, + UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE, + PUTATIVE_UPSTREAM_DOMAINS + ] + ] + + + # cols_output_prelim = output_df.columns.tolist() + # + # print("cols_output_prelim") + # print(cols_output_prelim) + # + # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]] + # + # print("cols_output with p-peptide") + # print(cols_output) + # + # cols_output = [col for col in cols_output if not col == "p-peptide"] + # + # print("cols_output") + # print(cols_output) + # + # output_df = output_df[cols_output] + + #join output_df back to quantitative columns in data_in df + quant_cols = data_in.columns.tolist() + quant_cols = quant_cols[1:] + quant_data = data_in[quant_cols] + + ## ----------- Write merge/filter metadata to SQLite database (start) ----------- + # Open SwissProt SQLite database + conn = sql.connect(output_sqlite) + cur = conn.cursor() + + cur.executescript(MRGFLTR_DDL) + + cur.execute( + CITATION_INSERT_STMT, + ('mrgfltr_metadata_view', CITATION_INSERT_PSP) + ) + cur.execute( + CITATION_INSERT_STMT, + ('mrgfltr_metadata', CITATION_INSERT_PSP) + ) + cur.execute( + CITATION_INSERT_STMT, + ('mrgfltr_metadata_view', CITATION_INSERT_PSP_REF) + ) + cur.execute( + CITATION_INSERT_STMT, + ('mrgfltr_metadata', CITATION_INSERT_PSP_REF) + ) + + # Read ppep-to-sequence LUT + ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn) + #ACE ppep_lut_df.info(verbose=True) + # write only metadata for merged/filtered records to SQLite + mrgfltr_metadata_df = output_df.copy() + # replace phosphopeptide seq with ppep.id + mrgfltr_metadata_df = ppep_lut_df.merge( + mrgfltr_metadata_df, + left_on='ppep_seq', + right_on=PHOSPHOPEPTIDE, + how='inner' + ) + mrgfltr_metadata_df.drop( + columns=[PHOSPHOPEPTIDE, 'ppep_seq'], + inplace=True + ) + #rename columns + mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS + #ACE mrgfltr_metadata_df.info(verbose=True) + mrgfltr_metadata_df.to_sql( + 'mrgfltr_metadata', + con=conn, + if_exists='append', + index=False, + method='multi' + ) + + # Close SwissProt SQLite database + conn.close() + ## ----------- Write merge/filter metadata to SQLite database (finish) ----------- + + output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH) + output_cols = output_df.columns.tolist() + output_cols = output_cols[:-1] + output_df = output_df[output_cols] + + #cosmetic changes to Upstream column + output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping + us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) + i = 0 + while i < len(us_series): + #turn blanks into N_A to signify the info was searched for but cannot be found + if us_series[i] == "": + us_series[i] = N_A + i += 1 + output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series + + end_time = time.process_time() #timer + print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer + + (output_rows, output_cols) = output_df.shape + + #output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True) + output_df = output_df.convert_dtypes(convert_integer=True) + + + #Output onto Final CSV file + output_df.to_csv(output_filename_csv, index=False) + output_df.to_csv(output_filename_tab, quoting=None, sep='\t', index=False) + + end_time = time.process_time() #timer + print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer + + print('{:>10} phosphopeptides written to output'.format(str(output_rows))) + + end_time = time.process_time() #timer + print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer + + + #Rev. 7/1/2016 + #Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's + #Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS + #Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ + # read from SwissProt SQLite database + #Rev. 12/9/2021: Transfer code to Galaxy tool wrapper + + ############################################# + # copied from Excel Output Script.ipynb END # + ############################################# + + try: + catch(mqpep_getswissprot,) + exit(0) + except Exception as e: + exit('Internal error running mqpep_getswissprot(): %s' % (e)) + +if __name__ == "__main__": + __main__() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/search_ppep.py Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,512 @@ +#!/usr/bin/env python +# Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB + +import argparse +import os.path +import sqlite3 +import re +from codecs import getreader as cx_getreader +import time + +# For Aho-Corasick search for fixed set of substrings +# - add_word +# - make_automaton +# - iter +import ahocorasick +# Support map over auto.iter(...) +# - itemgetter +import operator +#import hashlib + +# ref: https://stackoverflow.com/a/8915613/15509512 +# answers: "How to handle exceptions in a list comprehensions" +# usage: +# from math import log +# eggs = [1,3,0,3,2] +# print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) +# producing: +# for <built-in function log> +# with args (0,) +# exception: math domain error +# [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] +def catch(func, *args, handle=lambda e : e, **kwargs): + try: + return func(*args, **kwargs) + except Exception as e: + print("For %s" % str(func)) + print(" with args %s" % str(args)) + print(" caught exception: %s" % str(e)) + (ty, va, tb) = sys.exc_info() + print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) + #exit(-1) + return None # was handle(e) + +def __main__(): + ITEM_GETTER = operator.itemgetter(1) + + DROP_TABLES_SQL = ''' + DROP VIEW IF EXISTS ppep_gene_site_view; + DROP VIEW IF EXISTS uniprot_view; + DROP VIEW IF EXISTS uniprotkb_pep_ppep_view; + DROP VIEW IF EXISTS ppep_intensity_view; + DROP VIEW IF EXISTS ppep_metadata_view; + + DROP TABLE IF EXISTS sample; + DROP TABLE IF EXISTS ppep; + DROP TABLE IF EXISTS site_type; + DROP TABLE IF EXISTS deppep_UniProtKB; + DROP TABLE IF EXISTS deppep; + DROP TABLE IF EXISTS ppep_gene_site; + DROP TABLE IF EXISTS ppep_metadata; + DROP TABLE IF EXISTS ppep_intensity; + ''' + + CREATE_TABLES_SQL = ''' + CREATE TABLE deppep + ( id INTEGER PRIMARY KEY + , seq TEXT UNIQUE ON CONFLICT IGNORE + ) + ; + CREATE TABLE deppep_UniProtKB + ( deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE + , UniProtKB_id TEXT REFERENCES UniProtKB(id) ON DELETE CASCADE + , pos_start INTEGER + , pos_end INTEGER + , PRIMARY KEY (deppep_id, UniProtKB_id, pos_start, pos_end) + ON CONFLICT IGNORE + ) + ; + CREATE TABLE ppep + ( id INTEGER PRIMARY KEY + , deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE + , seq TEXT UNIQUE ON CONFLICT IGNORE + , scrubbed TEXT + ); + CREATE TABLE site_type + ( id INTEGER PRIMARY KEY + , type_name TEXT UNIQUE ON CONFLICT IGNORE + ); + CREATE INDEX idx_ppep_scrubbed on ppep(scrubbed) + ; + CREATE TABLE sample + ( id INTEGER PRIMARY KEY + , name TEXT UNIQUE ON CONFLICT IGNORE + ) + ; + CREATE VIEW uniprot_view AS + SELECT DISTINCT + Uniprot_ID + , Description + , Organism_Name + , Organism_ID + , Gene_Name + , PE + , SV + , Sequence + , Description || ' OS=' || + Organism_Name || ' OX=' || Organism_ID || + CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN='|| Gene_Name END || + CASE WHEN PE = 'N/A' THEN '' ELSE ' PE='|| PE END || + CASE WHEN SV = 'N/A' THEN '' ELSE ' SV='|| SV END + AS long_description + , Database + FROM UniProtKB + ; + CREATE VIEW uniprotkb_pep_ppep_view AS + SELECT deppep_UniProtKB.UniprotKB_ID AS accession + , deppep_UniProtKB.pos_start AS pos_start + , deppep_UniProtKB.pos_end AS pos_end + , deppep.seq AS peptide + , ppep.seq AS phosphopeptide + , ppep.scrubbed AS scrubbed + , uniprot_view.Sequence AS sequence + , uniprot_view.Description AS description + , uniprot_view.long_description AS long_description + , ppep.id AS ppep_id + FROM ppep, deppep, deppep_UniProtKB, uniprot_view + WHERE deppep.id = ppep.deppep_id + AND deppep.id = deppep_UniProtKB.deppep_id + AND deppep_UniProtKB.UniprotKB_ID = uniprot_view.Uniprot_ID + ORDER BY UniprotKB_ID, deppep.seq, ppep.seq + ; + CREATE TABLE ppep_gene_site + ( ppep_id INTEGER REFERENCES ppep(id) + , gene_names TEXT + , site_type_id INTEGER REFERENCES site_type(id) + , kinase_map TEXT + , PRIMARY KEY (ppep_id, kinase_map) ON CONFLICT IGNORE + ) + ; + CREATE VIEW ppep_gene_site_view AS + SELECT DISTINCT + ppep.seq AS phospho_peptide + , ppep_id + , gene_names + , type_name + , kinase_map + FROM + ppep, ppep_gene_site, site_type + WHERE + ppep_gene_site.ppep_id = ppep.id + AND + ppep_gene_site.site_type_id = site_type.id + ORDER BY + ppep.seq + ; + CREATE TABLE ppep_metadata + ( ppep_id INTEGER REFERENCES ppep(id) + , protein_description TEXT + , gene_name TEXT + , FASTA_name TEXT + , phospho_sites TEXT + , motifs_unique TEXT + , accessions TEXT + , motifs_all_members TEXT + , domain TEXT + , ON_FUNCTION TEXT + , ON_PROCESS TEXT + , ON_PROT_INTERACT TEXT + , ON_OTHER_INTERACT TEXT + , notes TEXT + , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE + ) + ; + CREATE VIEW ppep_metadata_view AS + SELECT DISTINCT + ppep.seq AS phospho_peptide + , protein_description + , gene_name + , FASTA_name + , phospho_sites + , motifs_unique + , accessions + , motifs_all_members + , domain + , ON_FUNCTION + , ON_PROCESS + , ON_PROT_INTERACT + , ON_OTHER_INTERACT + , notes + FROM + ppep, ppep_metadata + WHERE + ppep_metadata.ppep_id = ppep.id + ORDER BY + ppep.seq + ; + CREATE TABLE ppep_intensity + ( ppep_id INTEGER REFERENCES ppep(id) + , sample_id INTEGER + , intensity INTEGER + , PRIMARY KEY (ppep_id, sample_id) ON CONFLICT IGNORE + ) + ; + CREATE VIEW ppep_intensity_view AS + SELECT DISTINCT + ppep.seq AS phospho_peptide + , sample.name AS sample + , intensity + FROM + ppep, sample, ppep_intensity + WHERE + ppep_intensity.sample_id = sample.id + AND + ppep_intensity.ppep_id = ppep.id + ; + ''' + + UNIPROT_SEQ_AND_ID_SQL = ''' + select Sequence, Uniprot_ID + from UniProtKB + ''' + + # Parse Command Line + parser = argparse.ArgumentParser( + description='Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB).' + ) + + # inputs: + # Phosphopeptide data for experimental results, including the intensities + # and the mapping to kinase domains, in tabular format. + parser.add_argument( + '--phosphopeptides', '-p', + nargs=1, + required=True, + dest='phosphopeptides', + help='Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool' + ) + parser.add_argument( + '--uniprotkb', '-u', + nargs=1, + required=True, + dest='uniprotkb', + help='UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool' + ) + parser.add_argument( + '--schema', + action='store_true', + dest='db_schema', + help='show updated database schema' + ) + parser.add_argument( + '--warn-duplicates', + action='store_true', + dest='warn_duplicates', + help='show warnings for duplicated sequences' + ) + parser.add_argument( + '--verbose', + action='store_true', + dest='verbose', + help='show somewhat verbose program tracing' + ) + # "Make it so!" (parse the arguments) + options = parser.parse_args() + if options.verbose: + print("options: " + str(options) + "\n") + + # path to phosphopeptide (e.g., "outputfile_STEP2.txt") input tabular file + if options.phosphopeptides is None: + exit('Argument "phosphopeptides" is required but not supplied') + try: + f_name = os.path.abspath(options.phosphopeptides[0]) + except Exception as e: + exit('Error parsing phosphopeptides argument: %s' % (e)) + + # path to SQLite input/output tabular file + if options.uniprotkb is None: + exit('Argument "uniprotkb" is required but not supplied') + try: + db_name = os.path.abspath(options.uniprotkb[0]) + except Exception as e: + exit('Error parsing uniprotkb argument: %s' % (e)) + + # print("options.schema is %d" % options.db_schema) + + # db_name = "demo/test.sqlite" + # f_name = "demo/test_input.txt" + + con = sqlite3.connect(db_name) + cur = con.cursor() + ker = con.cursor() + + cur.executescript(DROP_TABLES_SQL) + + # if options.db_schema: + # print("\nAfter dropping tables/views that are to be created, schema is:") + # cur.execute("SELECT * FROM sqlite_schema") + # for row in cur.fetchall(): + # if row[4] is not None: + # print("%s;" % row[4]) + + cur.executescript(CREATE_TABLES_SQL) + + if options.db_schema: + print("\nAfter creating tables/views that are to be created, schema is:") + cur.execute("SELECT * FROM sqlite_schema") + for row in cur.fetchall(): + if row[4] is not None: + print("%s;" % row[4]) + + def generate_ppep(f): + #get keys from upstream tabular file using readline() + # ref: https://stackoverflow.com/a/16713581/15509512 + # answer to "Use codecs to read file with correct encoding" + file1_encoded = open(f, 'rb') + file1 = cx_getreader("latin-1")(file1_encoded) + + count = 0 + re_tab = re.compile('^[^\t]*') + re_quote = re.compile('"') + while True: + count += 1 + # Get next line from file + line = file1.readline() + # if line is empty + # end of file is reached + if not line: + break + if count > 1: + m = re_tab.match(line) + m = re_quote.sub('',m[0]) + yield m + file1.close() + file1_encoded.close() + + # Build an Aho-Corasick automaton from a trie + # - ref: + # - https://pypi.org/project/pyahocorasick/ + # - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm + # - https://en.wikipedia.org/wiki/Trie + auto = ahocorasick.Automaton() + re_phos = re.compile('p') + # scrub out unsearchable characters per section + # "Match the p_peptides to the @sequences array:" + # of the original + # PhosphoPeptide Upstream Kinase Mapping.pl + # which originally read + # $tmp_p_peptide =~ s/#//g; + # $tmp_p_peptide =~ s/\d//g; + # $tmp_p_peptide =~ s/\_//g; + # $tmp_p_peptide =~ s/\.//g; + # + re_scrub = re.compile('0-9_.#') + ppep_count = 0 + for ppep in generate_ppep(f_name): + ppep_count += 1 + add_to_trie = False + #print(ppep) + scrubbed = re_scrub.sub('',ppep) + deppep = re_phos.sub('',scrubbed) + if options.verbose: + print("deppep: %s; scrubbed: %s" % (deppep,scrubbed)) + #print(deppep) + cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) + if cur.fetchone() is None: + add_to_trie = True + cur.execute("INSERT INTO deppep(seq) VALUES (?)", (deppep,)) + cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) + deppep_id = cur.fetchone()[0] + if add_to_trie: + #print((deppep_id, deppep)) + # Build the trie + auto.add_word(deppep, (deppep_id, deppep)) + cur.execute( + "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)", + (ppep, scrubbed, deppep_id) + ) + # def generate_deppep(): + # cur.execute("SELECT seq FROM deppep") + # for row in cur.fetchall(): + # yield row[0] + cur.execute("SELECT count(*) FROM (SELECT seq FROM deppep GROUP BY seq)") + for row in cur.fetchall(): + deppep_count = row[0] + + cur.execute("SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)") + for row in cur.fetchall(): + sequence_count = row[0] + + print( + "%d phosphopeptides were read from input" % ppep_count + ) + print( + "%d corresponding dephosphopeptides are represented in input" % deppep_count + ) + # Look for cases where both Gene_Name and Sequence are identical + cur.execute(''' + SELECT Uniprot_ID, Gene_Name, Sequence + FROM UniProtKB + WHERE Sequence IN ( + SELECT Sequence + FROM UniProtKB + GROUP BY Sequence, Gene_Name + HAVING count(*) > 1 + ) + ORDER BY Sequence + ''') + duplicate_count = 0 + old_seq = '' + for row in cur.fetchall(): + if duplicate_count == 0: + print("\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column).") + if row[2] != old_seq: + old_seq = row[2] + duplicate_count += 1 + if options.warn_duplicates: + print("\n%s\t%s\t%s" % row) + else: + if options.warn_duplicates: + print("%s\t%s" % (row[0], row[1])) + if duplicate_count > 0: + print("\n%d sequences have duplicated accession IDs\n" % duplicate_count) + + print( + "%s accession sequences will be searched\n" % sequence_count + ) + + #print(auto.dump()) + + # Convert the trie to an automaton (a finite-state machine) + auto.make_automaton() + + # Execute query for seqs and metadata without fetching the results yet + uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL) + while batch := uniprot_seq_and_id.fetchmany(size=50): + if None == batch: + break + for Sequence, UniProtKB_id in batch: + if Sequence is not None: + for end_index, (insert_order, original_value) in auto.iter(Sequence): + ker.execute(''' + INSERT INTO deppep_UniProtKB + (deppep_id,UniProtKB_id,pos_start,pos_end) + VALUES (?,?,?,?) + ''', ( + insert_order, + UniProtKB_id, + 1 + end_index - len(original_value), + end_index + ) + ) + else: + raise ValueError("UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" % (UniProtKB_id,)) + ker.execute(""" + SELECT count(*) || ' accession-peptide-phosphopeptide combinations were found' + FROM uniprotkb_pep_ppep_view + """ + ) + for row in ker.fetchall(): + print(row[0]) + + ker.execute(""" + SELECT count(*) || ' accession matches were found', count(*) AS accession_count + FROM ( + SELECT accession + FROM uniprotkb_pep_ppep_view + GROUP BY accession + ) + """ + ) + for row in ker.fetchall(): + print(row[0]) + accession_count = row[1] + + ker.execute(""" + SELECT count(*) || ' peptide matches were found' + FROM ( + SELECT peptide + FROM uniprotkb_pep_ppep_view + GROUP BY peptide + ) + """ + ) + for row in ker.fetchall(): + print(row[0]) + + ker.execute(""" + SELECT count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count + FROM ( + SELECT phosphopeptide + FROM uniprotkb_pep_ppep_view + GROUP BY phosphopeptide + ) + """ + ) + for row in ker.fetchall(): + print(row[0]) + phosphopeptide_count = row[1] + + con.commit() + ker.execute('vacuum') + con.close() + +if __name__ == "__main__": + wrap_start_time = time.perf_counter() + __main__() + wrap_stop_time = time.perf_counter() + # print(wrap_start_time) + # print(wrap_stop_time) + print("\nThe matching process took %d milliseconds to run.\n" % ((wrap_stop_time - wrap_start_time)*1000),) + + # vim: sw=4 ts=4 et ai :
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/alpha_levels.tabular Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,3 @@ +0.05 +0.1 +0.2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/pSTY_motifs.tabular Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,196 @@ +1 ((E|D|A)(D|E)(E|D)(E|D)pS(E|D|A)(D|E|A)(E|D)(E|D))|(pS.(E|pS|pT))|(pS..(E|pS|pT))|((pS|pT)..(E|D))|(pS(D|E).(D|E).(D|E))|((D|E)pS(D|E).(D|E))|(pS(D|E)(D|E)(D|E))|((pS|pT)..(D|E))|((pS|pT)..(E|D|pS|pY))|((S|E|P|G)(D|S|N|E|P)(E|D|G|Q|W)(Y|E|D|S|W|T)(W|E|D)pS(D|E)(D|E|W|N)(E|D)(E|D|N|Q)) Casein Kinase II substrate motif (HPRD) +2 ((L|F|I)...R(Q|S|T)L(pS|pT)(M|L|I|V))|(..B.R..pS..)|(pS...(pS|pT)) MAPKAPK2 kinase substrate motif (HPRD) +3 ((M|V|L|I|F)(R|K|H)...(pS|pT)...(M|V|L|I|F))|((M|V|L|I)..(R|K|H).(pS|pT)...(M|V|L|I))|((M|V|L|I|F)(R|K|H)..(pS|pT)...(M|V|L|I|F)) AMP-activated protein kinase substrate motif (HPRD) +4 ((P|L|I|M).(L|I|D|E)pSQ)|(LpSQE)|(pSQ) ATM kinase substrate motif (HPRD) +5 ((R|K).R..(pS|pT)(M|L|V|I))|(VFLGFpTYVAP) p70 Ribosomal S6 kinase substrate motif (HPRD) +6 ((R|K).R..pS)|(RRR.pS) MAPKAPK1 kinase substrate motif (HPRD) +7 ((R|K)pSP(R|P)(R|K|H))|((pS|pT)P.(R|K))|(HHH(R|K)pSPR(R|K)R) Cdc2 kinase substrate motif (HPRD) +8 ((R|N)(F|L|M)(R|K)(R|K)pS(R|I|V|M)(R|I|M|V)(M|I|F|V)(I|F|M))|(FR.(pS|pT))|(RF(R|K)(R|K)pS(R|I)(R|I)MI) NIMA kinase substrate motif (HPRD) +9 ((pS|pT)P.(K|R))|((K|R)(pS|pT)P)|((pS|pT)P(K|R)) Growth associated histone HI kinase substrate motif (HPRD) +10 (..(pS|pT)E)|(.(pS|pT)...(A|P|S|T)) G protein-coupled receptor kinase 1 substrate motif (HPRD) +11 (.R..(pS|pT).R.)|((pS|pT).(R|K))|((R|K)..(pS|pT))|((R|K)..(pS|pT).(R|K))|((K|R).(pS|pT))|((R|K).(pS|pT).(R|K)) PKC kinase substrate motif (HPRD) +12 (.pSQ)|(P(pS|pT).) DNA dependent Protein kinase substrate motif (HPRD) +13 (AKRRRLSpSLRA)|(VRKRpTLRRL) PAK1 kinase substrate motif (HPRD) +14 (ARKGpSLRQ)|(R(R|F)RR(R|K)GpSF(R|K)(R|K)) PKC alpha kinase substrate motif (HPRD) +15 (HpSTSDD)|(YRpSVDE) Branched chain alpha-ketoacid dehydrogenase kinase substrate motif (HPRD) +16 (KCSpTWP)|(R..pS)|(R.R..pS.P)|(YpTV)|(RS.(pS|pT).P)|(R.(Y|F).pS.P)|(RPVSSAApSVY) 14-3-3 domain binding motif (HPRD) +17 (KK.RRpT(L|V).)|(KKR.RpT(L|V).)|((R|K).RR.(pS|pT)(L|V).) DMPK1 kinase substrate motif (HPRD) +18 (KKKKKK(pS|pT)...)|((R|K|Q|N)(M|C|W)(R|T|S|N)(E|D|S|N)(R|K|E|D|N)pS(S|D|E)(S|GC|D)(SM|R|N)(N|H|S|R|C)) TGF beta receptor kinase substrate motif (HPRD) +19 (KRKQIpSVR)|((F|M|K)(R|K)(M|R|Q|F)(M|F|L|I)pS(F|I|M|L)(F|R|K)(L|I)(F|L|I))|((K|R)..pS(V|I)) Phosphorylase kinase substrate motif (HPRD) +20 (KRQGpSVRR)|(R(K|E|R).pS) PKC epsilon kinase substrate motif (HPRD) +21 (P.(pS|pT)P)|(pSP) ERK1, ERK2 Kinase substrate motif (HPRD) +22 (P.(pS|pT)PP)|(..P.(pS|pT)PPP.) ERK1,2 kinase substrate motif (HPRD) +23 (PL(pS|pT)PIP(K|R|H))|(PL(pS|pT)P.(K|R|H)) CDK4 kinase substrate motif (HPRD) +24 (PLpTLP)|(PLLpTP)|(PLpTP)|(PpTLP)|(PLpTLP)|(PpTLP)|(LpTP) RAF1 kinase substrate motif (HPRD) +25 (R..(pS|pT))|((K|F)(R|K)(Q|M)(Q|M|K|L|F)pS(F|I|M|L|V)(D|E|I)(L|M|K|I)(F|K))|((M|V|L|I|F).(R|K)..(pS|pT)..)|(R..pS) Calmodulin-dependent protein kinase II substrate motif (HPRD) +26 (R..pSPV)|(K(pS|pT)P.K)|(KpSP...K)|(KpSP..K)|(KpSP....K)|(KpTPAKEE)|(P.pSP)|(.(pS|pT)P)|(..pSP) GSK-3, ERK1, ERK2, CDK5 substrate motif (HPRD) +27 (R.R..(pS|pT)(F|L))|(R.R..(pS|pT))|(GRART(S|T)pSFAE)|((R|Q|K)(R|K|N|Q|P|H)(R|K)(R|S|T)(N|K|Q|H|D|P)pS(F|W|I|M|N|S)(S|T|H)(R|S|K)(S|T|P|Q))|((R|K).(R|K)(S|T).pS) Akt kinase substrate motif (HPRD) +28 (RR..pS)|(KR.RpS)|(KRR.pT) ZIP kinase substrate motif (HPRD) +29 (RR.pS(M|I|L|V|F|Y))|(R.pS)|(KR..pS)|(R..pS)|((R|K).(pS|pT))|(K..(pS|pT))|((R|K)(R|K).(pS|pT))|(K...(pS|pT))|((pS|pT).(R|K))|(RRRRpSIIFI)|(RR.pS)|(R(R|K).(pS|pT)(I|L|V|F|Y)(D|C|.).D)|(RR.pS)|(RRR(R|N)pSII(F|D))|((R|C|P|K)(R|A|P)(R|K)(R|K|S)(N|L|S|M|P)Ps(I|L|V|C)(S|P|H|Q)(S|W|Q)(S|L|G)) PKA kinase substrate motif (HPRD) +30 (RRFGpSBRRF)|(RRFGpS(M|L|V|I|F)RR(M|L|V|I|F)) MEKK kinase substrate motif (HPRD) +31 (VPGKARKKpSSCQLL)|(PLARTLpSVAGLP)|((M|I|L|V|F|Y).R..(pS|pT)) Calmodulin-dependent protein kinase IV substrate motif (HPRD) +32 (pSD.E)|(pS..(E|D)) Casein kinase II substrate motif (HPRD) +33 (pSP..(pS|pT))|((D|E)..(pS|pT))|((pS|pT)..(S|T))|((pS|pT)...(S|T)(M|L|V|I|F)) Casein Kinase I substrate motif (HPRD) +34 (pTP.K)|((K|H|G)H(H|P)(K|G|H)pSP(R|K)(H|R|K)(R|H|K))|((pS|pT)PG(pS|pT)PGTP) CDK5 kinase substrate motif (HPRD) +35 (R|K).R..pS...(R|K) AMP-activated protein kinase 2 substrate motif (HPRD) +36 (R|K|N)R.(pS|pT)(M|L|V|I) Aurora-A kinase substrate motif (HPRD) +37 (D|E)(pS|pT)... b-Adrenergic Receptor kinase substrate motif (HPRD) +38 (M|V|L|I|F).R..(pS|pT)...(M|V|L|I|F) Calmodulin-dependent protein kinase I substrate motif (HPRD) +39 (M|I|L|V|F|Y).R..(pS|pT)(M|I|L|V|F|Y) Calmodulin-dependent protein kinase II alpha substrate motif (HPRD) +40 E(F|E)D(T|A|G)GpSI(I|F|Y|G)(I|G|F)(F|G)(F|P|L) Casein Kinase I delta substrate motif (HPRD) +41 Y(Y|E)(D|Y)(A|D)(A|G)pSI(I|Y|F|G)(I|G|F)(F|G)(F|P|L) Casein Kinase I gamma substrate motif (HPRD) +42 P.(pS|pT)PKK.KK Cdc2 like protein kinase substrate motif (HPRD) +43 (pS|pT)P.(R|K) CDK1,2, 4, 6 kinase substrate motif (HPRD) +44 pSP.(R|K). CDK kinase substrate motif (HPRD) +45 (M|I|L|V).(R|K)..(pS|pT) Chk1 kinase substrate motif (HPRD) +46 R..(pS|pT)..R CLK1 kinase substrate motif (HPRD) +47 (R|K).(R|K).(R|K).pS..R CLK1,2 kinase substrate motif (HPRD) +48 R(R|H)(R|H)(R|E)RE(R|H)pSR(R|D)L CLK2 kinase substrate motif (HPRD) +49 R..(pS|pT)(L|V)R DMPK1,2 kinase substrate motif (HPRD) +50 R(R|K)R(E|R)R(E|A)(H|R)pSRR(R|D)(L|E) DOA/CDC-like kinase 2 substrate motif (HPRD) +51 (I|L|V|F|M)RR..(pS|pT)(I|L|M|V|F) Doublecortin kinase-1 kinase substrate motif (HPRD) +52 E.pS.R..R elF2 alpha kinase substrate motif (HPRD) +53 (T|P|S)(G|P|E|Y)(P|L|I)(L|M|P)pSP(G|P|F)(P|F|G|Y)(F|Y|I) ERK1 kinase substrate motif (HPRD) +54 pTEpY ERK1 Kinase substrate motif (HPRD) +55 KpSPP ERK1, ERK2, SAPK, CDK5 and GSK3 kinase substrate motif (HPRD) +56 (D|Y|W|E)(C)(P|S|C|E)(P|C|S|L|T|V)(L|M|T)pS(P|A)(T|S|G|R|C|F)(W|P|S)(W|F) ERK2 kinase substrate motif (HPRD) +57 pS...pS GSK3 kinase substrate motif (HPRD) +58 P.pTP GSK3, Erk1, Erk2 and CDK5 kinase motif (HPRD) +59 (M|L|V|I|F)(R|K|H)..pS...(M|L|V|I|F) HMGCoA Reductase kinase substrate motif (HPRD) +60 GP(Q|M)pSPI JNK1 Kinase substrate motif (HPRD) +61 LRpT LKB1 Kinase substrate motif (HPRD) +62 pT(G|P|E)pY MAPK 11,13,14 Kinase substrate motif (HPRD) +63 KKR..pS.(R|K)(R|K) MLCK kinase substrate motif (HPRD) +64 FpTY mTOR kinase substrate motif (HPRD) +65 IRRLpSTRRR Nek 2 kinase substrate motif (HPRD) +66 (R|K)(R|.).(pS|pT) PAK2 kinase substrate motif (HPRD) +67 F..F(pS|pT)(F|Y) PDK1 kinase substrate motif (HPRD) +68 (R|K)(R|K)(R|K).(pS|pT). Pim1 kinase substrate sequence (HPRD) +69 (R|K)(R|K|A|Q|P)(R|K)(R|Q|H|N|Y)(P|H|K)pS(G|S|T)(P|S|G|Q|H|S|T)(S|P|Q|G|D)(T|S|P|G) Pim2 kinase substrate sequence (HPRD) +70 R(R|K).(pS|pT)B PKA, PKG kinase substrate motif (HPRD) +71 (L|R|F)(R|K)R(K|Q)GpS(F|M)KK.A PKC beta kinase substrate motif (HPRD) +72 R.RKGpSF PKC delta kinase substrate motif (HPRD) +73 AR..R(R|K)RpSFRR PKC eta kinase substrate motif (HPRD) +74 F..F(pS|pT)(F|Y) PKC family kinase substrate motif (HPRD) +75 RRRK(G|K)SF(R|K)(R|K)KA PKC gamma kinase substrate motif (HPRD) +76 (L|V)(V|L|A)R(Q|K|E)MpS PKC mu kinase substrate motif (HPRD) +77 (R|F|W|M)(W|A|K|S)(R|S|K|H)(R|H|S|Q)(R|K|N|P|G|Q)pS(I|F|R|V|K|S|L|M)(K|M|R|S|T)(R|S|K|W)(R|K|G) PKC theta kinase substrate motif (HPRD) +78 F.R..pS(F|M)(F|M) PKC zeta kinase substrate motif (HPRD) +79 (L|V|I)(R|K|Q)(R|K)(R|K|T|Q|M)(N|K|R|L|M|H)pS(F|W|I|M|L|V)(S|N)(R|S|P|Y|W)(S|R|N|L) PKD kinase substrate motif (HPRD) +80 R(R|K).(pS|pT)B PKG kinase substrate motif (HPRD) +81 R..(pS|pT).R..R PKR kinase substrate motif (HPRD) +82 (D|E).(pS|pT)(I|L|V|M).(D|E) Plk1 kinase substrate motif (HPRD) +83 .pS..D.. Pyruvate dehydrogenase kinase substrate motif (HPRD) +84 pTEY Dual specificity protein phosphatase 1 substrate motif (HPRD) +85 pT.pY Dual specificity protein phosphatase 6 substrate motif (HPRD) +86 RRA(pS|pT)VA PP2A, PP2C substrate motif (HPRD) +87 .R..pSVA PP2B substrate motif (HPRD) +88 .pT.pY. PP2C delta substrate motif (HPRD) +89 pS(D|E)(D|E)E BARD1 BRCT domain binding motif (HPRD) +90 DpSG..pS Beta-TrCP1 domain binding motif (HPRD) +91 pS(F|Y|H)(V|F|Y)(F|Y) BRCA1 BRCT domain binding motif (HPRD) +92 (I|L)(I|L|P)pTP(R|K) CDC4 WD40 domain binding motif (HPRD) +93 HFDpTYLI Chk2 FHA domain binding motif (HPRD) +94 (R|D|H)(L|Y)(L|M)(K|A)pT(Q|L|M|E|V)(K|L|I|R) FHA domain binding motif (HPRD) +95 S(pS|pT). MDC1 BRCT domain binding motif (HPRD) +96 S(pS|pT). Plk1 PBD domain binding motif (HPRD) +97 pSYII RAD9 BRCT domain binding motif (HPRD) +98 (pS|pT)P WW domain binding motif (HPRD) +99 ((pS|pT)P.(K|R))|((pS|pT)P(K|R)) CDK1_Phosida +100 (P.(pS|pT)P)|(V.(pS|pT)P)|(PE(pS|pT)P) ERK/MAPK_Phosida +101 (R(R|S|T).(pS|pT).(S|T))|(R.R..(pS|pT)) PKB/AKT_Phosida +102 (R.(pS|pT))|(R(R|K).(pS|pT))|(KR..(pS|pT)) PKA_Phosida +103 (R..(pS|pT))|(R..(pS|pT)V) CAMK2_Phosida +104 (S..(pS|pT))|((S|T)...pS) CK1_Phosida +105 (pS|pT)..E CK2_Phosida +106 pS...S GSK3_Phosida +107 (pS|pT)P.(K|R) CDK2_Phosida +108 R..(pS|pT).R PKC_Phosida +109 (L|V|I).(R|K)..(pS|pT) PKD_Phosida +110 (I|E|V)pY(E|G)(E|D|P|N)(I|V|L) LCK_Phosida +111 (I|V|L)pY..(P|F) ABL_Phosida +112 (E|D)..pY..(D|E|A|G|S|T) SRC_Phosida +113 pY..(I|L|V|M) ALK_Phosida +114 (D|P|S|A|E|N).pY(V|L|D|E|I|N|P) EGFR_Phosida +115 (R|K).(pS|pT)(I|L|V) AURORA_Phosida +116 (R|K|N)R.(pS|pT)(M|L|V|I) AURORA-A_Phosida +117 (D|E).(pS|pT)(V|I|L|M).(D|E) PLK_Phosida +118 (E|D).(pS|pT)(F|L|I|Y|W|V|M) PLK1_Phosida +119 L..(pS|pT) NEK6_Phosida +120 L.R..(pS|pT) CHK1/2_Phosida +121 (M|I|L|V).(R|K)..(pS|pT) CHK1_Phosida +122 F..F(pS|pT)(F|Y) PDK1_Phosida +123 (F|L|M)(R|K)(R|K)(pS|pT) NIMA_Phosida +124 ((D|E)(D|E)...pYVA)|((E|D|Y)pY) TC-PTP phosphatase substrate motif (HPRD) +125 ((D|E).(L|I|V).pY..(L|I|V))|((D|E).(L|I|V)..pY..(L|I|V))|((D|E)(D|E)(D|E|L).pY..(F|M|L|V|I)(D|E))|((D|E).pY)|((E|P)(F|I|L)pYA.(F|I|L|V)) SHP1 phosphatase substrate motif (HPRD) +126 ((D|E).......(D|E)..pY..L.......Y..(L|I))|((I|V|L|S).pY..(L|I)) Src family kinase substrate motif (HPRD) +127 ((D|E)pYpY(R|K))|(EFpY(G|A)TY(G|A))|(E(Y|F|D)pYM)|((E|P)(M|L|I|V|F)pY(G|A).(M|L|I|V|F|Y)A)|(RD.Y.TDYpYR)|(E(F|D|Y)pY) PTP1B phosphatase substrate motif (HPRD) +128 ((H|F).V.(T|S|A)pY)|((I|V|L).pY(F|M).P)|(pY(I|V).(I|V))|((I|L|V|M).pY(T|V|A).(I|V|L|F))|((I|V).pY(L|M|T)Y(A|P|T)SG)|(W(M|T|V)pY(Y|R)(I|L).) SHP2 N-terminal SH2 domain binding motif (HPRD) +129 ((V|I|L).pYA.(L|V))|(..pYYM(K|R)) SHP1 C-terminal SH2 domain binding motif (HPRD) +130 (.E.IpYGVLF)|(E.(I|V|L|F)pY(G|A)V(L|V|F|I)(F|L|V|I)) Lck kinase substrate motif (HPRD) +131 (DEEIpY(E|G)EL.)|((D|E).......(D|E)..pY..L.......Y..(L|I)) Lyn kinase substrate motif (HPRD) +132 (EE(D|E)IpYFFFF)|(...IpY(M|I|F)FFF) CSK kinase substrate motif (HPRD) +133 (EEEEpYFELV)|((E|D|R|A)(D|E)(D|E)(E|D|I)pY(F|V|I|E)(E|F|D)(L|I|F|V)V)|(.(D|E)pY.)|(pYIPP)|(.(D|E)pY(I|L|V)) EGFR kinase substrate motif (HPRD) +134 (EEEEpYVFI.)|((L|N)(R|I)TpY)|((D|E)(D|E)(D|E)(D|E)pY(V|E|I)F(I|V|F)) PDGFR kinase substrate motif (HPRD) +135 (EEEIpYEEIE)|((E|A|D)(E|A)(E|A)(I|E|V)pY(D|E)(D|E)(I|V|E)(E|I|V)) Fes kinase substrate motif (HPRD) +136 (EEEpYFFLF)|(A(E|A)EEpY(F|V)F(L|F|M|I|V)F) FGFR kinase substrate motif (HPRD) +137 (L(Y|H)pY(M|F).(F|M))|(L.pYA.L) SHP1 N-terminal SH2 domain binding motif (HPRD) +138 (pY(M|L|E)EP)|(pYESP) Vav SH2 domain binding motif (HPRD) +139 (pY(Y|I|V)N(F|L|I|V))|(pY(Q|Y|V)N(Y|Q|F))|(pY.N) Grb2 SH2 domain binding motif (HPRD) +140 (pY..P)|(pYDHP) Crk SH2 domain binding motif (HPRD) +141 (pY..Q)|(pY(M|L|V|I|F)(P|R|K|H)Q) STAT3 SH2 domain binding motif (HPRD) +142 (pY..YY)|(pY(D|E).(I|L|V|M))|((D|E)..pY)|(pY....(F|Y)) ALK kinase substrate motif (HPRD) +143 (pYIDL)|(pYASI)|(EFpYA.(V|I)G(R|K|H)S) SHP2 phosphatase substrate motif (HPRD) +144 (pYM.M)|(EDAIpY)|(.VIpYAAPF)|(EAIpYAAPF)|(EEIpYEEpY)|(E.IpY..P.)|(EEIpYYYVH)|(ERIpYARTK)|(AEV(I|V|L|F)pYAA(P|F)F) Abl kinase substrate motif (HPRD) +145 (pYM.M)|(EE(E|N|D)pY(M|F)(M|F)(M|F|I|E)(M|F))|(.EEEpYMMMM)|(KKSRGDpYMTMQIG)|(KKKLPATGDpYMNMSPVGD) Insulin receptor kinase substrate motif (HPRD) +146 (pYM.M)|(YIpYGSFK)|(EEEIpY(G|E)EFD)|(D(D|E)(E|D|G)(I|V|L)pY(G|E)E(F|I)F)|((D|E).......(D|E)..pY..L.......Y..(L|I))|((D|E)(D|E)(E|D|G)(I|V|L)pY(G|E|D)E(F|I|L|V)(D|E))|(pY(A|G|S|T|D|E)) Src kinase substrate motif (HPRD) +147 (pYM.M)|(pY..M)|(pYMPMS) PI3 Kinase p85 SH2 domain binding motif (HPRD) +148 ME(E|N)(I|V)pY(G|E)IFF Fgr kinase substrate motif (HPRD) +149 KKKSPGEpYVNIEFG IGF1 receptor kinase substrate motif (HPRD) +150 pY..(L|I|V) JAK2 kinase substrate motif (HPRD) +151 pTPpY JNK kinase substrate motif (HPRD) +152 (E|D|pT|pY).pYEE Syk kinase substrate motif (HPRD) +153 DpYpYR PTP1B, TC-PTP phosphatase substrate motif (HPRD) +154 (D|E)FpY(G|A)(F|Y)(A|G) PTPRH phosphatase substrate motif (HPRD) +155 F(M|L|V|I)pY PTPRJ phosphatase substrate motif (HPRD) +156 pY(E|M|V)(N|V|I) 3BP2 SH2 domain binding motif (HPRD) +157 pYENP Abl SH2 domain binding motif (HPRD) +158 pY(T|A|S)(K|R|Q|N)(M|I|V|R) Csk SH2 domain binding motif (HPRD) +159 pYE.(V|I) Fes SH2 domain binding motif (HPRD) +160 pYEE(I|V) Fgr SH2 domain binding motif (HPRD) +161 pYEDP Fyn SH2 domain binding motif (HPRD) +162 pY(M|I|L|V).(M|I|L|V) GRB2, 3BP2, Csk, Fes, Syk C-terminal SH2 domain binding motif (HPRD) +163 (F|Y)pY(E|T|Y|S)N(I|L|V|P|T|Y|S) GRB7, GRB10 SH2 domain binding motif (HPRD) +164 pYF.(F|P|L|Y) HCP SH2 domain binding motif (HPRD) +165 pY(A|E|V)(Y|F|E|S|N|V)(P|F|I|H) Itk SH2 domain binding motif (HPRD) +166 pYDYV Lck and Src SH2 domain binding motif (HPRD) +167 pYDEP Nck SH2 domain binding motif (HPRD) +168 pY(L|I|V)E(L|I|V) PLCgamma C and N-terminal SH2 domain binding motif (HPRD) +169 pY..P RasGAP C-terminal SH2 domain binding motif (HPRD) +170 pYILV.(M|L|I|V|P) RasGAP N-terminal SH2 domain binding motif (HPRD) +171 TIpY..(V|I) SAP and EAT2 SH2 domain binding motif (HPRD) +172 pY(L|V)N(V|P) Sem5 SH2 domain binding motif (HPRD) +173 pY(T|V|I).L Shb SH2 domain binding motif (HPRD) +174 pY(I|E|Y|L).(I|L|M) SHC SH2 domain binding motif (HPRD) +175 (I|V|L|S).pY..(L|I) SHIP2 SH2 domain binding motif (HPRD) +176 (I|V).pY..(L|V) SHP1 SH2 domain binding motif (HPRD) +177 (V|I|L).pY(M|L|F).P SHP1, SHP2 SH2 domain binding motif (HPRD) +178 (T|V|I|Y).pY(A|S|T|V).(I|V|L) SHP2 CSH2 domain binding motif (HPRD) +179 (I|L|V)(I|L|V)(I|L|V|F|T|Y)pY(T|I|L|V)(I|L)(I|L|V|P) SHP2 C-terminal SH2 domain binding motif (HPRD) +180 pYIPP SHP2, PLCgamma SH2 domain binding motif (HPRD) +181 pYM.M Src and Abl SH2 domain binding motif (HPRD) +182 pY(R|K|H|Q|E|D)(R|K|H|Q|E|D)(I|P) Src, Fyn, Lck, Fgr, Abl, Crk, Nck SH2 domain binding motif (HPRD) +183 PP.pY Src, Fyn,Csk, Nck and SHC SH2 domain binding motif (HPRD) +184 pYEEI Src,Lck and Fyn SH2 domains binding motif (HPRD) +185 pY(D|E)(P|R)(R|P|Q) STAT1 SH2 domain binding motif (HPRD) +186 pY(Q|T|E)(E|Q)(L|I) Syk C-terminal SH2 domain binding motif (HPRD) +187 pYTT(I|L|M) Syk N-terminal SH2 domain binding motif (HPRD) +188 (D|E).......(D|E)..pY..L.......Y..(L|I) Syk, ZAP-70, Shc, Lyn SH2 domain binding motif (HPRD) +189 pYEN(F|I|V) Tensin SH2 domain binding motif (HPRD) +190 D(N|D).pY Cbl PTB domain binding motif (HPRD) +191 N.LpY Dok1 PTB domain binding motif (HPRD) +192 N..pY FRIP PTB domain binding motif (HPRD) +193 NP.pY Shc PTB domain binding motif (HPRD) +194 DD.pY Shb PTB domain binding motif (HPRD) +195 NP.pYF.R ShcA PTB domain binding motif (HPRD) +196 HN(M|L|V|I)(M|L|V|I|N)NP(S|T)pY ShcC PTB domain binding motif (HPRD)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_input_for_anova.tabular Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,23 @@ +Phosphopeptide Sequence10 Sequence7 Gene_Name Phosphoresidue UniProt_ID Description Function Phosphoresidue(PSP=PhosphoSitePlus.org) Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains Intensity.shL.1A Intensity.shL.1B Intensity.shL.1C Intensity.shR.2A Intensity.shR.2B Intensity.shR.2C +AAAAPDSRVpSEEENLK MAAAAPDSRVpSEEENLKKTPK AAPDSRVsEEENLKK RRP15 pS11 Q9Y3B9 RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | BARD1 BRCT domain binding | PKA | CK1 | CK2 38150000 39445000 56305000 55338000 7010600 70203000 +AAAITDMADLEELSRLpSPLPPGpSPGSAAR MADLEELSRLpSPLPPGSPGSA; LSRLSPLPPGpSPGSAARGRAE LEELSRLsPLPPGSP | LSPLPPGsPGSAARG AEBP2; AEBP2 pS18, pS24; pS18, pS24 Q6ZN18; Q6ZN18-2 AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 N/A N/A 5416400 7101800 385280000 208060000 41426000 352400000 +ADALQAGASQFETpSAAK LQAGASQFETpSAAKLKRKYWW GASQFETsAAKLKRK VAMP2; VAMP3 pS80; pS63 P63027; Q15836 VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A PKD3 | PKCiota 44627000 41445000 69094000 42521000 5738000 61819000 +DQKLpSELDDR DKVLERDQKLpSELDDRADALQ LERDQKLsELDDRAD VAMP1; VAMP1; VAMP1; VAMP2; VAMP3 pS63; pS63; pS63; pS61; pS44 P23763; P23763-2; P23763-3; P63027; Q15836 VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1; VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A CK2alpha | PKAbeta | PKAgamma | PKCiota | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | Pyruvate dehydrogenase kinase substrate 75542000 44814000 32924000 35016000 11023000 4669900 +EFVpSSDESSSGENK SESFKSKEFVpSSDESSSGENK FKSKEFVsSDESSSG SSRP1 pS667 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | Casein Kinase I substrate | CK2 | GSK3 12562000 16302000 23000000 7857800 0 18830000 +EGMNPSYDEYADpSDEDQHDAYLER MNPSYDEYADpSDEDQHDAYLE SYDEYADsDEDQHDA SSRP1 pS444 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | CK1alpha | Casein kinase II substrate | b-Adrenergic Receptor kinase substrate | Pyruvate dehydrogenase kinase substrate 0 0 0 0 0 0 +IGNEEpSDLEEACILPHpSPINVDK DDEEKIGNEEpSDLEEACILPH; DLEEACILPHpSPINVDKRPIA EKIGNEEsDLEEACI | EACILPHsPINVDKR HERC2 pS1577, pS1588 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | b-Adrenergic Receptor kinase substrate | WW domain binding | ERK/MAPK | CK2 | NEK6 167764000 121218000 155736000 140640000 83642000 128468000 +IRAEEEDLAAVPFLApSDNEEEEDEK EDLAAVPFLApSDNEEEEDEKG AAVPFLAsDNEEEED HERC2 pS2928 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | CK2 22562000 18225000 9119700 11689000 0 0 +KGLLApTpSGNDGTIR VWCNKKGLLApTSGNDGTIRVW; WCNKKGLLATpSGNDGTIRVWN NKKGLLAtSGNDGTI | KKGLLATsGNDGTIR HERC1 pT3445, pS3446 Q15751 HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 N/A N/A 7843600 0 241700000 0 0 10042600 +KpSSLVTSK PTPQDLPQRKpSSLVTSKLAGG; PTPQDLPQRKpSSLVTSKLAG QDLPQRKsSLVTSKL ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS108; pS108; pS124; pS131; pS104; pS104; pS120; pS124 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A G protein-coupled receptor kinase 1 substrate 0 0 18629000 0 0 0 +KSpSLVTSK TPQDLPQRKSpSLVTSKLAGGQ; TPQDLPQRKSpSLVTSKLAG DLPQRKSsLVTSKLA ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS109; pS109; pS125; pS132; pS105; pS105; pS121; pS125 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; protein conformation; SNCA(DISRUPTS) G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | Casein Kinase I substrate | MDC1 BRCT domain binding | GSK3 | AURORA 7090300 8341200 9691500 10030000 1675200 9952100 +LpSPNPWQEK MLAVDIEDRLpSPNPWQEKREI VDIEDRLsPNPWQEK HERC2 pS3462 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | WW domain binding 0 11706000 12495000 0 7273000 8877800 +NLLEDDpSDEEEDFFLR SERRNLLEDDpSDEEEDFFLRG RNLLEDDsDEEEDFF VAMP4 pS30 O75379 VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | Casein Kinase I substrate | b-Adrenergic Receptor kinase substrate | BARD1 BRCT domain binding | CK2 | Csnk2a1 1592100000 973800000 1011600000 1450300000 631970000 878760000 +pSQKQEEENPAEETGEEK MpSQKQEEENPAE ______MsQKQEEEN ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS2; pS2; pS2; pS2; pS2; pS2 O43768; O43768-2; O43768-3; O43768-4; O43768-8; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A ATM kinase substrate | PKC kinase substrate | PKA kinase substrate 0 0 8765300 0 2355900 14706000 +QLSEpSFK SKSSSRQLSEpSFKSKEFVSSD SSRQLSEsFKSKEFV SSRP1 pS659 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2a2 | CDK7 | PKCalpha | PKCbeta | DNAPK | PKC kinase substrate | PKA kinase substrate | NEK6 68201000 87774000 138300000 95357000 19966000 149110000 +RGpSLEMSSDGEPLSR SSATSGGRRGpSLEMSSDGEPL TSGGRRGsLEMSSDG AEBP2; AEBP2 pS206; pS206 Q6ZN18; Q6ZN18-2 AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 N/A Casein Kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | PKA | GSK3 | AURORA 19262000 11103000 19454000 0 1816900 22028000 +SDGpSLEDGDDVHR IEDGGARSDGpSLEDGDDVHRA GGARSDGsLEDGDDV SERINC1 pS364 Q9NRX5 SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1 N/A Casein kinase II substrate | Plk1 kinase substrate | Pyruvate dehydrogenase kinase substrate | CK1 | PLK | PLK1 31407000 17665000 20892000 23194000 5132400 54893000 +SEpSLTAESR EGGGLMTRSEpSLTAESRLVHT GLMTRSEsLTAESRL HERC1 pS1491 Q15751 HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 N/A b-Adrenergic Receptor kinase substrate 11766000 13176000 20540000 16963000 4364700 21308000 +STGPTAATGpSNRR MSTGPTAATGpSNRRLQQTQNQ GPTAATGsNRRLQQT VAMP3 pS11 Q15836 VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A PKCalpha | PKCbeta | PKCzeta | PKC kinase substrate | PKA kinase substrate 3057100 4718800 12052000 5047700 1070900 8333500 +TEDLEATpSEHFK RNKTEDLEATpSEHFKTTSQKV TEDLEATsEHFKTTS VAMP8 pS55 Q9BV40 VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1 activity, inhibited; abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion G protein-coupled receptor kinase 1 substrate | Casein Kinase I substrate 20400000 9738500 7862300 0 0 76518000 +TFWpSPELK SSMNSIKTFWpSPELKKERVLR NSIKTFWsPELKKER ERC2 pS187 O15083 ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3 N/A IKKalpha | IKKbeta | HIPK2 | Casein Kinase II substrate | ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | WW domain binding 29764000 20957000 24855000 30752000 8304800 23771000 +YFDpSGDYNMAK CADEMQKYFDpSGDYNMAKAKM; RLQKGQKYFDpSGDYNMAKAKM; MKSVEQKYFDpSGDYNMAKAKM EMQKYFDsGDYNMAK | KGQKYFDsGDYNMAK | VEQKYFDsGDYNMAK ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS67; pS67; pS83; pS90; pS63; pS63; pS79; pS83 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; cell cycle regulation; PPP2CA(INDUCES) b-Adrenergic Receptor kinase substrate 323250000 127970000 0 67123000 12790000 71378000
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_input_for_preproc.tabular Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,38 @@ +Proteins Positions within proteins Leading proteins Protein Fasta headers Localization prob Score diff PEP Score Delta score Score for localization Localization prob shL.1A Score diff shL.1A PEP shL.1A Score shL.1A Localization prob shL.1B Score diff shL.1B PEP shL.1B Score shL.1B Localization prob shL.1C Score diff shL.1C PEP shL.1C Score shL.1C Localization prob shR.2A Score diff shR.2A PEP shR.2A Score shR.2A Localization prob shR.2B Score diff shR.2B PEP shR.2B Score shR.2B Localization prob shR.2C Score diff shR.2C PEP shR.2C Score shR.2C Diagnostic peak Number of Phospho (STY) Amino acid Sequence window Modification window Peptide window coverage Phospho (STY) Probabilities Phospho (STY) Score diffs Position in peptide Charge Mass error [ppm] Identification type shL.1A Identification type shL.1B Identification type shL.1C Identification type shR.2A Identification type shR.2B Identification type shR.2C Intensity Intensity___1 Intensity___2 Intensity___3 Ratio mod/base Intensity shL.1A Intensity shL.1B Intensity shL.1C Intensity shR.2A Intensity shR.2B Intensity shR.2C Ratio mod/base shL.1A Ratio mod/base shL.1B Ratio mod/base shL.1C Ratio mod/base shR.2A Ratio mod/base shR.2B Ratio mod/base shR.2C Intensity shL.1A___1 Intensity shL.1A___2 Intensity shL.1A___3 Intensity shL.1B___1 Intensity shL.1B___2 Intensity shL.1B___3 Intensity shL.1C___1 Intensity shL.1C___2 Intensity shL.1C___3 Intensity shR.2A___1 Intensity shR.2A___2 Intensity shR.2A___3 Intensity shR.2B___1 Intensity shR.2B___2 Intensity shR.2B___3 Intensity shR.2C___1 Intensity shR.2C___2 Intensity shR.2C___3 Occupancy shL.1A Occupancy ratioshL.1A Occupancy error scale shL.1A Occupancy shL.1B Occupancy ratioshL.1B Occupancy error scale shL.1B Occupancy shL.1C Occupancy ratioshL.1C Occupancy error scale shL.1C Occupancy shR.2A Occupancy ratioshR.2A Occupancy error scale shR.2A Occupancy shR.2B Occupancy ratioshR.2B Occupancy error scale shR.2B Occupancy shR.2C Occupancy ratioshR.2C Occupancy error scale shR.2C Reverse Potential contaminant id Protein group IDs Positions Position Peptide IDs Mod. peptide IDs Evidence IDs MS/MS IDs Best localization evidence ID Best localization MS/MS ID Best localization raw file Best localization scan number Best score evidence ID Best score MS/MS ID Best score raw file Best score scan number Best PEP evidence ID Best PEP MS/MS ID Best PEP raw file Best PEP scan number +sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN 108;108;124;124;131;104;104;120 sp|O43768-2|ENSA_HUMAN sp|O43768-2|ENSA_HUMAN 0.877317 8.54376 0.001041 110.11 55.028 110.11 1 S TGDHIPTPQDLPQRKSSLVTSKLAG______ X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXPPPPPPPPXXXXXXXXX KS(0.877)S(0.123)LVTSK KS(8.54)S(-8.54)LVT(-58.58)S(-72.01)K 2 2 0.022801 By MS/MS 18629000 18629000 0 0 0 0 18629000 0 0 0 0 0 0 0 0 0 18629000 0 0 0 0 0 0 0 0 0 0 0 700 529 108 108 12310;20039 13742;22688 99166 91729 99166 91729 QE05099 5593 99166 91729 QE05099 5593 99166 91729 QE05099 5593 +sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN 109;109;125;125;132;105;105;121 sp|O43768-2|ENSA_HUMAN sp|O43768-2|ENSA_HUMAN 0.877764 9.23011 0.00135208 98.182 25.939 55.754 1 S GDHIPTPQDLPQRKSSLVTSKLAG_______ X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPXXXXXXXXXX KS(0.105)S(0.878)LVT(0.015)S(0.002)K KS(-9.23)S(9.23)LVT(-17.65)S(-25.69)K 3 2 -0.061619 By MS/MS By MS/MS By matching By matching By matching By MS/MS 81973000 81973000 0 0 7090300 8341200 9691500 10030000 1675200 9952100 7090300 0 0 8341200 0 0 9691500 0 0 10030000 0 0 1675200 0 0 9952100 0 0 701 529 109 109 12310;20039 13742;22688 99164;99165;99168;99169;160369;160370;160371;160372;160373;160374 91727;91728;91731;142479 99164 91727 QE05097 5219 99167 91730 QE05100 5516 99167 91730 QE05100 5516 +CON__P02662 46 CON__P02662 CON__P02662 0.99978 36.4544 1.10E-08 122.19 116.48 122.19 2 S VFGKEKVNELSKDIGSESTEDQAMEDIKQME X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;Phospho (STY);X;X;X;X;X;Oxidation (M);X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPPPPPPPXXX DIGS(1)ES(0.972)T(0.029)EDQAMEDIK DIGS(36.45)ES(15.33)T(-15.33)EDQAMEDIK 4 2 0.56139 By MS/MS By MS/MS By MS/MS 49187000 0 49187000 0 NaN 16494000 0 20139000 0 0 12553000 NaN NaN NaN NaN NaN NaN 0 16494000 0 0 0 0 0 20139000 0 0 0 0 0 0 0 0 12553000 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + 2 14 46 46 3452 3862;3863 27864;27865;27866;27867 25820;25821;25822;25823 27865 25821 QE05099 36641 27865 25821 QE05099 36641 27865 25821 QE05099 36641 +CON__P02662 48 CON__P02662 CON__P02662 0.971522 15.3284 1.10E-08 122.19 116.48 122.19 2 S GKEKVNELSKDIGSESTEDQAMEDIKQMEAE X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;Phospho (STY);X;X;X;X;X;Oxidation (M);X;X;X;X;X;X;X;X;X XXXXXXXXXXPPPPPPPPPPPPPPPPXXXXX DIGS(1)ES(0.972)T(0.029)EDQAMEDIK DIGS(36.45)ES(15.33)T(-15.33)EDQAMEDIK 6 2 0.56139 By MS/MS By MS/MS By MS/MS 49187000 0 49187000 0 NaN 16494000 0 20139000 0 0 12553000 NaN NaN NaN NaN NaN NaN 0 16494000 0 0 0 0 0 20139000 0 0 0 0 0 0 0 0 12553000 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + 3 14 48 48 3452 3862;3863 27864;27865;27866;27867 25820;25821;25822;25823 27865 25821 QE05099 36641 27865 25821 QE05099 36641 27865 25821 QE05099 36641 +CON__P02662 115 CON__P02662 CON__P02662 1 50.1781 4.91E-07 124.08 88.205 50.178 1 S RLKKYKVPQLEIVPNSAEERLHSMKEGIHAQ X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXPPPPPPPPPPPPPPXXXXXXXXXXX VPQLEIVPNS(1)AEER VPQLEIVPNS(50.18)AEER 10 3 -0.26085 By MS/MS By matching By MS/MS By matching By matching By MS/MS 228160000 228160000 0 0 NaN 36938000 3667100 7945800 0 2359500 8418700 NaN NaN NaN NaN NaN NaN 36938000 0 0 3667100 0 0 7945800 0 0 0 0 0 2359500 0 0 8418700 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + 4 14 115 115 23142 26196 185609;185610;185611;185612;185613;185614;185615 165233;165234;165235;165236 185612 165236 QE05102 41518 185610 165234 QE05097 41110 185610 165234 QE05097 41110 +sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-8|ENSA_HUMAN 2;2;2;2;2;2 sp|O43768-2|ENSA_HUMAN sp|O43768-2|ENSA_HUMAN 1.0 73.249 3.69e-06 83.395 74.925 83.395 1 S ______________MSQKQEEENPAEETGEE X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXXPPPPPPPPPPPPPPPP S(1)QKQEEENPAEETGEEK S(73.25)QKQEEENPAEET(-73.25)GEEK 1 2 -0.84902 By matching By matching By MS/MS 25828000 25828000 0 0 0 0 8765300 0 2355900 14706000 0 0 0 0 0 0 8765300 0 0 0 0 0 2355900 0 0 14706000 0 0 702 529 2 2 19781 22398 158249;158250;158251 140920 158249 140920 QE05102 12907 158249 140920 QE05102 12907 158249 140920 QE05102 12907 +sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN;sp|P56211-2|ARP19_HUMAN;sp|P56211|ARP19_HUMAN 67;67;83;83;90;63;63;79;46;62 sp|O43768-2|ENSA_HUMAN;sp|P56211-2|ARP19_HUMAN sp|O43768-2|ENSA_HUMAN 0.999907 42.1841 4.04e-05 77.894 72.756 77.894 1 S DFLMKRLQKGQKYFDSGDYNMAKAKMKNKQL;DFLRKRLQKGQKYFDSGDYNMAKAKMKNKQL X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPPXXXXXXXX YFDS(1)GDYNMAK Y(-44.9)FDS(42.18)GDY(-42.18)NMAK 4 2 0.090313 By MS/MS By MS/MS By matching By MS/MS By MS/MS 602510000 602510000 0 0 323250000 127970000 0 67123000 12790000 71378000 323250000 0 0 127970000 0 0 0 0 0 67123000 0 0 12790000 0 0 71378000 0 0 703 529;2007 67;46 67 23817 26932 190543;190544;190545;190546;190547 169398;169399;169400;169401 190543 169398 QE05097 28697 190543 169398 QE05097 28697 190543 169398 QE05097 28697 +sp|O95714|HERC2_HUMAN;sp|Q9BVR0|HRC23_HUMAN 1577;304 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 1.0 100.152 1.12e-15 100.15 94.415 100.15 2 S KPESTDDEEKIGNEESDLEEACILPHSPINV X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X XXXXXXXXXXPPPPPPPPPPPPPPPPPPPPP IGNEES(1)DLEEACILPHS(1)PINVDK IGNEES(100.15)DLEEACILPHS(100.15)PINVDK 6 3 -0.31776 By matching By matching By matching By matching By MS/MS By MS/MS 398730000 0 398730000 0 83882000 60609000 77868000 70320000 41821000 64234000 0 83882000 0 0 60609000 0 0 77868000 0 0 70320000 0 0 41821000 0 0 64234000 0 1295 867 1577 1577 11517 12858 93270;93271;93272;93273;93274;93275 86700;86701 93271 86701 QE05102 51298 93271 86701 QE05102 51298 93271 86701 QE05102 51298 +sp|O95714|HERC2_HUMAN;sp|Q9BVR0|HRC23_HUMAN 1588;315 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 1.0 100.152 1.12e-15 100.15 94.415 100.15 2 S GNEESDLEEACILPHSPINVDKRPIAIKSPK X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X PPPPPPPPPPPPPPPPPPPPPPXXXXXXXXX IGNEES(1)DLEEACILPHS(1)PINVDK IGNEES(100.15)DLEEACILPHS(100.15)PINVDK 17 3 -0.31776 By matching By matching By matching By matching By MS/MS By MS/MS 398730000 0 398730000 0 83882000 60609000 77868000 70320000 41821000 64234000 0 83882000 0 0 60609000 0 0 77868000 0 0 70320000 0 0 41821000 0 0 64234000 0 1296 867 1588 1588 11517 12858 93270;93271;93272;93273;93274;93275 86700;86701 93271 86701 QE05102 51298 93271 86701 QE05102 51298 93271 86701 QE05102 51298 +sp|O95714|HERC2_HUMAN 2928 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 1.0 44.9549 6.81e-12 84.285 78.578 44.955 1 S IRAEEEDLAAVPFLASDNEEEEDEKGNSGSL X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X PPPPPPPPPPPPPPPPPPPPPPPPPXXXXXX IRAEEEDLAAVPFLAS(1)DNEEEEDEK IRAEEEDLAAVPFLAS(44.95)DNEEEEDEK 16 3 -0.24823 By MS/MS By MS/MS By matching By matching 61597000 61597000 0 0 22562000 18225000 9119700 11689000 0 0 22562000 0 0 18225000 0 0 9119700 0 0 11689000 0 0 0 0 0 0 0 0 1297 867 2928 2928 11904 13281 96043;96044;96045;96046 89048;89049 96044 89049 QE05098 52942 96043 89048 QE05097 52381 96043 89048 QE05097 52381 +sp|O95714|HERC2_HUMAN 1938 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 0.427104 0.0 4.17e-06 44.164 42.292 44.164 S KYDLKLAELPAAAQPSAEDSDTEDDSEAEQT X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXPPPPPPPPPPPPPPPPPPPPPPPPPP LAELPAAAQPS(0.427)AEDS(0.427)DT(0.142)EDDS(0.003)EAEQTER LAELPAAAQPS(0)AEDS(0)DT(-4.78)EDDS(-20.87)EAEQT(-37.92)ER 11 3 -1.2171 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1298 867 1938 1938 12395 13829 99721 92163 QE05099 31358 99721 92163 QE05099 31358 99721 92163 QE05099 31358 +sp|O95714|HERC2_HUMAN 1942 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 0.427104 0.0 4.17e-06 44.164 42.292 44.164 S KLAELPAAAQPSAEDSDTEDDSEAEQTERNI X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX LAELPAAAQPS(0.427)AEDS(0.427)DT(0.142)EDDS(0.003)EAEQTER LAELPAAAQPS(0)AEDS(0)DT(-4.78)EDDS(-20.87)EAEQT(-37.92)ER 15 3 -1.2171 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1299 867 1942 1942 12395 13829 99721 92163 QE05099 31358 99721 92163 QE05099 31358 99721 92163 QE05099 31358 +sp|O95714|HERC2_HUMAN 3462 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 1.0 41.1171 0.0267288 41.117 33.02 41.117 1 S NGEECMLAVDIEDRLSPNPWQEKREIVSSED X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXPPPPPPPPPXXXXXXXX LS(1)PNPWQEK LS(41.12)PNPWQEK 2 2 0.64603 By matching By MS/MS By matching By matching 40352000 40352000 0 0 0 11706000 12495000 0 7273000 8877800 0 0 0 11706000 0 0 12495000 0 0 0 0 0 7273000 0 0 8877800 0 0 1300 867 3462 3462 14140 15756 112737;112738;112739;112740 102778 112737 102778 QE05099 28079 112737 102778 QE05099 28079 112737 102778 QE05099 28079 +sp|Q08945|SSRP1_HUMAN 667 sp|Q08945|SSRP1_HUMAN sp|Q08945|SSRP1_HUMAN 0.824557 6.72928 2.29e-05 88.385 80.253 88.385 1 S SSRQLSESFKSKEFVSSDESSSGENKSKKKR X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPPPPPXXXXX EFVS(0.825)S(0.175)DESSSGENK EFVS(6.73)S(-6.73)DES(-34.1)S(-47.3)S(-52.91)GENK 4 2 -0.31453 By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS 78553000 78553000 0 0 12562000 16302000 23000000 7857800 0 18830000 12562000 0 0 16302000 0 0 23000000 0 0 7857800 0 0 0 0 0 18830000 0 0 3469 2387 667 667 6499 7276 53820;53821;53822;53823;53824 51145;51146;51147;51148;51149 53820 51145 QE05097 12983 53820 51145 QE05097 12983 53820 51145 QE05097 12983 +sp|Q08945|SSRP1_HUMAN 444 sp|Q08945|SSRP1_HUMAN sp|Q08945|SSRP1_HUMAN 0.999939 44.165 7.94e-20 97.469 93.771 97.469 1 S GLKEGMNPSYDEYADSDEDQHDAYLERMKEE X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXPPPPPPPPPPPPPPPPPPPPPPPPXXXX EGMNPSYDEYADS(1)DEDQHDAYLER EGMNPS(-49.21)Y(-49.82)DEY(-44.17)ADS(44.17)DEDQHDAY(-90.19)LER 13 3 0.19918 By MS/MS By MS/MS 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3470 2387 444 444 6658 7448 55048;55049 52320;52321 55048 52320 QE05099 31926 55048 52320 QE05099 31926 55048 52320 QE05099 31926 +sp|Q08945|SSRP1_HUMAN 659 sp|Q08945|SSRP1_HUMAN sp|Q08945|SSRP1_HUMAN 0.999878 39.1416 0.00235198 117.7 65.216 117.7 1 S SRGSSSKSSSRQLSESFKSKEFVSSDESSSG X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X XXXXXXXXXXXPPPPPPPXXXXXXXXXXXXX QLSES(1)FK QLS(-39.14)ES(39.14)FK 5 2 0.14738 By MS/MS By MS/MS By MS/MS By MS/MS By matching By MS/MS 558700000 558700000 0 0 68201000 87774000 138300000 95357000 19966000 149110000 68201000 0 0 87774000 0 0 138300000 0 0 95357000 0 0 19966000 0 0 149110000 0 0 3471 2387 659 659 16873 19002 134380;134381;134382;134383;134384;134385 120469;120470;120471;120472;120473 134381 120470 QE05098 17736 134381 120470 QE05098 17736 134381 120470 QE05098 17736 +sp|Q15751|HERC1_HUMAN 3446 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.999981 47.2167 0.0187791 47.548 7.8172 47.548 2 S VMTCVWCNKKGLLATSGNDGTIRVWNVTKKQ X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXPPPPPPPPPPPPPPXXXXXXXX KGLLAT(1)S(1)GNDGTIR KGLLAT(47.2)S(47.22)GNDGT(-47.2)IR 7 2 -0.95722 By matching By MS/MS By matching 129800000 0 129800000 0 3921800 0 120850000 0 0 5021300 0 3921800 0 0 0 0 0 120850000 0 0 0 0 0 0 0 0 5021300 0 4421 2824 3446 3446 12194 13609 98227;98228;98229 90789 98227 90789 QE05099 12004 98227 90789 QE05099 12004 98227 90789 QE05099 12004 +sp|Q15751|HERC1_HUMAN 1491 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.9956 24.4686 0.000725254 80.245 41.065 80.245 1 S STSASEGGGLMTRSESLTAESRLVHTSPNYR X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPPXXXXXXXXX S(0.004)ES(0.996)LT(0.001)AESR S(-24.47)ES(24.47)LT(-30.8)AES(-48.77)R 3 2 -0.02332 By matching By MS/MS By MS/MS By MS/MS By matching By MS/MS 88117000 88117000 0 0 11766000 13176000 20540000 16963000 4364700 21308000 11766000 0 0 13176000 0 0 20540000 0 0 16963000 0 0 4364700 0 0 21308000 0 0 4422 2824 1491 1491 18146 20455 144586;144587;144588;144589;144590;144591 129449;129450;129451;129452 144587 129450 QE05099 10286 144587 129450 QE05099 10286 144587 129450 QE05099 10286 +sp|Q15751|HERC1_HUMAN 1510 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.330689 0.0 7.97e-05 45.193 39.23 45.193 S ESRLVHTSPNYRLIKSRSESDLSQPESDEEG X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXXPPPPPPPPPPPPPPPP S(0.331)RS(0.331)ES(0.331)DLS(0.008)QPESDEEGYALSGR S(0)RS(0)ES(0)DLS(-16.27)QPES(-35.13)DEEGY(-44.24)ALS(-45.11)GR 1 3 0.88872 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4423 2824 1510 1510 19884 22510 159108 141525 QE05102 26609 159108 141525 QE05102 26609 159108 141525 QE05102 26609 +sp|Q15751|HERC1_HUMAN 1512 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.473289 2.22394 8.37e-06 56.783 53.982 56.783 S RLVHTSPNYRLIKSRSESDLSQPESDEEGYA X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPPPPPPPPPPP S(0.284)RS(0.473)ES(0.219)DLS(0.024)QPESDEEGYALSGR S(-2.22)RS(2.22)ES(-3.34)DLS(-13.02)QPES(-39.32)DEEGY(-52.92)ALS(-56.34)GR 3 3 -0.16378 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4424 2824 1512 1512 19884 22510 159107 141524 QE05101 26243 159107 141524 QE05101 26243 159107 141524 QE05101 26243 +sp|Q15751|HERC1_HUMAN 1514 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.330689 0.0 7.97e-05 45.193 39.23 45.193 S VHTSPNYRLIKSRSESDLSQPESDEEGYALS X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX S(0.331)RS(0.331)ES(0.331)DLS(0.008)QPESDEEGYALSGR S(0)RS(0)ES(0)DLS(-16.27)QPES(-35.13)DEEGY(-44.24)ALS(-45.11)GR 5 3 0.88872 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4425 2824 1514 1514 19884 22510 159108 141525 QE05102 26609 159108 141525 QE05102 26609 159108 141525 QE05102 26609 +sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN 18;18 sp|Q6ZN18-2|AEBP2_HUMAN sp|Q6ZN18-2|AEBP2_HUMAN 0.998316 27.7896 1.21e-62 181.56 176.76 181.56 2 S AAITDMADLEELSRLSPLPPGSPGSAARGRA X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X PPPPPPPPPPPPPPPPPPPPPPPPPPPPXXX AAAITDMADLEELS(0.002)RLS(0.998)PLPPGS(0.809)PGS(0.191)AAR AAAIT(-99.88)DMADLEELS(-27.79)RLS(27.79)PLPPGS(6.28)PGS(-6.28)AAR 17 3 0.97551 By matching By matching By matching By MS/MS By MS/MS By MS/MS 499850000 0 499850000 0 2708200 3550900 192640000 104030000 20713000 176200000 0 2708200 0 0 3550900 0 0 192640000 0 0 104030000 0 0 20713000 0 0 176200000 0 5468 3335 18 18 28 35 264;265;266;267;268;269 236;237;238;239 264 236 QE05100 65231 264 236 QE05100 65231 264 236 QE05100 65231 +sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN 24;24 sp|Q6ZN18-2|AEBP2_HUMAN sp|Q6ZN18-2|AEBP2_HUMAN 0.809237 6.27624 1.21e-62 181.56 176.76 181.56 2 S ADLEELSRLSPLPPGSPGSAARGRAEPPEEE X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X PPPPPPPPPPPPPPPPPPPPPPXXXXXXXXX AAAITDMADLEELS(0.002)RLS(0.998)PLPPGS(0.809)PGS(0.191)AAR AAAIT(-99.88)DMADLEELS(-27.79)RLS(27.79)PLPPGS(6.28)PGS(-6.28)AAR 23 3 0.97551 By matching By matching By matching By MS/MS By MS/MS By MS/MS 499850000 0 499850000 0 2708200 3550900 192640000 104030000 20713000 176200000 0 2708200 0 0 3550900 0 0 192640000 0 0 104030000 0 0 20713000 0 0 176200000 0 5469 3335 24 24 28 35 264;265;266;267;268;269 236;237;238;239 264 236 QE05100 65231 264 236 QE05100 65231 264 236 QE05100 65231 +sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN 206;206 sp|Q6ZN18-2|AEBP2_HUMAN sp|Q6ZN18-2|AEBP2_HUMAN 0.999982 48.3708 1.18e-09 128.05 118.25 128.05 1 S TGGGGSSATSGGRRGSLEMSSDGEPLSRMDS X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPPPPPPPPXXX RGS(1)LEMSSDGEPLSR RGS(48.37)LEMS(-48.37)S(-54.13)DGEPLS(-99.69)R 3 2 -0.10602 By MS/MS By MS/MS By MS/MS By matching By MS/MS 73663000 73663000 0 0 19262000 11103000 19454000 0 1816900 22028000 19262000 0 0 11103000 0 0 19454000 0 0 0 0 0 1816900 0 0 22028000 0 0 5470 3335 206 206 17255 19413 137099;137100;137101;137102;137103 122913;122914;122915;122916 137099 122913 QE05097 23240 137099 122913 QE05097 23240 137099 122913 QE05097 23240 + REV__sp|P35908|K22E_HUMAN REV__sp|P35908|K22E_HUMAN 1 71.692 0.00457965 71.692 14.102 71.692 1 S X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXPPPPPPPPPXXXXXXXXXXXX IIKELS(1)DGR IIKELS(71.69)DGR 6 2 2.0005 By matching By MS/MS By matching By matching By matching 431850000 431850000 0 0 NaN 103010000 67359000 64124000 74201000 0 55805000 NaN NaN NaN NaN NaN NaN 103010000 0 0 67359000 0 0 64124000 0 0 74201000 0 0 0 0 0 55805000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + + 61 57 252 252 11589 12932 93729;93730;93731;93732;93733;93734 87100 93729 87100 QE05098 47490 93729 87100 QE05098 47490 93729 87100 QE05098 47490 + REV__sp|Q9NSB4|KRT82_HUMAN REV__sp|Q9NSB4|KRT82_HUMAN 1 45.368 0.0161156 45.368 28.697 45.368 1 S X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPXXXXXXXXX VDGS(1)VCDLRR VDGS(45.37)VCDLRR 4 2 0.77096 By matching By matching By matching By matching By matching By MS/MS 1670400000 1670400000 0 0 NaN 218420000 241200000 328130000 240860000 52984000 294390000 NaN NaN NaN NaN NaN NaN 218420000 0 0 241200000 0 0 328130000 0 0 240860000 0 0 52984000 0 0 294390000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + + 62 58 330 330 22307 25289 178961;178962;178963;178964;178965;178966;178967 159240 178961 159240 QE05102 16922 178961 159240 QE05102 16922 178961 159240 QE05102 16922 + REV__sp|Q6S5H4-2|POTEB_HUMAN REV__sp|Q6S5H4-2|POTEB_HUMAN 1 51.2862 0.045235 51.286 32.662 51.286 S X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPPXXXXXXXXX EVS(1)EIEELK EVS(51.29)EIEELK 3 2 0.81181 By matching By matching By matching By matching By matching 50767000 50767000 0 0 0.044169 0 8469100 14247000 11062000 1262600 15726000 0 0.056281 0.030122 0.051456 0.037786 0.081346 0 0 0 8469100 0 0 14247000 0 0 11062000 0 0 1262600 0 0 15726000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + 63 59 22 22 8166 9110 66515;66516;66517;66518;66519 61714;61715 66516 61715 QE05100 38402 66516 61715 QE05100 38402 66516 61715 QE05100 38402 +sp|Q8IUD2-4|RB6I2_HUMAN;sp|Q8IUD2-2|RB6I2_HUMAN;sp|Q8IUD2-3|RB6I2_HUMAN;sp|Q8IUD2|RB6I2_HUMAN;sp|Q8IUD2-5|RB6I2_HUMAN;sp|O15083|ERC2_HUMAN 191;191;191;191;191;187 sp|Q8IUD2-4|RB6I2_HUMAN sp|Q8IUD2-4|RB6I2_HUMAN 0.999998 58.0663 0.00181554 89.827 67.799 89.827 1 S ESKLSSSMNSIKTFWSPELKKERALRKDEAS X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPXXXXXXXXXXX TFWS(1)PELK T(-58.07)FWS(58.07)PELK 4 2 0.075831 By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS 138400000 138400000 0 0 29764000 20957000 24855000 30752000 8304800 23771000 29764000 0 0 20957000 0 0 24855000 0 0 30752000 0 0 8304800 0 0 23771000 0 0 6037 3584 191 191 21148 23984 169817;169818;169819;169820;169821;169822 151176;151177;151178;151179;151180;151181 169822 151181 QE05102 49176 169822 151181 QE05102 49176 169822 151181 QE05102 49176 +sp|Q9NRX5|SERC1_HUMAN 364 sp|Q9NRX5|SERC1_HUMAN sp|Q9NRX5|SERC1_HUMAN 0.999996 54.0798 2.24e-16 159.22 148.1 159.22 1 S DESTLIEDGGARSDGSLEDGDDVHRAVDNER X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPPPPXXXXXX SDGS(1)LEDGDDVHR S(-54.08)DGS(54.08)LEDGDDVHR 4 2 0.64808 By MS/MS By MS/MS By matching By MS/MS By MS/MS By MS/MS 222110000 222110000 0 0 31407000 17665000 20892000 23194000 5132400 54893000 31407000 0 0 17665000 0 0 20892000 0 0 23194000 0 0 5132400 0 0 54893000 0 0 8729 5187 364 364 17793 20026 141355;141356;141357;141358;141359;141360;141361;141362;141363;141364;141365 126543;126544;126545;126546;126547;126548;126549 141361 126549 QE05102 10564 141361 126549 QE05102 10564 141361 126549 QE05102 10564 +sp|Q9Y3B9|RRP15_HUMAN 11 sp|Q9Y3B9|RRP15_HUMAN sp|Q9Y3B9|RRP15_HUMAN 0.997432 25.8922 9.39e-31 175.33 139.7 175.33 1 S _____MAAAAPDSRVSEEENLKKTPKKKMKM X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXPPPPPPPPPPPPPPPPXXXXXXXXX AAAAPDS(0.003)RVS(0.997)EEENLK AAAAPDS(-25.89)RVS(25.89)EEENLK 10 2 -0.029697 By matching By matching By MS/MS By MS/MS By MS/MS By MS/MS 266450000 266450000 0 0 38150000 39445000 56305000 55338000 7010600 70203000 38150000 0 0 39445000 0 0 56305000 0 0 55338000 0 0 7010600 0 0 70203000 0 0 9895 5791 11 11 12 17 158;159;160;161;162;163 166;167;168;169 159 167 QE05100 23225 159 167 QE05100 23225 159 167 QE05100 23225 +sp|Q15751|HERC1_HUMAN 3445 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.999981 47.2024 0.0187791 47.548 7.8172 47.548 2 T RVMTCVWCNKKGLLATSGNDGTIRVWNVTKK X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXPPPPPPPPPPPPPPXXXXXXX KGLLAT(1)S(1)GNDGTIR KGLLAT(47.2)S(47.22)GNDGT(-47.2)IR 6 2 -0.95722 By matching By MS/MS By matching 129800000 0 129800000 0 3921800 0 120850000 0 0 5021300 0 3921800 0 0 0 0 0 120850000 0 0 0 0 0 0 0 0 5021300 0 10983 2824 3445 3445 12194 13609 98227;98228;98229 90789 98227 90789 QE05099 12004 98227 90789 QE05099 12004 98227 90789 QE05099 12004 +sp|O75379|VAMP4_HUMAN 30 sp|O75379|VAMP4_HUMAN sp|O75379|VAMP4_HUMAN 1 67.6437 1.44E-52 203.56 187.24 67.644 1 S TGSVKSERRNLLEDDSDEEEDFFLRGPSGPR X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXPPPPPPPPPPPPPPPPPPPPPP NLLEDDS(1)DEEEDFFLR NLLEDDS(67.64)DEEEDFFLR 7 3 -0.051914 By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS 7929000000 7929000000 0 0 NaN 1592100000 973800000 1011600000 1450300000 631970000 878760000 NaN NaN NaN NaN NaN NaN 1592100000 0 0 973800000 0 0 1011600000 0 0 1450300000 0 0 631970000 0 0 878760000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 963 669 30 30 15558;15559 17538;17539 124829;124830;124831;124832;124833;124834;124835;124836;124837;124838;124839;124840;124841;124842;124843;124844;124845;124846 112951;112952;112953;112954;112955;112956;112957;112958;112959;112960;112961;112962;112963;112964;112965;112966;112967;112968;112969;112970;112971;112972 124840 112969 QE05102 57877 124833 112957 QE05099 57820 124833 112957 QE05099 57820 +sp|O95183|VAMP5_HUMAN 48 sp|O95183|VAMP5_HUMAN sp|O95183|VAMP5_HUMAN 0.72657 5.36697 5.72E-05 79.514 55.133 79.514 1 S KLAELQQRSDQLLDMSSTFNKTTQNLAQKKC X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXPPPPPPPPPPPPPXXXXXXXXXX SDQLLDMS(0.727)S(0.211)T(0.062)FNK S(-64.13)DQLLDMS(5.37)S(-5.37)T(-10.67)FNK 8 2 -0.18713 By matching By matching By MS/MS By matching By matching By matching 86590000 86590000 0 0 0.032027 17447000 15753000 20219000 14001000 6284700 12885000 0.028348 0.025719 0.032895 0.033925 0.083789 0.034516 17447000 0 0 15753000 0 0 20219000 0 0 14001000 0 0 6284700 0 0 12885000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1189 809 48 48 17891 20149 142427;142428;142429;142430;142431;142432 127454 142427 127454 QE05099 48504 142427 127454 QE05099 48504 142427 127454 QE05099 48504 +sp|Q15836|VAMP3_HUMAN;sp|P63027|VAMP2_HUMAN 63;80 sp|Q15836|VAMP3_HUMAN sp|Q15836|VAMP3_HUMAN 0.920811 10.6555 1.81E-09 124.1 98.278 107.25 1 S DRADALQAGASQFETSAAKLKRKYWWKNCKM X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXPPPPPPPPPPPPPPPPPXXXXXXXXXXXX ADALQAGASQFET(0.079)S(0.921)AAK ADALQAGAS(-49.99)QFET(-10.66)S(10.66)AAK 14 2 0.23449 By MS/MS By MS/MS By MS/MS By MS/MS By matching By MS/MS 265240000 265240000 0 0 0.036151 44627000 41445000 69094000 42521000 5738000 61819000 0.03226 0.028442 0.039791 0.036967 0.030963 0.043392 44627000 0 0 41445000 0 0 69094000 0 0 42521000 0 0 5738000 0 0 61819000 0 0 0.47624 0.90925 12.188 0.51677 1.0694 7.2217 NaN NaN NaN 0.81588 4.4311 19.209 NaN NaN NaN 0.4388 0.78189 5.9861 4442 2836 63 63 279 319 2297;2298;2299;2300;2301;2302 1992;1993;1994;1995;1996 2300 1995 QE05100 30086 2301 1996 QE05102 30007 2301 1996 QE05102 30007 +sp|Q15836|VAMP3_HUMAN;sp|P63027|VAMP2_HUMAN;sp|P23763-2|VAMP1_HUMAN;sp|P23763-3|VAMP1_HUMAN;sp|P23763|VAMP1_HUMAN 44;61;63;63;63 sp|Q15836|VAMP3_HUMAN sp|Q15836|VAMP3_HUMAN 1 65.4951 2.36E-06 126.19 98.602 65.495 1 S MRVNVDKVLERDQKLSELDDRADALQAGASQ X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXPPPPPPPPPPXXXXXXXXXX DQKLS(1)ELDDR DQKLS(65.5)ELDDR 5 3 -0.72518 By MS/MS By MS/MS By MS/MS By MS/MS By matching By MS/MS 412950000 412950000 0 0 NaN 75542000 44814000 32924000 35016000 11023000 4669900 NaN NaN NaN NaN NaN NaN 75542000 0 0 44814000 0 0 32924000 0 0 35016000 0 0 11023000 0 0 4669900 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4443 2836 44 44 4530 5083 37093;37094;37095;37096;37097;37098;37099;37100;37101;37102;37103;37104 34712;34713;34714;34715;34716;34717;34718;34719 37100 34719 QE05102 18436 37093 34712 QE05097 18245 37093 34712 QE05097 18245 +sp|Q15836|VAMP3_HUMAN 11 sp|Q15836|VAMP3_HUMAN sp|Q15836|VAMP3_HUMAN 0.97018 15.1316 0.000117365 79.652 72.041 79.652 1 S _____MSTGPTAATGSNRRLQQTQNQVDEVV X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXPPPPPPPPPPPPPXXXXXXXXXXXX STGPTAAT(0.03)GS(0.97)NRR S(-66.94)T(-63.48)GPT(-42.47)AAT(-15.13)GS(15.13)NRR 10 2 -0.15791 By matching By matching By MS/MS By matching By matching By MS/MS 34280000 34280000 0 0 NaN 3057100 4718800 12052000 5047700 1070900 8333500 NaN NaN NaN NaN NaN NaN 3057100 0 0 4718800 0 0 12052000 0 0 5047700 0 0 1070900 0 0 8333500 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4444 2836 11 11 20280 22978 162490;162491;162492;162493;162494;162495 144222;144223 162490 144222 QE05099 7582 162490 144222 QE05099 7582 162490 144222 QE05099 7582 +sp|Q9BV40|VAMP8_HUMAN 55 sp|Q9BV40|VAMP8_HUMAN sp|Q9BV40|VAMP8_HUMAN 0.959784 13.7778 3.78E-05 91.969 27.98 91.969 1 S NLEHLRNKTEDLEATSEHFKTTSQKVARKFW X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXPPPPPPPPPPPPXXXXXXXXXXX TEDLEAT(0.04)S(0.96)EHFK T(-83.18)EDLEAT(-13.78)S(13.78)EHFK 8 2 0.40785 By matching By matching By matching By MS/MS 114520000 114520000 0 0 NaN 20400000 9738500 7862300 0 0 76518000 NaN NaN NaN NaN NaN NaN 20400000 0 0 9738500 0 0 7862300 0 0 0 0 0 0 0 0 76518000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7902 4687 55 55 21013 23827 168874;168875;168876;168877 150433 168874 150433 QE05102 19524 168874 150433 QE05102 19524 168874 150433 QE05102 19524
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_kinase_substrate.tabular Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,2 @@ +GENE KINASE KIN_ACC_ID KIN_ORGANISM SUBSTRATE SUB_GENE_ID SUB_ACC_ID SUB_GENE SUB_ORGANISM SUB_MOD_RSD SITE_GRP_ID SITE_+/-7_AA DOMAIN IN_VIVO_RXN IN_VITRO_RXN CST_CAT# +Csnk2a1 CK2A1 Q60737 human VAMP4 53330 O70480 Vamp4 human S30 454285 RNLLEDDsDEEEDFF X
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_networkin.tabular Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,33 @@ +#substrate position id networkin_score tree netphorest_group netphorest_score string_identifier string_score substrate_name sequence string_path +VAMP4 (ENSP00000236192) 30 CK2alpha 35.6396 KIN CK2_group 0.5228 ENSP00000236192 0.85 VAMP4 LLEDDsDEEED "ENSP00000217244, 0.68 ENSP00000236192" +SSRP1 (ENSP00000278412) 444 CK2alpha 28.6345 KIN CK2_group 0.3768 ENSP00000278412 0.874 SSRP1 DEYADsDEDQH "ENSP00000217244, 0.6992 ENSP00000278412" +SSRP1 (ENSP00000278412) 667 CK2alpha 22.2088 KIN CK2_group 0.3168 ENSP00000278412 0.874 SSRP1 SKEFVsSDESS "ENSP00000217244, 0.6992 ENSP00000278412" +HERC2 (ENSP00000261609) 1577 CK2alpha 10.7686 KIN CK2_group 0.5253 ENSP00000261609 0.4514 HERC2 IGNEEsDLEEA "ENSP00000217244, 0.764 ENSP00000346659, 0.76 ENSP00000261609" +HERC2 (ENSP00000261609) 2928 CK2alpha 10.7686 KIN CK2_group 0.4698 ENSP00000261609 0.4514 HERC2 VPFLAsDNEEE "ENSP00000217244, 0.764 ENSP00000346659, 0.76 ENSP00000261609" +RRP15 (ENSP00000355899) 11 CK2alpha 8.5484 KIN CK2_group 0.3566 ENSP00000355899 0.461 RRP15 PDSRVsEEENL "ENSP00000217244, 0.3688 ENSP00000355899" +SSRP1 (ENSP00000278412) 444 CK2a2 7.8435 KIN CK2_group 0.3768 ENSP00000278412 0.615 SSRP1 DEYADsDEDQH "ENSP00000262506, 0.492 ENSP00000278412" +SSRP1 (ENSP00000278412) 667 CK2a2 7.7757 KIN CK2_group 0.3168 ENSP00000278412 0.615 SSRP1 SKEFVsSDESS "ENSP00000262506, 0.492 ENSP00000278412" +VAMP2 (ENSP00000314214) 80 PKD3 6.9217 KIN PKD_group 0.0744 ENSP00000314214 0.949 VAMP2 SQFETsAAKLK "ENSP00000234179, 0.7592 ENSP00000314214" +VAMP2 (ENSP00000314214) 61 CK2alpha 6.3122 KIN CK2_group 0.3338 ENSP00000314214 0.4391 VAMP2 RDQKLsELDDR "ENSP00000217244, 0.7992 ENSP00000222812, 0.7544 ENSP00000314214" +VAMP1 (ENSP00000380148) 63 CK2alpha 6.1363 KIN CK2_group 0.3338 ENSP00000380148 0.4364 VAMP1 RDQKLsELDDR "ENSP00000217244, 0.7944 ENSP00000222812, 0.7544 ENSP00000380148" +ERC1 (ENSP00000354158) 191 IKKalpha 5.3194 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.96 ERC1 IKTFWsPELKK "ENSP00000359424, 0.768 ENSP00000354158" +ERC1 (ENSP00000354158) 191 IKKalpha 5.3194 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.96 ERC1 IKTFWsPELKK "ENSP00000359424, 0.768 ENSP00000354158" +VAMP2 (ENSP00000314214) 61 PKAbeta 4.9293 KIN PKA_group 0.1153 ENSP00000314214 0.8 VAMP2 RDQKLsELDDR "ENSP00000359719, 0.64 ENSP00000314214" +VAMP2 (ENSP00000314214) 61 PKAgamma 4.9293 KIN PKA_group 0.1153 ENSP00000314214 0.8 VAMP2 RDQKLsELDDR "ENSP00000366488, 0.64 ENSP00000314214" +VAMP3 (ENSP00000054666) 44 CK2alpha 4.2842 KIN CK2_group 0.3338 ENSP00000054666 0.4201 VAMP3 RDQKLsELDDR "ENSP00000217244, 0.7992 ENSP00000317714, 0.6792 ENSP00000054666" +VAMP2 (ENSP00000314214) 80 PKCiota 3.8971 KIN PKC_group 0.0928 ENSP00000314214 0.899 VAMP2 SQFETsAAKLK "ENSP00000295797, 0.7192 ENSP00000314214" +SSRP1 (ENSP00000278412) 444 CDK7 3.6159 KIN CDK7 0.0186 ENSP00000278412 0.903 SSRP1 DEYADsDEDQH "ENSP00000256443, 0.7224 ENSP00000278412" +SSRP1 (ENSP00000278412) 444 CK1alpha 3.3573 KIN CK1_group 0.1264 ENSP00000278412 0.404 SSRP1 DEYADsDEDQH "ENSP00000261798, 0.3232 ENSP00000278412" +VAMP3 (ENSP00000054666) 11 PKCalpha 3.0633 KIN PKC_group 0.4633 ENSP00000054666 0.3277 VAMP3 TAATGsNRRLQ "ENSP00000284384, 0.6232 ENSP00000359025, 0.6352 ENSP00000054666" +SSRP1 (ENSP00000278412) 659 PKCalpha 3.0524 KIN PKC_group 0.4345 ENSP00000278412 0.237 SSRP1 RQLSEsFKSKE "ENSP00000284384, 0.4552 ENSP00000351885, 0.76 ENSP00000278412" +VAMP2 (ENSP00000314214) 61 PKCiota 2.7785 KIN PKC_group 0.0463 ENSP00000314214 0.899 VAMP2 RDQKLsELDDR "ENSP00000295797, 0.7192 ENSP00000314214" +SSRP1 (ENSP00000278412) 659 CDK7 2.5961 KIN CDK7 0.0104 ENSP00000278412 0.903 SSRP1 RQLSEsFKSKE "ENSP00000256443, 0.7224 ENSP00000278412" +SSRP1 (ENSP00000278412) 667 CDK7 2.5961 KIN CDK7 0.0124 ENSP00000278412 0.903 SSRP1 SKEFVsSDESS "ENSP00000256443, 0.7224 ENSP00000278412" +ERC1 (ENSP00000354158) 191 IKKbeta 2.571 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.946 ERC1 IKTFWsPELKK "ENSP00000339151, 0.7568 ENSP00000354158" +ERC1 (ENSP00000354158) 191 IKKbeta 2.571 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.946 ERC1 IKTFWsPELKK "ENSP00000339151, 0.7568 ENSP00000354158" +SSRP1 (ENSP00000278412) 659 PKCbeta 2.4948 KIN PKC_group 0.4345 ENSP00000278412 0.1743 SSRP1 RQLSEsFKSKE "ENSP00000305355, 0.7976 ENSP00000366013, 0.7192 ENSP00000284811, 0.7448 ENSP00000278412" +VAMP3 (ENSP00000054666) 11 PKCbeta 2.4948 KIN PKC_group 0.4633 ENSP00000054666 0.2393 VAMP3 TAATGsNRRLQ "ENSP00000305355, 0.512 ENSP00000348986, 0.7616 ENSP00000054666" +SSRP1 (ENSP00000278412) 659 CK2a2 2.4345 KIN CK2_group 0.0356 ENSP00000278412 0.615 SSRP1 RQLSEsFKSKE "ENSP00000262506, 0.492 ENSP00000278412" +ERC1 (ENSP00000354158) 191 HIPK2 2.2748 KIN HIPK1_HIPK2_group 0.0463 ENSP00000354158 0.4159 ERC1 IKTFWsPELKK "ENSP00000263551, 0.7696 ENSP00000286332, 0.7192 ENSP00000354158" +VAMP3 (ENSP00000054666) 11 PKCzeta 2.0773 KIN PKC_group 0.4633 ENSP00000054666 0.4263 VAMP3 TAATGsNRRLQ "ENSP00000367830, 0.7688 ENSP00000320935, 0.796 ENSP00000054666" +SSRP1 (ENSP00000278412) 659 DNAPK 2.0042 KIN DNAPK 0.0584 ENSP00000278412 0.56 SSRP1 RQLSEsFKSKE "ENSP00000313420, 0.448 ENSP00000278412"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_regulatory_sites.tabular Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,8 @@ +32017 +"PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words ""PhosphoSitePlus(R), www.phosphosite.org"" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: ""Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926.""" + +GENE PROTEIN PROT_TYPE ACC_ID GENE_ID HU_CHR_LOC ORGANISM MOD_RSD SITE_GRP_ID SITE_+/-7_AA DOMAIN ON_FUNCTION ON_PROCESS ON_PROT_INTERACT ON_OTHER_INTERACT PMIDs LT_LIT MS_LIT MS_CST NOTES +ENSA ENSA "Inhibitor; Protein phosphatase, regulatory subunit" O43768 2029 1q21.3 human S109-p 477819 DLPQRKSsLVTSKLA Endosulfine "molecular association, regulation; protein conformation" SNCA(DISRUPTS) 18973346 1 34 50 +VAMP8 VAMP8 "Membrane protein, integral; Vesicle" Q9BV40 8673 2p11.2 human S55-p 12738929 TEDLEATsEHFKTTS Synaptobrevin "activity, inhibited" 27402227 1 8 0 "abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion" +ENSA ENSA "Inhibitor; Protein phosphatase, regulatory subunit" O43768 2029 1q21.3 human S67-p 455934 KGQKYFDsGDYNMAK Endosulfine "molecular association, regulation" cell cycle regulation PPP2CA(INDUCES) 27889260 3 56 47 +Vamp4 VAMP4 "Membrane protein, integral; Vesicle" O70480 53330 1 H2.1|1 70.29 cM mouse S30-p 454285 RNLLEDDsDEEEDFF "molecular association, regulation; intracellular localization" PACS-1(INDUCES) 14608369 1 64 10
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_swissprot.fasta Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,68 @@ +>sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 +MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDDDAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEKEKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKHQKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKSEEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT +>sp|Q08945|SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 +MAETLEFNDVYQEVKGSMNDGRLRLSRQGIIFKNSKTGKVDNIQAGELTEGIWRRVALGHGLKLLTKNGHVYKYDGFRESEFEKLSDFFKTHYRLELMEKDLCVKGWNWGTVKFGGQLLSFDIGDQPVFEIPLSNVSQCTTGKNEVTLEFHQNDDAEVSLMEVRFYVPPTQEDGVDPVEAFAQNVLSKADVIQATGDAICIFRELQCLTPRGRYDIRIYPTFLHLHGKTFDYKIPYTTVLRLFLLPHKDQRQMFFVISLDPPIKQGQTRYHFLILLFSKDEDISLTLNMNEEEVEKRFEGRLTKNMSGSLYEMVSRVMKALVNRKITVPGNFQGHSGAQCITCSYKASSGLLYPLERGFIYVHKPPVHIRFDEISFVNFARGTTTTRSFDFEIETKQGTQYTFSSIEREEYGKLFDFVNAKKLNIKNRGLKEGMNPSYDEYADSDEDQHDAYLERMKEEGKIREENANDSSDDSGEETDESFNPGEEEEDVAEEFDSNASASSSSNEGDSDRDEKKRKQLKKAKMAKDRKSRKKPVEVKKGKDPNAPKRPMSAYMLWLNASREKIKSDHPGISITDLSKKAGEIWKGMSKEKKEEWDRKAEDARRDYEKAMKEYEGGRGESSKRDKSKKKKKVKVKMEKKSTPSRGSSSKSSSRQLSESFKSKEFVSSDESSSGENKSKKKRRRSEDSEEEELASTPPSSEDSASGSDE +>sp|Q96SA4|SERC2_HUMAN Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 PE=2 SV=3 +MGACLGACSLLSCASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS +>sp|Q96SA4-2|SERC2_HUMAN Isoform 2 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 +MGAEGAPDFLSCPRVRRASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS +>sp|Q96SA4-3|SERC2_HUMAN Isoform 3 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 +MRSMRLREEESPGPSHTASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS +>sp|Q96SA4-4|SERC2_HUMAN Isoform 4 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 +MDGRMMRSMRLREEESPGPSHTASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS +>sp|Q9NRX5|SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1 +MGSVLGLCSMASWIPCLCGSAPCLLCRCCPSGNNSTVTRLIYALFLLVGVCVACVMLIPGMEEQLNKIPGFCENEKGVVPCNILVGYKAVYRLCFGLAMFYLLLSLLMIKVKSSSDPRAAVHNGFWFFKFAAAIAIIIGAFFIPEGTFTTVWFYVGMAGAFCFILIQLVLLIDFAHSWNESWVEKMEEGNSRCWYAALLSATALNYLLSLVAIVLFFVYYTHPASCSENKAFISVNMLLCVGASVMSILPKIQESQPRSGLLQSSVITVYTMYLTWSAMTNEPETNCNPSLLSIIGYNTTSTVPKEGQSVQWWHAQGIIGLILFLLCVFYSSIRTSNNSQVNKLTLTSDESTLIEDGGARSDGSLEDGDDVHRAVDNERDGVTYSYSFFHFMLFLASLYIMMTLTNWYRYEPSREMKSQWTAVWVKISSSWIGIVLYVWTLVAPLVLTNRDFD +>sp|O43768|ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1 +MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE +>sp|O43768-2|ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG +>sp|O43768-3|ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE +>sp|O43768-4|ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGVWGIASYPLSLGLKEVLRMKSVEQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG +>sp|O43768-5|ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE +>sp|O43768-6|ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG +>sp|O43768-7|ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE +>sp|O43768-8|ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGVWGIVSYPLSLELKEVLRMKSVEVLLDPFLEVLLLNRSRGEFEI +>sp|O43768-9|ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG +>sp|Q15751|HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 +MATMIPPVKLKWLEHLNSSWITEDSESIATREGVAVLYSKLVSNKEVVPLPQQVLCLKGPQLPDFERESLSSDEQDHYLDALLSSQLALAKMVCSDSPFAGALRKRLLVLQRVFYALSNKYHDKGKVKQQQHSPESSSGSADVHSVSERPRSSTDALIEMGVRTGLSLLFALLRQSWMMPVSGPGLSLCNDVIHTAIEVVSSLPPLSLANESKIPPMGLDCLSQVTTFLKGVTIPNSGADTLGRRLASELLLGLAAQRGSLRYLLEWIEMALGASAVVHTMEKGKLLSSQEGMISFDCFMTILMQMRRSLGSSADRSQWREPTRTSDGLCSLYEAALCLFEEVCRMASDYSRTCASPDSIQTGDAPIVSETCEVYVWGSNSSHQLVEGTQEKILQPKLAPSFSDAQTIEAGQYCTFVISTDGSVRACGKGSYGRLGLGDSNNQSTLKKLTFEPHRSIKKVSSSKGSDGHTLAFTTEGEVFSWGDGDYGKLGHGNSSTQKYPKLIQGPLQGKVVVCVSAGYRHSAAVTEDGELYTWGEGDFGRLGHGDSNSRNIPTLVKDISNVGEVSCGSSHTIALSKDGRTVWSFGGGDNGKLGHGDTNRVYKPKVIEALQGMFIRKVCAGSQSSLALTSTGQVYAWGCGACLGCGSSEATALRPKLIEELAATRIVDVSIGDSHCLALSHDNEVYAWGNNSMGQCGQGNSTGPITKPKKVSGLDGIAIQQISAGTSHSLAWTALPRDRQVVAWHRPYCVDLEESTFSHLRSFLERYCDKINSEIPPLPFPSSREHHSFLKLCLKLLSNHLALALAGGVATSILGRQAGPLRNLLFRLMDSTVPDEIQEVVIETLSVGATMLLPPLRERMELLHSLLPQGPDRWESLSKGQRMQLDIILTSLQDHTHVASLLGYSSPSDAADLSSVCTGYGNLSDQPYGTQSCHPDTHLAEILMKTLLRNLGFYTDQAFGELEKNSDKFLLGTSSSENSQPAHLHELLCSLQKQLLAFCHINNISENSSSVALLHKHLQLLLPHATDIYSRSANLLKESPWNGSVGEKLRDVIYVSAAGSMLCQIVNSLLLLPVSVARPLLSYLLDLLPPLDCLNRLLPAADLLEDQELQWPLHGGPELIDPAGLPLPQPAQSWVWLVDLERTIALLIGRCLGGMLQGSPVSPEEQDTAYWMKTPLFSDGVEMDTPQLDKCMSCLLEVALSGNEEQKPFDYKLRPEIAVYVDLALGCSKEPARSLWISMQDYAVSKDWDSATLSNESLLDTVSRFVLAALLKHTNLLSQACGESRYQPGKHLSEVYRCVYKVRSRLLACKNLELIQTRSSSRDRWISENQDSADVDPQEHSFTRTIDEEAEMEEQAERDREEGHPEPEDEEEEREHEVMTAGKIFQCFLSAREVARSRDRDRMNSGAGSGARADDPPPQSQQERRVSTDLPEGQDVYTAACNSVIHRCALLILGVSPVIDELQKRREEGQLQQPSTSASEGGGLMTRSESLTAESRLVHTSPNYRLIKSRSESDLSQPESDEEGYALSGRRNVDLDLAASHRKRGPMHSQLESLSDSWARLKHSRDWLCNSSYSFESDFDLTKSLGVHTLIENVVSFVSGDVGNAPGFKEPEESMSTSPQASIIAMEQQQLRAELRLEALHQILVLLSGMEEKGSISLAGSRLSSGFQSSTLLTSVRLQFLAGCFGLGTVGHTGGKGESGRLHHYQDGIRAAKRNIQIEIQVAVHKIYQQLSATLERALQANKHHIEAQQRLLLVTVFALSVHYQPVDVSLAISTGLLNVLSQLCGTDTMLGQPLQLLPKTGVSQLSTALKVASTRLLQILAITTGTYADKLSPKVVQSLLDLLCSQLKNLLSQTGVLHMASFGEGEQEDGEEEEKKVDSSGETEKKDFRAALRKQHAAELHLGDFLVFLRRVVSSKAIQSKMASPKWTEVLLNIASQKCSSGIPLVGNLRTRLLALHVLEAVLPACESGVEDDQMAQIVERLFSLLSDCMWETPIAQAKHAIQIKEKEQEIKLQKQGELEEEDENLPIQEVSFDPEKAQCCLVENGQILTHGSGGKGYGLASTGVTSGCYQWKFYIVKENRGNEGTCVGVSRWPVHDFNHRTTSDMWLYRAYSGNLYHNGEQTLTLSSFTQGDFITCVLDMEARTISFGKNGEEPKLAFEDVDAAELYPCVMFYSSNPGEKVKICDMQMRGTPRDLLPGDPICSPVAAVLAEATIQLIRILHRTDRWTYCINKKMMERLHKIKICIKESGQKLKKSRSVQSREENEMREEKESKEEEKGKHTRHGLADLSELQLRTLCIEVWPVLAVIGGVDAGLRVGGRCVHKQTGRHATLLGVVKEGSTSAKVQWDEAEITISFPTFWSPSDTPLYNLEPCEPLPFDVARFRGLTASVLLDLTYLTGVHEDMGKQSTKRHEKKHRHESEEKGDVEQKPESESALDMRTGLTSDDVKSQSTTSSKSENEIASFSLDPTLPSVESQHQITEGKRKNHEHMSKNHDVAQSEIRAVQLSYLYLGAMKSLSALLGCSKYAELLLIPKVLAENGHNSDCASSPVVHEDVEMRAALQFLMRHMVKRAVMRSPIKRALGLADLERAQAMIYKLVVHGLLEDQFGGKIKQEIDQQAEESDPAQQAQTPVTTSPSASSTTSFMSSSLEDTTTATTPVTDTETVPASESPGVMPLSLLRQMFSSYPTTTVLPTRRAQTPPISSLPTSPSDEVGRRQSLTSPDSQSARPANRTALSDPSSRLSTSPPPPAIAVPLLEMGFSLRQIAKAMEATGARGEADAQNITVLAMWMIEHPGHEDEEEPQSGSTADSRPGAAVLGSGGKSNDPCYLQSPGDIPSADAAEMEEGFSESPDNLDHTENAASGSGPSARGRSAVTRRHKFDLAARTLLARAAGLYRSVQAHRNQSRREGISLQQDPGALYDFNLDEELEIDLDDEAMEAMFGQDLTSDNDILGMWIPEVLDWPTWHVCESEDREEVVVCELCECSVVSFNQHMKRNHPGCGRSANRQGYRSNGSYVDGWFGGECGSGNPYYLLCGTCREKYLAMKTKSKSTSSERYKGQAPDLIGKQDSVYEEDWDMLDVDEDEKLTGEEEFELLAGPLGLNDRRIVPEPVQFPDSDPLGASVAMVTATNSMEETLMQIGCHGSVEKSSSGRITLGEQAAALANPHDRVVALRRVTAAAQVLLARTMVMRALSLLSVSGSSCSLAAGLESLGLTDIRTLVRLMCLAAAGRAGLSTSPSAMASTSERSRGGHSKANKPISCLAYLSTAVGCLASNAPSAAKLLVQLCTQNLISAATGVNLTTVDDSIQRKFLPSFLRGIAEENKLVTSPNFVVTQALVALLADKGAKLRPNYDKSEVEKKGPLELANALAACCLSSRLSSQHRQWAAQQLVRTLAAHDRDNQTTLQTLADMGGDLRKCSFIKLEAHQNRVMTCVWCNKKGLLATSGNDGTIRVWNVTKKQYSLQQTCVFNRLEGDAEESLGSPSDPSFSPVSWSISGKYLAGALEKMVNIWQVNGGKGLVDIQPHWVSALAWPEEGPATAWSGESPELLLVGRMDGSLGLIEVVDVSTMHRRELEHCYRKDVSVTCIAWFSEDRPFAVGYFDGKLLLGTKEPLEKGGIVLIDAHKDTLISMKWDPTGHILMTCAKEDSVKLWGSISGCWCCLHSLCHPSIVNGIAWCRLPGKGSKLQLLMATGCQSGLVCVWRIPQDTTQTNVTSAEGWWEQESNCQDGYRKSSGAKCVYQLRGHITPVRTVAFSSDGLALVSGGLGGLMNIWSLRDGSVLQTVVIGSGAIQTTVWIPEVGVAACSNRSKDVLVVNCTAEWAAANHVLATCRTALKQQGVLGLNMAPCMRAFLERLPMMLQEQYAYEKPHVVCGDQLVHSPYMQCLASLAVGLHLDQLLCNPPVPPHHQNCLPDPASWNPNEWAWLECFSTTIKAAEALTNGAQFPESFTVPDLEPVPEDELVFLMDNSKWINGMDEQIMSWATSRPEDWHLGGKCDVYLWGAGRHGQLAEAGRNVMVPAAAPSFSQAQQVICGQNCTFVIQANGTVLACGEGSYGRLGQGNSDDLHVLTVISALQGFVVTQLVTSCGSDGHSMALTESGEVFSWGDGDYGKLGHGNSDRQRRPRQIEALQGEEVVQMSCGFKHSAVVTSDGKLFTFGNGDYGRLGLGNTSNKKLPERVTALEGYQIGQVACGLNHTLAVSADGSMVWAFGDGDYGKLGLGNSTAKSSPQKIDVLCGIGIKKVACGTQFSVALTKDGHVYTFGQDRLIGLPEGRARNHNRPQQIPVLAGVIIEDVAVGAEHTLALASNGDVYAWGSNSEGQLGLGHTNHVREPTLVTGLQGKNVRQISAGRCHSAAWTAPPVPPRAPGVSVPLQLGLPDTVPPQYGALREVSIHTVRARLRLLYHFSDLMYSSWRLLNLSPNNQNSTSHYNAGTWGIVQGQLRPLLAPRVYTLPMVRSIGKTMVQGKNYGPQITVKRISTRGRKCKPIFVQIARQVVKLNASDLRLPSRAWKVKLVGEGADDAGGVFDDTITEMCQELETGIVDLLIPSPNATAEVGYNRDRFLFNPSACLDEHLMQFKFLGILMGVAIRTKKPLDLHLAPLVWKQLCCVPLTLEDLEEVDLLYVQTLNSILHIEDSGITEESFHEMIPLDSFVGQSADGKMVPIIPGGNSIPLTFSNRKEYVERAIEYRLHEMDRQVAAVREGMSWIVPVPLLSLLTAKQLEQMVCGMPEISVEVLKKVVRYREVDEQHQLVQWFWHTLEEFSNEERVLFMRFVSGRSRLPANTADISQRFQIMKVDRPYDSLPTSQTCFFQLRLPPYSSQLVMAERLRYAINNCRSIDMDNYMLSRNVDNAEGSDTDY +>sp|O95714|HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 +MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIVYTGTESTQNGELPPRKDDSVEPSGTKKEDLNDKEKKDEEETPAPIYRAKSILDSWVWGKQPDVNELKECLSVLVKEQQALAVQSATTTLSALRLKQRLVILERYFIALNRTVFQENVKVKWKSSGISLPPVDKKSSRPAGKGVEGLARVGSRAALSFAFAFLRRAWRSGEDADLCSELLQESLDALRALPEASLFDESTVSSVWLEVVERATRFLRSVVTGDVHGTPATKGPGSIPLQDQHLALAILLELAVQRGTLSQMLSAILLLLQLWDSGAQETDNERSAQGTSAPLLPLLQRFQSIICRKDAPHSEGDMHLLSGPLSPNESFLRYLTLPQDNELAIDLRQTAVVVMAHLDRLATPCMPPLCSSPTSHKGSLQEVIGWGLIGWKYYANVIGPIQCEGLANLGVTQIACAEKRFLILSRNGRVYTQAYNSDTLAPQLVQGLASRNIVKIAAHSDGHHYLALAATGEVYSWGCGDGGRLGHGDTVPLEEPKVISAFSGKQAGKHVVHIACGSTYSAAITAEGELYTWGRGNYGRLGHGSSEDEAIPMLVAGLKGLKVIDVACGSGDAQTLAVTENGQVWSWGDGDYGKLGRGGSDGCKTPKLIEKLQDLDVVKVRCGSQFSIALTKDGQVYSWGKGDNQRLGHGTEEHVRYPKLLEGLQGKKVIDVAAGSTHCLALTEDSEVHSWGSNDQCQHFDTLRVTKPEPAALPGLDTKHIVGIACGPAQSFAWSSCSEWSIGLRVPFVVDICSMTFEQLDLLLRQVSEGMDGSADWPPPQEKECVAVATLNLLRLQLHAAISHQVDPEFLGLGLGSILLNSLKQTVVTLASSAGVLSTVQSAAQAVLQSGWSVLLPTAEERARALSALLPCAVSGNEVNISPGRRFMIDLLVGSLMADGGLESALHAAITAEIQDIEAKKEAQKEKEIDEQEANASTFHRSRTPLDKDLINTGICESSGKQCLPLVQLIQQLLRNIASQTVARLKDVARRISSCLDFEQHSRERSASLDLLLRFQRLLISKLYPGESIGQTSDISSPELMGVGSLLKKYTALLCTHIGDILPVAASIASTSWRHFAEVAYIVEGDFTGVLLPELVVSIVLLLSKNAGLMQEAGAVPLLGGLLEHLDRFNHLAPGKERDDHEELAWPGIMESFFTGQNCRNNEEVTLIRKADLENHNKDGGFWTVIDGKVYDIKDFQTQSLTGNSILAQFAGEDPVVALEAALQFEDTRESMHAFCVGQYLEPDQEIVTIPDLGSLSSPLIDTERNLGLLLGLHASYLAMSTPLSPVEIECAKWLQSSIFSGGLQTSQIHYSYNEEKDEDHCSSPGGTPASKSRLCSHRRALGDHSQAFLQAIADNNIQDHNVKDFLCQIERYCRQCHLTTPIMFPPEHPVEEVGRLLLCCLLKHEDLGHVALSLVHAGALGIEQVKHRTLPKSVVDVCRVVYQAKCSLIKTHQEQGRSYKEVCAPVIERLRFLFNELRPAVCNDLSIMSKFKLLSSLPRWRRIAQKIIRERRKKRVPKKPESTDDEEKIGNEESDLEEACILPHSPINVDKRPIAIKSPKDKWQPLLSTVTGVHKYKWLKQNVQGLYPQSPLLSTIAEFALKEEPVDVEKMRKCLLKQLERAEVRLEGIDTILKLASKNFLLPSVQYAMFCGWQRLIPEGIDIGEPLTDCLKDVDLIPPFNRMLLEVTFGKLYAWAVQNIRNVLMDASAKFKELGIQPVPLQTITNENPSGPSLGTIPQARFLLVMLSMLTLQHGANNLDLLLNSGMLALTQTALRLIGPSCDNVEEDMNASAQGASATVLEETRKETAPVQLPVSGPELAAMMKIGTRVMRGVDWKWGDQDGPPPGLGRVIGELGEDGWIRVQWDTGSTNSYRMGKEGKYDLKLAELPAAAQPSAEDSDTEDDSEAEQTERNIHPTAMMFTSTINLLQTLCLSAGVHAEIMQSEATKTLCGLLRMLVESGTTDKTSSPNRLVYREQHRSWCTLGFVRSIALTPQVCGALSSPQWITLLMKVVEGHAPFTATSLQRQILAVHLLQAVLPSWDKTERARDMKCLVEKLFDFLGSLLTTCSSDVPLLRESTLRRRRVRPQASLTATHSSTLAEEVVALLRTLHSLTQWNGLINKYINSQLRSITHSFVGRPSEGAQLEDYFPDSENPEVGGLMAVLAVIGGIDGRLRLGGQVMHDEFGEGTVTRITPKGKITVQFSDMRTCRVCPLNQLKPLPAVAFNVNNLPFTEPMLSVWAQLVNLAGSKLEKHKIKKSTKQAFAGQVDLDLLRCQQLKLYILKAGRALLSHQDKLRQILSQPAVQETGTVHTDDGAVVSPDLGDMSPEGPQPPMILLQQLLASATQPSPVKAIFDKQELEAAALAVCQCLAVESTHPSSPGFEDCSSSEATTPVAVQHIRPARVKRRKQSPVPALPIVVQLMEMGFSRRNIEFALKSLTGASGNASSLPGVEALVGWLLDHSDIQVTELSDADTVSDEYSDEEVVEDVDDAAYSMSTGAVVTESQTYKKRADFLSNDDYAVYVRENIQVGMMVRCCRAYEEVCEGDVGKVIKLDRDGLHDLNVQCDWQQKGGTYWVRYIHVELIGYPPPSSSSHIKIGDKVRVKASVTTPKYKWGSVTHQSVGVVKAFSANGKDIIVDFPQQSHWTGLLSEMELVPSIHPGVTCDGCQMFPINGSRFKCRNCDDFDFCETCFKTKKHNTRHTFGRINEPGQSAVFCGRSGKQLKRCHSSQPGMLLDSWSRMVKSLNVSSSVNQASRLIDGSEPCWQSSGSQGKHWIRLEIFPDVLVHRLKMIVDPADSSYMPSLVVVSGGNSLNNLIELKTININPSDTTVPLLNDCTEYHRYIEIAIKQCRSSGIDCKIHGLILLGRIRAEEEDLAAVPFLASDNEEEEDEKGNSGSLIRKKAAGLESAATIRTKVFVWGLNDKDQLGGLKGSKIKVPSFSETLSALNVVQVAGGSKSLFAVTVEGKVYACGEATNGRLGLGISSGTVPIPRQITALSSYVVKKVAVHSGGRHATALTVDGKVFSWGEGDDGKLGHFSRMNCDKPRLIEALKTKRIRDIACGSSHSAALTSSGELYTWGLGEYGRLGHGDNTTQLKPKMVKVLLGHRVIQVACGSRDAQTLALTDEGLVFSWGDGDFGKLGRGGSEGCNIPQNIERLNGQGVCQIECGAQFSLALTKSGVVWTWGKGDYFRLGHGSDVHVRKPQVVEGLRGKKIVHVAVGALHCLAVTDSGQVYAWGDNDHGQQGNGTTTVNRKPTLVQGLEGQKITRVACGSSHSVAWTTVDVATPSVHEPVLFQTARDPLGASYLGVPSDADSSAASNKISGASNSKPNRPSLAKILLSLDGNLAKQQALSHILTALQIMYARDAVVGALMPAAMIAPVECPSFSSAAPSDASAMASPMNGEECMLAVDIEDRLSPNPWQEKREIVSSEDAVTPSAVTPSAPSASARPFIPVTDDLGAASIIAETMTKTKEDVESQNKAAGPEPQALDEFTSLLIADDTRVVVDLLKLSVCSRAGDRGRDVLSAVLSGMGTAYPQVADMLLELCVTELEDVATDSQSGRLSSQPVVVESSHPYTDDTSTSGTVKIPGAEGLRVEFDRQCSTERRHDPLTVMDGVNRIVSVRSGREWSDWSSELRIPGDELKWKFISDGSVNGWGWRFTVYPIMPAAGPKELLSDRCVLSCPSMDLVTCLLDFRLNLASNRSIVPRLAASLAACAQLSALAASHRMWALQRLRKLLTTEFGQSININRLLGENDGETRALSFTGSALAALVKGLPEALQRQFEYEDPIVRGGKQLLHSPFFKVLVALACDLELDTLPCCAETHKWAWFRRYCMASRVAVALDKRTPLPRLFLDEVAKKIRELMADSENMDVLHESHDIFKREQDEQLVQWMNRRPDDWTLSAGGSGTIYGWGHNHRGQLGGIEGAKVKVPTPCEALATLRPVQLIGGEQTLFAVTADGKLYATGYGAGGRLGIGGTESVSTPTLLESIQHVFIKKVAVNSGGKHCLALSSEGEVYSWGEAEDGKLGHGNRSPCDRPRVIESLRGIEVVDVAAGGAHSACVTAAGDLYTWGKGRYGRLGHSDSEDQLKPKLVEALQGHRVVDIACGSGDAQTLCLTDDDTVWSWGDGDYGKLGRGGSDGCKVPMKIDSLTGLGVVKVECGSQFSVALTKSGAVYTWGKGDYHRLGHGSDDHVRRPRQVQGLQGKKVIAIATGSLHCVCCTEDGEVYTWGDNDEGQLGDGTTNAIQRPRLVAALQGKKVNRVACGSAHTLAWSTSKPASAGKLPAQVPMEYNHLQEIPIIALRNRLLLLHHLSELFCPCIPMFDLEGSLDETGLGPSVGFDTLRGILISQGKEAAFRKVVQATMVRDRQHGPVVELNRIQVKRSRSKGGLAGPDGTKSVFGQMCAKMSSFGPDSLLLPHRVWKVKFVGESVDDCGGGYSESIAEICEELQNGLTPLLIVTPNGRDESGANRDCYLLSPAARAPVHSSMFRFLGVLLGIAIRTGSPLSLNLAEPVWKQLAGMSLTIADLSEVDKDFIPGLMYIRDNEATSEEFEAMSLPFTVPSASGQDIQLSSKHTHITLDNRAEYVRLAINYRLHEFDEQVAAVREGMARVVPVPLLSLFTGYELETMVCGSPDIPLHLLKSVATYKGIEPSASLIQWFWEVMESFSNTERSLFLRFVWGRTRLPRTIADFRGRDFVIQVLDKYNPPDHFLPESYTCFFLLKLPRYSCKQVLEEKLKYAIHFCKSIDTDDYARIALTGEPAADDSSDDSDNEDVDSFASDSTQDYLTGH +>sp|Q6ZN18|AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2 +MAAAITDMADLEELSRLSPLPPGSPGSAARGRAEPPEEEEEEEEEEEEAEAEAVAALLLNGGSGGGGGGGGGGVGGGEAETMSEPSPESASQAGEDEDEEEDDEEEEDESSSSGGGEEESSAESLVGSSGGSSSDETRSLSPGAASSSSGDGDGKEGLEEPKGPRGSQGGGGGGSSSSSVVSSGGDEGYGTGGGGSSATSGGRRGSLEMSSDGEPLSRMDSEDSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKRTLIRKVFNLYLSKQ +>sp|Q6ZN18-2|AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 +MAAAITDMADLEELSRLSPLPPGSPGSAARGRAEPPEEEEEEEEEEEEAEAEAVAALLLNGGSGGGGGGGGGGVGGGEAETMSEPSPESASQAGEDEDEEEDDEEEEDESSSSGGGEEESSAESLVGSSGGSSSDETRSLSPGAASSSSGDGDGKEGLEEPKGPRGSQGGGGGGSSSSSVVSSGGDEGYGTGGGGSSATSGGRRGSLEMSSDGEPLSRMDSEDSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKR +>sp|Q6ZN18-3|AEBP2_HUMAN Isoform 3 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 +MYTRRYSSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKRTLIRKVFNLYLSKQ +>sp|O15083|ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3 +MYGSARTITNLEGSPSRSPRLPRSPRLGHRRTSSGGGGGTGKTLSMENIQSLNAAYATSGPMYLSDHEGVASTTYPKGTMTLGRATNRAVYGGRVTAMGSSPNIASAGLSHTDVLSYTDQHGGLTGSSHHHHHQVPSMLRQVRDSTMLDLQAQLKELQRENDLLRKELDIKDSKLGSSMNSIKTFWSPELKKERVLRKEEAARMSVLKEQMRVSHEENQHLQLTIQALQDELRTQRDLNHLLQQESGNRGAEHFTIELTEENFRRLQAEHDRQAKELFLLRKTLEEMELRIETQKQTLNARDESIKKLLEMLQSKGLPSKSLEDDNERTRRMAEAESQVSHLEVILDQKEKENIHLREELHRRSQLQPEPAKTKALQTVIEMKDTKIASLERNIRDLEDEIQMLKANGVLNTEDREEEIKQIEVYKSHSKFMKTKIDQLKQELSKKESELLALQTKLETLSNQNSDCKQHIEVLKESLTAKEQRAAILQTEVDALRLRLEEKESFLNKKTKQLQDLTEEKGTLAGEIRDMKDMLEVKERKINVLQKKIENLQEQLRDKDKQLTNLKDRVKSLQTDSSNTDTALATLEEALSEKERIIERLKEQRERDDRERLEEIESFRKENKDLKEKVNALQAELTEKESSLIDLKEHASSLASAGLKRDSKLKSLEIAIEQKKEECSKLEAQLKKAHNIEDDSRMNPEFADQIKQLDKEASYYRDECGKAQAEVDRLLEILKEVENEKNDKDKKIAELESLTLRHMKDQNKKVANLKHNQQLEKKKNAQLLEEVRRREDSMADNSQHLQIEELMNALEKTRQELDATKARLASTQQSLAEKEAHLANLRIERRKQLEEILEMKQEALLAAISEKDANIALLELSASKKKKTQEEVMALKREKDRLVHQLKQQTQNRMKLMADNYDDDHHHYHHHHHHHHHRSPGRSQHSNHRPSPDQDDEEGIWA +>sp|P23763|VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1 +MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVIYFFT +>sp|P23763-3|VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 +MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVSKYR +>sp|P23763-2|VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 +MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVRRD +>sp|Q15836|VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 +MSTGPTAATGSNRRLQQTQNQVDEVVDIMRVNVDKVLERDQKLSELDDRADALQAGASQFETSAAKLKRKYWWKNCKMWAIGITVLVIFIIIIIVWVVSS +>sp|P63027|VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3 +MSATAATAPPAAPAGEGGPPAPPPNLTSNRRLQQTQAQVDEVVDIMRVNVDKVLERDQKLSELDDRADALQAGASQFETSAAKLKRKYWWKNLKMMIILGVICAIILIIIIVYFST +>sp|O75379|VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2 +MPPKFKRHLNDDDVTGSVKSERRNLLEDDSDEEEDFFLRGPSGPRFGPRNDKIKHVQNQVDEVIDVMQENITKVIERGERLDELQDKSESLSDNATAFSNRSKQLRRQMWWRGCKIKAIMALVAAILLLVIIILIVMKYRT +>sp|O75379-2|VAMP4_HUMAN_Isoform 2 of Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 +MPPKFKRHLNDDDVTGSVKSERRNLLEDDSDEEEDFFLGPSGPRFGPRNDKIKHVQNQVDEVIDVMQENITKVIERGERLDELQDKSESLSDNATAFSNRSKQLRRQMWWRGCKIKAIMALVAAILLLVIIILIVMKYRT +>sp|O95183|VAMP5_HUMAN_Vesicle-associated membrane protein 5 OS=Homo sapiens OX=9606 GN=VAMP5 PE=1 SV=1 +MAGIELERCQQQANEVTEIMRNNFGKVLERGVKLAELQQRSDQLLDMSSTFNKTTQNLAQKKCWENIRYRICVGLVVVGVLLIILIVLLVVFLPQSSDSSSAPRTQDAGIASGPGN +>sp|P51809|VAMP7_HUMAN_Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7 PE=1 SV=3 +MAILFAVVARGTTILAKHAWCGGNFLEVTEQILAKIPSENNKLTYSHGNYLFHYICQDRIVYLCITDDDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIDLVAQRGERLELLIDKTENLVDSSVTFKTTSRNLARAMCMKNLKLTIIIIIVSIVFIYIIVSPLCGGFTWPSCVKK +>sp|P51809-2|VAMP7_HUMAN_Isoform 2 of Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7 +MAILFAVVARGTTILAKHAWCGGNFLEVTEQILAKIPSENNKLTYSHGNYLFHYICQDRIVYLCITDDDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIVCHLQNYQQKSCSSHVYEEPQAHYYHHHRINCVHLYHCFTSLWWIYMAKLCEEIGKKKLPLTKDMREQGVKSNPCDSSLSHTDRWYLPVSSTLFSLFKILFHASRFIFVLSTSLFL +>sp|P51809-3|VAMP7_HUMAN_Isoform 3 of Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7 +MAILFAVVARGTTILAKHAWCGGNFLEDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIDLVAQRGERLELLIDKTENLVDSSVTFKTTSRNLARAMCMKNLKLTIIIIIVSIVFIYIIVSPLCGGFTWPSCVKK +>sp|Q9BV40|VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1 +MEEASEGGGNDRVRNLQSEVEGVKNIMTQNVERILARGENLEHLRNKTEDLEATSEHFKTTSQKVARKFWWKNVKMIVLICVIVFIIILFIVLFATGAFS
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/workflow/ppenrich_suite_wf.ga Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,653 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "phoshpoproteomic enrichment data pre-processing and ANOVA", + "creator": [ + { + "class": "Person", + "identifier": "0000-0002-2882-0508", + "name": "Art Eschenlauer" + } + ], + "format-version": "0.1", + "license": "MIT", + "name": "ppenrich_suite_wf", + "steps": { + "0": { + "annotation": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).", + "name": "Phospho (STY)Sites.txt" + } + ], + "label": "Phospho (STY)Sites.txt", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 257.06666564941406, + "height": 81.39999389648438, + "left": 339.95001220703125, + "right": 539.9500122070312, + "top": 175.6666717529297, + "width": 200, + "x": 339.95001220703125, + "y": 175.6666717529297 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "002d55e6-29a5-426d-9248-70ec33424b15", + "workflow_outputs": [] + }, + "1": { + "annotation": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)", + "content_id": null, + "errors": null, + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)", + "name": "SwissProt_Human_Canonical_Isoform.fasta" + } + ], + "label": "SwissProt_Human_Canonical_Isoform.fasta", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 411.4666748046875, + "height": 101.79998779296875, + "left": 379.95001220703125, + "right": 579.9500122070312, + "top": 309.66668701171875, + "width": 200, + "x": 379.95001220703125, + "y": 309.66668701171875 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"fasta\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "8f079dcc-1843-47cd-b4dc-1830e4466430", + "workflow_outputs": [] + }, + "2": { + "annotation": "Derived from https://networkin.info/download/networkin_human_predictions_3.1.tsv.xz (which is free for non-commercial use - for required citation, see https://networkin.info/)", + "content_id": null, + "errors": null, + "id": 2, + "input_connections": {}, + "inputs": [ + { + "description": "Derived from https://networkin.info/download/networkin_human_predictions_3.1.tsv.xz (which is free for non-commercial use - for required citation, see https://networkin.info/)", + "name": "NetworKIN_cutoffscore2.0.tabular" + } + ], + "label": "NetworKIN_cutoffscore2.0.tabular", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 573.4666748046875, + "height": 101.79998779296875, + "left": 418.95001220703125, + "right": 618.9500122070312, + "top": 471.66668701171875, + "width": 200, + "x": 418.95001220703125, + "y": 471.66668701171875 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "dc894a94-97a3-40ff-811e-01b30d498478", + "workflow_outputs": [] + }, + "3": { + "annotation": "Derived from http://hprd.org/serine_motifs, http://hprd.org/tyrosine_motifs, and http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx", + "content_id": null, + "errors": null, + "id": 3, + "input_connections": {}, + "inputs": [ + { + "description": "Derived from http://hprd.org/serine_motifs, http://hprd.org/tyrosine_motifs, and http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx", + "name": "pSTY_Motifs.tabular" + } + ], + "label": "pSTY_Motifs.tabular", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 726.0666809082031, + "height": 81.39999389648438, + "left": 459.95001220703125, + "right": 659.9500122070312, + "top": 644.6666870117188, + "width": 200, + "x": 459.95001220703125, + "y": 644.6666870117188 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "6fc936ad-0b52-484f-a051-73c1776fdeb0", + "workflow_outputs": [] + }, + "4": { + "annotation": "Derived from Kinase_Substrate_Dataset.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", + "content_id": null, + "errors": null, + "id": 4, + "input_connections": {}, + "inputs": [ + { + "description": "Derived from Kinase_Substrate_Dataset.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", + "name": "PSP_Kinase_Substrate_Dataset.tabular" + } + ], + "label": "PSP_Kinase_Substrate_Dataset.tabular", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 894.4666748046875, + "height": 101.79998779296875, + "left": 503.95001220703125, + "right": 703.9500122070312, + "top": 792.6666870117188, + "width": 200, + "x": 503.95001220703125, + "y": 792.6666870117188 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "22b77482-2339-4b45-8fc6-d39f7175131b", + "workflow_outputs": [] + }, + "5": { + "annotation": "Derived from Regulatory_sites.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", + "content_id": null, + "errors": null, + "id": 5, + "input_connections": {}, + "inputs": [ + { + "description": "Derived from Regulatory_sites.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", + "name": "PSP_Regulatory_sites.tabular" + } + ], + "label": "PSP_Regulatory_sites.tabular", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 1041.0666809082031, + "height": 81.39999389648438, + "left": 535.9500122070312, + "right": 735.9500122070312, + "top": 959.6666870117188, + "width": 200, + "x": 535.9500122070312, + "y": 959.6666870117188 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "3d97a902-1408-403c-b82e-ddb6ca6a7d47", + "workflow_outputs": [] + }, + "6": { + "annotation": "List of alpha cutoff values for significance testing; text file having no header and a single line for each cutoff value.", + "content_id": null, + "errors": null, + "id": 6, + "input_connections": {}, + "inputs": [ + { + "description": "List of alpha cutoff values for significance testing; text file having no header and a single line for each cutoff value.", + "name": "alpha_levels.tabular" + } + ], + "label": "alpha_levels.tabular", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 1210.5666198730469, + "height": 81.39999389648438, + "left": 562.9500122070312, + "right": 762.9500122070312, + "top": 1129.1666259765625, + "width": 200, + "x": 562.9500122070312, + "y": 1129.1666259765625 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": null}", + "tool_version": null, + "type": "data_input", + "uuid": "7b5eab97-7dad-4b0e-81eb-22aac39dd5b6", + "workflow_outputs": [] + }, + "7": { + "annotation": "", + "content_id": "mqppep_preproc", + "errors": null, + "id": 7, + "input_connections": { + "networkin": { + "id": 2, + "output_name": "output" + }, + "p_sty_motifs": { + "id": 3, + "output_name": "output" + }, + "phosphoSites": { + "id": 0, + "output_name": "output" + }, + "protein_fasta": { + "id": 1, + "output_name": "output" + }, + "psp_kinase_substrate": { + "id": 4, + "output_name": "output" + }, + "psp_regulatory_sites": { + "id": 5, + "output_name": "output" + } + }, + "inputs": [], + "label": null, + "name": "MaxQuant Phosphopeptide Preprocessing", + "outputs": [ + { + "name": "phosphoPepIntensities", + "type": "tabular" + }, + { + "name": "enrichGraph", + "type": "pdf" + }, + { + "name": "locProbCutoffGraph", + "type": "pdf" + }, + { + "name": "enrichGraph_svg", + "type": "svg" + }, + { + "name": "locProbCutoffGraph_svg", + "type": "svg" + }, + { + "name": "filteredData_tabular", + "type": "tabular" + }, + { + "name": "quantData_tabular", + "type": "tabular" + }, + { + "name": "mapped_phophopeptides", + "type": "tabular" + }, + { + "name": "melted_phophopeptide_map", + "type": "tabular" + }, + { + "name": "mqppep_output_sqlite", + "type": "sqlite" + }, + { + "name": "preproc_tab", + "type": "tabular" + }, + { + "name": "preproc_csv", + "type": "csv" + }, + { + "name": "preproc_sqlite", + "type": "sqlite" + } + ], + "position": { + "bottom": 1186.6000366210938, + "height": 812.933349609375, + "left": 945.4500122070312, + "right": 1145.4500122070312, + "top": 373.66668701171875, + "width": 200, + "x": 945.4500122070312, + "y": 373.66668701171875 + }, + "post_job_actions": { + "RenameDatasetActionenrichGraph": { + "action_arguments": { + "newname": "#{phosphoSites}.enrichGraph_pdf" + }, + "action_type": "RenameDatasetAction", + "output_name": "enrichGraph" + }, + "RenameDatasetActionenrichGraph_svg": { + "action_arguments": { + "newname": "#{phosphoSites}.enrichGraph_svg" + }, + "action_type": "RenameDatasetAction", + "output_name": "enrichGraph_svg" + }, + "RenameDatasetActionfilteredData_tabular": { + "action_arguments": { + "newname": "#{phosphoSites}.filteredData" + }, + "action_type": "RenameDatasetAction", + "output_name": "filteredData_tabular" + }, + "RenameDatasetActionlocProbCutoffGraph": { + "action_arguments": { + "newname": "#{phosphoSites}.locProbCutoffGraph_pdf" + }, + "action_type": "RenameDatasetAction", + "output_name": "locProbCutoffGraph" + }, + "RenameDatasetActionlocProbCutoffGraph_svg": { + "action_arguments": { + "newname": "#{phosphoSites}.locProbCutoffGraph_svg" + }, + "action_type": "RenameDatasetAction", + "output_name": "locProbCutoffGraph_svg" + }, + "RenameDatasetActionmapped_phophopeptides": { + "action_arguments": { + "newname": "#{phosphoSites}.ppep_map" + }, + "action_type": "RenameDatasetAction", + "output_name": "mapped_phophopeptides" + }, + "RenameDatasetActionmelted_phophopeptide_map": { + "action_arguments": { + "newname": "#{phosphoSites}.melted" + }, + "action_type": "RenameDatasetAction", + "output_name": "melted_phophopeptide_map" + }, + "RenameDatasetActionmqppep_output_sqlite": { + "action_arguments": { + "newname": "#{phosphoSites}.ppep_mapping_sqlite" + }, + "action_type": "RenameDatasetAction", + "output_name": "mqppep_output_sqlite" + }, + "RenameDatasetActionphosphoPepIntensities": { + "action_arguments": { + "newname": "#{phosphoSites}.ppep_intensities" + }, + "action_type": "RenameDatasetAction", + "output_name": "phosphoPepIntensities" + }, + "RenameDatasetActionpreproc_csv": { + "action_arguments": { + "newname": "#{phosphoSites}.preproc_csv" + }, + "action_type": "RenameDatasetAction", + "output_name": "preproc_csv" + }, + "RenameDatasetActionpreproc_sqlite": { + "action_arguments": { + "newname": "#{phosphoSites}.preproc_sqlite" + }, + "action_type": "RenameDatasetAction", + "output_name": "preproc_sqlite" + }, + "RenameDatasetActionpreproc_tab": { + "action_arguments": { + "newname": "#{phosphoSites}.preproc_tab" + }, + "action_type": "RenameDatasetAction", + "output_name": "preproc_tab" + }, + "RenameDatasetActionquantData_tabular": { + "action_arguments": { + "newname": "#{phosphoSites}.quantData" + }, + "action_type": "RenameDatasetAction", + "output_name": "quantData_tabular" + } + }, + "tool_id": "mqppep_preproc", + "tool_state": "{\"collapseFunc\": \"sum\", \"enriched\": \"ST\", \"intervalCol\": \"1\", \"localProbCutoff\": \"0.75\", \"merge_function\": \"sum\", \"networkin\": {\"__class__\": \"ConnectedValue\"}, \"p_sty_motifs\": {\"__class__\": \"ConnectedValue\"}, \"phosphoCol\": \"^Number of Phospho [(]STY[)]$\", \"phosphoSites\": {\"__class__\": \"ConnectedValue\"}, \"phospho_type\": \"sty\", \"protein_fasta\": {\"__class__\": \"ConnectedValue\"}, \"psp_kinase_substrate\": {\"__class__\": \"ConnectedValue\"}, \"psp_regulatory_sites\": {\"__class__\": \"ConnectedValue\"}, \"species\": \"human\", \"startCol\": \"^Intensity[^_]\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "0.1.0+galaxy0", + "type": "tool", + "uuid": "235b1a2e-ccc0-4c91-bb91-bbf4d272c870", + "workflow_outputs": [ + { + "label": "ppep_intensities", + "output_name": "phosphoPepIntensities", + "uuid": "92fd4e27-5d4b-4e9f-b3ad-6bdad53bb93d" + }, + { + "label": "enrichGraph_pdf", + "output_name": "enrichGraph", + "uuid": "4c1d5590-f8ba-421c-858c-4c026691b52e" + }, + { + "label": "locProbCutoffGraph_pdf", + "output_name": "locProbCutoffGraph", + "uuid": "66a79534-6372-4937-bcf2-8644be985eea" + }, + { + "label": "enrichGraph_svg", + "output_name": "enrichGraph_svg", + "uuid": "5e713d9c-1868-423b-be9a-25c0486e1472" + }, + { + "label": "locProbCutoffGraph_svg", + "output_name": "locProbCutoffGraph_svg", + "uuid": "4621ea21-ae90-4547-a68f-30dfc7857368" + }, + { + "label": "filteredData", + "output_name": "filteredData_tabular", + "uuid": "bb26d0fb-6f19-43c7-80ef-1cf81aa09ee8" + }, + { + "label": "quantData", + "output_name": "quantData_tabular", + "uuid": "20efe04f-2700-4af0-92c6-0830a42d8e75" + }, + { + "label": "ppep_map", + "output_name": "mapped_phophopeptides", + "uuid": "037e2b97-8fc8-436d-bcc3-af5ee685b752" + }, + { + "label": "melted_phosphopeptide_map", + "output_name": "melted_phophopeptide_map", + "uuid": "c3e5de84-2659-45eb-81a6-edef6037d8aa" + }, + { + "label": "ppep_mapping_sqlite", + "output_name": "mqppep_output_sqlite", + "uuid": "a1a4f827-1f1f-4175-ae51-c238f9e1f248" + }, + { + "label": "preproc_tab", + "output_name": "preproc_tab", + "uuid": "b22b4b56-9395-4f6d-945e-0089e8897069" + }, + { + "label": "preproc_csv", + "output_name": "preproc_csv", + "uuid": "54be90f9-1158-4686-af42-43d021088300" + }, + { + "label": "preproc_sqlite", + "output_name": "preproc_sqlite", + "uuid": "33663f9c-b718-4bdd-acc9-087c76bea678" + } + ] + }, + "8": { + "annotation": "Perform ANOVA. For imputing missing values, use median of non-missing values from the same treatment group.", + "content_id": "mqppep_anova", + "errors": null, + "id": 8, + "input_connections": { + "alpha_file": { + "id": 6, + "output_name": "output" + }, + "input_file": { + "id": 7, + "output_name": "preproc_tab" + } + }, + "inputs": [], + "label": "MaxQuant Phosphopeptide ANOVA group-median imputed", + "name": "MaxQuant Phosphopeptide ANOVA", + "outputs": [ + { + "name": "imputed_data_file", + "type": "tabular" + }, + { + "name": "report_file", + "type": "html" + } + ], + "position": { + "bottom": 1488.0999603271484, + "height": 254.93333435058594, + "left": 1202.949951171875, + "right": 1402.949951171875, + "top": 1233.1666259765625, + "width": 200, + "x": 1202.949951171875, + "y": 1233.1666259765625 + }, + "post_job_actions": { + "RenameDatasetActionimputed_data_file": { + "action_arguments": { + "newname": "#{input_file}.intensities_group-mean-imputed_QN_LT" + }, + "action_type": "RenameDatasetAction", + "output_name": "imputed_data_file" + }, + "RenameDatasetActionreport_file": { + "action_arguments": { + "newname": "#{input_file}.intensities_group-mean-imputed_report (download/unzip to view)" + }, + "action_type": "RenameDatasetAction", + "output_name": "report_file" + } + }, + "tool_id": "mqppep_anova", + "tool_state": "{\"alpha_file\": {\"__class__\": \"ConnectedValue\"}, \"first_data_column\": \"Intensity\", \"imputation\": {\"imputation_method\": \"group-median\", \"__current_case__\": 0}, \"input_file\": {\"__class__\": \"ConnectedValue\"}, \"sample_grouping_regex\": \"(\\\\d+)\", \"sample_names_regex\": \"\\\\.(\\\\d+)[A-Z]$\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "0.1.0+galaxy0", + "type": "tool", + "uuid": "2257286b-6f9a-45c1-90a3-bf5b972959d5", + "workflow_outputs": [ + { + "label": "intensities_group-mean-imputed_QN_LT", + "output_name": "imputed_data_file", + "uuid": "8e7317c6-95e9-4454-b4d7-31b4de6167a8" + }, + { + "label": "intensities_group-mean-imputed_report", + "output_name": "report_file", + "uuid": "dfe9b34e-1f3e-4971-8382-41178104e253" + } + ] + }, + "9": { + "annotation": "Perform ANOVA. For imputing missing values, create random values.", + "content_id": "mqppep_anova", + "errors": null, + "id": 9, + "input_connections": { + "alpha_file": { + "id": 6, + "output_name": "output" + }, + "input_file": { + "id": 7, + "output_name": "preproc_tab" + } + }, + "inputs": [], + "label": "MaxQuant Phosphopeptide ANOVA randomly imputed", + "name": "MaxQuant Phosphopeptide ANOVA", + "outputs": [ + { + "name": "imputed_data_file", + "type": "tabular" + }, + { + "name": "report_file", + "type": "html" + } + ], + "position": { + "bottom": 1325.0999603271484, + "height": 254.93333435058594, + "left": 1452.949951171875, + "right": 1652.949951171875, + "top": 1070.1666259765625, + "width": 200, + "x": 1452.949951171875, + "y": 1070.1666259765625 + }, + "post_job_actions": { + "RenameDatasetActionimputed_data_file": { + "action_arguments": { + "newname": "#{input_file}.intensities_randomly-imputed_QN_LT" + }, + "action_type": "RenameDatasetAction", + "output_name": "imputed_data_file" + }, + "RenameDatasetActionreport_file": { + "action_arguments": { + "newname": "#{input_file}.intensities_randomly-imputed_report (download/unzip to view)" + }, + "action_type": "RenameDatasetAction", + "output_name": "report_file" + } + }, + "tool_id": "mqppep_anova", + "tool_state": "{\"alpha_file\": {\"__class__\": \"ConnectedValue\"}, \"first_data_column\": \"Intensity\", \"imputation\": {\"imputation_method\": \"random\", \"__current_case__\": 3, \"meanPercentile\": \"1\", \"sdPercentile\": \"0.2\"}, \"input_file\": {\"__class__\": \"ConnectedValue\"}, \"sample_grouping_regex\": \"(\\\\d+)\", \"sample_names_regex\": \"\\\\.(\\\\d+)[A-Z]$\", \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": "0.1.0+galaxy0", + "type": "tool", + "uuid": "9516971c-8532-4797-8bf9-4655ff104dbd", + "workflow_outputs": [ + { + "label": "intensities_randomly-imputed_QN_LT", + "output_name": "imputed_data_file", + "uuid": "8ceda029-d5fd-4d75-a2b3-ac582bb137c3" + }, + { + "label": "intensities_randomly-imputed_report", + "output_name": "report_file", + "uuid": "84bedf25-c15b-4cc7-97e0-92f746e89f9c" + } + ] + } + }, + "tags": [ + "ppenrich" + ], + "uuid": "ac7bf2d1-89fe-4bf6-920a-d5508842d3f9", + "version": 7 +} \ No newline at end of file