Mercurial > repos > eschen42 > mqppep_anova
changeset 22:61adb8801b73 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 0c7ca054e77e042c8a584c9903073da064df7d8b
author | eschen42 |
---|---|
date | Thu, 30 Jun 2022 16:16:32 +0000 |
parents | f3deca1a3c84 |
children | 3911581e639a |
files | macros.xml mqppep_anova.R mqppep_anova.xml mqppep_anova_script.Rmd test-data/test_input_for_anova.tabular workflow/ppenrich_suite_wf.ga |
diffstat | 6 files changed, 2808 insertions(+), 357 deletions(-) [+] |
line wrap: on
line diff
--- a/macros.xml Wed Apr 13 19:48:32 2022 +0000 +++ b/macros.xml Thu Jun 30 16:16:32 2022 +0000 @@ -1,17 +1,30 @@ <macros> - <token name="@TOOL_VERSION@">0.1.12</token> + <token name="@TOOL_VERSION@">0.1.13</token> <token name="@VERSION_SUFFIX@">0</token> <xml name="requirements"> <requirements> - <requirement type="package" version="0.4.0" >r-sass</requirement> - <requirement type="package" version="1.14.2" >r-data.table</requirement> <requirement type="package" version="1.56.0" >bioconductor-preprocesscore</requirement> - <requirement type="package" version="5.26.2" >perl</requirement> + <requirement type="package" version="1.22.2" >numpy</requirement> <requirement type="package" version="0.3.3" >openblas</requirement> - <requirement type="package" version="1.22.2" >numpy</requirement> + <requirement type="package" version="1.4.1" >pandas</requirement> + <requirement type="package" version="1.64" >perl-dbd-sqlite</requirement> + <requirement type="package" version="5.26.2" >perl</requirement> + <requirement type="package" version="1.4.0" >pyahocorasick</requirement> + <requirement type="package" version="3.9.10" >python</requirement> + <requirement type="package" version="1.14.2" >r-data.table</requirement> + <requirement type="package" version="1.1.2" >r-dbi</requirement> <requirement type="package" version="3.3.5" >r-ggplot2</requirement> + <requirement type="package" version="3.1.3" >r-gplots</requirement> + <requirement type="package" version="0.9.4" >r-latex2exp</requirement> + <requirement type="package" version="1.7.1" >r-optparse</requirement> + <requirement type="package" version="1.4.4" >r-reshape2</requirement> + <requirement type="package" version="2.11" >r-rmarkdown</requirement> + <requirement type="package" version="2.2.8" >r-rsqlite</requirement> + <requirement type="package" version="0.4.0" >r-sass</requirement> + <requirement type="package" version="0.4_11" >r-sqldf</requirement> + <requirement type="package" version="1.4.0" >r-stringr</requirement> + <requirement type="package" version="0.37" >r-tinytex</requirement> <requirement type="package" version="0.3.7" >r-vioplot</requirement> - <requirement type="package" version="0.37" >r-tinytex</requirement> <!-- It would be nice to use conda-forge/texlive-core rather than r-tinytex because the former installs texlive when the package is built, but issue 23 blocked PDF-creation. @@ -19,14 +32,58 @@ with boxes) unless I specified the build as well as the version when building a conda environment, e.g.: texlive-core=20210325=h97429d4_0 --> - <requirement type="package" version="3.9.10" >python</requirement> - <requirement type="package" version="0.9.4" >r-latex2exp</requirement> - <requirement type="package" version="1.4.0" >r-stringr</requirement> - <requirement type="package" version="1.64" >perl-dbd-sqlite</requirement> - <requirement type="package" version="1.4.0" >pyahocorasick</requirement> - <requirement type="package" version="2.11" >r-rmarkdown</requirement> - <requirement type="package" version="1.4.1" >pandas</requirement> - <requirement type="package" version="1.7.1" >r-optparse</requirement> </requirements> + <!-- I specified the versions above because it takes a VERY long time to search for package versions when they are not omitted; also, version numbers should lead to reproducible behavior. Contrast execution times of this (about 18 seconds): + echo n | time conda create -n mqppep_ver -c conda-forge -c bioconda \ + bioconductor-preprocesscore=1.56.0 \ + numpy=1.22.2 \ + openblas=0.3.3 \ + pandas=1.4.1 \ + perl-dbd-sqlite=1.64 \ + perl-dbd-sqlite=1.64 \ + perl=5.26.2 \ + pyahocorasick=1.4.0 \ + python=3.9.10 \ + r-data.table=1.14.2 \ + r-dbi=1.1.2 \ + r-ggplot2=3.3.5 \ + r-gplots=3.1.3 \ + r-latex2exp=0.9.4 \ + r-optparse=1.7.1 \ + r-reshape2=1.4.4 \ + r-rmarkdown=2.11 \ + r-rsqlite=2.2.8 \ + r-sass=0.4.0 \ + r-sqldf=0.4_11 \ + r-stringr=1.4.0 \ + r-tinytex=0.37 \ + r-vioplot=0.3.7 + with this (42 or more seconds): + echo n | time conda create -n mqppep_nover -c conda-forge -c bioconda \ + bioconductor-preprocesscore= \ + numpy \ + openblas=0.3.3 \ + pandas \ + perl \ + perl-dbd-sqlite \ + perl-dbd-sqlite \ + pyahocorasick \ + python \ + r-data.table \ + r-dbi \ + r-ggplot2 \ + r-gplots \ + r-latex2exp \ + r-optparse \ + r-reshape2 \ + r-rmarkdown \ + r-rsqlite \ + r-sass \ + r-sqldf \ + r-stringr \ + r-tinytex \ + r-vioplot + + --> </xml> </macros>
--- a/mqppep_anova.R Wed Apr 13 19:48:32 2022 +0000 +++ b/mqppep_anova.R Thu Jun 30 16:16:32 2022 +0000 @@ -3,12 +3,6 @@ library(optparse) library(data.table) library(stringr) -# bioconductor-preprocesscore -# - libopenblas -# - r-data.table -# - r-rmarkdown -# - r-ggplot2 -# - texlive-core # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 @@ -30,6 +24,20 @@ " path to text file having one column and no header") ), make_option( + c("-S", "--preproc_sqlite"), + action = "store", + default = NA, + type = "character", + help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`" + ), + make_option( + c("-K", "--ksea_sqlite"), + action = "store", + default = NA, + type = "character", + help = "Path to 'ksea_sqlite' output produced by this tool" + ), + make_option( c("-f", "--firstDataColumn"), action = "store", default = "^Intensity[^_]", @@ -99,6 +107,28 @@ default = "QuantDataProcessingScript.html", type = "character", help = "HTML report file path" + ), + make_option( + c("-k", "--ksea_cutoff_statistic"), + action = "store", + default = "FDR", + type = "character", + help = paste0("Method for missing-value imputation,", + " one of c('FDR','p.value'), but don't expect 'p.value' to work well.") + ), + make_option( + c("-t", "--ksea_cutoff_threshold"), + action = "store", + default = 0.05, + type = "double", + help = paste0("Maximum score to be used to score a kinase enrichment as significant") + ), + make_option( + c("-M", "--anova_ksea_metadata"), + action = "store", + default = "anova_ksea_metadata.tsv", + type = "character", + help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments" ) ) args <- parse_args(OptionParser(option_list = option_list)) @@ -112,18 +142,27 @@ } input_file <- args$inputFile alpha_file <- args$alphaFile +preproc_sqlite <- args$preproc_sqlite imputed_data_file_name <- args$imputedDataFile imp_qn_lt_data_filenm <- args$imputedQNLTDataFile +anova_ksea_metadata <- args$anova_ksea_metadata report_file_name <- args$reportFile +ksea_sqlite <- args$ksea_sqlite +ksea_cutoff_statistic <- args$ksea_cutoff_statistic +ksea_cutoff_threshold <- args$ksea_cutoff_threshold +if ( + sum( + grepl( + pattern = ksea_cutoff_statistic, + x = c("FDR", "p.value") + ) + ) < 1 + ) { + print(sprintf("bad ksea_cutoff_statistic argument: %s", ksea_cutoff_statistic)) + return(-1) + } imputation_method <- args$imputationMethod -print( - grepl( - pattern = imputation_method, - x = c("group-median", "median", "mean", "random") - ) - ) - if ( sum( grepl( @@ -219,6 +258,7 @@ rmarkdown_params <- list( inputFile = input_file , alphaFile = alpha_file + , preprocDb = preproc_sqlite , firstDataColumn = first_data_column , imputationMethod = imputation_method , meanPercentile = mean_percentile @@ -227,6 +267,10 @@ , regexSampleGrouping = regex_sample_grouping , imputedDataFilename = imputed_data_file_name , imputedQNLTDataFile = imp_qn_lt_data_filenm + , anovaKseaMetadata = anova_ksea_metadata + , kseaAppPrepDb = ksea_sqlite + , kseaCutoffThreshold = ksea_cutoff_threshold + , kseaCutoffStatistic = ksea_cutoff_statistic ) print("rmarkdown_params")
--- a/mqppep_anova.xml Wed Apr 13 19:48:32 2022 +0000 +++ b/mqppep_anova.xml Thu Jun 30 16:16:32 2022 +0000 @@ -2,13 +2,25 @@ id="mqppep_anova" name="MaxQuant Phosphopeptide ANOVA" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" - python_template_version="3.5" profile="21.05" > - <description>Perform ANOVA on merged and filtered data from phospho-peptide enrichment/MaxQuant pipeline</description> + <description>Runs ANOVA and KSEA for phosphopeptides.</description> <macros> <import>macros.xml</import> </macros> + <edam_topics> + <edam_topic>topic_0121</edam_topic><!-- proteomics --> + <edam_topic>topic_3520</edam_topic><!-- proteomics experiment--> + </edam_topics> + <edam_operations> + <edam_operation>operation_0276</edam_operation><!-- Analyse a network of protein interactions. --> + <edam_operation>operation_0531</edam_operation><!-- Heat map generation --> + <edam_operation>operation_2938</edam_operation><!-- Dendrogram generation --> + <edam_operation>operation_2938</edam_operation><!-- Imputation --> + <edam_operation>operation_3435</edam_operation><!-- Standardisation and normalisation --> + <edam_operation>operation_3501</edam_operation><!-- Enrichment analysis --> + <edam_operation>operation_3658</edam_operation><!-- Statistical inference --> + </edam_operations> <expand macro="requirements"/> <!-- The weird invocation used here is because knitr and install_tinytex @@ -16,11 +28,12 @@ biocontainer are read-only, so this builds a pseudo-home under /tmp --> <command detect_errors="exit_code"><![CDATA[ - cp '$__tool_directory__/mqppep_anova_script.Rmd' . || exit 0; - cp '$__tool_directory__/mqppep_anova.R' . || exit 0; + cp '$__tool_directory__/mqppep_anova_script.Rmd' . && + cp '$__tool_directory__/mqppep_anova.R' . && Rscript mqppep_anova.R --inputFile '$input_file' --alphaFile '$alpha_file' + --preproc_sqlite '$preproc_sqlite' --firstDataColumn $intensity_column_regex_f --imputationMethod $imputation.imputation_method #if $imputation.imputation_method == "random" @@ -31,9 +44,11 @@ --regexSampleGrouping $sample_grouping_regex_f --imputedDataFile $imputed_data_file --imputedQNLTDataFile '$imp_qn_lt_file' - --reportFile '$report_file'; - export RESULT=\$?; - exit \${RESULT} + --ksea_sqlite '$ksea_sqlite' + --ksea_cutoff_threshold '$ksea_cutoff_threshold' + --ksea_cutoff_statistic 'FDR' + --reportFile '$report_file' + --anova_ksea_metadata '$anova_ksea_metadata' ]]></command> <configfiles> <configfile name="sample_names_regex_f"> @@ -48,19 +63,22 @@ </configfiles> <inputs> <param name="input_file" type="data" format="tabular" label="Filtered Phosphopeptide Intensities" - help="[input_file] Phosphopeptide intensities filtered for minimal quality. First column label 'Phosphopeptide'; sample-intensities must begin in column 10 and must have column labels to match argument [sample_names_regex]" + help="Phosphopeptide intensities filtered for minimal quality. First column label 'Phosphopeptide'; sample-intensities must begin in column 10 and must have column labels to match argument [sample_names_regex]" /> - <param name="alpha_file" type="data" format="tabular" label="alpha cutoff level" - help="[alpha_file] List of alpha cutoff values for significance testing; text file having one column and no header" + <param name="alpha_file" type="data" format="tabular" label="ANOVA alpha cutoff level" + help="ANOVA alpha cutoff values for significance testing: tabular data having one column and no header" /> + <param name="preproc_sqlite" type="data" format="sqlite" label="preproc_sqlite dataset from mqppep_preproc" + help="'preproc_sqlite' dataset produced by 'MaxQuant Phosphopeptide Preprocessing' tool" + /> <param name="intensity_column_regex" type="text" value="^Intensity[^_]" label="Intensity-column pattern" - help="[intensity_column_regex] Pattern matching columns that have peptide intensity data (PERL-compatible regular expression matching column label)" + help="Pattern matching columns that have peptide intensity data (PERL-compatible regular expression matching column label)" /> <!-- imputation_method <- c("group-median","median","mean","random")[1] --> <conditional name="imputation"> <param name="imputation_method" type="select" label="Imputation Method" - help="[imputation_method] Impute missing values by (1) using median for each sample-group; (2) using median across all samples; (3) using mean across all samples; or (4) using randomly generated values having same std. dev. as across all samples (with mean specified by [meanPercentile])" + help="Impute missing values by (1) using median for each sample-group; (2) using median across all samples; (3) using mean across all samples; or (4) using randomly generated values having same std. dev. as across all samples (with mean specified by [meanPercentile])" > <option value="random" selected="true">random</option> <option value="group-median">group-median</option> @@ -73,16 +91,16 @@ <when value="random"> <param name="meanPercentile" type="integer" value="1" min="1" max="99" label="Mean percentile for random values" - help="[meanPercentile] Percentile center of random values; range [1,99]" + help="Percentile center of random values; range [1,99]" /> <param name="sdPercentile" type="float" value="1.0" label="Percentile std. dev. for random values" - help="[sdPercentile] Standard deviation adjustment-factor for random values; real number. (1.0 means SD equal to the SD for the entire data set.)" + help="Standard deviation adjustment-factor for random values; real number. (1.0 means SD equal to the SD for the entire data set.)" /> </when> </conditional> <param name="sample_names_regex" type="text" value="\.\d+[A-Z]$" - help="[sample_names_regex] Pattern extracting sample-names from names of columns that have peptide intensity data (PERL-compatible regular expression)" + help="Pattern extracting sample-names from names of columns that have peptide intensity data (PERL-compatible regular expression)" label="Sample-extraction pattern"> <sanitizer> <valid initial="string.printable"> @@ -91,7 +109,7 @@ </sanitizer> </param> <param name="sample_grouping_regex" type="text" value="\d+" - help="[sample_grouping_regex] Pattern extracting sample-group from the sample-names that are extracted by 'Sample-extraction pattern' (PERL-compatible regular expression)" + help="Pattern extracting sample-group from the sample-names that are extracted by 'Sample-extraction pattern' (PERL-compatible regular expression)" label="Group-extraction pattern"> <sanitizer> <valid initial="string.printable"> @@ -99,34 +117,106 @@ </valid> </sanitizer> </param> + <param name="ksea_cutoff_threshold" type="float" value="0.05" + label="KSEA threshold level" + help="Maximum FDR to be used to score a kinase enrichment as significant" + /> </inputs> <outputs> <data name="imputed_data_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_intensities" ></data> <data name="imp_qn_lt_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_QN_LT_intensities" ></data> + <data name="anova_ksea_metadata" format="tabular" label="${input_file.name}.${imputation.imputation_method}-anova_ksea_metadata" ></data> <!-- <data name="report_file" format="html" label="${input_file.name}.${imputation.imputation_method}-imputed_report (download/unzip to view)" ></data> --> <data name="report_file" format="pdf" label="${input_file.name}.${imputation.imputation_method}-imputed_report" ></data> + <data name="ksea_sqlite" format="sqlite" label="${input_file.name}..${imputation.imputation_method}-imputed_ksea_sqlite"> + </data> </outputs> <tests> <test> <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> + <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> <param name="intensity_column_regex" value="^Intensity[^_]"/> - <param name="imputation_method" value="group-median"/> + <param name="imputation_method" value="median"/> <param name="sample_names_regex" value="\.\d+[A-Z]$"/> <param name="sample_grouping_regex" value="\d+"/> + <output name="imputed_data_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <!-- missing missing observd missing observd observd --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*8765300.8765300.8765300.8765300.2355900.14706000" /> + + </assert_contents> + </output> <output name="imp_qn_lt_file"> <assert_contents> <has_text text="Phosphopeptide" /> <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> - <has_text text="7.935878" /> + <!-- missing missing observed missing observed observed --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.962256.*6.908828.*6.814580.*6.865411.*6.908828.*7.088909" /> + <has_text text="pSQKQEEENPAEETGEEK" /> </assert_contents> </output> </test> <test> <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> + <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> + <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> + <param name="intensity_column_regex" value="^Intensity[^_]"/> + <param name="imputation_method" value="mean"/> + <param name="sample_names_regex" value="\.\d+[A-Z]$"/> + <param name="sample_grouping_regex" value="\d+"/> + <output name="imputed_data_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <!-- missing missing observd missing observd observd --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*6721601.6721601.8765300.6721601.2355900.14706000" /> + + </assert_contents> + </output> + <output name="imp_qn_lt_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <!-- missing missing observed missing observed observed --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.839850.*6.797424.*6.797424.*6.797424.*6.896609.*7.092451" /> + </assert_contents> + </output> + </test> + <test> + <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> + <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> + <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> + <param name="intensity_column_regex" value="^Intensity[^_]"/> + <param name="imputation_method" value="group-median"/> + <param name="sample_names_regex" value="\.\d+[A-Z]$"/> + <param name="sample_grouping_regex" value="\d+"/> + <output name="imputed_data_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <!-- missing missing observd missing observd observd --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*8765300.8765300.8765300.5886074.2355900.14706000" /> + + </assert_contents> + </output> + <output name="imp_qn_lt_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <!-- missing missing observed missing observed observed --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.946112.*6.888985.*6.792137.*6.792137.*6.888985.*7.089555" /> + </assert_contents> + </output> + </test> + <test> + <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> + <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> <param name="intensity_column_regex" value="^Intensity[^_]"/> <param name="imputation_method" value="random"/> @@ -134,20 +224,29 @@ <param name="sdPercentile" value="1.0" /> <param name="sample_names_regex" value="\.\d+[A-Z]$"/> <param name="sample_grouping_regex" value="\d+"/> + <output name="imputed_data_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <!-- observd observd observd --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*8765300.*2355900.*4706000" /> + + </assert_contents> + </output> <output name="imp_qn_lt_file"> <assert_contents> <has_text text="Phosphopeptide" /> <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> - <has_text text="8.392287" /> - <has_text text="pSQKQEEENPAEETGEEK" /> + <has_text text="5.409549" /> <!-- log-transformed value for pTYVDPFTpYEDPNQAVR .1B --> + <has_text text="6.464714" /> <!-- log-transformed value for pSQKQEEENPAEETGEEK .2A --> </assert_contents> </output> </test> </tests> <help><![CDATA[ -=========================================== -Phopsphoproteomic Enrichment Pipeline ANOVA -=========================================== +==================================================== +Phopsphoproteomic Enrichment Pipeline ANOVA and KSEA +==================================================== **Input files** @@ -179,7 +278,7 @@ 4. using randomly generated values where: - ``meanPercentile`` specifies the percentile among non-missing values to be used as mean of random values, and - - ``sdPercentile`` specifies the factor to be mulitplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values. + - ``sdPercentile`` specifies the factor to be multiplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values. ``sample_names_regex`` PERL-compatible regular expression extracting the sample-name from the the name of a column of instensities (from ``input_file``) for one sample. @@ -205,6 +304,11 @@ ``report_file`` Summary report for normalization, imputation, and **ANOVA**, in PDF format. +**Algorithm** + +The KSEA algorithm used here is as in the KSEAapp package as reported in [Wiredja 2017]. +The code is adapted from "Danica D. Wiredja (2017). KSEAapp: Kinase-Substrate Enrichment Analysis. R package version 0.99.0." to work with output from the "MaxQuant Phosphopeptide Preprocessing" Galaxy tool. + **Authors** ``Larry C. Cheng`` @@ -223,5 +327,7 @@ <citations> <!-- Cheng_2018 "Phosphopeptide Enrichment ..." PMID: 30124664 --> <citation type="doi">10.3791/57996</citation> + <!-- Wiredja_2017 "The KSEA App ..." PMID: 28655153 --> + <citation type="doi">10.1093/bioinformatics/btx415</citation> </citations> </tool>
--- a/mqppep_anova_script.Rmd Wed Apr 13 19:48:32 2022 +0000 +++ b/mqppep_anova_script.Rmd Thu Jun 30 16:16:32 2022 +0000 @@ -1,40 +1,100 @@ --- -title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA" -author: "Larry Cheng; Art Eschenlauer" -date: "May 28, 2018; Mar 16, 2022" +title: "MaxQuant Phosphoproteomic Enrichment Pipeline ANOVA/KSEA" +author: +- "Nick Graham^[ORCiD 0000-0002-6811-1941, University of Southern California: Los Angeles, CA, US]" +- "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" +- "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" +date: +- "May 28, 2018" +- "; revised June 23, 2022" output: pdf_document: toc: true - latex_document: - toc: true + toc_depth: 3 + keep_tex: true +header-includes: + - \usepackage{longtable} + - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut + - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut params: - alphaFile: "test-data/alpha_levels.tabular" - inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular" - firstDataColumn: "^Intensity[^_]" - imputationMethod: !r c("group-median", "median", "mean", "random")[4] - meanPercentile: 1 - sdPercentile: 1.0 - regexSampleNames: "\\.\\d+[A-Z]$" - regexSampleGrouping: "\\d+" - imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" - imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" - show_toc: true + alphaFile: "test-data/alpha_levels.tabular" + inputFile: "test-data/test_input_for_anova.tabular" + preprocDb: "test-data/test_input_for_anova.sqlite" + kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] + show_toc: true + firstDataColumn: "^Intensity[^_]" + imputationMethod: !r c("group-median", "median", "mean", "random")[1] + meanPercentile: 1 + sdPercentile: 1.0 + regexSampleNames: "\\.\\d+[A-Z]$" + regexSampleGrouping: "\\d+" + imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" + imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" + anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" + oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] + oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] + kseaCutoffStatistic: !r c("p.value", "FDR")[2] + kseaCutoffThreshold: !r c( 0.1, 0.05)[2] + kseaMinKinaseCount: 1 + intensityHeatmapRows: 75 --- <!-- - alphaFile: "test-data/alpha_levels.tabular" - inputFile: "test-data/test_input_for_anova.tabular" - inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular" - inputFile: "test-data/density_failure.preproc_tab.tabular" + kseaCutoffStatistic: !r c("p.value", "FDR")[2] + kseaCutoffThreshold: !r c(0.05, 0.1)[1] + + alphaFile: "test-data/alpha_levels.tabular" + inputFile: "test-data/test_input_for_anova.tabular" + preprocDb: "test-data/test_input_for_anova.sqlite" + kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] + + alphaFile: "test-data/alpha_levels.tabular" + inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" + preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" + kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] + + alphaFile: "test-data/alpha_levels.tabular" + inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" + preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" + kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] + + alphaFile: "test-data/alpha_levels.tabular" + inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" + preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" + kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] + + inputFile: "test-data/density_failure.preproc_tab.tabular" + kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2] latex_document: default --> ```{r setup, include = FALSE} +#ref for debugging: https://yihui.org/tinytex/r/#debugging +options(tinytex.verbose = TRUE) + # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 +# ref for top and bottom struts: https://tex.stackexchange.com/a/50355 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) # freeze the random number generator so the same results will be produced # from run to run set.seed(28571) +### LIBRARIES +library(gplots) +library(DBI) +library(RSQLite) +# Suppress "Warning: no DISPLAY variable so Tk is not available" +suppressWarnings(suppressMessages(library(sqldf))) + +# required but not added to search list: +# - DBI +# - RSQLite +# - ggplot2 +# - knitr +# - latex2exp +# - preprocessCore +# - reshape2 +# - vioplot + ### CONSTANTS const_parfin <- par("fin") @@ -44,20 +104,54 @@ sqrt(const_stripchart_cex * const_stripchart_cex / 2) const_stripchart_jitter <- 0.3 const_write_debug_files <- FALSE -const_table_anchor <- "tbp" +const_table_anchor_bp <- "bp" +const_table_anchor_ht <- "ht" +const_table_anchor_p <- "p" +const_table_anchor_tbp <- "tbp" + + +const_ksea_astrsk_kinases <- 1 +const_ksea_nonastrsk_kinases <- 2 +const_ksea_all_kinases <- 3 + +const_log10_e <- log10(exp(1)) ### FUNCTIONS -#ANOVA filter function -anova_func <- function(x, grouping_factor) { - x_aov <- aov(as.numeric(x) ~ grouping_factor) - pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1] - pvalue +# from `demo(error.catching)` +##' Catch *and* save both errors and warnings, and in the case of +##' a warning, also keep the computed result. +##' +##' @title tryCatch both warnings (with value) and errors +##' @param expr an \R expression to evaluate +##' @return a list with 'value' and 'warning', where +##' 'value' may be an error caught. +##' @author Martin Maechler; +##' Copyright (C) 2010-2012 The R Core Team +try_catch_w_e <- function(expr) { + wrn <- NULL + # warning handler + w_handler <- function(w) { + wrn <<- w + invokeRestart("muffleWarning") + } + list( + value = withCallingHandlers( + tryCatch( + expr, + error = function(e) e + ), + warning = w_handler + ), + warning = wrn + ) } + write_debug_file <- function(s) { if (const_write_debug_files) { s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) + print(sprintf("DEBUG writing file %s", spath)) write.table( s, file = s_path, @@ -69,28 +163,72 @@ } } -latex_collapsed_vector <- function(collapse_string, v) { +# ref: http://adv-r.had.co.nz/Environments.html +# "When creating your own environment, note that you should set its parent +# environment to be the empty environment. This ensures you don't +# accidentally inherit objects from somewhere else." +# Caution: this prevents `with(my_env, expr)` from working when `expr` +# contains anything from the global environment, even operators! +# Hence, `x <- 1; get("x", new_env())` fails by design. +new_env <- function() { + new.env(parent = emptyenv()) +} + +### numerical/statistical helper functions + +any_nan <- function(x) { + !any(x == "NaN") +} + +# determine standard deviation of quantile to impute +sd_finite <- function(x) { + ok <- is.finite(x) + sd(x[ok]) +} + +anova_func <- function(x, grouping_factor, one_way_f) { + subject <- data.frame( + intensity = x + ) + x_aov <- + one_way_f( + formula = intensity ~ grouping_factor, + data = subject + ) + pvalue <- + if (identical(one_way_f, aov)) + summary(x_aov)[[1]][["Pr(>F)"]][1] + else + pvalue <- x_aov$p.value + pvalue +} + + +### LaTeX functions + +latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) { + v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v cat( paste0( - gsub("_", "\\\\_", v), + v_sub, collapse = collapse_string ) ) } -latex_itemized_collapsed <- function(collapse_string, v) { +latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { cat("\\begin{itemize}\n\\item ") - latex_collapsed_vector(collapse_string, v) + latex_collapsed_vector(collapse_string, v, underscore_whack) cat("\n\\end{itemize}\n") } -latex_itemized_list <- function(v) { - latex_itemized_collapsed("\n\\item ", v) +latex_itemized_list <- function(v, underscore_whack = TRUE) { + latex_itemized_collapsed("\n\\item ", v, underscore_whack) } -latex_enumerated_collapsed <- function(collapse_string, v) { +latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { cat("\\begin{enumerate}\n\\item ") - latex_collapsed_vector(collapse_string, v) + latex_collapsed_vector(collapse_string, v, underscore_whack) cat("\n\\end{enumerate}\n") } @@ -98,8 +236,9 @@ latex_enumerated_collapsed("\n\\item ", v) } -latex_table_row <- function(v) { - latex_collapsed_vector(" & ", v) +latex_table_row <- function(v, extra = "", underscore_whack = TRUE) { + latex_collapsed_vector(" & ", v, underscore_whack) + cat(extra) cat(" \\\\\n") } @@ -118,10 +257,12 @@ justification = NULL, # TRUE to center on page centered = TRUE, - # optional capttion + # optional caption caption = NULL, # h(inline); b(bottom); t (top) or p (separate page) - anchor = "h" + anchor = "h", + # set underscore_whack to TRUE to escape underscores + underscore_whack = TRUE ) { if (is.null(justification)) justification <- @@ -144,12 +285,13 @@ "\n", sep = "" ) - } - else if (n == 0L) { + } else if (n == 0L) { cat("0 rows for:\n") - latex_itemized_list(names(x)) - } - else { + latex_itemized_list( + v = names(x), + underscore_whack = underscore_whack + ) + } else { if (is.null(max)) max <- getOption("max.print", 99999L) if (!is.finite(max)) @@ -178,12 +320,22 @@ sep = "" ) ) + # ref: https://tex.stackexchange.com/a/50353 + # Describes use of \rule{0pt}{3ex} if (!is.null(caption)) - cat(" \\hline\\hline\n") - latex_table_row(colnames(m)) + cat("\\B \\\\ \\hline\\hline\n") + # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 + latex_table_row( + v = colnames(m), + extra = "\\T\\B", + underscore_whack = underscore_whack + ) cat("\\hline\n") for (i in seq_len(length(m[, 1]))) { - latex_table_row(m[i, ]) + latex_table_row( + v = m[i, ], + underscore_whack = underscore_whack + ) } cat( paste( @@ -199,13 +351,835 @@ invisible(x) } +hypersub <- + function(s) { + hyper <- tolower(s) + hyper <- gsub("[^a-z0-9]+", "-", hyper) + hyper <- gsub("[-]+", "-", hyper) + hyper <- sub("^[-]", "", hyper) + hyper <- sub("[-]$", "", hyper) + return(hyper) + } + +subsection_header <- + function(s) { + hyper <- hypersub(s) + cat( + sprintf( + "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n", + hyper, s, hyper + ) + ) + } + +subsubsection_header <- + function(s) { + hyper <- hypersub(s) + cat( + sprintf( + "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n", + hyper, s, hyper + ) + ) + } + +### SQLite functions + +ddl_exec <- function(db, sql) { + discard <- DBI::dbExecute(conn = db, statement = sql) + if (FALSE && discard != 0) { + need_newpage <- TRUE + if (need_newpage) { + need_newpage <<- FALSE + cat("\\newpage\n") + } + o_file <- stdout() + cat("\n\\begin{verbatim}\n") + cat(sql, file = o_file) + cat(sprintf("\n%d rows affected by DDL\n", discard), file = o_file) + cat("\n\\end{verbatim}\n") + } +} + +dml_no_rows_exec <- function(db, sql) { + discard <- DBI::dbExecute(conn = db, statement = sql) + if (discard != 0) { + need_newpage <- TRUE + if (need_newpage) { + need_newpage <<- FALSE + cat("\\newpage\n") + } + cat("\n\\begin{verbatim}\n") + o_file <- stdout() + cat(sql, file = o_file) + cat(sprintf("\n%d rows affected by DML\n", discard), file = o_file) + cat("\n\\end{verbatim}\n") + } +} + +### KSEA functions and helpers + +# Adapted from KSEAapp::KSEA.Scores to allow retrieval of: +# - maximum log2(FC) +ksea_scores <- function( + + # For human data, typically, ksdata = KSEAapp::ksdata + ksdata, + + # Input data file having columns: + # - Protein : abbreviated protein name + # - Gene : HUGO gene name + # - Peptide : peptide sequence without indications of phosphorylation + # - Reside.Both : position(s) of phosphorylation within Gene sequence + # - First letter designates AA that is modified + # - Numbers indicate position within Gene + # - Multiple values are separated by semicolons + # - p : p-value + # - FC : fold-change + px, + + # A binary input of TRUE or FALSE, indicating whether or not to include + # NetworKIN predictions + networkin, + + # A numeric value between 1 and infinity setting the minimum NetworKIN + # score (can be left out if networkin = FALSE) + networkin_cutoff + +) { + if (length(grep(";", px$Residue.Both)) == 0) { + # There are no Residue.Both entries having semicolons, so new is + # simply px except two columns are renamed and a column is added + # for log2(abs(fold-change)) + new <- px + colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") + new$log2_fc <- log2(abs(as.numeric(as.character(new$FC)))) + new <- new[complete.cases(new$log2_fc), ] + } else { + # Split each row having semicolons in Residue.Both into rows that are + # duplicated in all respects except that each row has a single + # member of the set "split-on-semicolon-Residue.Both" + px_double <- px[grep(";", px$Residue.Both), ] + residues <- as.character(px_double$Residue.Both) + residues <- as.matrix(residues, ncol = 1) + split <- strsplit(residues, split = ";") + # x gets count of residues in each row, + # i.e., 1 + count of semicolons + x <- sapply(split, length) + # Here is the set of split rows + px_single <- data.frame( + Protein = rep(px_double$Protein, x), + Gene = rep(px_double$Gene, x), + Peptide = rep(px_double$Peptide, x), + Residue.Both = unlist(split), + p = rep(px_double$p, x), + FC = rep(px_double$FC, x) + ) + # new first gets the split rows + new <- px[-grep(";", px$Residue.Both), ] + # to new, append the rows that didn't need splitting in the first place + new <- rbind(new, px_single) + # map Gene to SUB_GENE + # map Residue.Both to SUB_MOD_RSD + colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") + # Eliminate any non-positive values to prevent introduction of + # infinite or NaN values + new[(0 <= new$log2_fc), "log2_fc"] <- NA + # Because of preceding step, there is no need for abs in the next line + new$log2_fc <- log2(as.numeric(as.character(new$FC))) + # Convert any illegal values from NaN to NA + new[is.nan(new$log2_fc), "log2_fc"] <- NA + # Eliminate rows having missing values (e.g., non-imputed data) + new <- new[complete.cases(new$log2_fc), ] + } + if (networkin == TRUE) { + # When NetworKIN is true, filter on NetworKIN.cutoff which includes + # PhosphoSitePlus data *because its networkin_score is set to Inf* + ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ] + ksdata_filtered <- ksdata_filtered[ + (ksdata_filtered$networkin_score >= networkin_cutoff), ] + } else { + # Otherwise, simply use PhosphSitePlus rows + ksdata_filtered <- ksdata[ + grep("PhosphoSitePlus", ksdata$Source), ] + } + # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD + # colnames of ksdata_filtered: + # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" + # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" + # "SITE_...7_AA" "networkin_score" "Source" + # colnames of new: + # "Protein" "SUB_GENE" "Peptide" "SUB_MOD_RSD" "p" "FC" "log2_fc" + # Equivalent to: + # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc + # FROM ksdata_filtered a + # INNER JOIN new b + # ON a.SUB_GENE = b.SUB_GENE + # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD + ksdata_dataset <- base::merge(ksdata_filtered, new) + # colnames of ksdata_dataset: + # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" + # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" + # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" + # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) + # Re-order dataset; prior to accounting for isoforms + ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] + # Extract non-isoform accession in UniProtKB + ksdata_dataset$uniprot_no_isoform <- sapply( + ksdata_dataset$KIN_ACC_ID, + function(x) unlist(strsplit(as.character(x), split = "-"))[1] + ) + # Discard previous results while selecting interesting columns ... + ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] + # Column names are now: + # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" + # "FC" "log2_fc" "Source" + # Make column names human-readable + colnames(ksdata_dataset_abbrev) <- c( + "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", + "FC", "log2FC", "Source" + ) + # SELECT * FROM ksdata_dataset_abbrev + # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p + ksdata_dataset_abbrev <- + ksdata_dataset_abbrev[ + order( + ksdata_dataset_abbrev$Kinase.Gene, + ksdata_dataset_abbrev$Substrate.Gene, + ksdata_dataset_abbrev$Substrate.Mod, + ksdata_dataset_abbrev$p), + ] + # First aggregation step to account for multiply phosphorylated peptides + # and differing peptide sequences; the goal here is to combine results + # for all measurements of the same substrate. + # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, + # `Source`, avg(log2FC) AS log2FC + # FROM ksdata_dataset_abbrev + # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, + # `Source` + # ORDER BY `Kinase.Gene`; + # in two steps: + # (1) compute average log_2(fold-change) + ksdata_dataset_abbrev <- aggregate( + log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, + data = ksdata_dataset_abbrev, + FUN = mean + ) + # (2) order by Kinase.Gene + ksdata_dataset_abbrev <- + ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] + # SELECT `Kinase.Gene`, count(*) + # FROM ksdata_dataset_abbrev + # GROUP BY `Kinase.Gene`; + # in two steps: + # (1) Extract the list of Kinase.Gene names + kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) + # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, + # named by Kinase.Gene + kinase_list <- as.matrix(table(kinase_list)) + # Second aggregation step to account for all substrates per kinase + # CREATE TABLE mean_fc + # AS + # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC + # FROM ksdata_dataset_abbrev + # GROUP BY `Kinase.Gene` + mean_fc <- aggregate( + log2FC ~ Kinase.Gene, + data = ksdata_dataset_abbrev, + FUN = mean + ) + # mean_fc columns: "Kinase.Gene", "log2FC" + if (FALSE) { + # I need to re-think this; I was trying to find the most-represented + # peptide, but that horse has already left the barn + # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC + # FROM ksdata_dataset_abbrev + # GROUP BY `Kinase.Gene` + max_fc <- aggregate( + log2FC ~ Kinase.Gene, + data = ksdata_dataset_abbrev, + FUN = function(r) max(abs(r)) + ) + } + + # Create column 3: mS + mean_fc$m_s <- mean_fc[, 2] + # Create column 4: Enrichment + mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE)) + # Create column 5: m, count of substrates + mean_fc$m <- kinase_list + # Create column 6: z-score + mean_fc$z_score <- ( + (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) * + sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE) + # Create column 7: p-value, deduced from z-score + mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) + # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value + mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") + + # Remove log2FC column, which is duplicated as mS + mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] + # Correct the column names which we had to hack because of the linter... + colnames(mean_fc) <- c( + "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" + ) + return(mean_fc) +} + +low_fdr_barplot <- function( + rslt, + i_cntrst, + i, + a_level, + b_level, + fold_change, + caption +) { + rslt_score_list_i <- rslt$score_list[[i]] + if (!is.null(rslt_score_list_i)) { + rslt_score_list_i_nrow <- nrow(rslt_score_list_i) + k <- data.frame( + contrast = as.integer(i_cntrst), + a_level = rep.int(a_level, rslt_score_list_i_nrow), + b_level = rep.int(b_level, rslt_score_list_i_nrow), + kinase_gene = rslt_score_list_i$Kinase.Gene, + mean_log2_fc = rslt_score_list_i$mS, + enrichment = rslt_score_list_i$Enrichment, + substrate_count = rslt_score_list_i$m, + z_score = rslt_score_list_i$z.score, + p_value = rslt_score_list_i$p.value, + fdr = rslt_score_list_i$FDR + ) + selector <- switch( + ksea_cutoff_statistic, + "FDR" = { + k$fdr + }, + "p.value" = { + k$p_value + }, + stop( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) + ) + ) + + k <- k[selector < ksea_cutoff_threshold, ] + + if (nrow(k) > 1) { + op <- par(mai = c(1, 1.5, 0.4, 0.4)) + numeric_z_score <- as.numeric(k$z_score) + z_score_order <- order(numeric_z_score) + kinase_name <- k$kinase_gene + long_caption <- + sprintf( + "Kinase z-score, %s < %s, %s", + ksea_cutoff_statistic, + ksea_cutoff_threshold, + caption + ) + my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) + cat("\n\\clearpage\n") + barplot( + height = numeric_z_score[z_score_order], + border = NA, + xpd = FALSE, + cex.names = 1.0, + cex.axis = 1.0, + main = long_caption, + cex.main = my_cex_caption, + names.arg = kinase_name[z_score_order], + horiz = TRUE, + srt = 45, + las = 1) + par(op) + } + } +} + +# note that this adds elements to the global variable `ksea_asterisk_hash` + +low_fdr_print <- function( + rslt, + i_cntrst, + i, + a_level, + b_level, + fold_change, + caption +) { + rslt_score_list_i <- rslt$score_list[[i]] + if (!is.null(rslt_score_list_i)) { + rslt_score_list_i_nrow <- nrow(rslt_score_list_i) + k <- contrast_ksea_scores <- data.frame( + contrast = as.integer(i_cntrst), + a_level = rep.int(a_level, rslt_score_list_i_nrow), + b_level = rep.int(b_level, rslt_score_list_i_nrow), + kinase_gene = rslt_score_list_i$Kinase.Gene, + mean_log2_fc = rslt_score_list_i$mS, + enrichment = rslt_score_list_i$Enrichment, + substrate_count = rslt_score_list_i$m, + z_score = rslt_score_list_i$z.score, + p_value = rslt_score_list_i$p.value, + fdr = rslt_score_list_i$FDR + ) + + selector <- switch( + ksea_cutoff_statistic, + "FDR" = { + k$fdr + }, + "p.value" = { + k$p_value + }, + stop( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) + ) + ) + + k <- k[selector < ksea_cutoff_threshold, ] + # save kinase names to ksea_asterisk_hash + for (kinase_name in k$kinase_gene) { + ksea_asterisk_hash[[kinase_name]] <- 1 + } + + db_write_table_overwrite <- (i_cntrst < 2) + db_write_table_append <- !db_write_table_overwrite + RSQLite::dbWriteTable( + conn = db, + name = "contrast_ksea_scores", + value = contrast_ksea_scores, + append = db_write_table_append + ) + selector <- switch( + ksea_cutoff_statistic, + "FDR" = { + contrast_ksea_scores$fdr + }, + "p.value" = { + contrast_ksea_scores$p_value + }, + stop( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) + ) + ) + output_df <- contrast_ksea_scores[ + selector < ksea_cutoff_threshold, + c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", + "z_score", "p_value", "fdr") + ] + output_order <- with(output_df, order(mean_log2_fc, kinase_gene)) + output_df <- output_df[output_order, ] + colnames(output_df) <- + c( + colnames(output_df)[1], + colnames(output_df)[2], + "enrichment", + "m_s", + "z_score", + "p_value", + "fdr" + ) + output_df$fdr <- sprintf("%0.4f", output_df$fdr) + output_df$p_value <- sprintf("%0.2e", output_df$p_value) + output_df$z_score <- sprintf("%0.2f", output_df$z_score) + output_df$m_s <- sprintf("%d", output_df$m_s) + output_df$enrichment <- sprintf("%0.2f", output_df$enrichment) + output_ncol <- ncol(output_df) + colnames(output_df) <- + c( + "Kinase", + "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)", + "Enrichment", + "Substrates", + "z-score", + "p-value", + "FDR" + ) + selector <- switch( + ksea_cutoff_statistic, + "FDR" = { + rslt$score_list[[i]]$FDR + }, + "p.value" = { + rslt$score_list[[i]]$p.value + }, + stop( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) + ) + ) + if (sum(selector < ksea_cutoff_threshold) > 0) { + math_caption <- gsub("{", "\\{", caption, fixed = TRUE) + math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) + data_frame_latex( + x = output_df, + justification = "l c c c c c c", + centered = TRUE, + caption = sprintf( + "\\text{%s}, %s < %s", + math_caption, + ksea_cutoff_statistic, + ksea_cutoff_threshold + ), + anchor = const_table_anchor_p + ) + } else { + cat( + sprintf( + "\\break + No kinases had + \\(\\text{%s}_\\text{enrichment} < %s\\) + for contrast %s\\hfill\\break\n", + ksea_cutoff_statistic, + ksea_cutoff_threshold, + caption + ) + ) + } + } +} + +# create_breaks is a helper for ksea_heatmap +create_breaks <- function(merged_scores) { + if (min(merged_scores, na.rm = TRUE) < -1.6) { + breaks_neg <- seq(-1.6, 0, length.out = 30) + breaks_neg <- + append( + seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10), + breaks_neg + ) + breaks_neg <- sort(unique(breaks_neg)) + } else { + breaks_neg <- seq(-1.6, 0, length.out = 30) + } + if (max(merged_scores, na.rm = TRUE) > 1.6) { + breaks_pos <- seq(0, 1.6, length.out = 30) + breaks_pos <- + append( + breaks_pos, + seq(1.6, max(merged_scores, na.rm = TRUE), + length.out = 10) + ) + breaks_pos <- sort(unique(breaks_pos)) + } else { + breaks_pos <- seq(0, 1.6, length.out = 30) + } + breaks_all <- unique(append(breaks_neg, breaks_pos)) + mycol_neg <- + gplots::colorpanel(n = length(breaks_neg), + low = "blue", + high = "white") + mycol_pos <- + gplots::colorpanel(n = length(breaks_pos) - 1, + low = "white", + high = "red") + mycol <- unique(append(mycol_neg, mycol_pos)) + color_breaks <- list(breaks_all, mycol) + return(color_breaks) +} + +# draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap +draw_kseaapp_summary_heatmap <- function( + x, + sample_cluster, + merged_asterisk, + my_cex_row, + color_breaks, + margins, + ... +) { + merged_scores <- x + if (!is.matrix(x)) { + cat( + paste0( + "No plot because \\texttt{typeof(x)} is '", + typeof(x), + "' rather than 'matrix'.\n\n" + ) + ) + } else if (nrow(x) < 2) { + cat("No plot because matrix x has ", nrow(x), " rows.\n\n") + cat("\\begin{verbatim}\n") + str(x) + cat("\\end{verbatim}\n") + } else if (ncol(x) < 2) { + cat("No plot because matrix x has ", ncol(x), " columns.\n\n") + cat("\\begin{verbatim}\n") + str(x) + cat("\\end{verbatim}\n") + } else { + gplots::heatmap.2( + x = merged_scores, + Colv = sample_cluster, + scale = "none", + cellnote = merged_asterisk, + notecol = "white", + cexCol = 0.9, + # Heuristically assign size of row labels + cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25), + srtCol = 45, + srtRow = 45, + notecex = 3 * my_cex_row, + col = color_breaks[[2]], + density.info = "none", + trace = "none", + breaks = color_breaks[[1]], + lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), + lhei = c(0.4, 8.0, 1.1), + lwid = c(0.5, 3), + key = FALSE, + margins = margins, + ... + ) + } +} + +# Adapted from KSEAapp::KSEA.Heatmap +ksea_heatmap <- function( + # the data frame outputs from the KSEA.Scores() function, in list format + score_list, + # a character vector of all the sample names for heatmap annotation: + # - the names must be in the same order as the data in score_list + # - please avoid long names, as they may get cropped in the final image + sample_labels, + # character string of either "p.value" or "FDR" indicating the data column + # to use for marking statistically significant scores + stats, + # a numeric value between 0 and infinity indicating the min. number of + # substrates a kinase must have to be included in the heatmap + m_cutoff, + # a numeric value between 0 and 1 indicating the p-value/FDR cutoff + # for indicating significant kinases in the heatmap + p_cutoff = + stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"), + # a binary input of TRUE or FALSE, indicating whether or not to perform + # hierarchical clustering of the sample columns + sample_cluster, + # a binary input of TRUE or FALSE, indicating whether or not to export + # the heatmap as a .png image into the working directory + export = FALSE, + # bottom and right margins; adjust as needed if contrast names are too long + margins = c(6, 20), + # print which kinases? + # - Mandatory argument, must be one of const_ksea_.*_kinases + which_kinases, + # additional arguments to gplots::heatmap.2, such as: + # - main: main title of plot + # - xlab: x-axis label + # - ylab: y-axis label + ... +) { + filter_m <- function(dataset, m_cutoff) { + filtered <- dataset[(dataset$m >= m_cutoff), ] + return(filtered) + } + score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff)) + for (i in seq_len(length(score_list_m))) { + names <- colnames(score_list_m[[i]])[c(2:7)] + colnames(score_list_m[[i]])[c(2:7)] <- + paste(names, i, sep = ".") + } + master <- + Reduce( + f = function(...) { + base::merge(..., by = "Kinase.Gene", all = FALSE) + }, + x = score_list_m + ) + + row.names(master) <- master$Kinase.Gene + columns <- as.character(colnames(master)) + merged_scores <- as.matrix(master[, grep("z.score", columns), drop = FALSE]) + colnames(merged_scores) <- sample_labels + merged_stats <- as.matrix(master[, grep(stats, columns)]) + asterisk <- function(mtrx, p_cutoff) { + new <- data.frame() + for (i in seq_len(nrow(mtrx))) { + for (j in seq_len(ncol(mtrx))) { + my_value <- mtrx[i, j] + if (!is.na(my_value) && my_value < p_cutoff) { + new[i, j] <- "*" + } else { + new[i, j] <- "" + } + } + } + return(new) + } + merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) + + # begin hack to print only significant rows + asterisk_rows <- rowSums(merged_asterisk == "*") > 0 + all_rows <- rownames(merged_stats) + names(asterisk_rows) <- all_rows + non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) + asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) + merged_scores_asterisk <- merged_scores[names(asterisk_rows), ] + merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ] + # end hack to print only significant rows + + row_list <- list() + row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows + row_list[[const_ksea_all_kinases]] <- all_rows + row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows + + i <- which_kinases + my_row_names <- row_list[[i]] + scrs <- merged_scores[my_row_names, ] + stts <- merged_stats[my_row_names, ] + merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) + + color_breaks <- create_breaks(scrs) + plot_height <- nrow(scrs) ^ 0.55 + plot_width <- ncol(scrs) ^ 0.7 + my_cex_row <- 0.25 * 16 / plot_height + if (export == "TRUE") { + png( + "KSEA.Merged.Heatmap.png", + width = plot_width * 300, + height = 2 * plot_height * 300, + res = 300, + pointsize = 14 + ) + } + draw_kseaapp_summary_heatmap( + x = scrs, + sample_cluster = sample_cluster, + merged_asterisk = merged_asterisk, + my_cex_row = my_cex_row, + color_breaks = color_breaks, + margins = margins + ) + if (export == "TRUE") { + dev.off() + } + return(my_row_names) +} + +# helper for heatmaps of phosphopeptide intensities + +draw_intensity_heatmap <- + function( + m, # matrix with rownames already formatted + cutoff, # cutoff used by hm_heading_function + hm_heading_function, # construct and cat heading from m and cutoff + hm_main_title, # main title for plot (drawn below heading) + suppress_row_dendrogram = TRUE, # set to false to show dendrogram + max_peptide_count # experimental: + = intensity_hm_rows, # values of 50 and 75 worked well + ... # passthru parameters for heatmap + ) { + peptide_count <- 0 + # emit the heading for the heatmap + if (hm_heading_function(m, cutoff)) { + peptide_count <- min(max_peptide_count, nrow(m)) + if (nrow(m) > 1) { + m_margin <- m[peptide_count:1, ] + # Margin setting was heuristically derived + margins <- + c(0.5, # col + max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row + ) + } + if (nrow(m) > 1) { + tryCatch( + { + old_oma <- par("oma") + par(cex.main = 0.6) + # Heuristically determined character size adjustment formula + char_contractor <- + 250000 / ( + max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows + ) + heatmap( + m[peptide_count:1, ], + Rowv = if (suppress_row_dendrogram) NA else NULL, + Colv = NA, + cexRow = char_contractor, + cexCol = char_contractor * 50 / max_peptide_count, + scale = "row", + margins = margins, + main = + "Unimputed, unnormalized log(intensities)", + xlab = "", + las = 1, + ... + ) + }, + error = function(e) { + cat( + sprintf( + "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n", + e$message + ) + ) + }, + finally = par(old_oma) + ) + } + } + return(peptide_count) + } ``` -## Purpose - -Perform imputation of missing values, quantile normalization, and ANOVA. +```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} +cat("\\listoftables\n") +``` +# Purpose + +To perform for phosphopeptides: + +- imputation of missing values, +- quantile normalization, +- ANOVA (using the R stats::`r params$oneWayManyCategories` function), and +- KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: + - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) + - The Human Proteome Database [http://hprd.org](http://hprd.org) + - NetworKIN [http://networkin.science/](http://networkin.science/) + - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) ```{r include = FALSE} + +### GLOBAL VARIABLES + +# parameters for KSEA + +ksea_cutoff_statistic <- params$kseaCutoffStatistic +ksea_cutoff_threshold <- params$kseaCutoffThreshold +ksea_min_kinase_count <- params$kseaMinKinaseCount + +ksea_heatmap_titles <- list() +ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- + sprintf( + "Summary for all kinases enriched in one or more contrasts at %s < %s", + ksea_cutoff_statistic, + ksea_cutoff_threshold + ) +ksea_heatmap_titles[[const_ksea_all_kinases]] <- + "Summary figure for all contrasts and all kinases" +ksea_heatmap_titles[[const_ksea_nonastrsk_kinases]] <- + sprintf( + "Summary for all kinases not enriched at %s < %s in any contrast", + ksea_cutoff_statistic, + ksea_cutoff_threshold + ) +# hash to hold names of significantly enriched kinases +ksea_asterisk_hash <- new_env() + +# READ PARAMETERS (mostly) + +intensity_hm_rows <- params$intensityHeatmapRows # Input Filename input_file <- params$inputFile @@ -220,7 +1194,7 @@ # False discovery rate adjustment for ANOVA # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 val_fdr <- - read.table(file = params$alphaFile, sep = "\t", header = F, quote = "") + read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "") if ( ncol(val_fdr) != 1 || @@ -235,8 +1209,8 @@ #Imputed Data filename imputed_data_filename <- params$imputedDataFilename imp_qn_lt_data_filenm <- params$imputedQNLTDataFile - -#ANOVA data filename +anova_ksea_mtdt_file <- params$anovaKseaMetadata + ``` ```{r echo = FALSE} @@ -255,32 +1229,71 @@ regex_sample_names <- params$regexSampleNames # Regular expression to extract Sample Grouping from Sample Name; -# if error occurs, compare sample_factor_levels and sample_name_matches +# if error occurs, compare sample_treatment_levels vs. sample_name_matches # to see if groupings/pairs line up # e.g., "(\\d+)" regex_sample_grouping <- params$regexSampleGrouping +one_way_all_categories_fname <- params$oneWayManyCategories +one_way_all_categories <- try_catch_w_e( + match.fun(one_way_all_categories_fname)) +if (!is.function(one_way_all_categories$value)) { + write("fatal error for parameter oneWayManyCategories:", stderr()) + write(one_way_all_categories$value$message, stderr()) + if (sys.nframe() > 0) quit(save = "no", status = 1) + stop("Cannot continue. Goodbye.") +} +one_way_all_categories <- one_way_all_categories$value + +one_way_two_categories_fname <- params$oneWayManyCategories +one_way_two_categories <- try_catch_w_e( + match.fun(one_way_two_categories_fname)) +if (!is.function(one_way_two_categories$value)) { + cat("fatal error for parameter oneWayTwoCategories: \n") + cat(one_way_two_categories$value$message, fill = TRUE) + if (sys.nframe() > 0) quit(save = "no", status = 1) + stop("Cannot continue. Goodbye.") +} +one_way_two_categories <- one_way_two_categories$value + +preproc_db <- params$preprocDb +ksea_app_prep_db <- params$kseaAppPrepDb +result <- file.copy( + from = preproc_db, + to = ksea_app_prep_db, + overwrite = TRUE + ) +if (!result) { + write( + sprintf( + "fatal error copying initial database '%s' to output '%s'", + preproc_db, + ksea_app_prep_db, + ), + stderr() + ) + if (sys.nframe() > 0) quit(save = "no", status = 1) + stop("Cannot continue. Goodbye.") +} ``` ```{r echo = FALSE} ### READ DATA -library(data.table) - # read.table reads a file in table format and creates a data frame from it. # - note that `quote = ""` means that quotation marks are treated literally. full_data <- read.table( file = input_file, sep = "\t", - header = T, + header = TRUE, quote = "", check.names = FALSE ) ``` -## Extract Sample Names and Factor Levels - -Column names parsed from input file are shown in Table 1; sample names and factor levels, in Table 2. +# Extract Sample Names and Treatment Levels + +Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2. ```{r echo = FALSE, results = 'asis'} @@ -297,7 +1310,7 @@ cat( sprintf( paste( - "\n\nPeptide-intensity data for each sample is", + "\n\nThe input data file has peptide-intensity data for each sample", "in one of columns %d through %d.\n\n" ), min(data_column_indices), @@ -308,29 +1321,30 @@ # Write column names as a LaTeX enumerated list. column_name_df <- data.frame( column = seq_len(length(colnames(full_data))), - name = colnames(full_data) + name = paste0("\\verb@", colnames(full_data), "@") ) data_frame_latex( x = column_name_df, justification = "l l", centered = TRUE, - caption = "Input data column name", - anchor = const_table_anchor + caption = "Input data column names", + anchor = const_table_anchor_bp, + underscore_whack = FALSE ) ``` ```{r echo = FALSE, results = 'asis'} -#```{r echo = FALSE, results = 'asis'} quant_data <- full_data[first_data_column:length(full_data)] quant_data[quant_data == 0] <- NA -rownames(quant_data) <- full_data$Phosphopeptide -# Get factors -> group replicates (as indicated by terminal letter) -# by the preceding digits; -# Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$" -# get factors -> -# group runs (samples) by ignoring terminal [A-Z] in sample names -# e.g. +rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide +# Extract factors and trt-replicates using regular expressions. +# Typically: +# regex_sample_names is "\\.\\d+[A-Z]$" +# regex_sample_grouping is "\\d+" +# This would distinguish trt-replicates by terminal letter [A-Z] +# in sample names and group them into trts by the preceding digits. +# e.g.: # group .1A .1B .1C into group 1; # group .2A .2B .2C, into group 2; # etc. @@ -340,26 +1354,28 @@ write_debug_file(quant_data) -m2 <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) -sample_factor_levels <- as.factor(regmatches(sample_name_matches, m2)) +rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) +sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) number_of_samples <- length(sample_name_matches) -sample_factor_df <- data.frame( - sample = sample_name_matches, - level = sample_factor_levels +sample_treatment_df <- data.frame( + level = sample_treatment_levels, + sample = sample_name_matches ) data_frame_latex( - x = sample_factor_df, - justification = "c c", + x = sample_treatment_df, + justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", centered = TRUE, - caption = "Factor level", - anchor = const_table_anchor + caption = "Treatment levels", + anchor = const_table_anchor_tbp, + underscore_whack = FALSE ) ``` + ```{r echo = FALSE, results = 'asis'} cat("\\newpage\n") ``` -### Are the log-transformed sample distributions similar? +## Are the log-transformed sample distributions similar? ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} @@ -394,24 +1410,18 @@ ``` ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} -library(ggplot2) if (nrow(quant_data_log) > 1) { quant_data_log_stack <- stack(quant_data_log) - ggplot( - quant_data_log_stack, - aes(x = values) - ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + - ylab("Probability density") + - geom_density( - aes(group = ind, colour = ind), - na.rm = TRUE - ) + ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + + ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + + ggplot2::ylab("Probability density") + + ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) } else { cat("No density plot because there are too few peptides.\n\n") } ``` -### Globally, are peptide intensities are approximately unimodal? +## Globally, are peptide intensities are approximately unimodal? <!-- # bquote could be used as an alternative to latex2exp::TeX below particularly @@ -444,22 +1454,17 @@ ) ``` -### Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values: +## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} # determine quantile q1 <- quantile(logvalues, probs = mean_percentile)[1] -# determine standard deviation of quantile to impute -sd_finite <- function(x) { - ok <- is.finite(x) - sd(x[ok]) -} # 1 = row of matrix (ie, phosphopeptide) sds <- apply(quant_data_log, 1, sd_finite) if (sum(!is.na(sds)) > 2) { plot( - density(sds, na.rm = T) + density(sds, na.rm = TRUE) , main = "Smoothed estimated probability density vs. std. deviation" , sub = "(probability estimation made with Gaussian smoothing)" , ylab = "Probability density" @@ -490,12 +1495,12 @@ # prep for trt-median based imputation ``` -## Impute Missing Values +# Impute Missing Values ```{r echo = FALSE} -imp_smry_potential_before <- length(logvalues) -imp_smry_missing_before <- number_to_impute +imp_smry_pot_peptides_before <- nrow(quant_data_log) +imp_smry_missing_values_before <- number_to_impute imp_smry_pct_missing <- pct_missing_values ``` @@ -506,11 +1511,8 @@ ``` ```{r echo = FALSE} -#Impute data -quant_data_imp <- quant_data - # Identify which values are missing and need to be imputed -ind <- which(is.na(quant_data_imp), arr.ind = TRUE) +ind <- which(is.na(quant_data), arr.ind = TRUE) ``` ```{r echo = FALSE, results = 'asis'} @@ -519,37 +1521,74 @@ switch( imputation_method , "group-median" = { + quant_data_imp <- quant_data imputation_method_description <- paste("Substitute missing value with", - "median peptide intensity for sample group.\n" + "median peptide-intensity for sample group.\n" ) - sample_level_integers <- as.integer(sample_factor_levels) - for (i in seq_len(length(levels(sample_factor_levels)))) { + sample_level_integers <- as.integer(sample_treatment_levels) + # Take the accurate ln(x+1) because the data are log-normally distributed + # and because median can involve an average of two measurements. + quant_data_imp <- log1p(quant_data_imp) + for (i in seq_len(length(levels(sample_treatment_levels)))) { + # Determine the columns for this factor-level level_cols <- i == sample_level_integers - ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE) - quant_data_imp[ind, level_cols] <- - apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]] + # Extract those columns + lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] + # assign to ind the row-column pairs corresponding to each NA + ind <- which(is.na(lvlsbst), arr.ind = TRUE) + # No group-median exists if there is only one sample + # a given ppep has no measurement; otherwise, proceed. + if (ncol(lvlsbst) > 1) { + the_centers <- + apply(lvlsbst, 1, median, na.rm = TRUE) + for (j in seq_len(nrow(lvlsbst))) { + for (k in seq_len(ncol(lvlsbst))) { + if (is.na(lvlsbst[j, k])) { + lvlsbst[j, k] <- the_centers[j] + } + } + } + quant_data_imp[, level_cols] <- lvlsbst + } } + # Take the accurate e^x - 1 to match scaling of original input. + quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) good_rows <- !is.na(rowMeans(quant_data_imp)) } , "median" = { + quant_data_imp <- quant_data imputation_method_description <- paste("Substitute missing value with", - "median peptide intensity across all sample classes.\n" + "median peptide-intensity across all sample classes.\n" ) - quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]] - good_rows <- !is.na(rowMeans(quant_data_imp)) + # Take the accurate ln(x+1) because the data are log-normally distributed + # and because median can involve an average of two measurements. + quant_data_imp <- log1p(quant_data_imp) + quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]] + # Take the accurate e^x - 1 to match scaling of original input. + quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) + good_rows <- !is.nan(rowMeans(quant_data_imp)) } , "mean" = { + quant_data_imp <- quant_data imputation_method_description <- paste("Substitute missing value with", - "mean peptide intensity across all sample classes.\n" + "geometric-mean peptide intensity across all sample classes.\n" ) - quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]] - good_rows <- !is.na(rowMeans(quant_data_imp)) + # Take the accurate ln(x+1) because the data are log-normally distributed, + # so arguments to mean should be previously transformed. + # this will have to be + quant_data_imp <- log1p(quant_data_imp) + # Assign to NA cells the mean for the row + quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]] + # Take the accurate e^x - 1 to match scaling of original input. + quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) + good_rows <- !is.nan(rowMeans(quant_data_imp)) } , "random" = { - m1 <- median(sds, na.rm = T) * sd_percentile #sd to be used is the median sd + quant_data_imp <- quant_data + m1 <- median(sds, na.rm = TRUE) * sd_percentile #sd to be used is the median sd # If you want results to be reproducible, you will want to call # base::set.seed before calling stats::rnorm imputation_method_description <- @@ -565,9 +1604,11 @@ sd_percentile)) quant_data_imp[ind] <- 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) - good_rows <- !is.na(rowMeans(quant_data_imp)) + quant_data_imp_ln <- log1p(quant_data_imp) + good_rows <- !is.nan(rowMeans(quant_data_imp)) } ) +quant_data_imp_log10 <- quant_data_imp_ln * const_log10_e if (length(good_rows) < 1) { print("ERROR: Cannot impute data; there are no good rows!") @@ -581,13 +1622,13 @@ ```{r echo = FALSE} -imp_smry_potential_after <- sum(good_rows) +imp_smry_pot_peptides_after <- sum(good_rows) imp_smry_rejected_after <- sum(!good_rows) -imp_smry_missing_after <- sum(is.na(quant_data_imp[good_rows, ])) +imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) ``` ```{r echo = FALSE, results = 'asis'} # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf -tabular_lines <- paste( +tabular_lines_fmt <- paste( "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) " \\caption{Imputation Results}", " \\centering", # \centering centers the table on the page @@ -606,13 +1647,13 @@ ) tabular_lines <- sprintf( - tabular_lines, - imp_smry_potential_before, - imp_smry_missing_before, + tabular_lines_fmt, + imp_smry_pot_peptides_before, + imp_smry_missing_values_before, imp_smry_pct_missing, "%", - imp_smry_potential_after, - imp_smry_missing_after, + imp_smry_pot_peptides_after, + imp_smry_missing_values_after, imp_smry_rejected_after ) cat(tabular_lines) @@ -624,9 +1665,8 @@ full_data <- full_data [good_rows, ] quant_data <- quant_data [good_rows, ] +quant_data_imp <- quant_data_imp[good_rows, ] write_debug_file(quant_data_imp) - -quant_data_imp <- quant_data_imp[good_rows, ] quant_data_imp_good_rows <- quant_data_imp write_debug_file(quant_data_imp_good_rows) @@ -648,9 +1688,13 @@ ) ) -if (sum(!is.na(d_combined)) > 2 && sum(!is.na(d_original)) > 2) { +if (sum(!is.na(d_original)) > 2) { + d_original <- density(d_original) +} else { + can_plot_before_after_imp <- FALSE +} +if (can_plot_before_after_imp && sum(is.na(d_combined)) < 1) { d_combined <- density(d_combined) - d_original <- density(d_original) } else { can_plot_before_after_imp <- FALSE } @@ -663,7 +1707,7 @@ log10(quant_data_imp[is.na(quant_data)]) ) ) - if (sum(!is.na(d_combined)) > 2) { + if (can_plot_before_after_imp && sum(is.na(d_imputed)) < 1) { d_imputed <- (density(d_imputed)) } else { can_plot_before_after_imp <- FALSE @@ -676,24 +1720,53 @@ ``` ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} +zero_sd_rownames <- + rownames(quant_data_imp)[ + is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0) + ] + +if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { + stop("All peptides have zero standard deviation. Cannot continue.") +} +if (length(zero_sd_rownames) > 0) { + cat( + sprintf("%d peptides with zero variance were removed from statistical consideration", + length(zero_sd_rownames) + ) + ) + zap_named_rows <- function(df, nms) { + return(df[!(row.names(df) %in% nms), ]) + } + quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) + quant_data <- zap_named_rows(quant_data, zero_sd_rownames) + full_data <- zap_named_rows(full_data, zero_sd_rownames) +} + if (sum(is.na(quant_data)) > 0) { cat("\\leavevmode\\newpage\n") # data visualization old_par <- par( mai = par("mai") + c(0.5, 0, 0, 0) ) + # Copy quant data to x x <- quant_data - x <- blue_dots <- x / x - blue_dots <- log10(blue_dots * quant_data) + # x gets to have values of: + # - NA for observed values + # - 1 for missing values x[is.na(x)] <- 0 - x[x == 1] <- NA + x[x > 1] <- NA x[x == 0] <- 1 - quant_data_imp_log10 <- - log10(quant_data_imp) + # Log-transform imputed data + # update variable because rows may have been eliminated from quant_data_imp + quant_data_imp_log10 <- log10(quant_data_imp) write_debug_file(quant_data_imp_log10) + # Set blue_dots to log of quant data or NA for NA quant data + blue_dots <- log10(quant_data) + # Set red_dots to log of imputed data or NA for observed quant data red_dots <- quant_data_imp_log10 * x + count_red <- sum(!is.na(red_dots)) count_blue <- sum(!is.na(blue_dots)) ylim_save <- ylim <- c( @@ -715,7 +1788,7 @@ , las = 1 # "always horizontal" , col = const_boxplot_fill , ylim = ylim - , main = "Peptide intensities before and after imputation" + , main = "Peptide intensities after eliminating unusable peptides" , sub = boxplot_sub , xlab = "Sample" , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") @@ -746,8 +1819,9 @@ add = TRUE # Add it over ) - } else { - # violin plot + } + if (TRUE) { + # show measured values in blue on left half-violin plot cat("\\leavevmode\n\\quad\n\n\\quad\n\n") vioplot::vioplot( x = lapply(blue_dots, function(x) x[!is.na(x)]), @@ -760,12 +1834,24 @@ xlab = "Sample", ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) + red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) + cols_to_delete <- c() + for (ix in seq_len(length(red_violins))) { + if (length(red_violins[[ix]]) < 1) { + cols_to_delete <- c(cols_to_delete, ix) + } + } + # destroy any unimputable columns + if (!is.null(cols_to_delete)) { + red_violins <- red_violins[-cols_to_delete] + } + # plot imputed values in red on right half-violin plot vioplot::vioplot( - x = lapply(red_dots, function(x) x[!is.na(x)]), + x = red_violins, col = "lightpink1", side = "right", plotCentre = "line", - add = T + add = TRUE ) } @@ -774,7 +1860,15 @@ # density plot cat("\\leavevmode\n\n\n\n\n\n\n") if (can_plot_before_after_imp) { - ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) + ylim <- c( + 0, + max( + if (is.list(d_combined)) d_combined$y else d_combined, + if (is.list(d_original)) d_original$y else d_original, + if (is.list(d_imputed)) d_imputed$y else d_imputed, + na.rm = TRUE + ) + ) plot( d_combined, ylim = ylim, @@ -799,7 +1893,24 @@ } ``` -## Perform Quantile Normalization +# Perform Quantile Normalization + +The excellent `normalize.quantiles` function from +*[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* +performs "quantile normalization" as described Bolstad *et al.* (2003), +DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* +and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*, +i.e., it assumes that the goal is to detect +subtle differences among grossly similar samples (having similar distributions) +by equailzing intra-quantile quantitations. +Unfortunately, one software library upon which it depends +*[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* +that requires that a specific, non-concurrent version of the library be +installed. The installation command equivalent to what was used to install the library to produce the results presented here is: +``` + conda install bioconductor-preprocesscore openblas=0.3.3 +``` + <!-- # Apply quantile normalization using preprocessCore::normalize.quantiles @@ -814,10 +1925,14 @@ # conda install bioconductor-preprocesscore openblas=0.3.3 # ... # --- -# normalize.quantiles {preprocessCore} -- Quantile Normalization +# normalize.quantiles {preprocessCore} -- Quantile Normalization # # Description: -# Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities. +# Using a normalization based upon quantiles, this function normalizes a +# matrix of probe level intensities. +# +# THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the +# assumption that the data is missing at random. # # Usage: # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) @@ -859,55 +1974,42 @@ # ... --> ```{r echo = FALSE, results = 'asis'} -library(preprocessCore) if (nrow(quant_data_imp) > 0) { - quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp)) + quant_data_imp_qn <- preprocessCore::normalize.quantiles( + as.matrix(quant_data_imp), keep.names = TRUE + ) } else { quant_data_imp_qn <- as.matrix(quant_data_imp) } quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) -names(quant_data_imp_qn) <- names(quant_data_imp) write_debug_file(quant_data_imp_qn) quant_data_imp_qn_log <- log10(quant_data_imp_qn) -rownames(quant_data_imp_qn_log) <- rownames(quant_data_imp) write_debug_file(quant_data_imp_qn_log) quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) -any_nan <- function(x) { - !any(x == "NaN") -} sel <- apply(quant_data_imp_qn_ls, 1, any_nan) -quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ] +quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls + +quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) -rownames(quant_data_imp_qn_ls2) <- rownames(quant_data_imp)[sel] quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls) -rownames(quant_data_imp_qn_ls) <- rownames(quant_data_imp) write_debug_file(quant_data_imp_qn_ls) write_debug_file(quant_data_imp_qn_ls2) -#output quantile normalized data +# Create data.frame used by ANOVA analysis data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) -write.table( - data_table_imp_qn_lt, - file = imp_qn_lt_data_filenm, - sep = "\t", - col.names = TRUE, - row.names = FALSE, - quote = FALSE -) - ``` <!-- ACE insertion begin --> -### Checking that normalized, imputed, log-transformed sample distributions are similar: +## Are normalized, imputed, log-transformed sample distributions similar? ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} @@ -919,18 +2021,6 @@ quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 quant_data_log <- log10(quant_data_imp_qn) -# Output with imputed, un-normalized data - -data_table_imputed <- cbind(full_data[1:9], quant_data_imp) -write.table( - data_table_imputed - , file = imputed_data_filename - , sep = "\t" - , col.names = TRUE - , row.names = FALSE - , quote = FALSE - ) - how_many_peptides <- nrow(quant_data_log) if ((how_many_peptides) > 0) { @@ -970,15 +2060,10 @@ ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} if (nrow(quant_data_log) > 1) { quant_data_log_stack <- stack(quant_data_log) - ggplot( - quant_data_log_stack, - aes(x = values) - ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + - ylab("Probability density") + - geom_density( - aes(group = ind, colour = ind), - na.rm = TRUE - ) + ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + + ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + + ggplot2::ylab("Probability density") + + ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) } else { cat("No density plot because there are fewer than two peptides to plot.\n\n") } @@ -987,7 +2072,7 @@ cat("\\leavevmode\\newpage\n") ``` -## Perform ANOVA Filters +# ANOVA Analysis ```{r, echo = FALSE} # Make new data frame containing only Phosphopeptides @@ -1000,8 +2085,8 @@ ``` ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} -count_of_factor_levels <- length(levels(sample_factor_levels)) -if (count_of_factor_levels < 2) { +count_of_treatment_levels <- length(levels(sample_treatment_levels)) +if (count_of_treatment_levels < 2) { nuke_control_sequences <- function(s) { s <- gsub("[\\]", "xyzzy_plugh", s) @@ -1054,16 +2139,18 @@ cat("\n\n\n") cat("Sample group assignments are:\n", "\\begin{quote}\n", - paste(regmatches(sample_name_matches, m2), collapse = "\n\n\n"), + paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), "\n\\end{quote}\n\n") } else { + p_value_data_anova_ps <- apply( quant_data_imp_qn_log, 1, anova_func, - grouping_factor = sample_factor_levels + grouping_factor = sample_treatment_levels, + one_way_f = one_way_all_categories ) p_value_data_anova_ps_fdr <- @@ -1081,8 +2168,9 @@ # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" # Re-output datasets to include p-values + metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3]) write.table( - cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp), + cbind(metadata_plus_p, quant_data_imp), file = imputed_data_filename, sep = "\t", col.names = TRUE, @@ -1091,7 +2179,7 @@ ) write.table( - cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp_qn_log), + cbind(metadata_plus_p, quant_data_imp_qn_log), file = imp_qn_lt_data_filenm, sep = "\t", col.names = TRUE, @@ -1135,15 +2223,22 @@ if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { if (first_page_suppress == 1) { first_page_suppress <- 0 - } - else { + } else { cat("\\newpage\n") } - cat(sprintf( - "Intensities for %d peptides whose adjusted p-value < %0.2f:\n", - nrow(filtered_data_filtered), - cutoff - )) + if (nrow(filtered_data_filtered) > 1) { + subsection_header(sprintf( + "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n", + nrow(filtered_data_filtered), + cutoff + )) + } else { + subsection_header(sprintf( + "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n", + rownames(filtered_data_filtered)[1], + cutoff + )) + } cat("\n\n\n") cat("\n\n\n") @@ -1158,12 +2253,15 @@ # ref: https://r-charts.com/distribution/add-points-boxplot/ # Vertical plot colnames(filtered_data_filtered) <- sample_name_matches - boxplot( - filtered_data_filtered, - main = "Imputed, normalized intensities", # no line plot - las = 1, - col = const_boxplot_fill, - ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + tryCatch( + boxplot( + filtered_data_filtered, + main = "Imputed, normalized intensities", # no line plot + las = 1, + col = const_boxplot_fill, + ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + ), + error = function(e) print(e) ) par(old_par) } else { @@ -1175,8 +2273,12 @@ } if (nrow(filtered_data_filtered) > 0) { - #Add Phosphopeptide column to anova_filtered table - anova_filtered_merge <- merge( + # Add Phosphopeptide column to anova_filtered table + # The assumption here is that the first intensity is unique; + # this is a hokey assumption but almost definitely will + # be true in the real world unless there is a computation + # error upstream. + anova_filtered_merge <- base::merge( x = connect_df, y = filtered_data_filtered, by.x = "Intensity", @@ -1184,6 +2286,25 @@ ) anova_filtered_merge_order <- rownames(filtered_p) + anova_filtered <- data.frame( + ppep = anova_filtered_merge$Phosphopeptide, + intense = anova_filtered_merge$Intensity, + data = anova_filtered_merge[, 2:number_of_samples + 1] + ) + colnames(anova_filtered) <- + c("Phosphopeptide", colnames(filtered_data_filtered)) + + # Merge qualitative columns into the ANOVA data + output_table <- data.frame(anova_filtered$Phosphopeptide) + output_table <- base::merge( + x = output_table, + y = data_table_imp_qn_lt, + by.x = "anova_filtered.Phosphopeptide", + by.y = "Phosphopeptide" + ) + + # Produce heatmap to visualize significance and the effect of imputation + anova_filtered_merge_format <- sapply( X = filtered_p$fdr_adjusted_anova_p , @@ -1195,35 +2316,36 @@ } ) - anova_filtered <- data.table( - anova_filtered_merge$Phosphopeptide - , - anova_filtered_merge$Intensity - , - anova_filtered_merge[, 2:number_of_samples + 1] - ) - colnames(anova_filtered) <- - c("Phosphopeptide", colnames(filtered_data_filtered)) - - # Merge qualitative columns into the ANOVA data - output_table <- data.frame(anova_filtered$Phosphopeptide) - output_table <- merge( - x = output_table, - y = data_table_imp_qn_lt, - by.x = "anova_filtered.Phosphopeptide", - by.y = "Phosphopeptide" - ) - - # Produce heatmap to visualize significance and the effect of imputation + cat_hm_heading <- function(m, cutoff) { + cat("\\newpage\n") + if (nrow(m) > intensity_hm_rows) { + subsection_header( + paste( + sprintf("Heatmap for the %d most-significant peptides", + intensity_hm_rows), + sprintf("whose adjusted p-value < %0.2f\n", cutoff) + ) + ) + } else { + if (nrow(m) == 1) { + return(FALSE) + } else { + subsection_header( + paste( + sprintf("Heatmap for %d usable peptides whose", nrow(m)), + sprintf("adjusted p-value < %0.2f\n", cutoff) + ) + ) + } + } + cat("\n\n\n") + cat("\n\n\n") + return(TRUE) + } + + # construct matrix with appropriate rownames m <- as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) - m_nan_rows <- rowSums( - matrix( - as.integer(is.na(m)), - nrow = nrow(m) - ) - ) - m <- m[!m_nan_rows, , drop = FALSE] if (nrow(m) > 0) { rownames_m <- rownames(m) rownames(m) <- sapply( @@ -1237,57 +2359,1178 @@ ) } ) - how_many_peptides <- min(50, nrow(m)) - number_of_peptides_found <- how_many_peptides - if (nrow(m) > 1) { - m_margin <- m[how_many_peptides:1, ] - margins <- - c(max(nchar(colnames(m_margin))) * 10 / 16 # col - , max(nchar(rownames(m_margin))) * 5 / 16 # row - ) - } - - cat("\\newpage\n") - if (nrow(m) > 50) { - cat("Heatmap for the 50 most-significant peptides", - sprintf( - "whose adjusted p-value < %0.2f\n", - cutoff) - ) - } else { - if (nrow(m) == 1) { - next - } else { - cat( - sprintf("Heatmap for %d usable peptides whose", nrow(m)), - sprintf("adjusted p-value < %0.2f\n", cutoff) + } + # draw the heading and heatmap + if (nrow(m) > 0) { + number_of_peptides_found <- + draw_intensity_heatmap( + m = m, + cutoff = cutoff, + hm_heading_function = cat_hm_heading, + hm_main_title = "Unimputed, unnormalized log(intensities)", + suppress_row_dendrogram = FALSE ) - } - } - cat("\n\n\n") - cat("\n\n\n") - try( - if (nrow(m) > 1) { - old_oma <- par("oma") - par(cex.main = 0.6) - heatmap( - m[how_many_peptides:1, ], - Rowv = NA, - Colv = NA, - cexRow = 0.7, - cexCol = 0.8, - scale = "row", - margins = margins, - main = - "Unimputed, unnormalized intensities", - xlab = "", - las = 1 #, fin = c(9, 5.5) - ) - } - ) } } } } cat("\\leavevmode\n\n\n") ``` + +```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} + +if (count_of_treatment_levels > 1) { + # Prepare two-way contrasts with adjusted p-values + # Strategy: + # - use imputed, log-transformed data: + # - remember this when computing log2(fold-change) + # - each contrast is between a combination of trt levels + # - for each contrast, compute samples that are members + # - compute one-way test: + # - use `oneway.test` (Welch test) if numbers of samples + # are not equivalent between trt levels + # - otherwise, aov is fine but offers no advantage + # - adjust p-value, assuming that + # (# of pppeps)*(# of contrasts) tests were performed + + # Each contrast is between a combination of trt levels + m2 <- combn( + x = seq_len(length(levels(sample_treatment_levels))), + m = 2, + simplify = TRUE + ) + contrast_count <- ncol(m2) + + # For each contrast, compute samples that are members + # - local function to construct a data.frame for each contrast + # - the contrast in the first "column" + f_m2 <- + function(cntrst, lvl1, lvl2) { + return( + data.frame( + contrast = cntrst, + level = sample_treatment_levels[ + sample_treatment_levels %in% + levels(sample_treatment_levels)[c(lvl1, lvl2)] + ], + label = sample_name_matches[ + sample_treatment_levels %in% + levels(sample_treatment_levels)[c(lvl1, lvl2)] + ] + ) + ) + } + # - compute a df for each contrast + sample_level_dfs <- lapply( + X = 1:contrast_count, + FUN = function(i) f_m2(i, m2[1, i], m2[2, i]) + ) + # - compute a single df for all contrasts + combined_contrast_df <- Reduce(f = rbind, x = sample_level_dfs) + + # - dispose objects to free resources + rm(sample_level_dfs) + + # - write the df to a DB for later join-per-contrast + db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) + + RSQLite::dbWriteTable( + conn = db, + name = "contrast", + value = combined_contrast_df, + overwrite = TRUE + ) + + # Create UK for insert + ddl_exec(db, " + CREATE UNIQUE INDEX IF NOT EXISTS contrast__uk__idx + ON contrast(contrast, label); + " + ) + # Create indexes for join + ddl_exec(db, " + -- index for join in contrast_ppep_smpl_qnlt on a.label < b.label + CREATE INDEX IF NOT EXISTS contrast__label__idx + ON contrast(label); + " + ) + ddl_exec(db, " + -- index for joining two contrast_lvl_ppep_avg_quant on contrast + CREATE INDEX IF NOT EXISTS contrast__contrast__idx + ON contrast(contrast); + " + ) + ddl_exec(db, " + -- index for joining two contrast_lvl_ppep_avg_quant on phophospep + CREATE INDEX IF NOT EXISTS contrast__level__idx + ON contrast(level); + " + ) + # - dispose objects to free resources + rm(combined_contrast_df) + + # Use imputed, log-transformed data + # - remember that this was donoe when computing log2(fold-change) + # - melt data matrix for use in later join-per-contrast + casted <- cbind( + data.frame(vrbl = rownames(quant_data_imp_qn_log)), + quant_data_imp_qn_log + ) + quant_data_imp_qn_log_melted <- reshape2::melt( + casted, + id.vars = "vrbl" + ) + colnames(quant_data_imp_qn_log_melted) <- + c("phosphopep", "sample", "quant") + # - dispose objects to free resources + rm(casted) + + # - write the df to a DB for use in later join-per-contrast + RSQLite::dbWriteTable( + conn = db, + name = "ppep_smpl_qnlt", + value = quant_data_imp_qn_log_melted, + overwrite = TRUE + ) + # Create UK for insert + ddl_exec(db, " + CREATE UNIQUE INDEX IF NOT EXISTS ppep_smpl_qnlt__uk__idx + ON ppep_smpl_qnlt(phosphopep, sample); + " + ) + # Create index for join + ddl_exec(db, " + -- index for join in contrast_ppep_smpl_qnlt + CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__sample__idx + ON ppep_smpl_qnlt(sample); + " + ) + ddl_exec(db, " + -- index for joining two contrast_lvl_ppep_avg_quant on phopho.pep + CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__phosphopep__idx + ON ppep_smpl_qnlt(phosphopep); + " + ) + # - dispose objects to free resources + rm(quant_data_imp_qn_log_melted) + + # - drop views if exist + ddl_exec(db, " + -- drop view dependent on contrast_lvl_ppep_avg_quant + DROP VIEW IF EXISTS v_contrast_log2_fc; + " + ) + ddl_exec(db, " + -- drop table dependent on contrast_ppep_smpl_qnlt + DROP TABLE IF EXISTS contrast_lvl_ppep_avg_quant; + " + ) + ddl_exec(db, " + DROP TABLE IF EXISTS contrast_lvl_lvl_metadata; + " + ) + ddl_exec(db, " + DROP VIEW IF EXISTS v_contrast_lvl_metadata; + " + ) + ddl_exec(db, " + -- drop view dependent on contrast_ppep_smpl_qnlt + DROP VIEW IF EXISTS v_contrast_lvl_ppep_avg_quant; + " + ) + ddl_exec(db, " + DROP VIEW IF EXISTS v_contrast_lvl_lvl; + " + ) + ddl_exec(db, " + -- drop view upon which other views depend + DROP VIEW IF EXISTS contrast_ppep_smpl_qnlt; + " + ) + # - create view + dml_no_rows_exec(db, " + -- view contrast_ppep_smpl_qnlt is used for each phopshopep to + -- compute p-value for test of trt effect for two trt levels + CREATE VIEW contrast_ppep_smpl_qnlt + AS + SELECT contrast, + level, + phosphopep, + sample, + quant + FROM contrast AS c, + ppep_smpl_qnlt AS q + WHERE q.sample = c.label + ORDER BY contrast, level, phosphopep + ; + " + ) + # - create simplification views + dml_no_rows_exec(db, " + CREATE VIEW v_contrast_lvl_metadata + AS + SELECT contrast, + level, + group_concat(label, ';') AS samples + FROM contrast + GROUP BY contrast, level + /* view v_contrast_lvl_metadata is used + to simplify creation of table contrast_lvl_lvl_metadata */ + ; + " + ) + dml_no_rows_exec(db, " + CREATE VIEW v_contrast_lvl_ppep_avg_quant + AS + SELECT contrast, + level, + phosphopep, + avg(quant) AS avg_quant + FROM contrast_ppep_smpl_qnlt + GROUP BY contrast, level, phosphopep + /* view v_contrast_lvl_ppep_avg_quant is used + to simplify view v_contrast_log2_fc */ + ; + " + ) + + # - create contrast-metadata table + dml_no_rows_exec(db, " + CREATE TABLE contrast_lvl_lvl_metadata + AS + SELECT DISTINCT + a.contrast AS ab_contrast, + a.level AS a_level, + b.level AS b_level, + a.samples AS a_samples, + b.samples AS b_samples, + 'log2(level_'||a.level||'/level_'||b.level||')' + AS fc_description + FROM v_contrast_lvl_metadata AS a, + v_contrast_lvl_metadata AS b + WHERE a.contrast = b.contrast + AND a.level > b.level + /* view v_contrast_lvl_lvl is used + to simplify view v_contrast_log2_fc */ + ; + " + ) + # - create pseudo-materialized view table + dml_no_rows_exec(db, " + CREATE VIEW v_contrast_lvl_lvl + AS + SELECT DISTINCT + a.contrast AS ab_contrast, + a.level AS a_level, + b.level AS b_level + FROM contrast AS a, + contrast AS b + WHERE a.contrast = b.contrast + AND a.level > b.level + /* view v_contrast_lvl_lvl is used + to simplify view v_contrast_log2_fc */ + ; + " + ) + + # - create view to compute log2(fold-change) + dml_no_rows_exec(db, " + CREATE VIEW v_contrast_log2_fc + AS + SELECT ab.ab_contrast AS contrast, + m.a_level AS a_level, + c.avg_quant AS a_quant, + m.a_samples AS a_samples, + ab.b_level AS b_level, + d.avg_quant AS b_quant, + m.b_samples AS b_samples, + m.fc_description AS fc_description, + 3.32193 * ( d.avg_quant - c.avg_quant) AS log2_fc, + d.phosphopep AS phosphopep + FROM contrast_lvl_lvl_metadata AS m, + v_contrast_lvl_ppep_avg_quant AS d, + v_contrast_lvl_lvl AS ab + INNER JOIN v_contrast_lvl_ppep_avg_quant AS c + ON c.contrast = ab.ab_contrast + AND c.level = ab.a_level + WHERE d.contrast = ab.ab_contrast + AND m.ab_contrast = ab.ab_contrast + AND d.level = ab.b_level + AND d.phosphopep = c.phosphopep + /* view to compute log2(fold-change) */ + ; + " + ) + + # For each contrast, compute samples that are members + # compute one-way test: + # - use `oneway.test` (Welch test) if numbers of samples + # are not equivalent between trt levels + # - otherwise, aov is fine but offers no advantage + for (contrast in contrast_count:2) { + invisible(contrast) + } + for (contrast in 1:contrast_count) { + contrast_df <- sqldf::sqldf( + x = paste0(" + SELECT level, phosphopep, sample, quant + FROM contrast_ppep_smpl_qnlt + WHERE contrast = ", contrast, " + ORDER BY phosphopep, level, sample + "), + connection = db + ) + contrast_cast <- reshape2::dcast( + data = contrast_df, + formula = phosphopep ~ sample, + value.var = "quant" + ) + contrast_cast_ncol <- ncol(contrast_cast) + contrast_cast_data <- contrast_cast[, 2:contrast_cast_ncol] + contrast_cast_samples <- colnames(contrast_cast_data) + + # - order grouping_factor by order of sample columns of contrast_cast_data + grouping_factor <- sqldf::sqldf( + x = paste0(" + SELECT sample, level + FROM contrast_ppep_smpl_qnlt + WHERE contrast = ", contrast, " + ORDER BY phosphopep, level, sample + LIMIT ", contrast_cast_ncol - 1 + ), + connection = db + ) + rownames(grouping_factor) <- grouping_factor$sample + grouping_factor <- grouping_factor[, "level", drop = FALSE] + + # - run the two-level (one-way) test + p_value_data_contrast_ps <- + apply( + X = contrast_cast_data, + MARGIN = 1, # apply to rows + FUN = anova_func, + grouping_factor = + as.factor(as.numeric(grouping_factor$level)), # anova_func arg2 + one_way_f = one_way_two_categories, # anova_func arg3 + simplify = TRUE # TRUE is the default for simplify + ) + contrast_data_adj_p_values <- p.adjust( + p = p_value_data_contrast_ps, + method = "fdr", + n = length(p_value_data_contrast_ps) # this is the default, length(p) + ) + # - compute the fold-change + contrast_p_df <- + data.frame( + contrast = contrast, + phosphopep = contrast_cast$phosphopep, + p_value_raw = p_value_data_contrast_ps, + p_value_adj = contrast_data_adj_p_values + ) + db_write_table_overwrite <- (contrast < 2) + db_write_table_append <- !db_write_table_overwrite + RSQLite::dbWriteTable( + conn = db, + name = "contrast_ppep_p_val", + value = contrast_p_df, + append = db_write_table_append + ) + # Create UK for insert + ddl_exec(db, " + CREATE UNIQUE INDEX IF NOT EXISTS contrast_ppep_p_val__uk__idx + ON contrast_ppep_p_val(phosphopep, contrast); + " + ) + } + # Perhaps this could be done more elegantly using unique keys + # or creating the tables before saving data to them, but this + # is fast and, if the database exists on disk rather than in + # memory, it doesn't stress memory. + dml_no_rows_exec(db, " + CREATE TEMP table contrast_log2_fc + AS + SELECT * + FROM v_contrast_log2_fc + ORDER BY contrast, phosphopep + ; + " + ) + dml_no_rows_exec(db, " + CREATE TEMP table ppep_p_val + AS + SELECT p_value_raw, + p_value_adj, + contrast AS p_val_contrast, + phosphopep AS p_val_ppep + FROM contrast_ppep_p_val + ORDER BY contrast, phosphopep + ; + " + ) + dml_no_rows_exec(db, " + DROP TABLE IF EXISTS contrast_log2_fc_p_val + ; + " + ) + dml_no_rows_exec(db, " + CREATE TABLE contrast_log2_fc_p_val + AS + SELECT a.*, + b.p_value_raw, + b.p_value_adj, + b.p_val_contrast, + b.p_val_ppep + FROM contrast_log2_fc a, ppep_p_val b + WHERE a.rowid = b.rowid + AND a.phosphopep = b.p_val_ppep + ; + " + ) + # Create UK + ddl_exec(db, " + CREATE UNIQUE INDEX IF NOT EXISTS contrast_log2_fc_p_val__uk__idx + ON contrast_log2_fc_p_val(phosphopep, contrast); + " + ) + # Create indices for future queries + ddl_exec(db, " + CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__contrast__idx + ON contrast_log2_fc_p_val(contrast); + " + ) + ddl_exec(db, " + CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__phosphopep__idx + ON contrast_log2_fc_p_val(phosphopep); + " + ) + ddl_exec(db, " + CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_raw__idx + ON contrast_log2_fc_p_val(p_value_raw); + " + ) + ddl_exec(db, " + CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_adj__idx + ON contrast_log2_fc_p_val(p_value_adj); + " + ) + dml_no_rows_exec(db, " + DROP VIEW IF EXISTS v_contrast_log2_fc_p_val + ; + " + ) + dml_no_rows_exec(db, " + CREATE VIEW v_contrast_log2_fc_p_val + AS + SELECT contrast, + a_level, + a_samples, + b_level, + b_samples, + a_quant, + b_quant, + fc_description, + log2_fc, + p_value_raw, + p_value_adj, + phosphopep + FROM contrast_log2_fc_p_val + ORDER BY contrast, phosphopep + ; + " + ) + ddl_exec(db, " + DROP TABLE IF EXISTS kseaapp_metadata + ; + " + ) + dml_no_rows_exec(db, " + CREATE TABLE kseaapp_metadata + AS + WITH extended(deppep, ppep, gene_name, uniprot_id, phosphoresidue) AS ( + SELECT DISTINCT + deppep.seq, + ppep.seq, + GeneName||';', + UniProtID||';', + PhosphoResidue||';' + FROM + ppep, deppep, mrgfltr_metadata + WHERE + mrgfltr_metadata.ppep_id = ppep.id + AND + ppep.deppep_id = deppep.id + ) + SELECT + ppep AS `ppep`, + SUBSTR(uniprot_id, 1, INSTR(uniprot_id,';') - 1 ) AS `Protein`, + SUBSTR(gene_name, 1, INSTR(gene_name,';') - 1 ) AS `Gene`, + deppep AS `Peptide`, + REPLACE( + REPLACE( + SUBSTR(phosphoresidue, 1, INSTR(phosphoresidue,';') - 1 ), + 'p', + '' + ), + ', ', + ';' + ) AS `Residue.Both` + FROM extended + ; + " + ) + # Create indexes for join + ddl_exec(db, " + CREATE INDEX IF NOT EXISTS kseaapp_metadata__ppep__idx + ON kseaapp_metadata(ppep); + " + ) + ddl_exec(db, " + DROP VIEW IF EXISTS v_kseaapp_contrast + ; + " + ) + dml_no_rows_exec(db, " + CREATE VIEW v_kseaapp_contrast + AS + SELECT a.*, b.Protein, b.Gene, b.Peptide, b.`Residue.Both` + FROM v_contrast_log2_fc_p_val a, kseaapp_metadata b + WHERE b.ppep = a.phosphopep + ; + " + ) + ddl_exec(db, " + DROP VIEW IF EXISTS v_kseaapp_input + ; + " + ) + dml_no_rows_exec(db, " + CREATE VIEW v_kseaapp_input + AS + SELECT v.contrast, + v.phosphopep, + m.`Protein`, + m.`Gene`, + m.`Peptide`, + m.`Residue.Both`, + v.p_value_raw AS `p`, + v.log2_fc AS `FC` + FROM kseaapp_metadata AS m, + v_contrast_log2_fc_p_val AS v + WHERE m.ppep = v.phosphopep + AND NOT m.`Gene` = 'No_Gene_Name' + AND NOT v.log2_fc = 0 + ; + " + ) +} +``` + +```{r echo = FALSE, results = 'asis'} +cat("\\newpage\n") +``` + +# KSEA Analysis + +Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: + +- The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp +- The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). +- An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). + +For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: + +$$ +\text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j} +$$ + +and fold-enrichment is computed as: + +$$ +\text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j} +$$ + +where: + +- $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$, +- $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and +- $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, +- $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset. +- Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. + +$\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x) + +Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). + +Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. + +- Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. +- Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). +- Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). + +```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} + +db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) + +# -- eliminate the table that's about to be defined +ddl_exec(db, " +DROP TABLE IF EXISTS site_metadata; +") + +# -- define the site_metadata table +ddl_exec(db, " +CREATE TABLE site_metadata( + id INTEGER PRIMARY KEY +, site_type_id INTEGER REFERENCES site_type(id) +, full TEXT UNIQUE ON CONFLICT IGNORE +, abbrev TEXT +, pattern TEXT +, motif TEXT +); +") + +# -- populate the table with initial values +ddl_exec(db, " +INSERT INTO site_metadata(full, abbrev, site_type_id) + SELECT DISTINCT kinase_map, kinase_map, site_type_id + FROM ppep_gene_site + ORDER BY kinase_map; +") + +# -- drop bogus KSData view if exists +ddl_exec(db, " +DROP VIEW IF EXISTS ks_data_v; +") + +# -- create view to serve as an impostor for KSEAapp::KSData +ddl_exec(db, " +CREATE VIEW IF NOT EXISTS ks_data_v +AS +SELECT + 'NA' AS KINASE, + 'NA' AS KIN_ACC_ID, + kinase_map AS GENE, + 'NA' AS KIN_ORGANISM, + 'NA' AS SUBSTRATE, + 0 AS SUB_GENE_ID, + 'NA' AS SUB_ACC_ID, + gene_names AS SUB_GENE, + 'NA' AS SUB_ORGANISM, + phospho_peptide AS SUB_MOD_RSD, + 0 AS SITE_GROUP_ID, + 'NA' AS 'SITE_7AA', + 2 AS networkin_score, + type_name AS Source +FROM ppep_gene_site_view; +") + +contrast_metadata_df <- + sqldf::sqldf("select * from contrast_lvl_lvl_metadata", connection = db) +rslt <- new_env() +rslt$score_list <- list() +rslt$name_list <- list() +rslt$longname_list <- list() + +ddl_exec(db, " + DROP TABLE IF EXISTS contrast_ksea_scores; + " +) + +next_index <- 0 +err_na_subscr_df_const <- + "missing values are not allowed in subscripted assignments of data frames" + +for (i_cntrst in seq_len(nrow(contrast_metadata_df))) { + cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] + cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] + cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] + contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) + contrast_longlabel <- ( + sprintf( + "Trt %s {%s} -> Trt %s {%s}", + contrast_metadata_df[i_cntrst, "b_level"], + gsub( + pattern = ";", + replacement = ", ", + x = contrast_metadata_df[i_cntrst, "b_samples"], + fixed = TRUE + ), + contrast_metadata_df[i_cntrst, "a_level"], + gsub( + pattern = ";", + replacement = ", ", + x = contrast_metadata_df[i_cntrst, "a_samples"], + fixed = TRUE + ) + ) + ) + kseaapp_input <- + sqldf::sqldf( + x = sprintf(" + SELECT `Protein`, `Gene`, `Peptide`, phosphopep AS `Residue.Both`, `p`, `FC` + FROM v_kseaapp_input + WHERE contrast = %d + ", + i_cntrst + ), + connection = db + ) + + pseudo_ksdata <- dbReadTable(db, "ks_data_v") + + # This hack is because SQL table has the log2-transformed values + kseaapp_input[, "FC"] <- 2 ** kseaapp_input[, "FC", drop = TRUE] + main_title <- ( + sprintf( + "Change from treatment %s to treatment %s", + contrast_metadata_df[i_cntrst, "b_level"], + contrast_metadata_df[i_cntrst, "a_level"] + ) + ) + sub_title <- contrast_longlabel + tryCatch( + expr = { + ksea_scores_rslt <- + ksea_scores( + ksdata = pseudo_ksdata, # KSEAapp::KSData, + px = kseaapp_input, + networkin = TRUE, + networkin_cutoff = 2 + ) + + if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { + next_index <- 1 + next_index + rslt$score_list[[next_index]] <- ksea_scores_rslt + rslt$name_list[[next_index]] <- contrast_label + rslt$longname_list[[next_index]] <- contrast_longlabel + low_fdr_print( + rslt = rslt, + i_cntrst = i_cntrst, + i = next_index, + a_level = cntrst_a_level, + b_level = cntrst_b_level, + fold_change = cntrst_fold_change, + caption = contrast_longlabel + ) + } + }, + error = function(e) str(e) + ) +} + +plotted_kinases <- NULL +if (length(rslt$score_list) > 1) { + for (i in seq_len(length(ksea_heatmap_titles))) { + hdr <- ksea_heatmap_titles[[i]] + which_kinases <- i + + cat("\\clearpage\n\\begin{center}\n") + if (i == const_ksea_astrsk_kinases) { + subsection_header(hdr) + } else { + subsection_header(hdr) + } + cat("\\end{center}\n") + + plotted_kinases <- ksea_heatmap( + # the data frame outputs from the KSEA.Scores() function, in list format + score_list = rslt$score_list, + # a character vector of all the sample names for heatmap annotation: + # - the names must be in the same order as the data in score_list + # - please avoid long names, as they may get cropped in the final image + sample_labels = rslt$name_list, + # character string of either "p.value" or "FDR" indicating the data column + # to use for marking statistically significant scores + stats = c("p.value", "FDR")[2], + # a numeric value between 0 and infinity indicating the min. number of + # substrates a kinase must have to be included in the heatmap + m_cutoff = 1, + # a numeric value between 0 and 1 indicating the p-value/FDR cutoff + # for indicating significant kinases in the heatmap + p_cutoff = 0.05, + # a binary input of TRUE or FALSE, indicating whether or not to perform + # hierarchical clustering of the sample columns + sample_cluster = TRUE, + # a binary input of TRUE or FALSE, indicating whether or not to export + # the heatmap as a .png image into the working directory + export = FALSE, + # additional arguments to gplots::heatmap.2, such as: + # - main: main title of plot + # - xlab: x-axis label + # - ylab: y-axis label + xlab = "Contrast", + ylab = "Kinase", + # print which kinases: + # - 1 : all kinases + # - 2 : significant kinases + # - 3 : non-significant kinases + which_kinases = which_kinases + ) + cat("\\begin{center}\n") + cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n") + cat("\\end{center}\n") + } # end for (i in ... +} # end if (length ... + +for (i_cntrst in seq_len(length(rslt$score_list))) { + next_index <- i_cntrst + cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] + cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] + cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] + contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) + contrast_longlabel <- ( + sprintf( + "Trt %s {%s} -> Trt %s {%s}", + contrast_metadata_df[i_cntrst, "b_level"], + gsub( + pattern = ";", + replacement = ", ", + x = contrast_metadata_df[i_cntrst, "b_samples"], + fixed = TRUE + ), + contrast_metadata_df[i_cntrst, "a_level"], + gsub( + pattern = ";", + replacement = ", ", + x = contrast_metadata_df[i_cntrst, "a_samples"], + fixed = TRUE + ) + ) + ) + main_title <- ( + sprintf( + "Change from treatment %s to treatment %s", + contrast_metadata_df[i_cntrst, "b_level"], + contrast_metadata_df[i_cntrst, "a_level"] + ) + ) + sub_title <- contrast_longlabel + tryCatch( + expr = { + ksea_scores_rslt <- rslt$score_list[[next_index]] + + if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { + low_fdr_barplot( + rslt = rslt, + i_cntrst = i_cntrst, + i = next_index, + a_level = cntrst_a_level, + b_level = cntrst_b_level, + fold_change = cntrst_fold_change, + caption = contrast_longlabel + ) + } + }, + error = function(e) str(e) + ) +} +``` + +```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} + +# Use enriched kinases to find enriched kinase-substrate pairs +enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) +all_enriched_substrates <- sqldf(" + SELECT + gene AS kinase, + ppep, + '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label + FROM ( + SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE GENE IN (SELECT kinase FROM enriched_kinases) + ) + GROUP BY ppep + ") + +# helper used to label per-kinase substrate enrichment figure +cat_enriched_heading <- function(m, cut_args) { + cutoff <- cut_args$cutoff + kinase <- cut_args$kinase + statistic <- cut_args$statistic + threshold <- cut_args$threshold + cat("\\newpage\n") + if (nrow(m) > intensity_hm_rows) { + subsection_header( + paste( + sprintf( + "Lowest p-valued %d (of %d) enriched %s-substrates,", + intensity_hm_rows, + nrow(m), + kinase + ), + sprintf(" KSEA %s < %0.2f\n", statistic, threshold) + ) + ) + } else { + if (nrow(m) == 1) { + return(FALSE) + } else { + subsection_header( + paste( + sprintf( + "%d enriched %s-substrates,", + nrow(m), + kinase + ), + sprintf( + " KSEA %s < %0.2f\n", + statistic, + threshold + ) + ) + ) + } + } + cat("\n\n\n") + cat("\n\n\n") + return(TRUE) +} + +# Disabling heatmaps for substrates pending decision whether to eliminate them altogether +if (FALSE) + for (kinase_name in sort(enriched_kinases$kinase)) { + enriched_substrates <- + all_enriched_substrates[ + all_enriched_substrates$kinase == kinase_name, + , + drop = FALSE + ] + # Get the intensity values for the heatmap + enriched_intensities <- + as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) + # Remove rows having too many NA values to be relevant + na_counter <- is.na(enriched_intensities) + na_counts <- apply(na_counter, 1, sum) + enriched_intensities <- + enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE] + # Rename the rows with the display-name for the heatmap + rownames(enriched_intensities) <- + sapply( + X = rownames(enriched_intensities), + FUN = function(rn) { + enriched_substrates[enriched_substrates$ppep == rn, "label"] + } + ) + # Format as matrix for heatmap + m <- as.matrix(enriched_intensities) + # Draw the heading and heatmap + if (nrow(m) > 0) { + cut_args <- new_env() + cut_args$cutoff <- cutoff + cut_args$kinase <- kinase_name + cut_args$statistic <- ksea_cutoff_statistic + cut_args$threshold <- ksea_cutoff_threshold + number_of_peptides_found <- + draw_intensity_heatmap( + m = m, + cutoff = cut_args, + hm_heading_function = cat_enriched_heading, + hm_main_title + = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", + suppress_row_dendrogram = FALSE + ) + } + } + +# Write output tabular files + +# get kinase, ppep, concat(kinase) tuples for enriched kinases + +kinase_ppep_label <- sqldf(" + WITH + t(ppep, label) AS + ( + SELECT DISTINCT + SUB_MOD_RSD AS ppep, + group_concat(gene, '; ') AS label + FROM pseudo_ksdata + WHERE GENE IN (SELECT kinase FROM enriched_kinases) + GROUP BY ppep + ), + k(kinase, ppep_join) AS + ( + SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join + FROM pseudo_ksdata + WHERE GENE IN (SELECT kinase FROM enriched_kinases) + ) + SELECT k.kinase, t.ppep, t.label + FROM t, k + WHERE t.ppep = k.ppep_join + ORDER BY k.kinase, t.ppep + ") + +# extract what we need from full_data +impish <- cbind(rownames(quant_data_imp), quant_data_imp) +colnames(impish)[1] <- "Phosphopeptide" +data_table_imputed_sql <- " + SELECT + f.*, + k.label AS KSEA_enrichments, + q.* + FROM + metadata_plus_p f + LEFT JOIN kinase_ppep_label k + ON f.Phosphopeptide = k.ppep, + impish q + WHERE + f.Phosphopeptide = q.Phosphopeptide + " +data_table_imputed <- sqldf(data_table_imputed_sql) +# Zap the duplicated 'Phosphopeptide' column named 'ppep' +data_table_imputed <- + data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] + +# Output with imputed, un-normalized data + +write.table( + data_table_imputed + , file = imputed_data_filename + , sep = "\t" + , col.names = TRUE + , row.names = FALSE + , quote = FALSE + ) + + +#output quantile normalized data +impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) +colnames(impish)[1] <- "Phosphopeptide" +data_table_imputed <- sqldf(data_table_imputed_sql) +# Zap the duplicated 'Phosphopeptide' column named 'ppep' +data_table_imputed <- + data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] +write.table( + data_table_imputed, + file = imp_qn_lt_data_filenm, + sep = "\t", + col.names = TRUE, + row.names = FALSE, + quote = FALSE +) + +ppep_kinase <- sqldf(" + SELECT DISTINCT k.ppep, k.kinase + FROM ( + SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE GENE IN (SELECT kinase FROM enriched_kinases) + ) k + ORDER BY k.ppep, k.kinase + ") + +RSQLite::dbWriteTable( + conn = db, + name = "ksea_enriched_ks", + value = ppep_kinase, + append = FALSE + ) + +RSQLite::dbWriteTable( + conn = db, + name = "anova_signif", + value = p_value_data, + append = FALSE + ) + + ddl_exec(db, " + DROP VIEW IF EXISTS stats_metadata_v; + " + ) + dml_no_rows_exec(db, " + CREATE VIEW stats_metadata_v + AS + SELECT DISTINCT m.*, + p.raw_anova_p, + p.fdr_adjusted_anova_p, + kek.kinase AS ksea_enrichments + FROM + mrgfltr_metadata_view m + LEFT JOIN anova_signif p + ON m.phospho_peptide = p.phosphopeptide + LEFT JOIN ksea_enriched_ks kek + ON m.phospho_peptide = kek.ppep + ; + " + ) + +write.table( + dbReadTable(db, "stats_metadata_v"), + file = anova_ksea_mtdt_file, + sep = "\t", + col.names = TRUE, + row.names = FALSE, + quote = FALSE + ) + + +``` + +```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} +cat("\\leavevmode\n\n\n") + +# write parameters to report + +param_unlist <- unlist(as.list(params)) +param_df <- data.frame( + parameter = paste0("\\verb@", names(param_unlist), "@"), + value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@") + ) + +data_frame_latex( + x = param_df, + justification = "p{0.35\\linewidth} p{0.6\\linewidth}", + centered = TRUE, + caption = "Input parameters", + anchor = const_table_anchor_bp, + underscore_whack = FALSE + ) + +# write parameters to SQLite output + +mqppep_anova_script_param_df <- data.frame( + script = "mqppep_anova_script.Rmd", + parameter = names(param_unlist), + value = param_unlist + ) +ddl_exec(db, " + DROP TABLE IF EXISTS script_parameter; + " +) +ddl_exec(db, " + CREATE TABLE IF NOT EXISTS script_parameter( + script TEXT, + parameter TEXT, + value ANY, + UNIQUE (script, parameter) ON CONFLICT REPLACE + ) + ; + " +) +RSQLite::dbWriteTable( + conn = db, + name = "script_parameter", + value = mqppep_anova_script_param_df, + append = TRUE +) + +# We are done with output +RSQLite::dbDisconnect(db) +``` +<!-- +There's gotta be a better way... + +loaded_packages_df <- sessioninfo::package_info("loaded") +loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) +loaded_packages_df <- data.frame( + package = loaded_packages_df$package, + version = loaded_packages_df$loadedversion, + date = loaded_packages_df$date + ) +data_frame_latex( + x = loaded_packages_df, + justification = "l | l l", + centered = FALSE, + caption = "Loaded R packages", + anchor = const_table_anchor_bp + ) +-->
--- a/test-data/test_input_for_anova.tabular Wed Apr 13 19:48:32 2022 +0000 +++ b/test-data/test_input_for_anova.tabular Thu Jun 30 16:16:32 2022 +0000 @@ -1,23 +1,24 @@ Phosphopeptide Sequence10 Sequence7 Gene_Name Phosphoresidue UniProt_ID Description Function Phosphoresidue(PSP=PhosphoSitePlus.org) Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains Intensity.shL.1A Intensity.shL.1B Intensity.shL.1C Intensity.shR.2A Intensity.shR.2B Intensity.shR.2C -AAAAPDSRVpSEEENLK MAAAAPDSRVpSEEENLKKTPK AAPDSRVsEEENLKK RRP15 pS11 Q9Y3B9 RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | BARD1 BRCT domain binding | PKA | CK1 | CK2 38150000 39445000 56305000 55338000 7010600 70203000 +AAAAPDSRVpSEEENLK MAAAAPDSRVpSEEENLKKTPK AAPDSRVsEEENLKK RRP15 pS11 Q9Y3B9 RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 N/A CK2alpha | BARD1 Q99728 38150000 39445000 56305000 55338000 7010600 70203000 AAAITDMADLEELSRLpSPLPPGpSPGSAAR MADLEELSRLpSPLPPGSPGSA; LSRLSPLPPGpSPGSAARGRAE LEELSRLsPLPPGSP | LSPLPPGsPGSAARG AEBP2; AEBP2 pS18, pS24; pS18, pS24 Q6ZN18; Q6ZN18-2 AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 N/A N/A 5416400 7101800 385280000 208060000 41426000 352400000 ADALQAGASQFETpSAAK LQAGASQFETpSAAKLKRKYWW GASQFETsAAKLKRK VAMP2; VAMP3 pS80; pS63 P63027; Q15836 VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A PKD3 | PKCiota 44627000 41445000 69094000 42521000 5738000 61819000 -DQKLpSELDDR DKVLERDQKLpSELDDRADALQ LERDQKLsELDDRAD VAMP1; VAMP1; VAMP1; VAMP2; VAMP3 pS63; pS63; pS63; pS61; pS44 P23763; P23763-2; P23763-3; P63027; Q15836 VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1; VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A CK2alpha | PKAbeta | PKAgamma | PKCiota | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | Pyruvate dehydrogenase kinase substrate 75542000 44814000 32924000 35016000 11023000 4669900 -EFVpSSDESSSGENK SESFKSKEFVpSSDESSSGENK FKSKEFVsSDESSSG SSRP1 pS667 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | Casein Kinase I substrate | CK2 | GSK3 12562000 16302000 23000000 7857800 0 18830000 -EGMNPSYDEYADpSDEDQHDAYLER MNPSYDEYADpSDEDQHDAYLE SYDEYADsDEDQHDA SSRP1 pS444 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | CK1alpha | Casein kinase II substrate | b-Adrenergic Receptor kinase substrate | Pyruvate dehydrogenase kinase substrate 0 0 0 0 0 0 -IGNEEpSDLEEACILPHpSPINVDK DDEEKIGNEEpSDLEEACILPH; DLEEACILPHpSPINVDKRPIA EKIGNEEsDLEEACI | EACILPHsPINVDKR HERC2 pS1577, pS1588 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | b-Adrenergic Receptor kinase substrate | WW domain binding | ERK/MAPK | CK2 | NEK6 167764000 121218000 155736000 140640000 83642000 128468000 -IRAEEEDLAAVPFLApSDNEEEEDEK EDLAAVPFLApSDNEEEEDEKG AAVPFLAsDNEEEED HERC2 pS2928 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | CK2 22562000 18225000 9119700 11689000 0 0 +DQKLpSELDDR DKVLERDQKLpSELDDRADALQ LERDQKLsELDDRAD VAMP1; VAMP1; VAMP1; VAMP2; VAMP3 pS63; pS63; pS63; pS61; pS44 P23763; P23763-2; P23763-3; P63027; Q15836 VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1; VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A CK2alpha | PKAbeta | PKAgamma | PKCiota | PDHK1 75542000 44814000 32924000 35016000 11023000 4669900 +EFVpSSDESSSGENK SESFKSKEFVpSSDESSSGENK FKSKEFVsSDESSSG SSRP1 pS667 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | GSK3 12562000 16302000 23000000 7857800 0 18830000 +EGMNPSYDEYADpSDEDQHDAYLER MNPSYDEYADpSDEDQHDAYLE SYDEYADsDEDQHDA SSRP1 pS444 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | CK1alpha | GRK-2 | PDHK1 0 0 0 0 0 0 +IGNEEpSDLEEACILPHpSPINVDK DDEEKIGNEEpSDLEEACILPH; DLEEACILPHpSPINVDKRPIA EKIGNEEsDLEEACI | EACILPHsPINVDKR HERC2 pS1577, pS1588 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha | GRK-2 | DOC_WW_Pin1_4 | NEK6 167764000 121218000 155736000 140640000 83642000 128468000 +IRAEEEDLAAVPFLApSDNEEEEDEK EDLAAVPFLApSDNEEEEDEKG AAVPFLAsDNEEEED HERC2 pS2928 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha 22562000 18225000 9119700 11689000 0 0 KGLLApTpSGNDGTIR VWCNKKGLLApTSGNDGTIRVW; WCNKKGLLATpSGNDGTIRVWN NKKGLLAtSGNDGTI | KKGLLATsGNDGTIR HERC1 pT3445, pS3446 Q15751 HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 N/A N/A 7843600 0 241700000 0 0 10042600 -KpSSLVTSK PTPQDLPQRKpSSLVTSKLAGG; PTPQDLPQRKpSSLVTSKLAG QDLPQRKsSLVTSKL ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS108; pS108; pS124; pS131; pS104; pS104; pS120; pS124 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A G protein-coupled receptor kinase 1 substrate 0 0 18629000 0 0 0 -KSpSLVTSK TPQDLPQRKSpSLVTSKLAGGQ; TPQDLPQRKSpSLVTSKLAG DLPQRKSsLVTSKLA ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS109; pS109; pS125; pS132; pS105; pS105; pS121; pS125 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; protein conformation; SNCA(DISRUPTS) G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | Casein Kinase I substrate | MDC1 BRCT domain binding | GSK3 | AURORA 7090300 8341200 9691500 10030000 1675200 9952100 -LpSPNPWQEK MLAVDIEDRLpSPNPWQEKREI VDIEDRLsPNPWQEK HERC2 pS3462 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | WW domain binding 0 11706000 12495000 0 7273000 8877800 -NLLEDDpSDEEEDFFLR SERRNLLEDDpSDEEEDFFLRG RNLLEDDsDEEEDFF VAMP4 pS30 O75379 VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | Casein Kinase I substrate | b-Adrenergic Receptor kinase substrate | BARD1 BRCT domain binding | CK2 | Csnk2a1 1592100000 973800000 1011600000 1450300000 631970000 878760000 -pSQKQEEENPAEETGEEK MpSQKQEEENPAE ______MsQKQEEEN ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS2; pS2; pS2; pS2; pS2; pS2 O43768; O43768-2; O43768-3; O43768-4; O43768-8; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A ATM kinase substrate | PKC kinase substrate | PKA kinase substrate 0 0 8765300 0 2355900 14706000 -QLSEpSFK SKSSSRQLSEpSFKSKEFVSSD SSRQLSEsFKSKEFV SSRP1 pS659 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2a2 | CDK7 | PKCalpha | PKCbeta | DNAPK | PKC kinase substrate | PKA kinase substrate | NEK6 68201000 87774000 138300000 95357000 19966000 149110000 -RGpSLEMSSDGEPLSR SSATSGGRRGpSLEMSSDGEPL TSGGRRGsLEMSSDG AEBP2; AEBP2 pS206; pS206 Q6ZN18; Q6ZN18-2 AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 N/A Casein Kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | PKA | GSK3 | AURORA 19262000 11103000 19454000 0 1816900 22028000 -SDGpSLEDGDDVHR IEDGGARSDGpSLEDGDDVHRA GGARSDGsLEDGDDV SERINC1 pS364 Q9NRX5 SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1 N/A Casein kinase II substrate | Plk1 kinase substrate | Pyruvate dehydrogenase kinase substrate | CK1 | PLK | PLK1 31407000 17665000 20892000 23194000 5132400 54893000 -SEpSLTAESR EGGGLMTRSEpSLTAESRLVHT GLMTRSEsLTAESRL HERC1 pS1491 Q15751 HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 N/A b-Adrenergic Receptor kinase substrate 11766000 13176000 20540000 16963000 4364700 21308000 -STGPTAATGpSNRR MSTGPTAATGpSNRRLQQTQNQ GPTAATGsNRRLQQT VAMP3 pS11 Q15836 VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A PKCalpha | PKCbeta | PKCzeta | PKC kinase substrate | PKA kinase substrate 3057100 4718800 12052000 5047700 1070900 8333500 -TEDLEATpSEHFK RNKTEDLEATpSEHFKTTSQKV TEDLEATsEHFKTTS VAMP8 pS55 Q9BV40 VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1 activity, inhibited; abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion G protein-coupled receptor kinase 1 substrate | Casein Kinase I substrate 20400000 9738500 7862300 0 0 76518000 -TFWpSPELK SSMNSIKTFWpSPELKKERVLR NSIKTFWsPELKKER ERC2 pS187 O15083 ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3 N/A IKKalpha | IKKbeta | HIPK2 | Casein Kinase II substrate | ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | WW domain binding 29764000 20957000 24855000 30752000 8304800 23771000 -YFDpSGDYNMAK CADEMQKYFDpSGDYNMAKAKM; RLQKGQKYFDpSGDYNMAKAKM; MKSVEQKYFDpSGDYNMAKAKM EMQKYFDsGDYNMAK | KGQKYFDsGDYNMAK | VEQKYFDsGDYNMAK ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS67; pS67; pS83; pS90; pS63; pS63; pS79; pS83 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; cell cycle regulation; PPP2CA(INDUCES) b-Adrenergic Receptor kinase substrate 323250000 127970000 0 67123000 12790000 71378000 +KpSSLVTSK PTPQDLPQRKpSSLVTSKLAGG; PTPQDLPQRKpSSLVTSKLAG QDLPQRKsSLVTSKL ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS108; pS108; pS124; pS131; pS104; pS104; pS120; pS124 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A N/A 0 0 18629000 0 0 0 +KSpSLVTSK TPQDLPQRKSpSLVTSKLAGGQ; TPQDLPQRKSpSLVTSKLAG DLPQRKSsLVTSKLA ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS109; pS109; pS125; pS132; pS105; pS105; pS121; pS125 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; protein conformation; SNCA(DISRUPTS) MDC1 FHA | GSK3 | PLK1 PBD 7090300 8341200 9691500 10030000 1675200 9952100 +LpSPNPWQEK MLAVDIEDRLpSPNPWQEKREI VDIEDRLsPNPWQEK HERC2 pS3462 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A DOC_WW_Pin1_4 0 11706000 12495000 0 7273000 8877800 +NLLEDDpSDEEEDFFLR SERRNLLEDDpSDEEEDFFLRG RNLLEDDsDEEEDFF VAMP4 pS30 O75379 VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2 N/A CK2alpha | GRK-2 | BARD1 Q99728 | Csnk2a1 1592100000 973800000 1011600000 1450300000 631970000 878760000 +pSQKQEEENPAEETGEEK MpSQKQEEENPAE ______MsQKQEEEN ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS2; pS2; pS2; pS2; pS2; pS2 O43768; O43768-2; O43768-3; O43768-4; O43768-8; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A N/A 0 0 8765300 0 2355900 14706000 +pTYVDPFTpYEDPNQAVR EEKHLNQGVRpTYVDPFTYEDP; GVRTYVDPFTpYEDPNQAVREF HLNQGVRtYVDPFTY | TYVDPFTyEDPNQAV EPHA4; EPHA4 pT595, pY602; pT544, pY551 P54764; P54764-2 EPHA4_HUMAN Ephrin type-A receptor 4 OS=Homo sapiens OX=9606 GN=EPHA4 PE=1 SV=1; EPHA4_HUMAN Isoform 2 of Ephrin type-A receptor 4 OS=Homo sapiens OX=9606 GN=EPHA4 N/A EPHA4 | EphA1 | EphA2 | EphA3 | EphA5 | EphA7 | EphA6 | Abl | EphA8 | Fgr | Yes | BLK | HCK | EphB6 | EphB3 725460 0 1651300 655850 646420 0 +QLSEpSFK SKSSSRQLSEpSFKSKEFVSSD SSRQLSEsFKSKEFV SSRP1 pS659 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2a2 | CDK7 | PKCalpha | PKCbeta | DNAPK | NEK6 68201000 87774000 138300000 95357000 19966000 149110000 +RGpSLEMSSDGEPLSR SSATSGGRRGpSLEMSSDGEPL TSGGRRGsLEMSSDG AEBP2; AEBP2 pS206; pS206 Q6ZN18; Q6ZN18-2 AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 N/A GSK3 19262000 11103000 19454000 0 1816900 22028000 +SDGpSLEDGDDVHR IEDGGARSDGpSLEDGDDVHRA GGARSDGsLEDGDDV SERINC1 pS364 Q9NRX5 SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1 N/A PLK1 | PDHK1 31407000 17665000 20892000 23194000 5132400 54893000 +SEpSLTAESR EGGGLMTRSEpSLTAESRLVHT GLMTRSEsLTAESRL HERC1 pS1491 Q15751 HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 N/A GRK-2 11766000 13176000 20540000 16963000 4364700 21308000 +STGPTAATGpSNRR MSTGPTAATGpSNRRLQQTQNQ GPTAATGsNRRLQQT VAMP3 pS11 Q15836 VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A PKCalpha | PKCbeta | PKCzeta 3057100 4718800 12052000 5047700 1070900 8333500 +TEDLEATpSEHFK RNKTEDLEATpSEHFKTTSQKV TEDLEATsEHFKTTS VAMP8 pS55 Q9BV40 VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1 activity, inhibited; abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion N/A 20400000 9738500 7862300 0 0 76518000 +TFWpSPELK SSMNSIKTFWpSPELKKERVLR NSIKTFWsPELKKER ERC2 pS187 O15083 ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3 N/A IKKalpha | IKKbeta | HIPK2 | DOC_WW_Pin1_4 29764000 20957000 24855000 30752000 8304800 23771000 +YFDpSGDYNMAK CADEMQKYFDpSGDYNMAKAKM; RLQKGQKYFDpSGDYNMAKAKM; MKSVEQKYFDpSGDYNMAKAKM EMQKYFDsGDYNMAK | KGQKYFDsGDYNMAK | VEQKYFDsGDYNMAK ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS67; pS67; pS83; pS90; pS63; pS63; pS79; pS83 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; cell cycle regulation; PPP2CA(INDUCES) GRK-2 323250000 127970000 0 67123000 12790000 71378000
--- a/workflow/ppenrich_suite_wf.ga Wed Apr 13 19:48:32 2022 +0000 +++ b/workflow/ppenrich_suite_wf.ga Thu Jun 30 16:16:32 2022 +0000 @@ -390,7 +390,7 @@ }, "11": { "annotation": "Transform the output of MaxQuant for phosphoproteome-enriched samples to prepare it for statistical anlaysis.", - "content_id": "testtoolshed.g2.bx.psu.edu/repos/eschen42/mqppep_preproc/mqppep_preproc/0.1.9+galaxy0", + "content_id": "mqppep_preproc", "errors": null, "id": 11, "input_connections": { @@ -792,7 +792,7 @@ }, "13": { "annotation": "Perform ANOVA. For imputing missing values, create random values.", - "content_id": "testtoolshed.g2.bx.psu.edu/repos/eschen42/mqppep_anova/mqppep_anova/0.1.9+galaxy0", + "content_id": "mqppep_anova", "errors": null, "id": 13, "input_connections": {