Mercurial > repos > eschen42 > mqppep_anova
diff mqppep_anova_script.Rmd @ 24:8582a9797c18 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit c9e47049958ea3b12e30b9bd8884d48147c45edd
author | eschen42 |
---|---|
date | Thu, 14 Jul 2022 02:12:33 +0000 |
parents | 61adb8801b73 |
children | f9cd87ac8006 |
line wrap: on
line diff
--- a/mqppep_anova_script.Rmd Mon Jul 11 13:51:14 2022 +0000 +++ b/mqppep_anova_script.Rmd Thu Jul 14 02:12:33 2022 +0000 @@ -21,13 +21,13 @@ inputFile: "test-data/test_input_for_anova.tabular" preprocDb: "test-data/test_input_for_anova.sqlite" kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] + regexSampleNames: "\\.\\d+[A-Z]$" + regexSampleGrouping: "\\d+" show_toc: true firstDataColumn: "^Intensity[^_]" - imputationMethod: !r c("group-median", "median", "mean", "random")[1] + imputationMethod: !r c("group-median", "median", "mean", "random")[4] meanPercentile: 1 sdPercentile: 1.0 - regexSampleNames: "\\.\\d+[A-Z]$" - regexSampleGrouping: "\\d+" imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" @@ -39,23 +39,36 @@ intensityHeatmapRows: 75 --- <!-- - kseaCutoffStatistic: !r c("p.value", "FDR")[2] - kseaCutoffThreshold: !r c(0.05, 0.1)[1] - alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/test_input_for_anova.tabular" preprocDb: "test-data/test_input_for_anova.sqlite" kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] + regexSampleNames: "\\.\\d+[A-Z]$" + regexSampleGrouping: "\\d+" + + alphaFile: "test-data/alpha_levels.tabular" + inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular" + preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite" + kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] + regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$" + regexSampleGrouping: "\\w+" + + kseaCutoffStatistic: !r c("p.value", "FDR")[2] + kseaCutoffThreshold: !r c(0.05, 0.1)[1] alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] + regexSampleNames: "\\.\\d+[A-Z]$" + regexSampleGrouping: "\\d+" alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" - kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] + kseaAppPrepDb: !r c(":memory:", "test-data/pY_Sites_NancyDu.ksea.sqlite")[2] + regexSampleNames: "\\.\\d+[A-Z]$" + regexSampleGrouping: "\\d+" alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" @@ -668,7 +681,7 @@ k <- k[selector < ksea_cutoff_threshold, ] - if (nrow(k) > 1) { + if (nrow(k) > 0) { op <- par(mai = c(1, 1.5, 0.4, 0.4)) numeric_z_score <- as.numeric(k$z_score) z_score_order <- order(numeric_z_score) @@ -687,13 +700,14 @@ border = NA, xpd = FALSE, cex.names = 1.0, - cex.axis = 1.0, main = long_caption, cex.main = my_cex_caption, names.arg = kinase_name[z_score_order], horiz = TRUE, srt = 45, - las = 1) + las = 1, + cex.axis = 0.9 + ) par(op) } } @@ -852,6 +866,8 @@ # create_breaks is a helper for ksea_heatmap create_breaks <- function(merged_scores) { + if (sum(!is.na(merged_scores)) < 2) + return(NULL) if (min(merged_scores, na.rm = TRUE) < -1.6) { breaks_neg <- seq(-1.6, 0, length.out = 30) breaks_neg <- @@ -909,39 +925,41 @@ ) ) } else if (nrow(x) < 2) { - cat("No plot because matrix x has ", nrow(x), " rows.\n\n") - cat("\\begin{verbatim}\n") - str(x) - cat("\\end{verbatim}\n") + cat("No plot because matrix has ", nrow(x), " rows.\n\n") + return(FALSE) } else if (ncol(x) < 2) { cat("No plot because matrix x has ", ncol(x), " columns.\n\n") - cat("\\begin{verbatim}\n") - str(x) - cat("\\end{verbatim}\n") + return(FALSE) } else { + my_limit <- 25 + my_cex_col <- my_limit / (my_limit + ncol(x)) + my_cex_row <- my_limit / (my_limit + nrow(x)) + my_scale <- 12.0 + if (ncol(x) < 10 && nrow(x) < 10) + my_scale <- my_scale * 10 / (10 - nrow(x)) * 10 / (10 - ncol(x)) gplots::heatmap.2( x = merged_scores, Colv = sample_cluster, - scale = "none", + breaks = color_breaks[[1]], cellnote = merged_asterisk, + cexCol = 0.9 * my_cex_col, + cexRow = 2 * my_cex_row, + col = color_breaks[[2]], + density.info = "none", + key = FALSE, + lhei = c(0.4, 8.0, 1.1), + lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), + lwid = c(0.5, 3), + margins = margins, + notecex = my_scale * my_cex_row * my_cex_col, notecol = "white", - cexCol = 0.9, - # Heuristically assign size of row labels - cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25), + scale = "none", srtCol = 45, srtRow = 45, - notecex = 3 * my_cex_row, - col = color_breaks[[2]], - density.info = "none", trace = "none", - breaks = color_breaks[[1]], - lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), - lhei = c(0.4, 8.0, 1.1), - lwid = c(0.5, 3), - key = FALSE, - margins = margins, ... ) + return(TRUE) } } @@ -993,7 +1011,7 @@ master <- Reduce( f = function(...) { - base::merge(..., by = "Kinase.Gene", all = FALSE) + base::merge(..., by = "Kinase.Gene", all = TRUE) }, x = score_list_m ) @@ -1025,8 +1043,8 @@ names(asterisk_rows) <- all_rows non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) - merged_scores_asterisk <- merged_scores[names(asterisk_rows), ] - merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ] + merged_scores_asterisk <- merged_scores[names(asterisk_rows), , drop = FALSE] + merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), , drop = FALSE] # end hack to print only significant rows row_list <- list() @@ -1036,11 +1054,15 @@ i <- which_kinases my_row_names <- row_list[[i]] - scrs <- merged_scores[my_row_names, ] - stts <- merged_stats[my_row_names, ] + scrs <- merged_scores[my_row_names, , drop = FALSE] + stts <- merged_stats[my_row_names, , drop = FALSE] merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) color_breaks <- create_breaks(scrs) + if (is.null(color_breaks)) { + cat("No plot because matrix has too many missing values.\n\n") + return(NULL) + } plot_height <- nrow(scrs) ^ 0.55 plot_width <- ncol(scrs) ^ 0.7 my_cex_row <- 0.25 * 16 / plot_height @@ -1053,7 +1075,7 @@ pointsize = 14 ) } - draw_kseaapp_summary_heatmap( + did_draw <- draw_kseaapp_summary_heatmap( x = scrs, sample_cluster = sample_cluster, merged_asterisk = merged_asterisk, @@ -1064,12 +1086,14 @@ if (export == "TRUE") { dev.off() } + if (!did_draw) + return(NULL) return(my_row_names) } # helper for heatmaps of phosphopeptide intensities -draw_intensity_heatmap <- +draw_ppep_heatmap <- function( m, # matrix with rownames already formatted cutoff, # cutoff used by hm_heading_function @@ -1084,7 +1108,7 @@ # emit the heading for the heatmap if (hm_heading_function(m, cutoff)) { peptide_count <- min(max_peptide_count, nrow(m)) - if (nrow(m) > 1) { + if (nrow(m) > 0) { m_margin <- m[peptide_count:1, ] # Margin setting was heuristically derived margins <- @@ -1092,39 +1116,82 @@ max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row ) } - if (nrow(m) > 1) { + if (nrow(m) > 0) { + hm_call <- NULL tryCatch( { old_oma <- par("oma") par(cex.main = 0.6) # Heuristically determined character size adjustment formula - char_contractor <- + my_cex_row <- 250000 / ( max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows ) - heatmap( - m[peptide_count:1, ], - Rowv = if (suppress_row_dendrogram) NA else NULL, - Colv = NA, - cexRow = char_contractor, - cexCol = char_contractor * 50 / max_peptide_count, - scale = "row", - margins = margins, - main = - "Unimputed, unnormalized log(intensities)", - xlab = "", - las = 1, - ... - ) + m_hm <- m[peptide_count:1, , drop = FALSE] + my_limit <- 60 + my_cex_col <- 0.75 * my_limit / (my_limit + ncol(m_hm)) + hm_call <- function(x, scaling, title) { + heatmap( + x, + Rowv = if (suppress_row_dendrogram) NA else NULL, + Colv = NA, + cexRow = my_cex_row, + cexCol = my_cex_col, + scale = scaling, + margins = margins, + main = title, + xlab = "", + las = 1, + ... + ) + } + if (sum(rowSums(!is.na(m_hm)) < 2)) + hm_call( + m_hm, + "none", + "log(intensities), unscaled, unimputed, and unnormalized" + ) + else + hm_call( + m_hm, + "row", + "log(intensities), row-scaled, unimputed, and unnormalized" + ) }, error = function(e) { - cat( - sprintf( - "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n", - e$message + if (!is.null(hm_call)) { + m_hm[is.na(m_hm)] <- 0 + tryCatch( + { + if (nrow(m_hm) > 1) + hm_call( + m_hm, + "none", + paste( + "log(intensities), unscaled, unimputed,", + "NAs zeroed, unnormalized" + ) + ) + else + cat("\nThere are too few peptides to produce a heatmap.\n") + }, + error = function(r) { + cat( + sprintf( + "\n%s %s Internal message: %s\n", + "Could not draw heatmap,", + "possibly because of too many missing values.", + r$message + ) + ) + } + ) + } else { + cat( + "\nCould not draw heatmap, possibly because of too many missing values.\n" ) - ) - }, + } + }, finally = par(old_oma) ) } @@ -1291,9 +1358,9 @@ ) ``` -# Extract Sample Names and Treatment Levels - -Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2. +# Extract Sample Classes and Names + +Column names parsed from input file are shown in Table 1; sample classes and names, in Table 2. ```{r echo = FALSE, results = 'asis'} @@ -1323,6 +1390,7 @@ column = seq_len(length(colnames(full_data))), name = paste0("\\verb@", colnames(full_data), "@") ) +cat("\n\\begin{tiny}\n") data_frame_latex( x = column_name_df, justification = "l l", @@ -1331,6 +1399,7 @@ anchor = const_table_anchor_bp, underscore_whack = FALSE ) +cat("\n\\end{tiny}\n") ``` @@ -1358,17 +1427,29 @@ sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) number_of_samples <- length(sample_name_matches) sample_treatment_df <- data.frame( - level = sample_treatment_levels, + class = sample_treatment_levels, + sample = sample_name_matches + ) +# reorder data +if (TRUE) { + my_order <- with(sample_treatment_df, order(class, sample)) + quant_data <- quant_data[, my_order] + sample_name_matches <- sample_name_matches[my_order] + sample_treatment_levels <- sample_treatment_levels[my_order] +} +sample_treatment_df <- data.frame( + class = sample_treatment_levels, sample = sample_name_matches ) data_frame_latex( x = sample_treatment_df, justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", centered = TRUE, - caption = "Treatment levels", + caption = "Sample classes", anchor = const_table_anchor_tbp, underscore_whack = FALSE ) +sample_name_shrink <- 10 / (10 + max(nchar(sample_name_matches))) ``` ```{r echo = FALSE, results = 'asis'} @@ -1395,7 +1476,8 @@ # Vertical plot boxplot( quant_data_log -, las = 1 +, las = 2 +, cex.axis = 0.9 * sample_name_shrink , col = const_boxplot_fill , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") , xlab = "Sample" @@ -1785,7 +1867,8 @@ colnames(blue_dots) <- sample_name_matches boxplot( blue_dots - , las = 1 # "always horizontal" + , las = 2 # "always vertical" + , cex.axis = 0.9 * sample_name_shrink , col = const_boxplot_fill , ylim = ylim , main = "Peptide intensities after eliminating unusable peptides" @@ -1831,6 +1914,8 @@ ylim = ylim_save, main = "Distributions of observed and imputed data", sub = "Light blue = observed data; Pink = imputed data", + las = 2, + cex.axis = 0.9 * sample_name_shrink, xlab = "Sample", ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) @@ -2043,7 +2128,8 @@ colnames(quant_data_log) <- sample_name_matches boxplot( quant_data_log - , las = 1 + , las = 2 + , cex.axis = 0.9 * sample_name_shrink , col = const_boxplot_fill , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") , xlab = "Sample" @@ -2084,7 +2170,7 @@ colnames(connect_df) <- c("Phosphopeptide", "Intensity") ``` -```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} +```{r anova, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} count_of_treatment_levels <- length(levels(sample_treatment_levels)) if (count_of_treatment_levels < 2) { nuke_control_sequences <- @@ -2156,10 +2242,8 @@ p_value_data_anova_ps_fdr <- p.adjust(p_value_data_anova_ps, method = "fdr") p_value_data <- data.frame( - phosphopeptide = full_data[, 1] - , - raw_anova_p = p_value_data_anova_ps - , + phosphopeptide = full_data[, 1], + raw_anova_p = p_value_data_anova_ps, fdr_adjusted_anova_p = p_value_data_anova_ps_fdr ) @@ -2257,7 +2341,8 @@ boxplot( filtered_data_filtered, main = "Imputed, normalized intensities", # no line plot - las = 1, + las = 2, + cex.axis = 0.9 * sample_name_shrink, col = const_boxplot_fill, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") ), @@ -2309,16 +2394,16 @@ X = filtered_p$fdr_adjusted_anova_p , FUN = function(x) { - if (x > 0.0001) - paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s") + if (x > 0.01) + paste0("%s (%0.", 1 + ceiling(-log10(x)), "f)") else - paste0("(%0.4e) %s") + paste0("%s (%0.2e)") } ) cat_hm_heading <- function(m, cutoff) { - cat("\\newpage\n") if (nrow(m) > intensity_hm_rows) { + cat("\\newpage\n") subsection_header( paste( sprintf("Heatmap for the %d most-significant peptides", @@ -2327,7 +2412,7 @@ ) ) } else { - if (nrow(m) == 1) { + if (nrow(m) == 0) { return(FALSE) } else { subsection_header( @@ -2354,8 +2439,8 @@ FUN = function(i) { sprintf( anova_filtered_merge_format[i], - filtered_p$fdr_adjusted_anova_p[i], - rownames_m[i] + rownames_m[i], + signif(filtered_p$fdr_adjusted_anova_p[i], 2) ) } ) @@ -2363,18 +2448,21 @@ # draw the heading and heatmap if (nrow(m) > 0) { number_of_peptides_found <- - draw_intensity_heatmap( + draw_ppep_heatmap( m = m, cutoff = cutoff, hm_heading_function = cat_hm_heading, - hm_main_title = "Unimputed, unnormalized log(intensities)", + hm_main_title = + "log(intensities), row-scaled, unimputed, unnormalized", suppress_row_dendrogram = FALSE ) } } } } -cat("\\leavevmode\n\n\n") +cat("\\leavevmode\n") +cat("The adjusted ANOVA \\textit{p}-value is shown in parentheses + after the phosphopeptide sequence.\n\n") ``` ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} @@ -2710,7 +2798,7 @@ MARGIN = 1, # apply to rows FUN = anova_func, grouping_factor = - as.factor(as.numeric(grouping_factor$level)), # anova_func arg2 + as.factor(grouping_factor$level), # anova_func arg2 one_way_f = one_way_two_categories, # anova_func arg3 simplify = TRUE # TRUE is the default for simplify ) @@ -3046,21 +3134,9 @@ contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) contrast_longlabel <- ( sprintf( - "Trt %s {%s} -> Trt %s {%s}", + "Class %s -> Class %s", contrast_metadata_df[i_cntrst, "b_level"], - gsub( - pattern = ";", - replacement = ", ", - x = contrast_metadata_df[i_cntrst, "b_samples"], - fixed = TRUE - ), - contrast_metadata_df[i_cntrst, "a_level"], - gsub( - pattern = ";", - replacement = ", ", - x = contrast_metadata_df[i_cntrst, "a_samples"], - fixed = TRUE - ) + contrast_metadata_df[i_cntrst, "a_level"] ) ) kseaapp_input <- @@ -3165,9 +3241,11 @@ # - 3 : non-significant kinases which_kinases = which_kinases ) - cat("\\begin{center}\n") - cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n") - cat("\\end{center}\n") + if (!is.null(plotted_kinases)) { + cat("\\begin{center}\n") + cat("Color intensity reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n") + cat("\\end{center}\n") + } } # end for (i in ... } # end if (length ... @@ -3179,21 +3257,9 @@ contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) contrast_longlabel <- ( sprintf( - "Trt %s {%s} -> Trt %s {%s}", + "Class %s -> Class %s", contrast_metadata_df[i_cntrst, "b_level"], - gsub( - pattern = ";", - replacement = ", ", - x = contrast_metadata_df[i_cntrst, "b_samples"], - fixed = TRUE - ), - contrast_metadata_df[i_cntrst, "a_level"], - gsub( - pattern = ";", - replacement = ", ", - x = contrast_metadata_df[i_cntrst, "a_samples"], - fixed = TRUE - ) + contrast_metadata_df[i_cntrst, "a_level"] ) ) main_title <- ( @@ -3233,13 +3299,18 @@ SELECT gene AS kinase, ppep, - '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label + sub_gene, + '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, + fdr_adjusted_anova_p FROM ( SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep FROM pseudo_ksdata - WHERE GENE IN (SELECT kinase FROM enriched_kinases) - ) + WHERE gene IN (SELECT kinase FROM enriched_kinases) + ), + p_value_data + WHERE ppep = phosphopeptide GROUP BY ppep + ORDER BY fdr_adjusted_anova_p ") # helper used to label per-kinase substrate enrichment figure @@ -3262,7 +3333,7 @@ ) ) } else { - if (nrow(m) == 1) { + if (nrow(m) == 0) { return(FALSE) } else { subsection_header( @@ -3287,7 +3358,7 @@ } # Disabling heatmaps for substrates pending decision whether to eliminate them altogether -if (FALSE) +if (TRUE) for (kinase_name in sort(enriched_kinases$kinase)) { enriched_substrates <- all_enriched_substrates[ @@ -3295,14 +3366,20 @@ , drop = FALSE ] + enriched_substrates$label <- with( + enriched_substrates, + sprintf( + "(%s-%s) %s (%0.2g)", + kinase, + sub("$FAILED_MATCH_GENE_NAME", "unidentified", sub_gene, fixed = TRUE), + ppep, + fdr_adjusted_anova_p + ) + ) # Get the intensity values for the heatmap enriched_intensities <- as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) # Remove rows having too many NA values to be relevant - na_counter <- is.na(enriched_intensities) - na_counts <- apply(na_counter, 1, sum) - enriched_intensities <- - enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE] # Rename the rows with the display-name for the heatmap rownames(enriched_intensities) <- sapply( @@ -3321,7 +3398,7 @@ cut_args$statistic <- ksea_cutoff_statistic cut_args$threshold <- ksea_cutoff_threshold number_of_peptides_found <- - draw_intensity_heatmap( + draw_ppep_heatmap( m = m, cutoff = cut_args, hm_heading_function = cat_enriched_heading, @@ -3329,6 +3406,25 @@ = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", suppress_row_dendrogram = FALSE ) + if (number_of_peptides_found > 1) { + cat("\\leavevmode\n") + cat("The kinase-subsrate pair is shown in parentheses + before the phosphopeptide sequence.\n\n") + cat("The adjusted ANOVA \\textit{p}-value is shown in parentheses + after the phosphopeptide sequence.\n\n") + } + if (nrow(m) == 1) { + cat( + sprintf( + "\n\nSubstrate is %s, + \nphopshopeptide is %s, + \n\nand adjusted ANOVA \\textit{p}-value is %0.2g.\n", + enriched_substrates[1, "sub_gene"], + enriched_substrates[1, "ppep"], + enriched_substrates[1, "fdr_adjusted_anova_p"] + ) + ) + } } } @@ -3473,7 +3569,11 @@ param_unlist <- unlist(as.list(params)) param_df <- data.frame( parameter = paste0("\\verb@", names(param_unlist), "@"), - value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@") + value = paste0( + "\n\\begin{tiny}\n\\verb@", + gsub("$", "\\$", param_unlist, fixed = TRUE), + "@\n\\end{tiny}" + ) ) data_frame_latex(