comparison mutational_patterns.R @ 0:d585f369bfad draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mutational_patterns commit 518fb067e8206ecafbf673a5e4cf375ccead11e3"
author artbio
date Fri, 04 Jun 2021 22:35:17 +0000
parents
children c73aa7cc4c4b
comparison
equal deleted inserted replaced
-1:000000000000 0:d585f369bfad
1 # load packages that are provided in the conda env
2 options(show.error.messages = F,
3 error = function() {
4 cat(geterrmessage(), file = stderr()); q("no", 1, F)
5 }
6 )
7 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
8 warnings()
9 library(optparse)
10 library(rjson)
11 library(grid)
12 library(gridExtra)
13 library(scales)
14 library(RColorBrewer)
15
16 # Arguments
17 option_list <- list(
18 make_option(
19 "--inputs",
20 default = NA,
21 type = "character",
22 help = "json formatted dictionary of datasets and their paths"
23 ),
24 make_option(
25 "--genome",
26 default = NA,
27 type = "character",
28 help = "genome name in the BSgenome bioconductor package"
29 ),
30 make_option(
31 "--levels",
32 default = NA,
33 type = "character",
34 help = "path to the tab separated file describing the levels in function of datasets"
35 ),
36 make_option(
37 "--cosmic_version",
38 default = "v2",
39 type = "character",
40 help = "Version of the Cosmic Signature set to be used to express mutational patterns"
41 ),
42 make_option(
43 "--signum",
44 default = 2,
45 type = "integer",
46 help = "selects the N most significant signatures in samples to express mutational patterns"
47 ),
48 make_option(
49 "--nrun",
50 default = 2,
51 type = "integer",
52 help = "Number of runs to fit signatures"
53 ),
54 make_option(
55 "--rank",
56 default = 2,
57 type = "integer",
58 help = "number of ranks to display for parameter optimization"
59 ),
60 make_option(
61 "--newsignum",
62 default = 2,
63 type = "integer",
64 help = "Number of new signatures to be captured"
65 ),
66 make_option(
67 "--output_spectrum",
68 default = NA,
69 type = "character",
70 help = "path to output dataset"
71 ),
72 make_option(
73 "--output_denovo",
74 default = NA,
75 type = "character",
76 help = "path to output dataset"
77 ),
78 make_option(
79 "--sigmatrix",
80 default = NA,
81 type = "character",
82 help = "path to signature matrix"
83 ),
84 make_option(
85 "--output_cosmic",
86 default = NA,
87 type = "character",
88 help = "path to output dataset"
89 ),
90 make_option(
91 "--sig_contrib_matrix",
92 default = NA,
93 type = "character",
94 help = "path to signature contribution matrix"
95 ),
96 make_option(
97 c("-r", "--rdata"),
98 type = "character",
99 default = NULL,
100 help = "Path to RData output file")
101 )
102
103 opt <- parse_args(OptionParser(option_list = option_list),
104 args = commandArgs(trailingOnly = TRUE))
105
106 ################ Manage input data ####################
107 json_dict <- opt$inputs
108 parser <- newJSONParser()
109 parser$addData(json_dict)
110 fileslist <- parser$getObject()
111 vcf_paths <- attr(fileslist, "names")
112 element_identifiers <- unname(unlist(fileslist))
113 ref_genome <- opt$genome
114 vcf_table <- data.frame(element_identifier = as.character(element_identifiers), path = vcf_paths)
115
116 library(MutationalPatterns)
117 library(ref_genome, character.only = TRUE)
118 library(ggplot2)
119
120 # Load the VCF files into a GRangesList:
121 vcfs <- read_vcfs_as_granges(vcf_paths, element_identifiers, ref_genome)
122 library(plyr)
123 if (!is.na(opt$levels)[1]) { # manage levels if there are
124 levels_table <- read.delim(opt$levels, header = FALSE,
125 col.names = c("element_identifier", "level"))
126 } else {
127 levels_table <- data.frame(element_identifier = vcf_table$element_identifier,
128 level = rep("nolabels", length(vcf_table$element_identifier)))
129 }
130 metadata_table <- join(vcf_table, levels_table, by = "element_identifier")
131 tissue <- as.vector(metadata_table$level)
132 detach(package:plyr)
133
134 ##### This is done for any section ######
135 mut_mat <- mut_matrix(vcf_list = vcfs, ref_genome = ref_genome)
136 qual_col_pals <- brewer.pal.info[brewer.pal.info$category == "qual", ]
137 col_vector <- unique(unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals))))
138 col_vector <- col_vector[c(-32, -34, -39)] # 67-color palette
139
140 ###### Section 1 Mutation characteristics and spectrums #############
141 if (!is.na(opt$output_spectrum)[1]) {
142 pdf(opt$output_spectrum, paper = "special", width = 11.69, height = 11.69)
143 type_occurrences <- mut_type_occurrences(vcfs, ref_genome)
144
145 # mutation spectrum, total or by sample
146
147 if (length(levels(factor(levels_table$level))) == 1) {
148 p1 <- plot_spectrum(type_occurrences, CT = TRUE, legend = TRUE)
149 plot(p1)
150 } else {
151 p2 <- plot_spectrum(type_occurrences, by = tissue, CT = TRUE) # by levels
152 p3 <- plot_spectrum(type_occurrences, CT = TRUE, legend = TRUE) # total
153 grid.arrange(p2, p3, ncol = 2, widths = c(4, 2.3), heights = c(4, 1))
154 }
155 plot_96_profile(mut_mat, condensed = TRUE)
156 dev.off()
157 }
158
159 ###### Section 2: De novo mutational signature extraction using NMF #######
160 # opt$rank cannot be higher than the number of samples and
161 # likewise, opt$signum cannot be higher thant the number of samples
162 if (!is.na(opt$output_denovo)[1]) {
163
164 if (opt$rank > length(element_identifiers)) {
165 opt$rank <- length(element_identifiers)
166 }
167 if (opt$signum > length(element_identifiers)) {
168 opt$signum <- length(element_identifiers)
169 }
170 pseudo_mut_mat <- mut_mat + 0.0001 # First add a small pseudocount to the mutation count matrix
171 # Use the NMF package to generate an estimate rank plot
172 library("NMF")
173 estimate <- nmf(pseudo_mut_mat, rank = 1:opt$rank, method = "brunet", nrun = opt$nrun, seed = 123456)
174 # And plot it
175 pdf(opt$output_denovo, paper = "special", width = 11.69, height = 11.69)
176 p4 <- plot(estimate)
177 grid.arrange(p4)
178 # Extract 4 (PARAMETIZE) mutational signatures from the mutation count matrix with extract_signatures
179 # (For larger datasets it is wise to perform more iterations by changing the nrun parameter
180 # to achieve stability and avoid local minima)
181 nmf_res <- extract_signatures(pseudo_mut_mat, rank = opt$newsignum, nrun = opt$nrun)
182 # Assign signature names
183 colnames(nmf_res$signatures) <- paste0("NewSig_", 1:opt$newsignum)
184 rownames(nmf_res$contribution) <- paste0("NewSig_", 1:opt$newsignum)
185 # Plot the 96-profile of the signatures:
186 p5 <- plot_96_profile(nmf_res$signatures, condensed = TRUE)
187 new_sig_matrix <- reshape2::dcast(p5$data, substitution + context ~ variable, value.var = "value")
188 new_sig_matrix <- format(new_sig_matrix, scientific = TRUE)
189 write.table(new_sig_matrix, file = opt$sigmatrix, quote = FALSE, row.names = FALSE, sep = "\t")
190 grid.arrange(p5)
191 # Visualize the contribution of the signatures in a barplot
192 pc1 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode = "relative", coord_flip = TRUE)
193 # Visualize the contribution of the signatures in absolute number of mutations
194 pc2 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode = "absolute", coord_flip = TRUE)
195 # Combine the two plots:
196 grid.arrange(pc1, pc2)
197
198 # The relative contribution of each signature for each sample can also be plotted as a heatmap with
199 # plot_contribution_heatmap, which might be easier to interpret and compare than stacked barplots.
200 # The samples can be hierarchically clustered based on their euclidean dis- tance. The signatures
201 # can be plotted in a user-specified order.
202 # Plot signature contribution as a heatmap with sample clustering dendrogram and a specified signature order:
203 pch1 <- plot_contribution_heatmap(nmf_res$contribution,
204 sig_order = paste0("NewSig_", 1:opt$newsignum))
205 # Plot signature contribution as a heatmap without sample clustering:
206 pch2 <- plot_contribution_heatmap(nmf_res$contribution, cluster_samples = FALSE)
207 #Combine the plots into one figure:
208 grid.arrange(pch1, pch2, ncol = 2, widths = c(2, 1.6))
209
210 # Compare the reconstructed mutational profile with the original mutational profile:
211 plot_compare_profiles(pseudo_mut_mat[, 1],
212 nmf_res$reconstructed[, 1],
213 profile_names = c("Original", "Reconstructed"),
214 condensed = TRUE)
215 dev.off()
216 }
217 ##### Section 3: Find optimal contribution of known signatures: COSMIC mutational signatures ####
218
219 if (!is.na(opt$output_cosmic)[1]) {
220 pdf(opt$output_cosmic, paper = "special", width = 11.69, height = 11.69)
221 pseudo_mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix
222 if (opt$cosmic_version == "v2") {
223 sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "")
224 cancer_signatures <- read.table(sp_url, sep = "\t", header = TRUE)
225 new_order <- match(row.names(pseudo_mut_mat), cancer_signatures$Somatic.Mutation.Type)
226 cancer_signatures <- cancer_signatures[as.vector(new_order), ]
227 row.names(cancer_signatures) <- cancer_signatures$Somatic.Mutation.Type
228 cancer_signatures <- as.matrix(cancer_signatures[, 4:33])
229 colnames(cancer_signatures) <- gsub("Signature.", "", colnames(cancer_signatures)) # shorten signature labels
230 cosmic_tag <- "Signatures (Cosmic v2, March 2015)"
231 cosmic_colors <- col_vector[1:30]
232 } else {
233 sp_url <- "https://raw.githubusercontent.com/ARTbio/startbio/master/sigProfiler_SBS_signatures_2019_05_22.tsv"
234 cancer_signatures <- read.table(sp_url, sep = "\t", header = TRUE)
235 new_order <- match(row.names(pseudo_mut_mat), cancer_signatures$Somatic.Mutation.Type)
236 cancer_signatures <- cancer_signatures[as.vector(new_order), ]
237 row.names(cancer_signatures) <- cancer_signatures$Somatic.Mutation.Type
238 cancer_signatures <- as.matrix(cancer_signatures[, 4:70])
239 colnames(cancer_signatures) <- gsub("SBS", "", colnames(cancer_signatures)) # shorten signature labels
240 cosmic_tag <- "Signatures (Cosmic v3, May 2019)"
241 cosmic_colors <- col_vector[1:67]
242 }
243
244 # Plot mutational profiles of the COSMIC signatures
245 if (opt$cosmic_version == "v2") {
246 p6 <- plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3)
247 grid.arrange(p6, top = textGrob("COSMIC signature profiles", gp = gpar(fontsize = 12, font = 3)))
248 } else {
249 p6 <- plot_96_profile(cancer_signatures[, 1:33], condensed = TRUE, ymax = 0.3)
250 p6bis <- plot_96_profile(cancer_signatures[, 34:67], condensed = TRUE, ymax = 0.3)
251 grid.arrange(p6, top = textGrob("COSMIC signature profiles (on two pages)",
252 gp = gpar(fontsize = 12, font = 3)))
253 grid.arrange(p6bis, top = textGrob("COSMIC signature profiles (continued)",
254 gp = gpar(fontsize = 12, font = 3)))
255 }
256
257
258 # Find optimal contribution of COSMIC signatures to reconstruct 96 mutational profiles
259 fit_res <- fit_to_signatures(pseudo_mut_mat, cancer_signatures)
260
261 # Plot contribution barplots
262 pc3 <- plot_contribution(fit_res$contribution, cancer_signatures, coord_flip = T, mode = "absolute")
263 pc4 <- plot_contribution(fit_res$contribution, cancer_signatures, coord_flip = T, mode = "relative")
264 pc3_data <- pc3$data
265 pc3 <- ggplot(pc3_data, aes(x = Sample, y = Contribution, fill = as.factor(Signature))) +
266 geom_bar(stat = "identity", position = "stack") +
267 coord_flip() +
268 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) +
269 labs(x = "Samples", y = "Absolute contribution") + theme_bw() +
270 theme(panel.grid.minor.x = element_blank(),
271 panel.grid.major.x = element_blank(),
272 legend.position = "right",
273 text = element_text(size = 8),
274 axis.text.x = element_text(angle = 90, hjust = 1))
275 pc4_data <- pc4$data
276 pc4 <- ggplot(pc4_data, aes(x = Sample, y = Contribution, fill = as.factor(Signature))) +
277 geom_bar(stat = "identity", position = "fill") +
278 coord_flip() +
279 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) +
280 scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
281 labs(x = "Samples", y = "Relative contribution") + theme_bw() +
282 theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank(), legend.position = "right",
283 text = element_text(size = 8),
284 axis.text.x = element_text(angle = 90, hjust = 1))
285 #####
286 # ggplot2 alternative
287 if (!is.na(opt$levels)[1]) { # if there are levels to display in graphs
288 pc3_data <- pc3$data
289 pc3_data <- merge(pc3_data, metadata_table[, c(1, 3)], by.x = "Sample", by.y = "element_identifier")
290 pc3 <- ggplot(pc3_data, aes(x = Sample, y = Contribution, fill = as.factor(Signature))) +
291 geom_bar(stat = "identity", position = "stack") +
292 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) +
293 labs(x = "Samples", y = "Absolute contribution") + theme_bw() +
294 theme(panel.grid.minor.x = element_blank(),
295 panel.grid.major.x = element_blank(),
296 legend.position = "right",
297 text = element_text(size = 8),
298 axis.text.x = element_text(angle = 90, hjust = 1)) +
299 facet_grid(~level, scales = "free_x", space = "free")
300 pc4_data <- pc4$data
301 pc4_data <- merge(pc4_data, metadata_table[, c(1, 3)], by.x = "Sample", by.y = "element_identifier")
302 pc4 <- ggplot(pc4_data, aes(x = Sample, y = Contribution, fill = as.factor(Signature))) +
303 geom_bar(stat = "identity", position = "fill") +
304 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) +
305 scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
306 labs(x = "Samples", y = "Relative contribution") + theme_bw() +
307 theme(panel.grid.minor.x = element_blank(),
308 panel.grid.major.x = element_blank(),
309 legend.position = "right",
310 text = element_text(size = 8),
311 axis.text.x = element_text(angle = 90, hjust = 1)) +
312 facet_grid(~level, scales = "free_x", space = "free")
313 }
314 # Combine the two plots:
315 grid.arrange(pc3, pc4,
316 top = textGrob("Absolute and Relative Contributions of Cosmic signatures to mutational patterns",
317 gp = gpar(fontsize = 12, font = 3)))
318
319 #### pie charts of comic signatures contributions in samples ###
320 library(reshape2)
321 library(dplyr)
322 if (length(levels(factor(levels_table$level))) < 2) {
323 fit_res_contrib <- as.data.frame(fit_res$contribution)
324 worklist <- cbind(signature = rownames(fit_res$contribution),
325 level = rep("nolabels", length(fit_res_contrib[, 1])),
326 fit_res_contrib,
327 sum = rowSums(fit_res_contrib))
328 worklist <- worklist[order(worklist[, "sum"], decreasing = T), ]
329 worklist <- worklist[1:opt$signum, ]
330 worklist <- worklist[, -length(worklist[1, ])]
331 worklist <- melt(worklist)
332 worklist <- worklist[, c(1, 3, 4, 2)]
333 } else {
334 worklist <- list()
335 for (i in levels(factor(levels_table$level))) {
336 fit_res$contribution[, levels_table$element_identifier[levels_table$level == i]] -> worklist[[i]]
337 sum <- rowSums(as.data.frame(worklist[[i]]))
338 worklist[[i]] <- cbind(worklist[[i]], sum)
339 worklist[[i]] <- worklist[[i]][order(worklist[[i]][, "sum"], decreasing = T), ]
340 worklist[[i]] <- worklist[[i]][1:opt$signum, ]
341 worklist[[i]] <- worklist[[i]][, -length(as.data.frame(worklist[[i]]))]
342 }
343 worklist <- as.data.frame(melt(worklist))
344 worklist[, 2] <- paste0(worklist[, 4], " - ", worklist[, 2])
345 head(worklist)
346 }
347
348 colnames(worklist) <- c("signature", "sample", "value", "level")
349 worklist <- as.data.frame(worklist %>% group_by(sample) %>% mutate(value = value / sum(value) * 100))
350 worklist$pos <- cumsum(worklist$value) - worklist$value / 2
351 worklist$label <- factor(worklist$signature)
352 worklist$signature <- factor(worklist$signature)
353 p7 <- ggplot(worklist, aes(x = "", y = value, group = signature, fill = signature)) +
354 geom_bar(width = 1, stat = "identity") +
355 geom_text(aes(label = label), position = position_stack(vjust = 0.5), color = "black", size = 3) +
356 coord_polar("y", start = 0) + facet_wrap(.~sample) +
357 labs(x = "", y = "Samples", fill = cosmic_tag) +
358 scale_fill_manual(name = paste0(opt$signum, " most contributing\nsignatures\n(in each label/tissue)"),
359 values = cosmic_colors[as.numeric(levels(factor(worklist$label)))]) +
360 theme(axis.text = element_blank(),
361 axis.ticks = element_blank(),
362 panel.grid = element_blank())
363 grid.arrange(p7)
364
365 # Plot relative contribution of the cancer signatures in each sample as a heatmap with sample clustering
366 if (length(vcf_paths) > 1) {
367 p8 <- plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete")
368 grid.arrange(p8)
369 }
370
371 # export relative contribution matrix
372 if (!is.na(opt$sig_contrib_matrix)) {
373 output_table <- t(fit_res$contribution) / rowSums(t(fit_res$contribution))
374 colnames(output_table) <- paste0("s", colnames(output_table))
375 if (length(levels(factor(levels_table$level))) > 1) {
376 output_table <- data.frame(sample = paste0(metadata_table[metadata_table$element_identifier == colnames(fit_res$contribution),
377 3], "-", colnames(fit_res$contribution)),
378 output_table)
379 } else {
380 output_table <- data.frame(sample = rownames(output_table), output_table)
381 }
382 write.table(output_table, file = opt$sig_contrib_matrix, sep = "\t", quote = F, row.names = F)
383 }
384
385 # calculate all pairwise cosine similarities
386 cos_sim_ori_rec <- cos_sim_matrix(pseudo_mut_mat, fit_res$reconstructed)
387 # extract cosine similarities per sample between original and reconstructed
388 cos_sim_ori_rec <- as.data.frame(diag(cos_sim_ori_rec))
389
390 # We can use ggplot to make a barplot of the cosine similarities between the original and
391 # reconstructed mutational profile of each sample. This clearly shows how well each mutational
392 # profile can be reconstructed with the COSMIC mutational signatures. Two identical profiles
393 # have a cosine similarity of 1. The lower the cosine similarity between original and
394 # reconstructed, the less well the original mutational profile can be reconstructed with
395 # the COSMIC signatures. You could use, for example, cosine similarity of 0.95 as a cutoff.
396
397 # Adjust data frame for plotting with gpplot
398 colnames(cos_sim_ori_rec) <- "cos_sim"
399 cos_sim_ori_rec$sample <- row.names(cos_sim_ori_rec)
400 # Make barplot
401 p9 <- ggplot(cos_sim_ori_rec, aes(y = cos_sim, x = sample)) +
402 geom_bar(stat = "identity", fill = "skyblue4") +
403 coord_cartesian(ylim = c(0.8, 1)) +
404 # coord_flip(ylim=c(0.8,1)) +
405 ylab("Cosine similarity\n original VS reconstructed") +
406 xlab("") +
407 # Reverse order of the samples such that first is up
408 # xlim(rev(levels(factor(cos_sim_ori_rec$sample)))) +
409 theme_bw() +
410 theme(panel.grid.minor.y = element_blank(),
411 panel.grid.major.y = element_blank()) +
412 # Add cut.off line
413 geom_hline(aes(yintercept = .95))
414 grid.arrange(p9, top = textGrob("Similarity between true and reconstructed profiles (with all Cosmic sig.)", gp = gpar(fontsize = 12, font = 3)))
415 dev.off()
416 }
417
418
419 # Output RData file
420 if (!is.null(opt$rdata)) {
421 save.image(file = opt$rdata)
422 }