Mercurial > repos > artbio > mutational_patterns
comparison mutational_patterns.R @ 0:d585f369bfad draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mutational_patterns commit 518fb067e8206ecafbf673a5e4cf375ccead11e3"
author | artbio |
---|---|
date | Fri, 04 Jun 2021 22:35:17 +0000 |
parents | |
children | c73aa7cc4c4b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d585f369bfad |
---|---|
1 # load packages that are provided in the conda env | |
2 options(show.error.messages = F, | |
3 error = function() { | |
4 cat(geterrmessage(), file = stderr()); q("no", 1, F) | |
5 } | |
6 ) | |
7 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | |
8 warnings() | |
9 library(optparse) | |
10 library(rjson) | |
11 library(grid) | |
12 library(gridExtra) | |
13 library(scales) | |
14 library(RColorBrewer) | |
15 | |
16 # Arguments | |
17 option_list <- list( | |
18 make_option( | |
19 "--inputs", | |
20 default = NA, | |
21 type = "character", | |
22 help = "json formatted dictionary of datasets and their paths" | |
23 ), | |
24 make_option( | |
25 "--genome", | |
26 default = NA, | |
27 type = "character", | |
28 help = "genome name in the BSgenome bioconductor package" | |
29 ), | |
30 make_option( | |
31 "--levels", | |
32 default = NA, | |
33 type = "character", | |
34 help = "path to the tab separated file describing the levels in function of datasets" | |
35 ), | |
36 make_option( | |
37 "--cosmic_version", | |
38 default = "v2", | |
39 type = "character", | |
40 help = "Version of the Cosmic Signature set to be used to express mutational patterns" | |
41 ), | |
42 make_option( | |
43 "--signum", | |
44 default = 2, | |
45 type = "integer", | |
46 help = "selects the N most significant signatures in samples to express mutational patterns" | |
47 ), | |
48 make_option( | |
49 "--nrun", | |
50 default = 2, | |
51 type = "integer", | |
52 help = "Number of runs to fit signatures" | |
53 ), | |
54 make_option( | |
55 "--rank", | |
56 default = 2, | |
57 type = "integer", | |
58 help = "number of ranks to display for parameter optimization" | |
59 ), | |
60 make_option( | |
61 "--newsignum", | |
62 default = 2, | |
63 type = "integer", | |
64 help = "Number of new signatures to be captured" | |
65 ), | |
66 make_option( | |
67 "--output_spectrum", | |
68 default = NA, | |
69 type = "character", | |
70 help = "path to output dataset" | |
71 ), | |
72 make_option( | |
73 "--output_denovo", | |
74 default = NA, | |
75 type = "character", | |
76 help = "path to output dataset" | |
77 ), | |
78 make_option( | |
79 "--sigmatrix", | |
80 default = NA, | |
81 type = "character", | |
82 help = "path to signature matrix" | |
83 ), | |
84 make_option( | |
85 "--output_cosmic", | |
86 default = NA, | |
87 type = "character", | |
88 help = "path to output dataset" | |
89 ), | |
90 make_option( | |
91 "--sig_contrib_matrix", | |
92 default = NA, | |
93 type = "character", | |
94 help = "path to signature contribution matrix" | |
95 ), | |
96 make_option( | |
97 c("-r", "--rdata"), | |
98 type = "character", | |
99 default = NULL, | |
100 help = "Path to RData output file") | |
101 ) | |
102 | |
103 opt <- parse_args(OptionParser(option_list = option_list), | |
104 args = commandArgs(trailingOnly = TRUE)) | |
105 | |
106 ################ Manage input data #################### | |
107 json_dict <- opt$inputs | |
108 parser <- newJSONParser() | |
109 parser$addData(json_dict) | |
110 fileslist <- parser$getObject() | |
111 vcf_paths <- attr(fileslist, "names") | |
112 element_identifiers <- unname(unlist(fileslist)) | |
113 ref_genome <- opt$genome | |
114 vcf_table <- data.frame(element_identifier = as.character(element_identifiers), path = vcf_paths) | |
115 | |
116 library(MutationalPatterns) | |
117 library(ref_genome, character.only = TRUE) | |
118 library(ggplot2) | |
119 | |
120 # Load the VCF files into a GRangesList: | |
121 vcfs <- read_vcfs_as_granges(vcf_paths, element_identifiers, ref_genome) | |
122 library(plyr) | |
123 if (!is.na(opt$levels)[1]) { # manage levels if there are | |
124 levels_table <- read.delim(opt$levels, header = FALSE, | |
125 col.names = c("element_identifier", "level")) | |
126 } else { | |
127 levels_table <- data.frame(element_identifier = vcf_table$element_identifier, | |
128 level = rep("nolabels", length(vcf_table$element_identifier))) | |
129 } | |
130 metadata_table <- join(vcf_table, levels_table, by = "element_identifier") | |
131 tissue <- as.vector(metadata_table$level) | |
132 detach(package:plyr) | |
133 | |
134 ##### This is done for any section ###### | |
135 mut_mat <- mut_matrix(vcf_list = vcfs, ref_genome = ref_genome) | |
136 qual_col_pals <- brewer.pal.info[brewer.pal.info$category == "qual", ] | |
137 col_vector <- unique(unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))) | |
138 col_vector <- col_vector[c(-32, -34, -39)] # 67-color palette | |
139 | |
140 ###### Section 1 Mutation characteristics and spectrums ############# | |
141 if (!is.na(opt$output_spectrum)[1]) { | |
142 pdf(opt$output_spectrum, paper = "special", width = 11.69, height = 11.69) | |
143 type_occurrences <- mut_type_occurrences(vcfs, ref_genome) | |
144 | |
145 # mutation spectrum, total or by sample | |
146 | |
147 if (length(levels(factor(levels_table$level))) == 1) { | |
148 p1 <- plot_spectrum(type_occurrences, CT = TRUE, legend = TRUE) | |
149 plot(p1) | |
150 } else { | |
151 p2 <- plot_spectrum(type_occurrences, by = tissue, CT = TRUE) # by levels | |
152 p3 <- plot_spectrum(type_occurrences, CT = TRUE, legend = TRUE) # total | |
153 grid.arrange(p2, p3, ncol = 2, widths = c(4, 2.3), heights = c(4, 1)) | |
154 } | |
155 plot_96_profile(mut_mat, condensed = TRUE) | |
156 dev.off() | |
157 } | |
158 | |
159 ###### Section 2: De novo mutational signature extraction using NMF ####### | |
160 # opt$rank cannot be higher than the number of samples and | |
161 # likewise, opt$signum cannot be higher thant the number of samples | |
162 if (!is.na(opt$output_denovo)[1]) { | |
163 | |
164 if (opt$rank > length(element_identifiers)) { | |
165 opt$rank <- length(element_identifiers) | |
166 } | |
167 if (opt$signum > length(element_identifiers)) { | |
168 opt$signum <- length(element_identifiers) | |
169 } | |
170 pseudo_mut_mat <- mut_mat + 0.0001 # First add a small pseudocount to the mutation count matrix | |
171 # Use the NMF package to generate an estimate rank plot | |
172 library("NMF") | |
173 estimate <- nmf(pseudo_mut_mat, rank = 1:opt$rank, method = "brunet", nrun = opt$nrun, seed = 123456) | |
174 # And plot it | |
175 pdf(opt$output_denovo, paper = "special", width = 11.69, height = 11.69) | |
176 p4 <- plot(estimate) | |
177 grid.arrange(p4) | |
178 # Extract 4 (PARAMETIZE) mutational signatures from the mutation count matrix with extract_signatures | |
179 # (For larger datasets it is wise to perform more iterations by changing the nrun parameter | |
180 # to achieve stability and avoid local minima) | |
181 nmf_res <- extract_signatures(pseudo_mut_mat, rank = opt$newsignum, nrun = opt$nrun) | |
182 # Assign signature names | |
183 colnames(nmf_res$signatures) <- paste0("NewSig_", 1:opt$newsignum) | |
184 rownames(nmf_res$contribution) <- paste0("NewSig_", 1:opt$newsignum) | |
185 # Plot the 96-profile of the signatures: | |
186 p5 <- plot_96_profile(nmf_res$signatures, condensed = TRUE) | |
187 new_sig_matrix <- reshape2::dcast(p5$data, substitution + context ~ variable, value.var = "value") | |
188 new_sig_matrix <- format(new_sig_matrix, scientific = TRUE) | |
189 write.table(new_sig_matrix, file = opt$sigmatrix, quote = FALSE, row.names = FALSE, sep = "\t") | |
190 grid.arrange(p5) | |
191 # Visualize the contribution of the signatures in a barplot | |
192 pc1 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode = "relative", coord_flip = TRUE) | |
193 # Visualize the contribution of the signatures in absolute number of mutations | |
194 pc2 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode = "absolute", coord_flip = TRUE) | |
195 # Combine the two plots: | |
196 grid.arrange(pc1, pc2) | |
197 | |
198 # The relative contribution of each signature for each sample can also be plotted as a heatmap with | |
199 # plot_contribution_heatmap, which might be easier to interpret and compare than stacked barplots. | |
200 # The samples can be hierarchically clustered based on their euclidean dis- tance. The signatures | |
201 # can be plotted in a user-specified order. | |
202 # Plot signature contribution as a heatmap with sample clustering dendrogram and a specified signature order: | |
203 pch1 <- plot_contribution_heatmap(nmf_res$contribution, | |
204 sig_order = paste0("NewSig_", 1:opt$newsignum)) | |
205 # Plot signature contribution as a heatmap without sample clustering: | |
206 pch2 <- plot_contribution_heatmap(nmf_res$contribution, cluster_samples = FALSE) | |
207 #Combine the plots into one figure: | |
208 grid.arrange(pch1, pch2, ncol = 2, widths = c(2, 1.6)) | |
209 | |
210 # Compare the reconstructed mutational profile with the original mutational profile: | |
211 plot_compare_profiles(pseudo_mut_mat[, 1], | |
212 nmf_res$reconstructed[, 1], | |
213 profile_names = c("Original", "Reconstructed"), | |
214 condensed = TRUE) | |
215 dev.off() | |
216 } | |
217 ##### Section 3: Find optimal contribution of known signatures: COSMIC mutational signatures #### | |
218 | |
219 if (!is.na(opt$output_cosmic)[1]) { | |
220 pdf(opt$output_cosmic, paper = "special", width = 11.69, height = 11.69) | |
221 pseudo_mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix | |
222 if (opt$cosmic_version == "v2") { | |
223 sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "") | |
224 cancer_signatures <- read.table(sp_url, sep = "\t", header = TRUE) | |
225 new_order <- match(row.names(pseudo_mut_mat), cancer_signatures$Somatic.Mutation.Type) | |
226 cancer_signatures <- cancer_signatures[as.vector(new_order), ] | |
227 row.names(cancer_signatures) <- cancer_signatures$Somatic.Mutation.Type | |
228 cancer_signatures <- as.matrix(cancer_signatures[, 4:33]) | |
229 colnames(cancer_signatures) <- gsub("Signature.", "", colnames(cancer_signatures)) # shorten signature labels | |
230 cosmic_tag <- "Signatures (Cosmic v2, March 2015)" | |
231 cosmic_colors <- col_vector[1:30] | |
232 } else { | |
233 sp_url <- "https://raw.githubusercontent.com/ARTbio/startbio/master/sigProfiler_SBS_signatures_2019_05_22.tsv" | |
234 cancer_signatures <- read.table(sp_url, sep = "\t", header = TRUE) | |
235 new_order <- match(row.names(pseudo_mut_mat), cancer_signatures$Somatic.Mutation.Type) | |
236 cancer_signatures <- cancer_signatures[as.vector(new_order), ] | |
237 row.names(cancer_signatures) <- cancer_signatures$Somatic.Mutation.Type | |
238 cancer_signatures <- as.matrix(cancer_signatures[, 4:70]) | |
239 colnames(cancer_signatures) <- gsub("SBS", "", colnames(cancer_signatures)) # shorten signature labels | |
240 cosmic_tag <- "Signatures (Cosmic v3, May 2019)" | |
241 cosmic_colors <- col_vector[1:67] | |
242 } | |
243 | |
244 # Plot mutational profiles of the COSMIC signatures | |
245 if (opt$cosmic_version == "v2") { | |
246 p6 <- plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3) | |
247 grid.arrange(p6, top = textGrob("COSMIC signature profiles", gp = gpar(fontsize = 12, font = 3))) | |
248 } else { | |
249 p6 <- plot_96_profile(cancer_signatures[, 1:33], condensed = TRUE, ymax = 0.3) | |
250 p6bis <- plot_96_profile(cancer_signatures[, 34:67], condensed = TRUE, ymax = 0.3) | |
251 grid.arrange(p6, top = textGrob("COSMIC signature profiles (on two pages)", | |
252 gp = gpar(fontsize = 12, font = 3))) | |
253 grid.arrange(p6bis, top = textGrob("COSMIC signature profiles (continued)", | |
254 gp = gpar(fontsize = 12, font = 3))) | |
255 } | |
256 | |
257 | |
258 # Find optimal contribution of COSMIC signatures to reconstruct 96 mutational profiles | |
259 fit_res <- fit_to_signatures(pseudo_mut_mat, cancer_signatures) | |
260 | |
261 # Plot contribution barplots | |
262 pc3 <- plot_contribution(fit_res$contribution, cancer_signatures, coord_flip = T, mode = "absolute") | |
263 pc4 <- plot_contribution(fit_res$contribution, cancer_signatures, coord_flip = T, mode = "relative") | |
264 pc3_data <- pc3$data | |
265 pc3 <- ggplot(pc3_data, aes(x = Sample, y = Contribution, fill = as.factor(Signature))) + | |
266 geom_bar(stat = "identity", position = "stack") + | |
267 coord_flip() + | |
268 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) + | |
269 labs(x = "Samples", y = "Absolute contribution") + theme_bw() + | |
270 theme(panel.grid.minor.x = element_blank(), | |
271 panel.grid.major.x = element_blank(), | |
272 legend.position = "right", | |
273 text = element_text(size = 8), | |
274 axis.text.x = element_text(angle = 90, hjust = 1)) | |
275 pc4_data <- pc4$data | |
276 pc4 <- ggplot(pc4_data, aes(x = Sample, y = Contribution, fill = as.factor(Signature))) + | |
277 geom_bar(stat = "identity", position = "fill") + | |
278 coord_flip() + | |
279 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) + | |
280 scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + | |
281 labs(x = "Samples", y = "Relative contribution") + theme_bw() + | |
282 theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank(), legend.position = "right", | |
283 text = element_text(size = 8), | |
284 axis.text.x = element_text(angle = 90, hjust = 1)) | |
285 ##### | |
286 # ggplot2 alternative | |
287 if (!is.na(opt$levels)[1]) { # if there are levels to display in graphs | |
288 pc3_data <- pc3$data | |
289 pc3_data <- merge(pc3_data, metadata_table[, c(1, 3)], by.x = "Sample", by.y = "element_identifier") | |
290 pc3 <- ggplot(pc3_data, aes(x = Sample, y = Contribution, fill = as.factor(Signature))) + | |
291 geom_bar(stat = "identity", position = "stack") + | |
292 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) + | |
293 labs(x = "Samples", y = "Absolute contribution") + theme_bw() + | |
294 theme(panel.grid.minor.x = element_blank(), | |
295 panel.grid.major.x = element_blank(), | |
296 legend.position = "right", | |
297 text = element_text(size = 8), | |
298 axis.text.x = element_text(angle = 90, hjust = 1)) + | |
299 facet_grid(~level, scales = "free_x", space = "free") | |
300 pc4_data <- pc4$data | |
301 pc4_data <- merge(pc4_data, metadata_table[, c(1, 3)], by.x = "Sample", by.y = "element_identifier") | |
302 pc4 <- ggplot(pc4_data, aes(x = Sample, y = Contribution, fill = as.factor(Signature))) + | |
303 geom_bar(stat = "identity", position = "fill") + | |
304 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) + | |
305 scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + | |
306 labs(x = "Samples", y = "Relative contribution") + theme_bw() + | |
307 theme(panel.grid.minor.x = element_blank(), | |
308 panel.grid.major.x = element_blank(), | |
309 legend.position = "right", | |
310 text = element_text(size = 8), | |
311 axis.text.x = element_text(angle = 90, hjust = 1)) + | |
312 facet_grid(~level, scales = "free_x", space = "free") | |
313 } | |
314 # Combine the two plots: | |
315 grid.arrange(pc3, pc4, | |
316 top = textGrob("Absolute and Relative Contributions of Cosmic signatures to mutational patterns", | |
317 gp = gpar(fontsize = 12, font = 3))) | |
318 | |
319 #### pie charts of comic signatures contributions in samples ### | |
320 library(reshape2) | |
321 library(dplyr) | |
322 if (length(levels(factor(levels_table$level))) < 2) { | |
323 fit_res_contrib <- as.data.frame(fit_res$contribution) | |
324 worklist <- cbind(signature = rownames(fit_res$contribution), | |
325 level = rep("nolabels", length(fit_res_contrib[, 1])), | |
326 fit_res_contrib, | |
327 sum = rowSums(fit_res_contrib)) | |
328 worklist <- worklist[order(worklist[, "sum"], decreasing = T), ] | |
329 worklist <- worklist[1:opt$signum, ] | |
330 worklist <- worklist[, -length(worklist[1, ])] | |
331 worklist <- melt(worklist) | |
332 worklist <- worklist[, c(1, 3, 4, 2)] | |
333 } else { | |
334 worklist <- list() | |
335 for (i in levels(factor(levels_table$level))) { | |
336 fit_res$contribution[, levels_table$element_identifier[levels_table$level == i]] -> worklist[[i]] | |
337 sum <- rowSums(as.data.frame(worklist[[i]])) | |
338 worklist[[i]] <- cbind(worklist[[i]], sum) | |
339 worklist[[i]] <- worklist[[i]][order(worklist[[i]][, "sum"], decreasing = T), ] | |
340 worklist[[i]] <- worklist[[i]][1:opt$signum, ] | |
341 worklist[[i]] <- worklist[[i]][, -length(as.data.frame(worklist[[i]]))] | |
342 } | |
343 worklist <- as.data.frame(melt(worklist)) | |
344 worklist[, 2] <- paste0(worklist[, 4], " - ", worklist[, 2]) | |
345 head(worklist) | |
346 } | |
347 | |
348 colnames(worklist) <- c("signature", "sample", "value", "level") | |
349 worklist <- as.data.frame(worklist %>% group_by(sample) %>% mutate(value = value / sum(value) * 100)) | |
350 worklist$pos <- cumsum(worklist$value) - worklist$value / 2 | |
351 worklist$label <- factor(worklist$signature) | |
352 worklist$signature <- factor(worklist$signature) | |
353 p7 <- ggplot(worklist, aes(x = "", y = value, group = signature, fill = signature)) + | |
354 geom_bar(width = 1, stat = "identity") + | |
355 geom_text(aes(label = label), position = position_stack(vjust = 0.5), color = "black", size = 3) + | |
356 coord_polar("y", start = 0) + facet_wrap(.~sample) + | |
357 labs(x = "", y = "Samples", fill = cosmic_tag) + | |
358 scale_fill_manual(name = paste0(opt$signum, " most contributing\nsignatures\n(in each label/tissue)"), | |
359 values = cosmic_colors[as.numeric(levels(factor(worklist$label)))]) + | |
360 theme(axis.text = element_blank(), | |
361 axis.ticks = element_blank(), | |
362 panel.grid = element_blank()) | |
363 grid.arrange(p7) | |
364 | |
365 # Plot relative contribution of the cancer signatures in each sample as a heatmap with sample clustering | |
366 if (length(vcf_paths) > 1) { | |
367 p8 <- plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete") | |
368 grid.arrange(p8) | |
369 } | |
370 | |
371 # export relative contribution matrix | |
372 if (!is.na(opt$sig_contrib_matrix)) { | |
373 output_table <- t(fit_res$contribution) / rowSums(t(fit_res$contribution)) | |
374 colnames(output_table) <- paste0("s", colnames(output_table)) | |
375 if (length(levels(factor(levels_table$level))) > 1) { | |
376 output_table <- data.frame(sample = paste0(metadata_table[metadata_table$element_identifier == colnames(fit_res$contribution), | |
377 3], "-", colnames(fit_res$contribution)), | |
378 output_table) | |
379 } else { | |
380 output_table <- data.frame(sample = rownames(output_table), output_table) | |
381 } | |
382 write.table(output_table, file = opt$sig_contrib_matrix, sep = "\t", quote = F, row.names = F) | |
383 } | |
384 | |
385 # calculate all pairwise cosine similarities | |
386 cos_sim_ori_rec <- cos_sim_matrix(pseudo_mut_mat, fit_res$reconstructed) | |
387 # extract cosine similarities per sample between original and reconstructed | |
388 cos_sim_ori_rec <- as.data.frame(diag(cos_sim_ori_rec)) | |
389 | |
390 # We can use ggplot to make a barplot of the cosine similarities between the original and | |
391 # reconstructed mutational profile of each sample. This clearly shows how well each mutational | |
392 # profile can be reconstructed with the COSMIC mutational signatures. Two identical profiles | |
393 # have a cosine similarity of 1. The lower the cosine similarity between original and | |
394 # reconstructed, the less well the original mutational profile can be reconstructed with | |
395 # the COSMIC signatures. You could use, for example, cosine similarity of 0.95 as a cutoff. | |
396 | |
397 # Adjust data frame for plotting with gpplot | |
398 colnames(cos_sim_ori_rec) <- "cos_sim" | |
399 cos_sim_ori_rec$sample <- row.names(cos_sim_ori_rec) | |
400 # Make barplot | |
401 p9 <- ggplot(cos_sim_ori_rec, aes(y = cos_sim, x = sample)) + | |
402 geom_bar(stat = "identity", fill = "skyblue4") + | |
403 coord_cartesian(ylim = c(0.8, 1)) + | |
404 # coord_flip(ylim=c(0.8,1)) + | |
405 ylab("Cosine similarity\n original VS reconstructed") + | |
406 xlab("") + | |
407 # Reverse order of the samples such that first is up | |
408 # xlim(rev(levels(factor(cos_sim_ori_rec$sample)))) + | |
409 theme_bw() + | |
410 theme(panel.grid.minor.y = element_blank(), | |
411 panel.grid.major.y = element_blank()) + | |
412 # Add cut.off line | |
413 geom_hline(aes(yintercept = .95)) | |
414 grid.arrange(p9, top = textGrob("Similarity between true and reconstructed profiles (with all Cosmic sig.)", gp = gpar(fontsize = 12, font = 3))) | |
415 dev.off() | |
416 } | |
417 | |
418 | |
419 # Output RData file | |
420 if (!is.null(opt$rdata)) { | |
421 save.image(file = opt$rdata) | |
422 } |