comparison high_dim_visu.R @ 0:241dd93219d7 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_high_dimension_visualization commit 1282ac9de7c926ab251f88afb2453f52c8b14200
author artbio
date Thu, 11 Jul 2019 12:31:00 -0400
parents
children 8e6ce12edd90
comparison
equal deleted inserted replaced
-1:000000000000 0:241dd93219d7
1 # load packages that are provided in the conda env
2 options( show.error.messages=F,
3 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
5 requiredPackages = c('optparse', 'Rtsne', 'ggplot2', 'ggfortify')
6 warnings()
7 library(optparse)
8 library(FactoMineR)
9 library(factoextra)
10 library(Rtsne)
11 library(ggplot2)
12 library(ggfortify)
13 library(RColorBrewer)
14 library(ClusterR)
15
16 # Arguments
17 option_list = list(
18 make_option(
19 "--data",
20 default = NA,
21 type = 'character',
22 help = "Input file that contains expression value to visualise"
23 ),
24 make_option(
25 "--sep",
26 default = '\t',
27 type = 'character',
28 help = "File separator [default : '%default' ]"
29 ),
30 make_option(
31 "--colnames",
32 default = TRUE,
33 type = 'logical',
34 help = "Consider first line as header ? [default : '%default' ]"
35 ),
36 make_option(
37 "--out",
38 default = "res.tab",
39 type = 'character',
40 help = "Output name [default : '%default' ]"
41 ),
42 make_option(
43 "--labels",
44 default = FALSE,
45 type = 'logical',
46 help = "add labels in scatter plots [default : '%default' ]"
47 ),
48 make_option(
49 "--factor",
50 default = '',
51 type = 'character',
52 help = "A two column table that specifies factor levels for contrasting data [default : '%default' ]"
53 ),
54 make_option(
55 "--visu_choice",
56 default = 'PCA',
57 type = 'character',
58 help = "visualisation method ('PCA', 'tSNE', 'HCPC') [default : '%default' ]"
59 ),
60 make_option(
61 "--table_coordinates",
62 default = '',
63 type = 'character',
64 help = "Table with plot coordinates [default : '%default' ]"
65 ),
66 make_option(
67 "--Rtsne_seed",
68 default = 42,
69 type = 'integer',
70 help = "Seed value for reproducibility [default : '%default' ]"
71 ),
72 make_option(
73 "--Rtsne_dims",
74 default = 2,
75 type = 'integer',
76 help = "Output dimensionality [default : '%default' ]"
77 ),
78 make_option(
79 "--Rtsne_initial_dims",
80 default = 50,
81 type = 'integer',
82 help = "The number of dimensions that should be retained in the initial PCA step [default : '%default' ]"
83 ),
84 make_option(
85 "--Rtsne_perplexity",
86 default = 5.0,
87 type = 'numeric',
88 help = "perplexity [default : '%default' ]"
89 ),
90 make_option(
91 "--Rtsne_theta",
92 default = 1.0,
93 type = 'numeric',
94 help = "theta [default : '%default' ]"
95 ),
96 make_option(
97 "--Rtsne_max_iter",
98 default = 1000,
99 type = 'integer',
100 help = "max_iter [default : '%default' ]"
101 ),
102 make_option(
103 "--Rtsne_pca",
104 default = TRUE,
105 type = 'logical',
106 help = "Whether an initial PCA step should be performed [default : '%default' ]"
107 ),
108 make_option(
109 "--Rtsne_pca_center",
110 default = TRUE,
111 type = 'logical',
112 help = "Should data be centered before pca is applied? [default : '%default' ]"
113 ),
114 make_option(
115 "--Rtsne_pca_scale",
116 default = FALSE,
117 type = 'logical',
118 help = "Should data be scaled before pca is applied? [default : '%default' ]"
119 ),
120 make_option(
121 "--Rtsne_normalize",
122 default = TRUE,
123 type = 'logical',
124 help = "Should data be normalized internally prior to distance calculations? [default : '%default' ]"
125 ),
126 make_option(
127 "--Rtsne_exaggeration_factor",
128 default = 12.0,
129 type = 'numeric',
130 help = " Exaggeration factor used to multiply the P matrix in the first part of the optimization [default : '%default' ]"
131 ),
132 make_option(
133 "--PCA_npc",
134 default = 5,
135 type = 'integer',
136 help = "number of dimensions kept in the results [default : '%default' ]"
137 ),
138 make_option(
139 "--PCA_x_axis",
140 default = 1,
141 type = 'integer',
142 help = "PC to plot in the x axis [default : '%default' ]"
143 ),
144 make_option(
145 "--PCA_y_axis",
146 default = 2,
147 type = 'integer',
148 help = "PC to plot in the y axis [default : '%default' ]"
149 ),
150 make_option(
151 "--HCPC_ncluster",
152 default = -1,
153 type = 'numeric',
154 help = "nb.clust, number of clusters to consider in the hierarchical clustering. [default : -1 let HCPC to optimize the number]"
155 ),
156 make_option(
157 "--HCPC_npc",
158 default = 5,
159 type = 'integer',
160 help = "npc, number of dimensions which are kept for HCPC analysis [default : '%default' ]"
161 ),
162 make_option(
163 "--HCPC_metric",
164 default = 'euclidian',
165 type = 'character',
166 help = "Metric to be used for calculating dissimilarities between observations, available 'euclidian' or 'manhattan' [default : '%default' ]"
167 ),
168 make_option(
169 "--HCPC_method",
170 default = 'ward',
171 type = 'character',
172 help = "Clustering method between 'ward','average','single', 'complete', 'weighted' [default :'%default']"
173 ),
174 make_option(
175 "--pdf_out",
176 default = "out.pdf",
177 type = 'character',
178 help = "pdf of plots [default : '%default' ]"
179 ),
180 make_option(
181 "--HCPC_consol",
182 default = 'TRUE',
183 type = 'logical',
184 help = "If TRUE, a k-means consolidation is performed [default :'%default']"
185 ),
186 make_option(
187 "--HCPC_itermax",
188 default = '10',
189 type = 'integer',
190 help = "The maximum number of iterations for the consolidation [default :'%default']"
191 ),
192 make_option(
193 "--HCPC_min",
194 default = '3',
195 type = 'integer',
196 help = "The least possible number of clusters suggested [default :'%default']"
197 ),
198 make_option(
199 "--HCPC_max",
200 default = -1,
201 type = 'integer',
202 help = "The higher possible number of clusters suggested [default :'%default']"
203 ),
204 make_option(
205 "--HCPC_clusterCA",
206 default = 'rows',
207 type = 'character',
208 help = "A string equals to 'rows' or 'columns' for the clustering of Correspondence Analysis results [default :'%default']"
209 ),
210 make_option(
211 "--HCPC_kk",
212 default = -1,
213 type = 'numeric',
214 help = "The maximum number of iterations for the consolidation [default :'%default']"
215 ),
216 make_option(
217 "--HCPC_clust",
218 default = "",
219 type = 'character',
220 help = "Output result of HCPC clustering : two column table (cell identifiers and clusters) [default :'%default']"
221 ),
222 make_option(
223 "--mutual_info",
224 default = "",
225 type = "character",
226 help = "Output file of external validation of HCPC clustering with factor levels [default :'%default']"
227 )
228 )
229
230 opt = parse_args(OptionParser(option_list = option_list),
231 args = commandArgs(trailingOnly = TRUE))
232
233 if (opt$sep == "tab") {opt$sep <- "\t"}
234 if (opt$sep == "comma") {opt$sep <- ","}
235 if(opt$HCPC_max == -1) {opt$HCPC_max <- NULL}
236 if(opt$HCPC_kk == -1) {opt$HCPC_kk <- Inf}
237
238 ##Add legend to plot()
239 legend.col <- function(col, lev){
240
241 opar <- par
242
243 n <- length(col)
244
245 bx <- par("usr")
246
247 box.cx <- c(bx[2] + (bx[2] - bx[1]) / 1000,
248 bx[2] + (bx[2] - bx[1]) / 1000 + (bx[2] - bx[1]) / 50)
249 box.cy <- c(bx[3], bx[3])
250 box.sy <- (bx[4] - bx[3]) / n
251
252 xx <- rep(box.cx, each = 2)
253
254 par(xpd = TRUE)
255 for(i in 1:n){
256
257 yy <- c(box.cy[1] + (box.sy * (i - 1)),
258 box.cy[1] + (box.sy * (i)),
259 box.cy[1] + (box.sy * (i)),
260 box.cy[1] + (box.sy * (i - 1)))
261 polygon(xx, yy, col = col[i], border = col[i])
262
263 }
264 par(new = TRUE)
265 plot(0, 0, type = "n",
266 ylim = c(min(lev), max(lev)),
267 yaxt = "n", ylab = "",
268 xaxt = "n", xlab = "",
269 frame.plot = FALSE)
270 axis(side = 4, las = 2, tick = FALSE, line = .25)
271 par <- opar
272 }
273
274
275 data = read.table(
276 opt$data,
277 check.names = FALSE,
278 header = opt$colnames,
279 row.names = 1,
280 sep = opt$sep
281 )
282
283 # Contrasting factor and its colors
284 if (opt$factor != '') {
285 contrasting_factor <- read.delim(
286 opt$factor,
287 header = TRUE
288 )
289 rownames(contrasting_factor) <- contrasting_factor[,1]
290 contrasting_factor <- contrasting_factor[colnames(data),]
291 colnames(contrasting_factor) <- c("id","factor")
292 if(is.numeric(contrasting_factor$factor)){
293 factor_cols <- rev(brewer.pal(n = 11, name = "RdYlGn"))[contrasting_factor$factor]
294 } else {
295 contrasting_factor$factor <- as.factor(contrasting_factor$factor)
296 if(nlevels(contrasting_factor$factor) == 2){
297 colors_labels <- c("#E41A1C", "#377EB8")
298 } else {
299 colors_labels <- brewer.pal(nlevels(contrasting_factor$factor), name = 'Paired')
300 }
301 factorColors <-
302 with(contrasting_factor,
303 data.frame(factor = levels(contrasting_factor$factor),
304 color = I(colors_labels)
305 )
306 )
307 factor_cols <- factorColors$color[match(contrasting_factor$factor,
308 factorColors$factor)]
309 }
310 } else {
311 factor_cols <- rep("deepskyblue4", length(rownames(data)))
312 }
313
314 ################ t-SNE ####################
315 if (opt$visu_choice == 'tSNE') {
316 # filter and transpose df for tsne and pca
317 tdf = t(data)
318 # make tsne and plot results
319 set.seed(opt$Rtsne_seed) ## Sets seed for reproducibility
320
321 tsne_out <- Rtsne(tdf,
322 dims = opt$Rtsne_dims,
323 initial_dims = opt$Rtsne_initial_dims,
324 perplexity = opt$Rtsne_perplexity ,
325 theta = opt$Rtsne_theta,
326 max_iter = opt$Rtsne_max_iter,
327 pca = opt$Rtsne_pca,
328 pca_center = opt$Rtsne_pca_center,
329 pca_scale = opt$Rtsne_pca_scale,
330 normalize = opt$Rtsne_normalize,
331 exaggeration_factor=opt$Rtsne_exaggeration_factor)
332
333 embedding <- as.data.frame(tsne_out$Y[,1:2])
334 embedding$Class <- as.factor(rownames(tdf))
335 gg_legend = theme(legend.position="right")
336 if (opt$factor == '') {
337 ggplot(embedding, aes(x=V1, y=V2)) +
338 geom_point(size=1, color='deepskyblue4') +
339 gg_legend +
340 xlab("t-SNE 1") +
341 ylab("t-SNE 2") +
342 ggtitle('t-SNE') +
343 if (opt$labels) {
344 geom_text(aes(label=Class),hjust=-0.2, vjust=-0.5, size=1.5, color='deepskyblue4')
345 }
346 } else {
347 if(is.numeric(contrasting_factor$factor)){
348 embedding$factor <- contrasting_factor$factor
349 } else {
350 embedding$factor <- as.factor(contrasting_factor$factor)
351 }
352
353 ggplot(embedding, aes(x=V1, y=V2, color=factor)) +
354 geom_point(size=1) + #, color=factor_cols
355 gg_legend +
356 xlab("t-SNE 1") +
357 ylab("t-SNE 2") +
358 ggtitle('t-SNE') +
359 if (opt$labels) {
360 geom_text(aes(label=Class, colour=factor),hjust=-0.2, vjust=-0.5, size=1.5)
361 }
362 }
363 ggsave(file=opt$pdf_out, device="pdf")
364
365 #save coordinates table
366 if(opt$table_coordinates != ''){
367 coord_table <- cbind(rownames(tdf), round(as.data.frame(tsne_out$Y), 6))
368 colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$Rtsne_dims)))
369 }
370 }
371
372
373 ######### make PCA with FactoMineR #################
374 if (opt$visu_choice == 'PCA') {
375 pca <- PCA(t(data), ncp=opt$PCA_npc, graph=FALSE)
376 pdf(opt$pdf_out)
377 if (opt$labels == FALSE) {
378 plot(pca, axes = c(opt$PCA_x_axis,opt$PCA_y_axis), label="none" , col.ind = factor_cols)
379 } else {
380 plot(pca, axes = c(opt$PCA_x_axis,opt$PCA_y_axis), cex=0.2 , col.ind = factor_cols)
381 }
382 if (opt$factor != '') {
383 if(is.factor(contrasting_factor$factor)) {
384 legend(x = 'topright',
385 legend = as.character(factorColors$factor),
386 col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7)
387 } else {
388 legend.col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE))
389 }
390 }
391 dev.off()
392
393 #save coordinates table
394 if(opt$table_coordinates != ''){
395 coord_table <- cbind(rownames(pca$ind$coord), round(as.data.frame(pca$ind$coord), 6))
396 colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$PCA_npc)))
397 }
398
399 }
400
401 ########### make HCPC with FactoMineR ##########
402 if (opt$visu_choice == 'HCPC') {
403
404 # HCPC starts with a PCA
405 pca <- PCA(
406 t(data),
407 ncp = opt$HCPC_npc,
408 graph = FALSE,
409 )
410
411 PCA_IndCoord = as.data.frame(pca$ind$coord) # coordinates of observations in PCA
412
413 # Hierarchical Clustering On Principal Components Followed By Kmean Clustering
414 res.hcpc <- HCPC(pca,
415 nb.clust=opt$HCPC_ncluster, metric=opt$HCPC_metric, method=opt$HCPC_method,
416 graph=F,consol=opt$HCPC_consol,iter.max=opt$HCPC_itermax,min=opt$HCPC_min,max=opt$HCPC_max,
417 cluster.CA=opt$HCPC_clusterCA,kk=opt$HCPC_kk)
418 # HCPC plots
419 dims <- head(as.data.frame(res.hcpc$call$t$res$eig),2) # dims variances in column 2
420 pdf(opt$pdf_out)
421 plot(res.hcpc, choice="tree")
422 plot(res.hcpc, choice="bar")
423 plot(res.hcpc, choice="3D.map")
424 if (opt$labels == FALSE) {
425 plot(res.hcpc, choice="map", label="none")
426 } else {
427 plot(res.hcpc, choice="map")
428 }
429
430 # user contrasts on the pca
431 if (opt$factor != '') {
432 plot(pca, label="none", col.ind = factor_cols)
433 if(is.factor(contrasting_factor$factor)) {
434 legend(x = 'topright',
435 legend = as.character(factorColors$factor),
436 col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7)
437
438 ## Normalized Mutual Information
439 sink(opt$mutual_info)
440 res <- external_validation(
441 true_labels = as.numeric(contrasting_factor$factor),
442 clusters = as.numeric(res.hcpc$data.clust$clust),
443 summary_stats = TRUE
444 )
445 sink()
446
447 } else {
448 legend.col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE))
449 }
450 }
451 ## Clusters to which individual observations belong # used ?
452 # Clust <- data.frame(Cluster = res.hcpc$data.clust[, (nrow(data) + 1)],
453 # Observation = rownames(res.hcpc$data.clust))
454 # metadata <- data.frame(Observation=colnames(data), row.names=colnames(data))
455 # metadata = merge(y = metadata,
456 # x = Clust,
457 # by = "Observation")
458
459 # unclear utility
460 # ObsNumberPerCluster = as.data.frame(table(metadata$Cluster))
461 # colnames(ObsNumberPerCluster) = c("Cluster", "ObsNumber")
462 #
463 # ## Silhouette Plot # not used
464 # hc.cut = hcut(PCA_IndCoord,
465 # k = nlevels(metadata$Cluster),
466 # hc_method = "ward.D2")
467 #
468 # Sil = fviz_silhouette(hc.cut)
469 # sil1 = as.data.frame(Sil$data)
470
471 dev.off()
472
473 if(opt$table_coordinates != ''){
474 coord_table <- cbind(Cell=rownames(res.hcpc$call$X),
475 round(as.data.frame(res.hcpc$call$X[, -length(res.hcpc$call$X)]), 6),
476 as.data.frame(res.hcpc$call$X[, length(res.hcpc$call$X)])
477 )
478 colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$HCPC_npc)),"Cluster")
479 }
480
481 if(opt$HCPC_clust != ""){
482 res_clustering <- data.frame(Cell = rownames(res.hcpc$data.clust),
483 Cluster = res.hcpc$data.clust$clust)
484
485 }
486
487 }
488
489 ## Return coordinates file to user
490
491 if(opt$table_coordinates != ''){
492 write.table(
493 coord_table,
494 file = opt$table_coordinates,
495 sep = "\t",
496 quote = F,
497 col.names = T,
498 row.names = F
499 )
500 }
501
502
503 if(opt$HCPC_clust != ""){
504 write.table(
505 res_clustering,
506 file = opt$HCPC_clust,
507 sep = "\t",
508 quote = F,
509 col.names = T,
510 row.names = F
511 )
512 }
513
514
515
516
517
518
519
520
521
522
523