Mercurial > repos > artbio > gsc_high_dimensions_visualisation
comparison high_dim_visu.R @ 0:241dd93219d7 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_high_dimension_visualization commit 1282ac9de7c926ab251f88afb2453f52c8b14200
| author | artbio |
|---|---|
| date | Thu, 11 Jul 2019 12:31:00 -0400 |
| parents | |
| children | 8e6ce12edd90 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:241dd93219d7 |
|---|---|
| 1 # load packages that are provided in the conda env | |
| 2 options( show.error.messages=F, | |
| 3 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | |
| 4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | |
| 5 requiredPackages = c('optparse', 'Rtsne', 'ggplot2', 'ggfortify') | |
| 6 warnings() | |
| 7 library(optparse) | |
| 8 library(FactoMineR) | |
| 9 library(factoextra) | |
| 10 library(Rtsne) | |
| 11 library(ggplot2) | |
| 12 library(ggfortify) | |
| 13 library(RColorBrewer) | |
| 14 library(ClusterR) | |
| 15 | |
| 16 # Arguments | |
| 17 option_list = list( | |
| 18 make_option( | |
| 19 "--data", | |
| 20 default = NA, | |
| 21 type = 'character', | |
| 22 help = "Input file that contains expression value to visualise" | |
| 23 ), | |
| 24 make_option( | |
| 25 "--sep", | |
| 26 default = '\t', | |
| 27 type = 'character', | |
| 28 help = "File separator [default : '%default' ]" | |
| 29 ), | |
| 30 make_option( | |
| 31 "--colnames", | |
| 32 default = TRUE, | |
| 33 type = 'logical', | |
| 34 help = "Consider first line as header ? [default : '%default' ]" | |
| 35 ), | |
| 36 make_option( | |
| 37 "--out", | |
| 38 default = "res.tab", | |
| 39 type = 'character', | |
| 40 help = "Output name [default : '%default' ]" | |
| 41 ), | |
| 42 make_option( | |
| 43 "--labels", | |
| 44 default = FALSE, | |
| 45 type = 'logical', | |
| 46 help = "add labels in scatter plots [default : '%default' ]" | |
| 47 ), | |
| 48 make_option( | |
| 49 "--factor", | |
| 50 default = '', | |
| 51 type = 'character', | |
| 52 help = "A two column table that specifies factor levels for contrasting data [default : '%default' ]" | |
| 53 ), | |
| 54 make_option( | |
| 55 "--visu_choice", | |
| 56 default = 'PCA', | |
| 57 type = 'character', | |
| 58 help = "visualisation method ('PCA', 'tSNE', 'HCPC') [default : '%default' ]" | |
| 59 ), | |
| 60 make_option( | |
| 61 "--table_coordinates", | |
| 62 default = '', | |
| 63 type = 'character', | |
| 64 help = "Table with plot coordinates [default : '%default' ]" | |
| 65 ), | |
| 66 make_option( | |
| 67 "--Rtsne_seed", | |
| 68 default = 42, | |
| 69 type = 'integer', | |
| 70 help = "Seed value for reproducibility [default : '%default' ]" | |
| 71 ), | |
| 72 make_option( | |
| 73 "--Rtsne_dims", | |
| 74 default = 2, | |
| 75 type = 'integer', | |
| 76 help = "Output dimensionality [default : '%default' ]" | |
| 77 ), | |
| 78 make_option( | |
| 79 "--Rtsne_initial_dims", | |
| 80 default = 50, | |
| 81 type = 'integer', | |
| 82 help = "The number of dimensions that should be retained in the initial PCA step [default : '%default' ]" | |
| 83 ), | |
| 84 make_option( | |
| 85 "--Rtsne_perplexity", | |
| 86 default = 5.0, | |
| 87 type = 'numeric', | |
| 88 help = "perplexity [default : '%default' ]" | |
| 89 ), | |
| 90 make_option( | |
| 91 "--Rtsne_theta", | |
| 92 default = 1.0, | |
| 93 type = 'numeric', | |
| 94 help = "theta [default : '%default' ]" | |
| 95 ), | |
| 96 make_option( | |
| 97 "--Rtsne_max_iter", | |
| 98 default = 1000, | |
| 99 type = 'integer', | |
| 100 help = "max_iter [default : '%default' ]" | |
| 101 ), | |
| 102 make_option( | |
| 103 "--Rtsne_pca", | |
| 104 default = TRUE, | |
| 105 type = 'logical', | |
| 106 help = "Whether an initial PCA step should be performed [default : '%default' ]" | |
| 107 ), | |
| 108 make_option( | |
| 109 "--Rtsne_pca_center", | |
| 110 default = TRUE, | |
| 111 type = 'logical', | |
| 112 help = "Should data be centered before pca is applied? [default : '%default' ]" | |
| 113 ), | |
| 114 make_option( | |
| 115 "--Rtsne_pca_scale", | |
| 116 default = FALSE, | |
| 117 type = 'logical', | |
| 118 help = "Should data be scaled before pca is applied? [default : '%default' ]" | |
| 119 ), | |
| 120 make_option( | |
| 121 "--Rtsne_normalize", | |
| 122 default = TRUE, | |
| 123 type = 'logical', | |
| 124 help = "Should data be normalized internally prior to distance calculations? [default : '%default' ]" | |
| 125 ), | |
| 126 make_option( | |
| 127 "--Rtsne_exaggeration_factor", | |
| 128 default = 12.0, | |
| 129 type = 'numeric', | |
| 130 help = " Exaggeration factor used to multiply the P matrix in the first part of the optimization [default : '%default' ]" | |
| 131 ), | |
| 132 make_option( | |
| 133 "--PCA_npc", | |
| 134 default = 5, | |
| 135 type = 'integer', | |
| 136 help = "number of dimensions kept in the results [default : '%default' ]" | |
| 137 ), | |
| 138 make_option( | |
| 139 "--PCA_x_axis", | |
| 140 default = 1, | |
| 141 type = 'integer', | |
| 142 help = "PC to plot in the x axis [default : '%default' ]" | |
| 143 ), | |
| 144 make_option( | |
| 145 "--PCA_y_axis", | |
| 146 default = 2, | |
| 147 type = 'integer', | |
| 148 help = "PC to plot in the y axis [default : '%default' ]" | |
| 149 ), | |
| 150 make_option( | |
| 151 "--HCPC_ncluster", | |
| 152 default = -1, | |
| 153 type = 'numeric', | |
| 154 help = "nb.clust, number of clusters to consider in the hierarchical clustering. [default : -1 let HCPC to optimize the number]" | |
| 155 ), | |
| 156 make_option( | |
| 157 "--HCPC_npc", | |
| 158 default = 5, | |
| 159 type = 'integer', | |
| 160 help = "npc, number of dimensions which are kept for HCPC analysis [default : '%default' ]" | |
| 161 ), | |
| 162 make_option( | |
| 163 "--HCPC_metric", | |
| 164 default = 'euclidian', | |
| 165 type = 'character', | |
| 166 help = "Metric to be used for calculating dissimilarities between observations, available 'euclidian' or 'manhattan' [default : '%default' ]" | |
| 167 ), | |
| 168 make_option( | |
| 169 "--HCPC_method", | |
| 170 default = 'ward', | |
| 171 type = 'character', | |
| 172 help = "Clustering method between 'ward','average','single', 'complete', 'weighted' [default :'%default']" | |
| 173 ), | |
| 174 make_option( | |
| 175 "--pdf_out", | |
| 176 default = "out.pdf", | |
| 177 type = 'character', | |
| 178 help = "pdf of plots [default : '%default' ]" | |
| 179 ), | |
| 180 make_option( | |
| 181 "--HCPC_consol", | |
| 182 default = 'TRUE', | |
| 183 type = 'logical', | |
| 184 help = "If TRUE, a k-means consolidation is performed [default :'%default']" | |
| 185 ), | |
| 186 make_option( | |
| 187 "--HCPC_itermax", | |
| 188 default = '10', | |
| 189 type = 'integer', | |
| 190 help = "The maximum number of iterations for the consolidation [default :'%default']" | |
| 191 ), | |
| 192 make_option( | |
| 193 "--HCPC_min", | |
| 194 default = '3', | |
| 195 type = 'integer', | |
| 196 help = "The least possible number of clusters suggested [default :'%default']" | |
| 197 ), | |
| 198 make_option( | |
| 199 "--HCPC_max", | |
| 200 default = -1, | |
| 201 type = 'integer', | |
| 202 help = "The higher possible number of clusters suggested [default :'%default']" | |
| 203 ), | |
| 204 make_option( | |
| 205 "--HCPC_clusterCA", | |
| 206 default = 'rows', | |
| 207 type = 'character', | |
| 208 help = "A string equals to 'rows' or 'columns' for the clustering of Correspondence Analysis results [default :'%default']" | |
| 209 ), | |
| 210 make_option( | |
| 211 "--HCPC_kk", | |
| 212 default = -1, | |
| 213 type = 'numeric', | |
| 214 help = "The maximum number of iterations for the consolidation [default :'%default']" | |
| 215 ), | |
| 216 make_option( | |
| 217 "--HCPC_clust", | |
| 218 default = "", | |
| 219 type = 'character', | |
| 220 help = "Output result of HCPC clustering : two column table (cell identifiers and clusters) [default :'%default']" | |
| 221 ), | |
| 222 make_option( | |
| 223 "--mutual_info", | |
| 224 default = "", | |
| 225 type = "character", | |
| 226 help = "Output file of external validation of HCPC clustering with factor levels [default :'%default']" | |
| 227 ) | |
| 228 ) | |
| 229 | |
| 230 opt = parse_args(OptionParser(option_list = option_list), | |
| 231 args = commandArgs(trailingOnly = TRUE)) | |
| 232 | |
| 233 if (opt$sep == "tab") {opt$sep <- "\t"} | |
| 234 if (opt$sep == "comma") {opt$sep <- ","} | |
| 235 if(opt$HCPC_max == -1) {opt$HCPC_max <- NULL} | |
| 236 if(opt$HCPC_kk == -1) {opt$HCPC_kk <- Inf} | |
| 237 | |
| 238 ##Add legend to plot() | |
| 239 legend.col <- function(col, lev){ | |
| 240 | |
| 241 opar <- par | |
| 242 | |
| 243 n <- length(col) | |
| 244 | |
| 245 bx <- par("usr") | |
| 246 | |
| 247 box.cx <- c(bx[2] + (bx[2] - bx[1]) / 1000, | |
| 248 bx[2] + (bx[2] - bx[1]) / 1000 + (bx[2] - bx[1]) / 50) | |
| 249 box.cy <- c(bx[3], bx[3]) | |
| 250 box.sy <- (bx[4] - bx[3]) / n | |
| 251 | |
| 252 xx <- rep(box.cx, each = 2) | |
| 253 | |
| 254 par(xpd = TRUE) | |
| 255 for(i in 1:n){ | |
| 256 | |
| 257 yy <- c(box.cy[1] + (box.sy * (i - 1)), | |
| 258 box.cy[1] + (box.sy * (i)), | |
| 259 box.cy[1] + (box.sy * (i)), | |
| 260 box.cy[1] + (box.sy * (i - 1))) | |
| 261 polygon(xx, yy, col = col[i], border = col[i]) | |
| 262 | |
| 263 } | |
| 264 par(new = TRUE) | |
| 265 plot(0, 0, type = "n", | |
| 266 ylim = c(min(lev), max(lev)), | |
| 267 yaxt = "n", ylab = "", | |
| 268 xaxt = "n", xlab = "", | |
| 269 frame.plot = FALSE) | |
| 270 axis(side = 4, las = 2, tick = FALSE, line = .25) | |
| 271 par <- opar | |
| 272 } | |
| 273 | |
| 274 | |
| 275 data = read.table( | |
| 276 opt$data, | |
| 277 check.names = FALSE, | |
| 278 header = opt$colnames, | |
| 279 row.names = 1, | |
| 280 sep = opt$sep | |
| 281 ) | |
| 282 | |
| 283 # Contrasting factor and its colors | |
| 284 if (opt$factor != '') { | |
| 285 contrasting_factor <- read.delim( | |
| 286 opt$factor, | |
| 287 header = TRUE | |
| 288 ) | |
| 289 rownames(contrasting_factor) <- contrasting_factor[,1] | |
| 290 contrasting_factor <- contrasting_factor[colnames(data),] | |
| 291 colnames(contrasting_factor) <- c("id","factor") | |
| 292 if(is.numeric(contrasting_factor$factor)){ | |
| 293 factor_cols <- rev(brewer.pal(n = 11, name = "RdYlGn"))[contrasting_factor$factor] | |
| 294 } else { | |
| 295 contrasting_factor$factor <- as.factor(contrasting_factor$factor) | |
| 296 if(nlevels(contrasting_factor$factor) == 2){ | |
| 297 colors_labels <- c("#E41A1C", "#377EB8") | |
| 298 } else { | |
| 299 colors_labels <- brewer.pal(nlevels(contrasting_factor$factor), name = 'Paired') | |
| 300 } | |
| 301 factorColors <- | |
| 302 with(contrasting_factor, | |
| 303 data.frame(factor = levels(contrasting_factor$factor), | |
| 304 color = I(colors_labels) | |
| 305 ) | |
| 306 ) | |
| 307 factor_cols <- factorColors$color[match(contrasting_factor$factor, | |
| 308 factorColors$factor)] | |
| 309 } | |
| 310 } else { | |
| 311 factor_cols <- rep("deepskyblue4", length(rownames(data))) | |
| 312 } | |
| 313 | |
| 314 ################ t-SNE #################### | |
| 315 if (opt$visu_choice == 'tSNE') { | |
| 316 # filter and transpose df for tsne and pca | |
| 317 tdf = t(data) | |
| 318 # make tsne and plot results | |
| 319 set.seed(opt$Rtsne_seed) ## Sets seed for reproducibility | |
| 320 | |
| 321 tsne_out <- Rtsne(tdf, | |
| 322 dims = opt$Rtsne_dims, | |
| 323 initial_dims = opt$Rtsne_initial_dims, | |
| 324 perplexity = opt$Rtsne_perplexity , | |
| 325 theta = opt$Rtsne_theta, | |
| 326 max_iter = opt$Rtsne_max_iter, | |
| 327 pca = opt$Rtsne_pca, | |
| 328 pca_center = opt$Rtsne_pca_center, | |
| 329 pca_scale = opt$Rtsne_pca_scale, | |
| 330 normalize = opt$Rtsne_normalize, | |
| 331 exaggeration_factor=opt$Rtsne_exaggeration_factor) | |
| 332 | |
| 333 embedding <- as.data.frame(tsne_out$Y[,1:2]) | |
| 334 embedding$Class <- as.factor(rownames(tdf)) | |
| 335 gg_legend = theme(legend.position="right") | |
| 336 if (opt$factor == '') { | |
| 337 ggplot(embedding, aes(x=V1, y=V2)) + | |
| 338 geom_point(size=1, color='deepskyblue4') + | |
| 339 gg_legend + | |
| 340 xlab("t-SNE 1") + | |
| 341 ylab("t-SNE 2") + | |
| 342 ggtitle('t-SNE') + | |
| 343 if (opt$labels) { | |
| 344 geom_text(aes(label=Class),hjust=-0.2, vjust=-0.5, size=1.5, color='deepskyblue4') | |
| 345 } | |
| 346 } else { | |
| 347 if(is.numeric(contrasting_factor$factor)){ | |
| 348 embedding$factor <- contrasting_factor$factor | |
| 349 } else { | |
| 350 embedding$factor <- as.factor(contrasting_factor$factor) | |
| 351 } | |
| 352 | |
| 353 ggplot(embedding, aes(x=V1, y=V2, color=factor)) + | |
| 354 geom_point(size=1) + #, color=factor_cols | |
| 355 gg_legend + | |
| 356 xlab("t-SNE 1") + | |
| 357 ylab("t-SNE 2") + | |
| 358 ggtitle('t-SNE') + | |
| 359 if (opt$labels) { | |
| 360 geom_text(aes(label=Class, colour=factor),hjust=-0.2, vjust=-0.5, size=1.5) | |
| 361 } | |
| 362 } | |
| 363 ggsave(file=opt$pdf_out, device="pdf") | |
| 364 | |
| 365 #save coordinates table | |
| 366 if(opt$table_coordinates != ''){ | |
| 367 coord_table <- cbind(rownames(tdf), round(as.data.frame(tsne_out$Y), 6)) | |
| 368 colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$Rtsne_dims))) | |
| 369 } | |
| 370 } | |
| 371 | |
| 372 | |
| 373 ######### make PCA with FactoMineR ################# | |
| 374 if (opt$visu_choice == 'PCA') { | |
| 375 pca <- PCA(t(data), ncp=opt$PCA_npc, graph=FALSE) | |
| 376 pdf(opt$pdf_out) | |
| 377 if (opt$labels == FALSE) { | |
| 378 plot(pca, axes = c(opt$PCA_x_axis,opt$PCA_y_axis), label="none" , col.ind = factor_cols) | |
| 379 } else { | |
| 380 plot(pca, axes = c(opt$PCA_x_axis,opt$PCA_y_axis), cex=0.2 , col.ind = factor_cols) | |
| 381 } | |
| 382 if (opt$factor != '') { | |
| 383 if(is.factor(contrasting_factor$factor)) { | |
| 384 legend(x = 'topright', | |
| 385 legend = as.character(factorColors$factor), | |
| 386 col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7) | |
| 387 } else { | |
| 388 legend.col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE)) | |
| 389 } | |
| 390 } | |
| 391 dev.off() | |
| 392 | |
| 393 #save coordinates table | |
| 394 if(opt$table_coordinates != ''){ | |
| 395 coord_table <- cbind(rownames(pca$ind$coord), round(as.data.frame(pca$ind$coord), 6)) | |
| 396 colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$PCA_npc))) | |
| 397 } | |
| 398 | |
| 399 } | |
| 400 | |
| 401 ########### make HCPC with FactoMineR ########## | |
| 402 if (opt$visu_choice == 'HCPC') { | |
| 403 | |
| 404 # HCPC starts with a PCA | |
| 405 pca <- PCA( | |
| 406 t(data), | |
| 407 ncp = opt$HCPC_npc, | |
| 408 graph = FALSE, | |
| 409 ) | |
| 410 | |
| 411 PCA_IndCoord = as.data.frame(pca$ind$coord) # coordinates of observations in PCA | |
| 412 | |
| 413 # Hierarchical Clustering On Principal Components Followed By Kmean Clustering | |
| 414 res.hcpc <- HCPC(pca, | |
| 415 nb.clust=opt$HCPC_ncluster, metric=opt$HCPC_metric, method=opt$HCPC_method, | |
| 416 graph=F,consol=opt$HCPC_consol,iter.max=opt$HCPC_itermax,min=opt$HCPC_min,max=opt$HCPC_max, | |
| 417 cluster.CA=opt$HCPC_clusterCA,kk=opt$HCPC_kk) | |
| 418 # HCPC plots | |
| 419 dims <- head(as.data.frame(res.hcpc$call$t$res$eig),2) # dims variances in column 2 | |
| 420 pdf(opt$pdf_out) | |
| 421 plot(res.hcpc, choice="tree") | |
| 422 plot(res.hcpc, choice="bar") | |
| 423 plot(res.hcpc, choice="3D.map") | |
| 424 if (opt$labels == FALSE) { | |
| 425 plot(res.hcpc, choice="map", label="none") | |
| 426 } else { | |
| 427 plot(res.hcpc, choice="map") | |
| 428 } | |
| 429 | |
| 430 # user contrasts on the pca | |
| 431 if (opt$factor != '') { | |
| 432 plot(pca, label="none", col.ind = factor_cols) | |
| 433 if(is.factor(contrasting_factor$factor)) { | |
| 434 legend(x = 'topright', | |
| 435 legend = as.character(factorColors$factor), | |
| 436 col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7) | |
| 437 | |
| 438 ## Normalized Mutual Information | |
| 439 sink(opt$mutual_info) | |
| 440 res <- external_validation( | |
| 441 true_labels = as.numeric(contrasting_factor$factor), | |
| 442 clusters = as.numeric(res.hcpc$data.clust$clust), | |
| 443 summary_stats = TRUE | |
| 444 ) | |
| 445 sink() | |
| 446 | |
| 447 } else { | |
| 448 legend.col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE)) | |
| 449 } | |
| 450 } | |
| 451 ## Clusters to which individual observations belong # used ? | |
| 452 # Clust <- data.frame(Cluster = res.hcpc$data.clust[, (nrow(data) + 1)], | |
| 453 # Observation = rownames(res.hcpc$data.clust)) | |
| 454 # metadata <- data.frame(Observation=colnames(data), row.names=colnames(data)) | |
| 455 # metadata = merge(y = metadata, | |
| 456 # x = Clust, | |
| 457 # by = "Observation") | |
| 458 | |
| 459 # unclear utility | |
| 460 # ObsNumberPerCluster = as.data.frame(table(metadata$Cluster)) | |
| 461 # colnames(ObsNumberPerCluster) = c("Cluster", "ObsNumber") | |
| 462 # | |
| 463 # ## Silhouette Plot # not used | |
| 464 # hc.cut = hcut(PCA_IndCoord, | |
| 465 # k = nlevels(metadata$Cluster), | |
| 466 # hc_method = "ward.D2") | |
| 467 # | |
| 468 # Sil = fviz_silhouette(hc.cut) | |
| 469 # sil1 = as.data.frame(Sil$data) | |
| 470 | |
| 471 dev.off() | |
| 472 | |
| 473 if(opt$table_coordinates != ''){ | |
| 474 coord_table <- cbind(Cell=rownames(res.hcpc$call$X), | |
| 475 round(as.data.frame(res.hcpc$call$X[, -length(res.hcpc$call$X)]), 6), | |
| 476 as.data.frame(res.hcpc$call$X[, length(res.hcpc$call$X)]) | |
| 477 ) | |
| 478 colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$HCPC_npc)),"Cluster") | |
| 479 } | |
| 480 | |
| 481 if(opt$HCPC_clust != ""){ | |
| 482 res_clustering <- data.frame(Cell = rownames(res.hcpc$data.clust), | |
| 483 Cluster = res.hcpc$data.clust$clust) | |
| 484 | |
| 485 } | |
| 486 | |
| 487 } | |
| 488 | |
| 489 ## Return coordinates file to user | |
| 490 | |
| 491 if(opt$table_coordinates != ''){ | |
| 492 write.table( | |
| 493 coord_table, | |
| 494 file = opt$table_coordinates, | |
| 495 sep = "\t", | |
| 496 quote = F, | |
| 497 col.names = T, | |
| 498 row.names = F | |
| 499 ) | |
| 500 } | |
| 501 | |
| 502 | |
| 503 if(opt$HCPC_clust != ""){ | |
| 504 write.table( | |
| 505 res_clustering, | |
| 506 file = opt$HCPC_clust, | |
| 507 sep = "\t", | |
| 508 quote = F, | |
| 509 col.names = T, | |
| 510 row.names = F | |
| 511 ) | |
| 512 } | |
| 513 | |
| 514 | |
| 515 | |
| 516 | |
| 517 | |
| 518 | |
| 519 | |
| 520 | |
| 521 | |
| 522 | |
| 523 |
