Mercurial > repos > artbio > gsc_high_dimensions_visualisation
diff high_dim_visu.R @ 0:241dd93219d7 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_high_dimension_visualization commit 1282ac9de7c926ab251f88afb2453f52c8b14200
| author | artbio | 
|---|---|
| date | Thu, 11 Jul 2019 12:31:00 -0400 | 
| parents | |
| children | 8e6ce12edd90 | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/high_dim_visu.R Thu Jul 11 12:31:00 2019 -0400 @@ -0,0 +1,523 @@ +# load packages that are provided in the conda env +options( show.error.messages=F, + error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +requiredPackages = c('optparse', 'Rtsne', 'ggplot2', 'ggfortify') +warnings() +library(optparse) +library(FactoMineR) +library(factoextra) +library(Rtsne) +library(ggplot2) +library(ggfortify) +library(RColorBrewer) +library(ClusterR) + +# Arguments +option_list = list( + make_option( + "--data", + default = NA, + type = 'character', + help = "Input file that contains expression value to visualise" + ), + make_option( + "--sep", + default = '\t', + type = 'character', + help = "File separator [default : '%default' ]" + ), + make_option( + "--colnames", + default = TRUE, + type = 'logical', + help = "Consider first line as header ? [default : '%default' ]" + ), + make_option( + "--out", + default = "res.tab", + type = 'character', + help = "Output name [default : '%default' ]" + ), + make_option( + "--labels", + default = FALSE, + type = 'logical', + help = "add labels in scatter plots [default : '%default' ]" + ), + make_option( + "--factor", + default = '', + type = 'character', + help = "A two column table that specifies factor levels for contrasting data [default : '%default' ]" + ), + make_option( + "--visu_choice", + default = 'PCA', + type = 'character', + help = "visualisation method ('PCA', 'tSNE', 'HCPC') [default : '%default' ]" + ), + make_option( + "--table_coordinates", + default = '', + type = 'character', + help = "Table with plot coordinates [default : '%default' ]" + ), + make_option( + "--Rtsne_seed", + default = 42, + type = 'integer', + help = "Seed value for reproducibility [default : '%default' ]" + ), + make_option( + "--Rtsne_dims", + default = 2, + type = 'integer', + help = "Output dimensionality [default : '%default' ]" + ), + make_option( + "--Rtsne_initial_dims", + default = 50, + type = 'integer', + help = "The number of dimensions that should be retained in the initial PCA step [default : '%default' ]" + ), + make_option( + "--Rtsne_perplexity", + default = 5.0, + type = 'numeric', + help = "perplexity [default : '%default' ]" + ), + make_option( + "--Rtsne_theta", + default = 1.0, + type = 'numeric', + help = "theta [default : '%default' ]" + ), + make_option( + "--Rtsne_max_iter", + default = 1000, + type = 'integer', + help = "max_iter [default : '%default' ]" + ), + make_option( + "--Rtsne_pca", + default = TRUE, + type = 'logical', + help = "Whether an initial PCA step should be performed [default : '%default' ]" + ), + make_option( + "--Rtsne_pca_center", + default = TRUE, + type = 'logical', + help = "Should data be centered before pca is applied? [default : '%default' ]" + ), + make_option( + "--Rtsne_pca_scale", + default = FALSE, + type = 'logical', + help = "Should data be scaled before pca is applied? [default : '%default' ]" + ), + make_option( + "--Rtsne_normalize", + default = TRUE, + type = 'logical', + help = "Should data be normalized internally prior to distance calculations? [default : '%default' ]" + ), + make_option( + "--Rtsne_exaggeration_factor", + default = 12.0, + type = 'numeric', + help = " Exaggeration factor used to multiply the P matrix in the first part of the optimization [default : '%default' ]" + ), + make_option( + "--PCA_npc", + default = 5, + type = 'integer', + help = "number of dimensions kept in the results [default : '%default' ]" + ), + make_option( + "--PCA_x_axis", + default = 1, + type = 'integer', + help = "PC to plot in the x axis [default : '%default' ]" + ), + make_option( + "--PCA_y_axis", + default = 2, + type = 'integer', + help = "PC to plot in the y axis [default : '%default' ]" + ), + make_option( + "--HCPC_ncluster", + default = -1, + type = 'numeric', + help = "nb.clust, number of clusters to consider in the hierarchical clustering. [default : -1 let HCPC to optimize the number]" + ), + make_option( + "--HCPC_npc", + default = 5, + type = 'integer', + help = "npc, number of dimensions which are kept for HCPC analysis [default : '%default' ]" + ), + make_option( + "--HCPC_metric", + default = 'euclidian', + type = 'character', + help = "Metric to be used for calculating dissimilarities between observations, available 'euclidian' or 'manhattan' [default : '%default' ]" + ), + make_option( + "--HCPC_method", + default = 'ward', + type = 'character', + help = "Clustering method between 'ward','average','single', 'complete', 'weighted' [default :'%default']" + ), + make_option( + "--pdf_out", + default = "out.pdf", + type = 'character', + help = "pdf of plots [default : '%default' ]" + ), + make_option( + "--HCPC_consol", + default = 'TRUE', + type = 'logical', + help = "If TRUE, a k-means consolidation is performed [default :'%default']" + ), + make_option( + "--HCPC_itermax", + default = '10', + type = 'integer', + help = "The maximum number of iterations for the consolidation [default :'%default']" + ), + make_option( + "--HCPC_min", + default = '3', + type = 'integer', + help = "The least possible number of clusters suggested [default :'%default']" + ), + make_option( + "--HCPC_max", + default = -1, + type = 'integer', + help = "The higher possible number of clusters suggested [default :'%default']" + ), + make_option( + "--HCPC_clusterCA", + default = 'rows', + type = 'character', + help = "A string equals to 'rows' or 'columns' for the clustering of Correspondence Analysis results [default :'%default']" + ), + make_option( + "--HCPC_kk", + default = -1, + type = 'numeric', + help = "The maximum number of iterations for the consolidation [default :'%default']" + ), + make_option( + "--HCPC_clust", + default = "", + type = 'character', + help = "Output result of HCPC clustering : two column table (cell identifiers and clusters) [default :'%default']" + ), + make_option( + "--mutual_info", + default = "", + type = "character", + help = "Output file of external validation of HCPC clustering with factor levels [default :'%default']" + ) +) + +opt = parse_args(OptionParser(option_list = option_list), + args = commandArgs(trailingOnly = TRUE)) + +if (opt$sep == "tab") {opt$sep <- "\t"} +if (opt$sep == "comma") {opt$sep <- ","} +if(opt$HCPC_max == -1) {opt$HCPC_max <- NULL} +if(opt$HCPC_kk == -1) {opt$HCPC_kk <- Inf} + +##Add legend to plot() +legend.col <- function(col, lev){ + +opar <- par + +n <- length(col) + +bx <- par("usr") + +box.cx <- c(bx[2] + (bx[2] - bx[1]) / 1000, +bx[2] + (bx[2] - bx[1]) / 1000 + (bx[2] - bx[1]) / 50) +box.cy <- c(bx[3], bx[3]) +box.sy <- (bx[4] - bx[3]) / n + +xx <- rep(box.cx, each = 2) + +par(xpd = TRUE) +for(i in 1:n){ + +yy <- c(box.cy[1] + (box.sy * (i - 1)), +box.cy[1] + (box.sy * (i)), +box.cy[1] + (box.sy * (i)), +box.cy[1] + (box.sy * (i - 1))) +polygon(xx, yy, col = col[i], border = col[i]) + +} +par(new = TRUE) +plot(0, 0, type = "n", +ylim = c(min(lev), max(lev)), +yaxt = "n", ylab = "", +xaxt = "n", xlab = "", +frame.plot = FALSE) +axis(side = 4, las = 2, tick = FALSE, line = .25) +par <- opar +} + + +data = read.table( + opt$data, + check.names = FALSE, + header = opt$colnames, + row.names = 1, + sep = opt$sep +) + +# Contrasting factor and its colors +if (opt$factor != '') { + contrasting_factor <- read.delim( + opt$factor, + header = TRUE + ) + rownames(contrasting_factor) <- contrasting_factor[,1] + contrasting_factor <- contrasting_factor[colnames(data),] + colnames(contrasting_factor) <- c("id","factor") + if(is.numeric(contrasting_factor$factor)){ + factor_cols <- rev(brewer.pal(n = 11, name = "RdYlGn"))[contrasting_factor$factor] + } else { + contrasting_factor$factor <- as.factor(contrasting_factor$factor) + if(nlevels(contrasting_factor$factor) == 2){ + colors_labels <- c("#E41A1C", "#377EB8") + } else { + colors_labels <- brewer.pal(nlevels(contrasting_factor$factor), name = 'Paired') + } + factorColors <- + with(contrasting_factor, + data.frame(factor = levels(contrasting_factor$factor), + color = I(colors_labels) + ) + ) + factor_cols <- factorColors$color[match(contrasting_factor$factor, + factorColors$factor)] + } +} else { + factor_cols <- rep("deepskyblue4", length(rownames(data))) +} + +################ t-SNE #################### +if (opt$visu_choice == 'tSNE') { + # filter and transpose df for tsne and pca + tdf = t(data) + # make tsne and plot results + set.seed(opt$Rtsne_seed) ## Sets seed for reproducibility + + tsne_out <- Rtsne(tdf, + dims = opt$Rtsne_dims, + initial_dims = opt$Rtsne_initial_dims, + perplexity = opt$Rtsne_perplexity , + theta = opt$Rtsne_theta, + max_iter = opt$Rtsne_max_iter, + pca = opt$Rtsne_pca, + pca_center = opt$Rtsne_pca_center, + pca_scale = opt$Rtsne_pca_scale, + normalize = opt$Rtsne_normalize, + exaggeration_factor=opt$Rtsne_exaggeration_factor) + + embedding <- as.data.frame(tsne_out$Y[,1:2]) + embedding$Class <- as.factor(rownames(tdf)) + gg_legend = theme(legend.position="right") + if (opt$factor == '') { + ggplot(embedding, aes(x=V1, y=V2)) + + geom_point(size=1, color='deepskyblue4') + + gg_legend + + xlab("t-SNE 1") + + ylab("t-SNE 2") + + ggtitle('t-SNE') + + if (opt$labels) { + geom_text(aes(label=Class),hjust=-0.2, vjust=-0.5, size=1.5, color='deepskyblue4') + } + } else { + if(is.numeric(contrasting_factor$factor)){ + embedding$factor <- contrasting_factor$factor + } else { + embedding$factor <- as.factor(contrasting_factor$factor) + } + + ggplot(embedding, aes(x=V1, y=V2, color=factor)) + + geom_point(size=1) + #, color=factor_cols + gg_legend + + xlab("t-SNE 1") + + ylab("t-SNE 2") + + ggtitle('t-SNE') + + if (opt$labels) { + geom_text(aes(label=Class, colour=factor),hjust=-0.2, vjust=-0.5, size=1.5) + } + } + ggsave(file=opt$pdf_out, device="pdf") + + #save coordinates table + if(opt$table_coordinates != ''){ + coord_table <- cbind(rownames(tdf), round(as.data.frame(tsne_out$Y), 6)) + colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$Rtsne_dims))) + } +} + + +######### make PCA with FactoMineR ################# +if (opt$visu_choice == 'PCA') { + pca <- PCA(t(data), ncp=opt$PCA_npc, graph=FALSE) + pdf(opt$pdf_out) + if (opt$labels == FALSE) { + plot(pca, axes = c(opt$PCA_x_axis,opt$PCA_y_axis), label="none" , col.ind = factor_cols) + } else { + plot(pca, axes = c(opt$PCA_x_axis,opt$PCA_y_axis), cex=0.2 , col.ind = factor_cols) + } +if (opt$factor != '') { + if(is.factor(contrasting_factor$factor)) { + legend(x = 'topright', + legend = as.character(factorColors$factor), + col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7) + } else { + legend.col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE)) + } +} +dev.off() + + #save coordinates table + if(opt$table_coordinates != ''){ + coord_table <- cbind(rownames(pca$ind$coord), round(as.data.frame(pca$ind$coord), 6)) + colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$PCA_npc))) + } + +} + +########### make HCPC with FactoMineR ########## +if (opt$visu_choice == 'HCPC') { + +# HCPC starts with a PCA +pca <- PCA( + t(data), + ncp = opt$HCPC_npc, + graph = FALSE, +) + +PCA_IndCoord = as.data.frame(pca$ind$coord) # coordinates of observations in PCA + +# Hierarchical Clustering On Principal Components Followed By Kmean Clustering +res.hcpc <- HCPC(pca, + nb.clust=opt$HCPC_ncluster, metric=opt$HCPC_metric, method=opt$HCPC_method, + graph=F,consol=opt$HCPC_consol,iter.max=opt$HCPC_itermax,min=opt$HCPC_min,max=opt$HCPC_max, + cluster.CA=opt$HCPC_clusterCA,kk=opt$HCPC_kk) +# HCPC plots +dims <- head(as.data.frame(res.hcpc$call$t$res$eig),2) # dims variances in column 2 +pdf(opt$pdf_out) +plot(res.hcpc, choice="tree") +plot(res.hcpc, choice="bar") +plot(res.hcpc, choice="3D.map") +if (opt$labels == FALSE) { +plot(res.hcpc, choice="map", label="none") +} else { +plot(res.hcpc, choice="map") +} + +# user contrasts on the pca +if (opt$factor != '') { + plot(pca, label="none", col.ind = factor_cols) + if(is.factor(contrasting_factor$factor)) { + legend(x = 'topright', + legend = as.character(factorColors$factor), + col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7) + + ## Normalized Mutual Information + sink(opt$mutual_info) + res <- external_validation( + true_labels = as.numeric(contrasting_factor$factor), + clusters = as.numeric(res.hcpc$data.clust$clust), + summary_stats = TRUE + ) + sink() + + } else { + legend.col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE)) + } +} +## Clusters to which individual observations belong # used ? +# Clust <- data.frame(Cluster = res.hcpc$data.clust[, (nrow(data) + 1)], +# Observation = rownames(res.hcpc$data.clust)) +# metadata <- data.frame(Observation=colnames(data), row.names=colnames(data)) +# metadata = merge(y = metadata, +# x = Clust, +# by = "Observation") + +# unclear utility +# ObsNumberPerCluster = as.data.frame(table(metadata$Cluster)) +# colnames(ObsNumberPerCluster) = c("Cluster", "ObsNumber") +# +# ## Silhouette Plot # not used +# hc.cut = hcut(PCA_IndCoord, +# k = nlevels(metadata$Cluster), +# hc_method = "ward.D2") +# +# Sil = fviz_silhouette(hc.cut) +# sil1 = as.data.frame(Sil$data) + +dev.off() + + if(opt$table_coordinates != ''){ + coord_table <- cbind(Cell=rownames(res.hcpc$call$X), + round(as.data.frame(res.hcpc$call$X[, -length(res.hcpc$call$X)]), 6), + as.data.frame(res.hcpc$call$X[, length(res.hcpc$call$X)]) + ) + colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$HCPC_npc)),"Cluster") + } + + if(opt$HCPC_clust != ""){ + res_clustering <- data.frame(Cell = rownames(res.hcpc$data.clust), + Cluster = res.hcpc$data.clust$clust) + + } + +} + +## Return coordinates file to user + +if(opt$table_coordinates != ''){ + write.table( + coord_table, + file = opt$table_coordinates, + sep = "\t", + quote = F, + col.names = T, + row.names = F + ) +} + + +if(opt$HCPC_clust != ""){ + write.table( + res_clustering, + file = opt$HCPC_clust, + sep = "\t", + quote = F, + col.names = T, + row.names = F + ) +} + + + + + + + + + + +
