Mercurial > repos > peter-waltman > ucsc_cluster_tools2
comparison cluster.tools/consensus.clustering.R @ 0:0decf3fd54bc draft
Uploaded
| author | peter-waltman | 
|---|---|
| date | Thu, 28 Feb 2013 01:45:39 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:0decf3fd54bc | 
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 ## Consensus Clustering Script by Peter Waltman | |
| 3 ## May 31, 2011 | |
| 4 ## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0) | |
| 5 ## | |
| 6 #usage, options and doc goes here | |
| 7 argspec <- c("consensus.clustering.R takes a clustering from ConsensusClusterPlus and clinical survival data | |
| 8 and generates a KM-plot, along with the log-rank p-values | |
| 9 | |
| 10 Usage: | |
| 11 consensus.clustering.R -d <data.file> | |
| 12 Optional: | |
| 13 -o <output.name> | |
| 14 -a <cluster.alg> ## must be either 'hc' or 'km' | |
| 15 -m <distance.metric> ## must be one supported by ConsensusClusterPlus | |
| 16 -k <max.k> | |
| 17 -r <reps> | |
| 18 -f <filter> ## filter, o/w no filtering | |
| 19 | |
| 20 \n\n") | |
| 21 args <- commandArgs(TRUE) | |
| 22 if ( length( args ) == 1 && args =="--help") { | |
| 23 write(argspec, stderr()) | |
| 24 q(); | |
| 25 } | |
| 26 | |
| 27 lib.load.quiet <- function( package ) { | |
| 28 package <- as.character(substitute(package)) | |
| 29 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) | |
| 30 } | |
| 31 lib.load.quiet(getopt) | |
| 32 lib.load.quiet( gplots ) | |
| 33 lib.load.quiet( amap ) | |
| 34 ## if any of the faster clustering methods are available on this system, load them | |
| 35 if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) { | |
| 36 if ( 'flashClust' %in% installed.packages() ) { | |
| 37 lib.load.quiet( flashClust ) | |
| 38 } else { | |
| 39 if ( 'fastcluster' %in% installed.packages() ) { | |
| 40 lib.load.quiet( fastcluster ) | |
| 41 } | |
| 42 } | |
| 43 } | |
| 44 ##lib.load.quiet(ConsensusClusterPlus) | |
| 45 lib.load.quiet( amap ) | |
| 46 lib.load.quiet( cluster ) | |
| 47 | |
| 48 ################### | |
| 49 ## code borrowed/updated from ConsensusClusterPlus | |
| 50 ################### | |
| 51 | |
| 52 ConsensusClusterPlus <- function( d=NULL, | |
| 53 maxK = 3, | |
| 54 reps=10, | |
| 55 pItem=0.8, | |
| 56 pFeature=1, | |
| 57 clusterAlg="hc", | |
| 58 title="untitled_consensus_cluster", | |
| 59 innerLinkage="average", | |
| 60 finalLinkage="average", | |
| 61 distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ), | |
| 62 ml=NULL, | |
| 63 tmyPal=NULL, | |
| 64 seed=NULL, | |
| 65 plot=NULL, | |
| 66 writeTable=FALSE, | |
| 67 weightsItem=NULL, | |
| 68 weightsFeature=NULL, | |
| 69 verbose=F ) { | |
| 70 ##description: runs consensus subsamples | |
| 71 | |
| 72 | |
| 73 if(is.null(seed)==TRUE){ | |
| 74 seed=timeSeed = as.numeric(Sys.time()) | |
| 75 } | |
| 76 set.seed(seed) | |
| 77 | |
| 78 if(is.null(ml)==TRUE){ | |
| 79 | |
| 80 if ( inherits( distance, "dist" ) ) { | |
| 81 stop( "If you want to pass in a pre-calculated distance object, pass it in as the data, rather than the distance parameter\n" ) | |
| 82 } | |
| 83 | |
| 84 if ( ! class( d ) %in% c( "dist", "matrix", "ExpressionSet" ) ) { | |
| 85 stop("d must be a matrix, distance object or ExpressionSet (eset object)") | |
| 86 } | |
| 87 | |
| 88 if ( inherits( d, "dist" ) ) { | |
| 89 ## if d is a distance matrix, fix a few things so that they don't cause problems with the analysis | |
| 90 ## Note, assumption is that if d is a distance matrix, the user doesn't want to sample over the row features | |
| 91 if ( is.null( attr( d, "method" ) ) ) { | |
| 92 attr( d, "method" ) <- distance <- "unknown - user-specified" | |
| 93 } | |
| 94 if ( is.null( distance ) || ( distance != attr( d, "method" ) ) ) { | |
| 95 distance <- attr( d, "method" ) | |
| 96 } | |
| 97 | |
| 98 if ( ( ! is.null( pFeature ) ) && ( pFeature < 1 ) ) { | |
| 99 if ( verbose ) warning( "Cannot use the pFeatures parameter when specifying a distance matrix as the data object\n" ) | |
| 100 pFeature <- 1 | |
| 101 } | |
| 102 if ( ! is.null( weightsFeature ) ) { | |
| 103 if ( verbose ) warning( "Cannot use the weightsFeature parameter when specifying a distance matrix as the data object\n" ) | |
| 104 weightsFeature <- NULL | |
| 105 } | |
| 106 if ( clusterAlg == "km" ) { | |
| 107 if ( verbose ) warning( "You are asking CCPLUS to use K-means to cluster a distance matrix (rather than the data itself) - this may produce unintended results. We suggest using PAM if you want to use alternate distance metrics/objects\n" ) | |
| 108 ##d <- as.matrix( d ) #this is now done w/in ccRun | |
| 109 } | |
| 110 } else { | |
| 111 if ( is.null( distance ) ) { | |
| 112 ## we should never get here, but just in case | |
| 113 distance <- "pearson" | |
| 114 } | |
| 115 } | |
| 116 | |
| 117 if ( ( clusterAlg == "km" ) && inherits( distance, "character" ) && ( distance != "euclidean" ) ) { | |
| 118 warning( "WARNING: kmeans can only use the euclidean distance metric. If you would like to use an alternate metric, we suggest using PAM or HC clustering instead. This parameter combinationwill use k-means, but will NOT use the specified distance metric\n" ) | |
| 119 distance <- 'euclidean' | |
| 120 } | |
| 121 | |
| 122 | |
| 123 if ( inherits( d,"ExpressionSet" ) ) { | |
| 124 d <- exprs(d) | |
| 125 } | |
| 126 | |
| 127 ml <- ccRun( d=d, | |
| 128 maxK=maxK, | |
| 129 repCount=reps, | |
| 130 diss=inherits(d,"dist"), | |
| 131 pItem=pItem, | |
| 132 pFeature=pFeature, | |
| 133 innerLinkage=innerLinkage, | |
| 134 clusterAlg=clusterAlg, | |
| 135 weightsFeature=weightsFeature, | |
| 136 weightsItem=weightsItem, | |
| 137 distance=distance, | |
| 138 verbose=verbose) | |
| 139 } | |
| 140 res=list(); | |
| 141 | |
| 142 ##make results directory | |
| 143 if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){ | |
| 144 dir.create(paste(title,sep="")) | |
| 145 } | |
| 146 | |
| 147 ##write log file | |
| 148 log <- matrix( ncol=2, | |
| 149 byrow=T, | |
| 150 c("title",title, | |
| 151 "maxK",maxK, | |
| 152 "input matrix rows",ifelse ( inherits( d, "matrix" ), nrow(d), "dist-mat" ), | |
| 153 "input matric columns",ifelse ( inherits( d, "matrix" ), ncol(d), ncol( as.matrix(d) ) ), | |
| 154 "number of bootstraps",reps, | |
| 155 "item subsampling proportion",pItem, | |
| 156 "feature subsampling proportion",ifelse( is.null(pFeature), 1, pFeature ), | |
| 157 "cluster algorithm",clusterAlg, | |
| 158 "inner linkage type",innerLinkage, | |
| 159 "final linkage type",finalLinkage, | |
| 160 "correlation method",distance, | |
| 161 "plot",if(is.null(plot)) NA else plot, | |
| 162 "seed",if(is.null(seed)) NA else seed)) | |
| 163 colnames(log) = c("option","value") | |
| 164 if(writeTable){ | |
| 165 write.csv(file=paste(title,"/",title,".log.csv",sep=""), log,row.names=F) | |
| 166 } | |
| 167 if(is.null(plot)){ | |
| 168 ##nothing | |
| 169 }else if(plot=="png"){ | |
| 170 png(paste(title,"/","consensus%03d.png",sep="")) | |
| 171 }else if (plot=="pdf"){ | |
| 172 pdf(onefile=TRUE, paste(title,"/","consensus.pdf",sep="")) | |
| 173 }else if (plot=="ps"){ | |
| 174 postscript(onefile=TRUE, paste(title,"/","consensus.ps",sep="")) | |
| 175 } | |
| 176 | |
| 177 colorList=list() | |
| 178 colorM = rbind() #matrix of colors. | |
| 179 | |
| 180 #18 colors for marking different clusters | |
| 181 thisPal <- c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99","#E31A1C","#FDBF6F","#FF7F00","#CAB2D6","#6A3D9A","#FFFF99","#B15928", | |
| 182 "#bd18ea", #magenta | |
| 183 "#2ef4ca", #aqua | |
| 184 "#f4cced", #pink, | |
| 185 "#f4cc03", #lightorange | |
| 186 "#05188a", #navy, | |
| 187 "#e5a25a", #light brown | |
| 188 "#06f106", #bright green | |
| 189 "#85848f", #med gray | |
| 190 "#000000", #black | |
| 191 "#076f25", #dark green | |
| 192 "#93cd7f",#lime green | |
| 193 "#4d0776", #dark purple | |
| 194 "#ffffff" #white | |
| 195 ) | |
| 196 | |
| 197 ##plot scale | |
| 198 colBreaks=NA | |
| 199 if(is.null(tmyPal)==TRUE){ | |
| 200 colBreaks=10 | |
| 201 tmyPal = myPal(colBreaks) | |
| 202 }else{ | |
| 203 colBreaks=length(tmyPal) | |
| 204 } | |
| 205 sc = cbind(seq(0,1,by=1/( colBreaks) )); rownames(sc) = sc[,1] | |
| 206 sc = cbind(sc,sc) | |
| 207 heatmap(sc, Colv=NA, Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=rownames(sc),labCol=F,main="consensus matrix legend") | |
| 208 | |
| 209 for (tk in 2:maxK){ | |
| 210 if(verbose){ | |
| 211 message(paste("consensus ",tk)) | |
| 212 } | |
| 213 fm = ml[[tk]] | |
| 214 hc=hclust( as.dist( 1 - fm ), method=finalLinkage); | |
| 215 message("clustered") | |
| 216 ct = cutree(hc,tk) | |
| 217 names(ct) = colnames(d) | |
| 218 c = fm | |
| 219 ##colnames(c) = colnames(d) | |
| 220 ##rownames(c) = colnames(d) | |
| 221 | |
| 222 colorList = setClusterColors(res[[tk-1]][[3]],ct,thisPal,colorList) | |
| 223 | |
| 224 pc = c | |
| 225 pc=pc[hc$order,] #***pc is matrix for plotting, same as c but is row-ordered and has names and extra row of zeros. | |
| 226 pc = rbind(pc,0) | |
| 227 | |
| 228 heatmap(pc, Colv=as.dendrogram(hc), Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=F,labCol=F,mar=c(5,5),main=paste("consensus matrix k=",tk,sep="") , ColSideCol=colorList[[1]]) | |
| 229 legend("topright",legend=unique(ct),fill=unique(colorList[[1]]),horiz=FALSE ) | |
| 230 | |
| 231 res[[tk]] = list(consensusMatrix=c,consensusTree=hc,consensusClass=ct,ml=ml[[tk]],clrs=colorList) | |
| 232 colorM = rbind(colorM,colorList[[1]]) | |
| 233 } | |
| 234 CDF(ml) | |
| 235 clusterTrackingPlot(colorM[,res[[length(res)]]$consensusTree$order]) | |
| 236 if(is.null(plot)==FALSE){ | |
| 237 dev.off(); | |
| 238 } | |
| 239 res[[1]] = colorM | |
| 240 if(writeTable){ | |
| 241 for(i in 2:length(res)){ | |
| 242 write.csv(file=paste(title,"/",title,".k=",i,".consensusMatrix.csv",sep=""), res[[i]]$consensusMatrix) | |
| 243 write.table(file=paste(title,"/",title,".k=",i,".consensusClass.csv",sep=""), res[[i]]$consensusClass,col.names = F,sep=",") | |
| 244 } | |
| 245 } | |
| 246 return(res) | |
| 247 } | |
| 248 | |
| 249 | |
| 250 calcICL = function(res,title="untitled_consensus_cluster",plot=NULL,writeTable=FALSE){ | |
| 251 #calculates and plots cluster consensus and item consensus | |
| 252 cc=rbind() | |
| 253 cci = rbind() | |
| 254 sumRes=list() | |
| 255 colorsArr=c() | |
| 256 | |
| 257 #make results directory | |
| 258 if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){ | |
| 259 dir.create(paste(title,sep="")) | |
| 260 } | |
| 261 if(is.null(plot)){ | |
| 262 #to screen | |
| 263 }else if(plot=="pdf"){ | |
| 264 pdf(onefile=TRUE, paste(title,"/","icl.pdf",sep="")) | |
| 265 }else if(plot=="ps"){ | |
| 266 postscript(onefile=TRUE, paste(title,"/","icl.ps",sep="")) | |
| 267 }else if (plot=="png"){ | |
| 268 png(paste(title,"/","icl%03d.png",sep="")) | |
| 269 } | |
| 270 | |
| 271 par(mfrow=c(3,1),mar=c(4,3,2,0)) | |
| 272 | |
| 273 for (k in 2:length(res)){ #each k | |
| 274 eiCols = c(); | |
| 275 o = res[[k]] | |
| 276 m = o$consensusMatrix | |
| 277 m = triangle(m,mode=2) | |
| 278 for (ci in sort(unique(o$consensusClass))){ #each cluster in k | |
| 279 items = which(o$consensusClass==ci) | |
| 280 nk = length(items) | |
| 281 mk = sum( m[items,items], na.rm=T)/((nk*(nk-1))/2) | |
| 282 cc=rbind(cc,c(k,ci,mk)) #cluster-consensus | |
| 283 | |
| 284 for (ei in rev(res[[2]]$consensusTree$order) ){ | |
| 285 denom = if (ei %in% items) { nk - 1} else { nk } | |
| 286 mei = sum( c(m[ei,items],m[items,ei]), na.rm=T)/denom # mean item consensus to a cluster. | |
| 287 cci = rbind(cci,c(k,ci,ei,mei)) #cluster, cluster index, item index, item-consensus | |
| 288 } | |
| 289 eiCols = c(eiCols, rep(ci,length(o$consensusClass)) ) | |
| 290 } | |
| 291 | |
| 292 cck = cci[which(cci[,1]==k),] #only plot the new k data. | |
| 293 | |
| 294 #group by item, order by cluster i | |
| 295 w=lapply(split(cck,cck[,3]), function(x) { y=matrix(unlist(x),ncol=4); y[order(y[,2]),4] }) | |
| 296 q = matrix(as.numeric(unlist(w)),ncol=length(w),byrow=F) | |
| 297 q = q[,res[[2]]$consensusTree$order] #order by leave order of k=2 | |
| 298 #q is a matrix of k rows and sample columns, values are item consensus of sample to the cluster. | |
| 299 | |
| 300 thisColors = unique(cbind(res[[k]]$consensusClass,res[[k]]$clrs[[1]])) | |
| 301 thisColors=thisColors[order(as.numeric(thisColors[,1])),2] | |
| 302 colorsArr=c(colorsArr,thisColors) | |
| 303 sumRes[[k]] = rankedBarPlot(q,thisColors,cc=res[[k]]$consensusClass[res[[2]]$consensusTree$order],paste("k=",k,sep="") ) | |
| 304 } | |
| 305 | |
| 306 ys=cs=lab=c() | |
| 307 lastk=cc[1,1] | |
| 308 for(i in 1:length(colorsArr)){ | |
| 309 if(lastk != cc[i,1]){ | |
| 310 ys=c(ys,0,0) | |
| 311 cs=c(cs,NA,NA) | |
| 312 lastk=cc[i,1] | |
| 313 lab=c(lab,NA,NA) | |
| 314 } | |
| 315 ys=c(ys,cc[i,3]) | |
| 316 cs=c(cs,colorsArr[i]) | |
| 317 lab=c(lab,cc[i,1]) | |
| 318 } | |
| 319 names(ys) = lab | |
| 320 par(mfrow=c(3,1),mar=c(4,3,2,0)) | |
| 321 barplot(ys,col=cs,border=cs,main="cluster-consensus",ylim=c(0,1),las=1) | |
| 322 if(is.null(plot)==FALSE){ | |
| 323 dev.off() | |
| 324 } | |
| 325 colnames(cc) = c("k","cluster","clusterConsensus") | |
| 326 colnames(cci) = c("k","cluster","item","itemConsensus") | |
| 327 cci[,"item"] = names(res[[2]]$consensusClass)[ cci[,"item"] ] | |
| 328 #type cci | |
| 329 cci = data.frame( k=as.numeric(cci[,"k"]), cluster=as.numeric(cci[,"cluster"]), item=cci[,"item"], itemConsensus=as.numeric(cci[,"itemConsensus"])) | |
| 330 | |
| 331 #write to file. | |
| 332 if(writeTable){ | |
| 333 write.csv(file=paste(title,"/",title,".summary.cluster.consensus.csv",sep=""),row.names=F, cc) | |
| 334 write.csv(file=paste(title,"/",title,".summary.item.consensus.csv",sep=""), row.names=F, cc) | |
| 335 } | |
| 336 return(list(clusterConsensus=cc,itemConsensus=cci)) | |
| 337 } | |
| 338 | |
| 339 | |
| 340 ccRun <- function( d=d, | |
| 341 maxK=NULL, | |
| 342 repCount=NULL, | |
| 343 diss=inherits( d, "dist" ), | |
| 344 pItem=NULL, | |
| 345 pFeature=NULL, | |
| 346 innerLinkage=NULL, | |
| 347 distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ), | |
| 348 clusterAlg=NULL, | |
| 349 weightsItem=NULL, | |
| 350 weightsFeature=NULL, | |
| 351 verbose=NULL) { | |
| 352 m = vector(mode='list', repCount) | |
| 353 ml = vector(mode="list",maxK) | |
| 354 n <- ifelse( diss, ncol( as.matrix(d) ), ncol(d) ) | |
| 355 mCount = mConsist = matrix(c(0),ncol=n,nrow=n) | |
| 356 ml[[1]] = c(0); | |
| 357 | |
| 358 if (is.null( distance ) ) distance <- 'euclidean' ## necessary if d is a dist object and attr( d, "method" ) == NULLa | |
| 359 | |
| 360 require( amap ) | |
| 361 ## we're going to use the amap Dist function, but they misname their correlation | |
| 362 ## functions, so re-name them correctly | |
| 363 amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary", | |
| 364 "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" ) | |
| 365 names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary", | |
| 366 "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" ) | |
| 367 main.dist.obj <- NULL | |
| 368 ##browser() | |
| 369 if ( diss ){ | |
| 370 main.dist.obj <- d | |
| 371 | |
| 372 ## reset the pFeature & weightsFeature params if they've been set (irrelevant if d is a dist matrix) | |
| 373 if ( ( !is.null(pFeature) ) && | |
| 374 ( pFeature < 1 ) ) { | |
| 375 if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified pFeature parameter\n" ) | |
| 376 pFeature <- 1 # set it to 1 to avoid problems with sampleCols | |
| 377 } | |
| 378 if ( ! is.null( weightsFeature ) ) { | |
| 379 if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified weightsFeature parameter\n" ) | |
| 380 weightsFeature <- NULL # set it to NULL to avoid problems with sampleCols | |
| 381 } | |
| 382 } else { ## d is a data matrix | |
| 383 ## we're not sampling over the features | |
| 384 if ( ( clusterAlg != "km" ) && | |
| 385 ( is.null( pFeature ) || | |
| 386 ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) ) { | |
| 387 ## only generate a main.dist.object IFF 1) d is a matrix, 2) we're not sampling the features, and 3) the algorithm isn't 'km' | |
| 388 if ( inherits( distance, "character" ) ) { | |
| 389 if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.") | |
| 390 | |
| 391 main.dist.obj <- Dist( t(d), method=as.character( amap.distance[ distance ] ) ) | |
| 392 ## now fix dumb amap naming convention for distance metrics | |
| 393 attr( main.dist.obj, "method" ) <- as.character( amap.distance[ distance ] ) | |
| 394 } else stop("unsupported distance specified.") | |
| 395 } else { | |
| 396 ## pFeature < 1 or a weightsFeature != NULL | |
| 397 ## since d is a data matrix, the user wants to sample over the gene features, so main.dist.obj is left as NULL | |
| 398 } | |
| 399 } | |
| 400 | |
| 401 | |
| 402 for (i in 1:repCount){ | |
| 403 ##browser() | |
| 404 if(verbose){ | |
| 405 message(paste("random subsample",i)); | |
| 406 } | |
| 407 ## take expression matrix sample, samples and genes | |
| 408 sample_x = sampleCols( d, pItem, pFeature, weightsItem, weightsFeature ) | |
| 409 | |
| 410 this_dist = NA | |
| 411 if ( ! is.null( main.dist.obj ) ) { | |
| 412 boot.cols <- sample_x$subcols | |
| 413 this_dist <- as.matrix( main.dist.obj )[ boot.cols, boot.cols ] | |
| 414 if ( clusterAlg != "km" ) { | |
| 415 ## if this isn't kmeans, then convert to a distance object | |
| 416 this_dist <- as.dist( this_dist ) | |
| 417 attr( this_dist, "method" ) <- attr( main.dist.obj, "method" ) | |
| 418 } | |
| 419 } else { | |
| 420 ## if main.dist.obj is NULL, then d is a data matrix, and either: | |
| 421 ## 1) clusterAlg is 'km' | |
| 422 ## 2) pFeatures < 1 or weightsFeatures have been specified, or | |
| 423 ## 3) both | |
| 424 ## so we can't use a main distance object and for every iteration, we will have to re-calculate either | |
| 425 ## 1) the distance matrix (because we're also sampling the features as well), or | |
| 426 ## 2) the submat (if using km) | |
| 427 | |
| 428 if ( clusterAlg != "km" ) { | |
| 429 if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.") | |
| 430 ## good, we have a supported distance type | |
| 431 this_dist <- Dist( t( sample_x$submat ), method=as.character( amap.distance[ distance ] ) ) | |
| 432 ## now fix dumb amap naming convention for distance metrics | |
| 433 attr( this_dist, "method" ) <- as.character( amap.distance[ distance ] ) | |
| 434 } else { | |
| 435 ##browser() | |
| 436 ##clusterAlg == "km" | |
| 437 ## if we're not sampling the features, then grab the colslice | |
| 438 if ( is.null( pFeature ) || | |
| 439 ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) { | |
| 440 this_dist <- d[, sample_x$subcols ] | |
| 441 } else { | |
| 442 if ( is.na( sample_x$submat ) ) { | |
| 443 save( "ccrun.submat.eq.na.dbg.rda" ) | |
| 444 stop( "Houston, we have a problem. sample_x$submat is NA in ccRun when it should be specified - saving state\n" ) | |
| 445 } | |
| 446 | |
| 447 this_dist <- sample_x$submat | |
| 448 } | |
| 449 } | |
| 450 } | |
| 451 | |
| 452 ## cluster samples for HC. | |
| 453 this_cluster=NA | |
| 454 if(clusterAlg=="hc"){ | |
| 455 this_cluster = hclust( this_dist, method=innerLinkage) | |
| 456 } | |
| 457 ##browser() | |
| 458 ##mCount is possible number of times that two sample occur in same random sample, independent of k | |
| 459 ##mCount stores number of times a sample pair was sampled together. | |
| 460 mCount <- connectivityMatrix( rep( 1,length(sample_x[[3]])), | |
| 461 mCount, | |
| 462 sample_x[[3]] ) | |
| 463 | |
| 464 ##use samples for each k | |
| 465 for (k in 2:maxK){ | |
| 466 if(verbose){ | |
| 467 message(paste(" k =",k)) | |
| 468 } | |
| 469 if (i==1){ | |
| 470 ml[[k]] = mConsist #initialize | |
| 471 } | |
| 472 this_assignment=NA | |
| 473 if(clusterAlg=="hc"){ | |
| 474 ##prune to k for hc | |
| 475 this_assignment = cutree(this_cluster,k) | |
| 476 ##browser() | |
| 477 }else if(clusterAlg=="km"){ | |
| 478 ##this_dist should now be a matrix corresponding to the result from sampleCols | |
| 479 this_assignment <- kmeans( t( this_dist ), | |
| 480 k, | |
| 481 iter.max = 10, | |
| 482 nstart = 1, | |
| 483 algorithm = c("Hartigan-Wong") )$cluster | |
| 484 }else if ( clusterAlg == "pam" ) { | |
| 485 require( cluster ) | |
| 486 this_assignment <- pam( x=this_dist, | |
| 487 k, | |
| 488 diss=TRUE, | |
| 489 metric=distance, | |
| 490 cluster.only=TRUE ) | |
| 491 } else{ | |
| 492 ##optional cluterArg Hook. | |
| 493 this_assignment <- get(clusterAlg)(this_dist, k) | |
| 494 } | |
| 495 ##add to tally | |
| 496 ml[[k]] <- connectivityMatrix( this_assignment, | |
| 497 ml[[k]], | |
| 498 sample_x[[3]] ) | |
| 499 } | |
| 500 } | |
| 501 | |
| 502 | |
| 503 ##consensus fraction | |
| 504 res = vector(mode="list",maxK) | |
| 505 for (k in 2:maxK){ | |
| 506 ##fill in other half of matrix for tally and count. | |
| 507 tmp = triangle(ml[[k]],mode=3) | |
| 508 tmpCount = triangle(mCount,mode=3) | |
| 509 res[[k]] = tmp / tmpCount | |
| 510 res[[k]][which(tmpCount==0)] = 0 | |
| 511 } | |
| 512 message("end fraction") | |
| 513 return(res) | |
| 514 } | |
| 515 | |
| 516 | |
| 517 connectivityMatrix <- function( clusterAssignments, m, sampleKey){ | |
| 518 ##input: named vector of cluster assignments, matrix to add connectivities | |
| 519 ##output: connectivity matrix | |
| 520 names( clusterAssignments ) <- sampleKey | |
| 521 cls <- lapply( unique( clusterAssignments ), function(i) as.numeric( names( clusterAssignments[ clusterAssignments %in% i ] ) ) ) | |
| 522 | |
| 523 for ( i in 1:length( cls ) ) { | |
| 524 nelts <- 1:ncol( m ) | |
| 525 cl <- as.numeric( nelts %in% cls[[i]] ) ## produces a binary vector | |
| 526 updt <- outer( cl, cl ) | |
| 527 m <- m + updt | |
| 528 } | |
| 529 return(m) | |
| 530 } | |
| 531 | |
| 532 ## returns a list with the sample columns, as well as the sub-matrix & sample features (if necessary) | |
| 533 ## if no sampling over the features is performed, the submatrix & sample features are returned as NAs | |
| 534 ## to reduce memory overhead | |
| 535 sampleCols <- function( d, | |
| 536 pSamp=NULL, | |
| 537 pRow=NULL, | |
| 538 weightsItem=NULL, | |
| 539 weightsFeature=NULL ){ | |
| 540 space <- ifelse( inherits( d, "dist" ), ncol( as.matrix(d) ), ncol(d) ) | |
| 541 sampleN <- floor(space*pSamp) | |
| 542 sampCols <- sort( sample(space, sampleN, replace = FALSE, prob = weightsItem) ) | |
| 543 | |
| 544 this_sample <- sampRows <- NA | |
| 545 if ( inherits( d, "matrix" ) ) { | |
| 546 if ( (! is.null( pRow ) ) && | |
| 547 ( (pRow < 1 ) || (! is.null( weightsFeature ) ) ) ) { | |
| 548 ## only sample the rows and generate a sub-matrix if we're sampling over the row/gene/features | |
| 549 space = nrow(d) | |
| 550 sampleN = floor(space*pRow) | |
| 551 sampRows = sort( sample(space, sampleN, replace = FALSE, prob = weightsFeature) ) | |
| 552 this_sample <- d[sampRows,sampCols] | |
| 553 dimnames(this_sample) <- NULL | |
| 554 } else { | |
| 555 ## do nothing | |
| 556 } | |
| 557 } | |
| 558 return( list( submat=this_sample, | |
| 559 subrows=sampRows, | |
| 560 subcols=sampCols ) ) | |
| 561 } | |
| 562 | |
| 563 CDF=function(ml,breaks=100){ | |
| 564 #plot CDF distribution | |
| 565 plot(c(0),xlim=c(0,1),ylim=c(0,1),col="white",bg="white",xlab="consensus index",ylab="CDF",main="consensus CDF", las=2) | |
| 566 k=length(ml) | |
| 567 this_colors = rainbow(k-1) | |
| 568 areaK = c() | |
| 569 for (i in 2:length(ml)){ | |
| 570 v=triangle(ml[[i]],mode=1) | |
| 571 | |
| 572 #empirical CDF distribution. default number of breaks is 100 | |
| 573 h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks)) | |
| 574 h$counts = cumsum(h$counts)/sum(h$counts) | |
| 575 | |
| 576 #calculate area under CDF curve, by histogram method. | |
| 577 thisArea=0 | |
| 578 for (bi in 1:(length(h$breaks)-1)){ | |
| 579 thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width | |
| 580 bi = bi + 1 | |
| 581 } | |
| 582 areaK = c(areaK,thisArea) | |
| 583 lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l') | |
| 584 } | |
| 585 legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors) | |
| 586 | |
| 587 #plot area under CDF change. | |
| 588 deltaK=areaK[1] #initial auc at k=2 | |
| 589 for(i in 2:(length(areaK))){ | |
| 590 #proportional increase relative to prior K. | |
| 591 deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1]) | |
| 592 } | |
| 593 plot(1+(1:length(deltaK)),y=deltaK,xlab="k",ylab="relative change in area under CDF curve",main="Delta area",type="b") | |
| 594 } | |
| 595 | |
| 596 | |
| 597 myPal = function(n=10){ | |
| 598 #returns n colors | |
| 599 seq = rev(seq(0,255,by=255/(n))) | |
| 600 palRGB = cbind(seq,seq,255) | |
| 601 rgb(palRGB,maxColorValue=255) | |
| 602 } | |
| 603 | |
| 604 setClusterColors = function(past_ct,ct,colorU,colorList){ | |
| 605 #description: sets common color of clusters between different K | |
| 606 newColors = c() | |
| 607 if(length(colorList)==0){ | |
| 608 #k==2 | |
| 609 newColors = colorU[ct] | |
| 610 colori=2 | |
| 611 }else{ | |
| 612 newColors = rep(NULL,length(ct)) | |
| 613 colori = colorList[[2]] | |
| 614 mo=table(past_ct,ct) | |
| 615 m=mo/apply(mo,1,sum) | |
| 616 for(tci in 1:ncol(m)){ # for each cluster | |
| 617 maxC = max(m[,tci]) | |
| 618 pci = which(m[,tci] == maxC) | |
| 619 if( sum(m[,tci]==maxC)==1 & max(m[pci,])==maxC & sum(m[pci,]==maxC)==1 ) { | |
| 620 #if new column maximum is unique, same cell is row maximum and is also unique | |
| 621 ##Note: the greatest of the prior clusters' members are the greatest in a current cluster's members. | |
| 622 newColors[which(ct==tci)] = unique(colorList[[1]][which(past_ct==pci)]) # one value | |
| 623 }else{ #add new color. | |
| 624 colori=colori+1 | |
| 625 newColors[which(ct==tci)] = colorU[colori] | |
| 626 } | |
| 627 } | |
| 628 } | |
| 629 return(list(newColors,colori,unique(newColors) )) | |
| 630 } | |
| 631 | |
| 632 clusterTrackingPlot = function(m){ | |
| 633 #description: plots cluster tracking plot | |
| 634 #input: m - matrix where rows are k, columns are samples, and values are cluster assignments. | |
| 635 plot(NULL,xlim=c(-0.1,1),ylim=c(0,1),axes=FALSE,xlab="samples",ylab="k",main="tracking plot") | |
| 636 for(i in 1:nrow(m)){ | |
| 637 rect( xleft=seq(0,1-1/ncol(m),by=1/ncol(m)), ybottom=rep(1-i/nrow(m),ncol(m)) , xright=seq(1/ncol(m),1,by=1/ncol(m)), ytop=rep(1-(i-1)/nrow(m),ncol(m)), col=m[i,],border=NA) | |
| 638 } | |
| 639 #hatch lines to indicate samples | |
| 640 xl = seq(0,1-1/ncol(m),by=1/ncol(m)) | |
| 641 segments( xl, rep(-0.1,ncol(m)) , xl, rep(0,ncol(m)), col="black") #** alt white and black color? | |
| 642 ypos = seq(1,0,by=-1/nrow(m))-1/(2*nrow(m)) | |
| 643 text(x=-0.1,y=ypos[-length(ypos)],labels=seq(2,nrow(m)+1,by=1)) | |
| 644 } | |
| 645 | |
| 646 triangle = function(m,mode=1){ | |
| 647 #mode=1 for CDF, vector of lower triangle. | |
| 648 #mode==3 for full matrix. | |
| 649 #mode==2 for calcICL; nonredundant half matrix coun | |
| 650 #mode!=1 for summary | |
| 651 n=dim(m)[1] | |
| 652 nm = matrix(0,ncol=n,nrow=n) | |
| 653 fm = m | |
| 654 | |
| 655 | |
| 656 nm[upper.tri(nm)] = m[upper.tri(m)] #only upper half | |
| 657 | |
| 658 fm = t(nm)+nm | |
| 659 diag(fm) = diag(m) | |
| 660 | |
| 661 nm=fm | |
| 662 nm[upper.tri(nm)] = NA | |
| 663 diag(nm) = NA | |
| 664 vm = m[lower.tri(nm)] | |
| 665 | |
| 666 if(mode==1){ | |
| 667 return(vm) #vector | |
| 668 }else if(mode==3){ | |
| 669 return(fm) #return full matrix | |
| 670 }else if(mode == 2){ | |
| 671 return(nm) #returns lower triangle and no diagonal. no double counts. | |
| 672 } | |
| 673 | |
| 674 } | |
| 675 | |
| 676 | |
| 677 rankedBarPlot=function(d,myc,cc,title){ | |
| 678 colors = rbind() #each row is a barplot series | |
| 679 byRank = cbind() | |
| 680 | |
| 681 spaceh = 0.1 #space between bars | |
| 682 for(i in 1:ncol(d)){ | |
| 683 byRank = cbind(byRank,sort(d[,i],na.last=F)) | |
| 684 colors = rbind(colors,order(d[,i],na.last=F)) | |
| 685 } | |
| 686 maxH = max(c(1.5,apply(byRank,2,sum)),na.rm=T) #maximum height of graph | |
| 687 | |
| 688 #barplot largest to smallest so that smallest is in front. | |
| 689 barp = barplot( apply(byRank,2,sum) , col=myc[colors[,1]] ,space=spaceh,ylim=c(0,maxH),main=paste("item-consensus", title),border=NA,las=1 ) | |
| 690 for(i in 2:nrow(byRank)){ | |
| 691 barplot( apply(matrix(byRank[i:nrow(byRank),],ncol=ncol(byRank)) ,2,sum), space=spaceh,col=myc[colors[,i]],ylim=c(0,maxH), add=T,border=NA,las=1 ) | |
| 692 } | |
| 693 xr=seq(spaceh,ncol(d)+ncol(d)*spaceh,(ncol(d)+ncol(d)*spaceh)/ncol(d) ) | |
| 694 #class labels as asterisks | |
| 695 text("*",x=xr+0.5,y=maxH,col=myc[cc],cex=1.4) #rect(xr,1.4,xr+1,1.5,col=myc[cc] ) | |
| 696 } | |
| 697 | |
| 698 | |
| 699 | |
| 700 ###################################################################3333 | |
| 701 ## RESTART MY SCRIPTS HERE | |
| 702 | |
| 703 spec <- matrix( c( "data.fname", "d", 1, "character", | |
| 704 "direction", "n", 2, "character", | |
| 705 "output.name", "o", 2, "character", | |
| 706 "cluster.alg", "a", 2, "character", | |
| 707 "distance.metric", "m", 2, "character", | |
| 708 "max.k", "k", 2, "integer", | |
| 709 "reps", "r", 2, "integer", | |
| 710 "innerLinkage", "i", 1, "character", | |
| 711 "finalLinkage", "f", 1, "character", | |
| 712 "out.report.dir", "p", 2, "character", | |
| 713 "out.report.html", "h", 2, "character" | |
| 714 ), | |
| 715 nc=4, | |
| 716 byrow=TRUE | |
| 717 ) | |
| 718 | |
| 719 opt <- getopt( spec=spec ) | |
| 720 | |
| 721 ## default params for non-required params | |
| 722 if ( is.null( opt$direction ) ) { opt$direction <- "cols" } | |
| 723 if ( is.null( opt$cluster.alg ) ) { opt$cluster.alg <- "pam" } | |
| 724 if ( is.null( opt$output.name ) ) { opt$output.name <- "consensus.cluster.result" } | |
| 725 if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "cosine" } | |
| 726 if ( is.null( opt$max.k ) ) { opt$max.k <- 10 } | |
| 727 if ( is.null( opt$reps ) ) { opt$reps <- 1000 } | |
| 728 if ( is.null( opt$innerLinkage ) ) { opt$innerLinkage <- "average" } | |
| 729 if ( is.null( opt$finalLinkage ) ) { opt$finalLinkage <- "average" } | |
| 730 | |
| 731 if ( is.null( opt$out.report.dir ) ) { opt$out.report.dir <- "report" } | |
| 732 if ( is.null( opt$out.report.html ) ) { opt$out.report.html <- file.path( "report", "index.html" ) } | |
| 733 | |
| 734 ## validate params here (make sure set to valid values) | |
| 735 if ( !opt$cluster.alg %in% c( "hc", "km", "pam" ) ) { | |
| 736 stop( "invalid clustering algorithm specified", cluster.alg ) | |
| 737 } | |
| 738 | |
| 739 | |
| 740 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) | |
| 741 ## transpose the matrix if we want to cluster the rows (genes) | |
| 742 if ( opt$direction == "rows" ) { | |
| 743 data <- t( data ) | |
| 744 } | |
| 745 | |
| 746 title <- paste( opt$cluster.alg, opt$output.name, sep="." ) | |
| 747 results <- ConsensusClusterPlus( data, | |
| 748 maxK=opt$max.k, | |
| 749 reps=opt$reps, | |
| 750 pItem=0.8, | |
| 751 pFeature=1, | |
| 752 title=opt$out.report.dir, | |
| 753 clusterAlg=opt$cluster.alg, | |
| 754 distance=opt$distance.metric, | |
| 755 innerLinkage=opt$innerLinkage, | |
| 756 finalLinkage=opt$finalLinkage, | |
| 757 plot='png', | |
| 758 writeTable=FALSE, | |
| 759 seed=100, | |
| 760 ##weightsFeature=abs( rnorm( nrow( orig.data ) ) ), | |
| 761 verbose=FALSE ) | |
| 762 | |
| 763 pngs = list.files(path=opt$out.report.dir, patt="png") | |
| 764 html.out <- paste( "<html>", | |
| 765 paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), | |
| 766 "</html>" ) | |
| 767 cat( html.out, file=opt$out.report.html ) | |
| 768 | |
| 769 | |
| 770 ## re-transpose the matrix back if we've clustered the rows (genes) | |
| 771 if ( opt$direction == "rows" ) { | |
| 772 data <- t( data ) | |
| 773 } | |
| 774 save( file=opt$output.name, data, results) | 
