| 2 | 1 ##!/usr/bin/env Rscript | 
|  | 2 ## Consensus Clustering Script by Peter Waltman | 
|  | 3 ## May 31, 2011 | 
|  | 4 ## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0) | 
|  | 5 ## | 
|  | 6 #usage, options and doc goes here | 
|  | 7 argspec <- c("consensus.clustering.R takes a clustering from ConsensusClusterPlus and clinical survival data | 
|  | 8 and generates a KM-plot, along with the log-rank p-values | 
|  | 9 | 
|  | 10         Usage: | 
|  | 11                 consensus.clustering.R -d <data.file> | 
|  | 12         Optional: | 
|  | 13                 -o <output.name> | 
|  | 14                 -a <cluster.alg>  ## must be either 'hc' or 'km' | 
|  | 15                 -m <distance.metric> ## must be one supported by ConsensusClusterPlus | 
|  | 16                 -k <max.k> | 
|  | 17                 -r <reps> | 
|  | 18                 -f <filter>            ## filter, o/w no filtering | 
|  | 19 | 
|  | 20                 \n\n") | 
|  | 21 args <- commandArgs(TRUE) | 
|  | 22 if ( length( args ) == 1 && args =="--help") { | 
|  | 23   write(argspec, stderr()) | 
|  | 24   q(); | 
|  | 25 } | 
|  | 26 | 
|  | 27 require(getopt) | 
|  | 28 ##require(ConsensusClusterPlus) | 
|  | 29 ##  if any of the faster clustering methods are available on this system, load them | 
|  | 30 require( amap ) | 
|  | 31 require( cluster ) | 
|  | 32 if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) { | 
|  | 33   if ( 'flashClust' %in% installed.packages() ) { | 
|  | 34     require( flashClust ) | 
|  | 35   } else { | 
|  | 36     if ( 'fastcluster' %in% installed.packages() ) { | 
|  | 37       require( fastcluster ) | 
|  | 38     } | 
|  | 39   } | 
|  | 40 } | 
|  | 41 | 
|  | 42 ################### | 
|  | 43 ## code borrowed/updated from ConsensusClusterPlus | 
|  | 44 ################### | 
|  | 45 | 
|  | 46 ConsensusClusterPlus <- function( d=NULL, | 
|  | 47                                   maxK = 3, | 
|  | 48                                   reps=10, | 
|  | 49                                   pItem=0.8, | 
|  | 50                                   pFeature=1, | 
|  | 51                                   clusterAlg="hc", | 
|  | 52                                   title="untitled_consensus_cluster", | 
|  | 53                                   innerLinkage="average", | 
|  | 54                                   finalLinkage="average", | 
|  | 55                                   distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ), | 
|  | 56                                   ml=NULL, | 
|  | 57                                   tmyPal=NULL, | 
|  | 58                                   seed=NULL, | 
|  | 59                                   plot=NULL, | 
|  | 60                                   writeTable=FALSE, | 
|  | 61                                   weightsItem=NULL, | 
|  | 62                                   weightsFeature=NULL, | 
|  | 63                                   verbose=F ) { | 
|  | 64   ##description: runs consensus subsamples | 
|  | 65 | 
|  | 66 | 
|  | 67   if(is.null(seed)==TRUE){ | 
|  | 68     seed=timeSeed = as.numeric(Sys.time()) | 
|  | 69   } | 
|  | 70   set.seed(seed) | 
|  | 71 | 
|  | 72   if(is.null(ml)==TRUE){ | 
|  | 73 | 
|  | 74     if ( inherits( distance, "dist" ) ) { | 
|  | 75       stop( "If you want to pass in a pre-calculated distance object, pass it in as the data, rather than the distance parameter\n" ) | 
|  | 76     } | 
|  | 77 | 
|  | 78     if ( ! class( d ) %in% c( "dist", "matrix", "ExpressionSet" ) ) { | 
|  | 79       stop("d must be a matrix, distance object or ExpressionSet (eset object)") | 
|  | 80     } | 
|  | 81 | 
|  | 82     if ( inherits( d, "dist" ) ) { | 
|  | 83       ## if d is a distance matrix, fix a few things so that they don't cause problems with the analysis | 
|  | 84       ##  Note, assumption is that if d is a distance matrix, the user doesn't want to sample over the row features | 
|  | 85       if ( is.null( attr( d, "method" ) ) ) { | 
|  | 86         attr( d, "method" ) <- distance <- "unknown - user-specified" | 
|  | 87       } | 
|  | 88       if ( is.null( distance ) || ( distance != attr( d, "method" ) ) ) { | 
|  | 89         distance <- attr( d, "method" ) | 
|  | 90       } | 
|  | 91 | 
|  | 92       if ( ( ! is.null( pFeature ) ) && ( pFeature < 1 ) ) { | 
|  | 93         if ( verbose ) warning( "Cannot use the pFeatures parameter when specifying a distance matrix as the data object\n" ) | 
|  | 94         pFeature <- 1 | 
|  | 95       } | 
|  | 96       if ( ! is.null( weightsFeature ) ) { | 
|  | 97         if ( verbose ) warning( "Cannot use the weightsFeature parameter when specifying a distance matrix as the data object\n" ) | 
|  | 98         weightsFeature <- NULL | 
|  | 99       } | 
|  | 100       if ( clusterAlg == "km" ) { | 
|  | 101         if ( verbose ) warning( "You are asking CCPLUS to use K-means to cluster a distance matrix (rather than the data itself) - this may produce unintended results. We suggest using PAM if you want to use alternate distance metrics/objects\n" ) | 
|  | 102         ##d <- as.matrix( d )  #this is now done w/in ccRun | 
|  | 103       } | 
|  | 104     } else { | 
|  | 105       if ( is.null( distance ) ) { | 
|  | 106         ## we should never get here, but just in case | 
|  | 107         distance <- "pearson" | 
|  | 108       } | 
|  | 109     } | 
|  | 110 | 
|  | 111     if ( ( clusterAlg == "km" ) && inherits( distance, "character" ) && ( distance != "euclidean" ) ) { | 
|  | 112       warning( "WARNING: kmeans can only use the euclidean distance metric.  If you would like to use an alternate metric, we suggest using PAM or HC clustering instead. This parameter combinationwill use k-means, but will NOT use the specified distance metric\n" ) | 
|  | 113       distance <- 'euclidean' | 
|  | 114     } | 
|  | 115 | 
|  | 116 | 
|  | 117     if ( inherits( d,"ExpressionSet" ) ) { | 
|  | 118       d <- exprs(d) | 
|  | 119     } | 
|  | 120 | 
|  | 121     ml <- ccRun( d=d, | 
|  | 122                  maxK=maxK, | 
|  | 123                  repCount=reps, | 
|  | 124                  diss=inherits(d,"dist"), | 
|  | 125                  pItem=pItem, | 
|  | 126                  pFeature=pFeature, | 
|  | 127                  innerLinkage=innerLinkage, | 
|  | 128                  clusterAlg=clusterAlg, | 
|  | 129                  weightsFeature=weightsFeature, | 
|  | 130                  weightsItem=weightsItem, | 
|  | 131                  distance=distance, | 
|  | 132                  verbose=verbose) | 
|  | 133   } | 
|  | 134   res=list(); | 
|  | 135 | 
|  | 136   ##make results directory | 
|  | 137   if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){ | 
|  | 138     dir.create(paste(title,sep="")) | 
|  | 139   } | 
|  | 140 | 
|  | 141   ##write log file | 
|  | 142   log <- matrix( ncol=2, | 
|  | 143                  byrow=T, | 
|  | 144                  c("title",title, | 
|  | 145                    "maxK",maxK, | 
|  | 146                    "input matrix rows",ifelse ( inherits( d, "matrix" ), nrow(d), "dist-mat" ), | 
|  | 147                    "input matric columns",ifelse ( inherits( d, "matrix" ), ncol(d), ncol( as.matrix(d) ) ), | 
|  | 148                    "number of bootstraps",reps, | 
|  | 149                    "item subsampling proportion",pItem, | 
|  | 150                    "feature subsampling proportion",ifelse( is.null(pFeature), 1, pFeature ), | 
|  | 151                    "cluster algorithm",clusterAlg, | 
|  | 152                    "inner linkage type",innerLinkage, | 
|  | 153                    "final linkage type",finalLinkage, | 
|  | 154                    "correlation method",distance, | 
|  | 155                    "plot",if(is.null(plot)) NA else plot, | 
|  | 156                    "seed",if(is.null(seed)) NA else seed)) | 
|  | 157   colnames(log) = c("option","value") | 
|  | 158   if(writeTable){ | 
|  | 159     write.csv(file=paste(title,"/",title,".log.csv",sep=""), log,row.names=F) | 
|  | 160   } | 
|  | 161   if(is.null(plot)){ | 
|  | 162     ##nothing | 
|  | 163   }else if(plot=="png"){ | 
|  | 164     png(paste(title,"/","consensus%03d.png",sep="")) | 
|  | 165   }else if (plot=="pdf"){ | 
|  | 166     pdf(onefile=TRUE, paste(title,"/","consensus.pdf",sep="")) | 
|  | 167   }else if (plot=="ps"){ | 
|  | 168     postscript(onefile=TRUE, paste(title,"/","consensus.ps",sep="")) | 
|  | 169   } | 
|  | 170 | 
|  | 171   colorList=list() | 
|  | 172   colorM = rbind() #matrix of colors. | 
|  | 173 | 
|  | 174                                         #18 colors for marking different clusters | 
|  | 175   thisPal <- c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99","#E31A1C","#FDBF6F","#FF7F00","#CAB2D6","#6A3D9A","#FFFF99","#B15928", | 
|  | 176                "#bd18ea", #magenta | 
|  | 177                "#2ef4ca", #aqua | 
|  | 178                "#f4cced", #pink, | 
|  | 179                "#f4cc03", #lightorange | 
|  | 180                "#05188a", #navy, | 
|  | 181                "#e5a25a", #light brown | 
|  | 182                "#06f106", #bright green | 
|  | 183                "#85848f", #med gray | 
|  | 184                "#000000", #black | 
|  | 185                "#076f25", #dark green | 
|  | 186                "#93cd7f",#lime green | 
|  | 187                "#4d0776", #dark purple | 
|  | 188                "#ffffff" #white | 
|  | 189                ) | 
|  | 190 | 
|  | 191   ##plot scale | 
|  | 192   colBreaks=NA | 
|  | 193   if(is.null(tmyPal)==TRUE){ | 
|  | 194     colBreaks=10 | 
|  | 195     tmyPal = myPal(colBreaks) | 
|  | 196   }else{ | 
|  | 197     colBreaks=length(tmyPal) | 
|  | 198   } | 
|  | 199   sc = cbind(seq(0,1,by=1/( colBreaks) )); rownames(sc) = sc[,1] | 
|  | 200   sc = cbind(sc,sc) | 
|  | 201   heatmap(sc, Colv=NA, Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=rownames(sc),labCol=F,main="consensus matrix legend") | 
|  | 202 | 
|  | 203   for (tk in 2:maxK){ | 
|  | 204     if(verbose){ | 
|  | 205       message(paste("consensus ",tk)) | 
|  | 206     } | 
|  | 207     fm = ml[[tk]] | 
|  | 208     hc=hclust( as.dist( 1 - fm ), method=finalLinkage); | 
|  | 209     message("clustered") | 
|  | 210     ct = cutree(hc,tk) | 
|  | 211     names(ct) = colnames(d) | 
|  | 212     c = fm | 
|  | 213     ##colnames(c) = colnames(d) | 
|  | 214     ##rownames(c) = colnames(d) | 
|  | 215 | 
|  | 216     colorList = setClusterColors(res[[tk-1]][[3]],ct,thisPal,colorList) | 
|  | 217 | 
|  | 218     pc = c | 
|  | 219     pc=pc[hc$order,] #***pc is matrix for plotting, same as c but is row-ordered and has names and extra row of zeros. | 
|  | 220     pc = rbind(pc,0) | 
|  | 221 | 
|  | 222     heatmap(pc, Colv=as.dendrogram(hc), Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=F,labCol=F,mar=c(5,5),main=paste("consensus matrix k=",tk,sep="") , ColSideCol=colorList[[1]]) | 
|  | 223     legend("topright",legend=unique(ct),fill=unique(colorList[[1]]),horiz=FALSE ) | 
|  | 224 | 
|  | 225     res[[tk]] = list(consensusMatrix=c,consensusTree=hc,consensusClass=ct,ml=ml[[tk]],clrs=colorList) | 
|  | 226     colorM = rbind(colorM,colorList[[1]]) | 
|  | 227   } | 
|  | 228   CDF(ml) | 
|  | 229   clusterTrackingPlot(colorM[,res[[length(res)]]$consensusTree$order]) | 
|  | 230   if(is.null(plot)==FALSE){ | 
|  | 231     dev.off(); | 
|  | 232   } | 
|  | 233   res[[1]] = colorM | 
|  | 234   if(writeTable){ | 
|  | 235     for(i in 2:length(res)){ | 
|  | 236       write.csv(file=paste(title,"/",title,".k=",i,".consensusMatrix.csv",sep=""), res[[i]]$consensusMatrix) | 
|  | 237       write.table(file=paste(title,"/",title,".k=",i,".consensusClass.csv",sep=""), res[[i]]$consensusClass,col.names = F,sep=",") | 
|  | 238     } | 
|  | 239   } | 
|  | 240   return(res) | 
|  | 241 } | 
|  | 242 | 
|  | 243 | 
|  | 244 calcICL = function(res,title="untitled_consensus_cluster",plot=NULL,writeTable=FALSE){ | 
|  | 245   #calculates and plots cluster consensus and item consensus | 
|  | 246   cc=rbind() | 
|  | 247   cci = rbind() | 
|  | 248   sumRes=list() | 
|  | 249   colorsArr=c() | 
|  | 250 | 
|  | 251   #make results directory | 
|  | 252   if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){ | 
|  | 253 	dir.create(paste(title,sep="")) | 
|  | 254   } | 
|  | 255   if(is.null(plot)){ | 
|  | 256     #to screen | 
|  | 257   }else if(plot=="pdf"){ | 
|  | 258     pdf(onefile=TRUE, paste(title,"/","icl.pdf",sep="")) | 
|  | 259   }else if(plot=="ps"){ | 
|  | 260     postscript(onefile=TRUE, paste(title,"/","icl.ps",sep="")) | 
|  | 261   }else if (plot=="png"){ | 
|  | 262     png(paste(title,"/","icl%03d.png",sep="")) | 
|  | 263   } | 
|  | 264 | 
|  | 265   par(mfrow=c(3,1),mar=c(4,3,2,0)) | 
|  | 266 | 
|  | 267   for (k in 2:length(res)){ #each k | 
|  | 268     eiCols = c(); | 
|  | 269     o = res[[k]] | 
|  | 270     m = o$consensusMatrix | 
|  | 271     m = triangle(m,mode=2) | 
|  | 272     for (ci in sort(unique(o$consensusClass))){ #each cluster in k | 
|  | 273 	items = which(o$consensusClass==ci) | 
|  | 274 	nk = length(items) | 
|  | 275 	mk = sum( m[items,items], na.rm=T)/((nk*(nk-1))/2) | 
|  | 276 	cc=rbind(cc,c(k,ci,mk)) #cluster-consensus | 
|  | 277 | 
|  | 278       for (ei in rev(res[[2]]$consensusTree$order) ){ | 
|  | 279 		denom = if (ei %in% items) { nk - 1} else { nk } | 
|  | 280         	mei = sum( c(m[ei,items],m[items,ei]), na.rm=T)/denom  # mean item consensus to a cluster. | 
|  | 281 		cci = rbind(cci,c(k,ci,ei,mei)) #cluster, cluster index, item index, item-consensus | 
|  | 282       } | 
|  | 283       eiCols = c(eiCols, rep(ci,length(o$consensusClass)) ) | 
|  | 284     } | 
|  | 285 | 
|  | 286 	  cck = cci[which(cci[,1]==k),] #only plot the new k data. | 
|  | 287 | 
|  | 288 	  #group by item, order by cluster i | 
|  | 289 	  w=lapply(split(cck,cck[,3]), function(x) { y=matrix(unlist(x),ncol=4); y[order(y[,2]),4] }) | 
|  | 290 	  q = matrix(as.numeric(unlist(w)),ncol=length(w),byrow=F) | 
|  | 291 	  q = q[,res[[2]]$consensusTree$order] #order by leave order of k=2 | 
|  | 292  	  #q is a matrix of k rows and sample columns, values are item consensus of sample to the cluster. | 
|  | 293 | 
|  | 294 	  thisColors = unique(cbind(res[[k]]$consensusClass,res[[k]]$clrs[[1]])) | 
|  | 295 	  thisColors=thisColors[order(as.numeric(thisColors[,1])),2] | 
|  | 296 	  colorsArr=c(colorsArr,thisColors) | 
|  | 297 	  sumRes[[k]] = rankedBarPlot(q,thisColors,cc=res[[k]]$consensusClass[res[[2]]$consensusTree$order],paste("k=",k,sep="") ) | 
|  | 298   } | 
|  | 299 | 
|  | 300   ys=cs=lab=c() | 
|  | 301   lastk=cc[1,1] | 
|  | 302   for(i in 1:length(colorsArr)){ | 
|  | 303     if(lastk != cc[i,1]){ | 
|  | 304       ys=c(ys,0,0) | 
|  | 305       cs=c(cs,NA,NA) | 
|  | 306       lastk=cc[i,1] | 
|  | 307       lab=c(lab,NA,NA) | 
|  | 308     } | 
|  | 309     ys=c(ys,cc[i,3]) | 
|  | 310     cs=c(cs,colorsArr[i]) | 
|  | 311     lab=c(lab,cc[i,1]) | 
|  | 312   } | 
|  | 313   names(ys) = lab | 
|  | 314   par(mfrow=c(3,1),mar=c(4,3,2,0)) | 
|  | 315   barplot(ys,col=cs,border=cs,main="cluster-consensus",ylim=c(0,1),las=1) | 
|  | 316   if(is.null(plot)==FALSE){ | 
|  | 317 	  dev.off() | 
|  | 318   } | 
|  | 319   colnames(cc) = c("k","cluster","clusterConsensus") | 
|  | 320   colnames(cci) = c("k","cluster","item","itemConsensus") | 
|  | 321   cci[,"item"] = names(res[[2]]$consensusClass)[ cci[,"item"] ] | 
|  | 322   #type cci | 
|  | 323   cci = data.frame( k=as.numeric(cci[,"k"]), cluster=as.numeric(cci[,"cluster"]), item=cci[,"item"], itemConsensus=as.numeric(cci[,"itemConsensus"])) | 
|  | 324 | 
|  | 325   #write to file. | 
|  | 326   if(writeTable){ | 
|  | 327 	write.csv(file=paste(title,"/",title,".summary.cluster.consensus.csv",sep=""),row.names=F, cc) | 
|  | 328 	write.csv(file=paste(title,"/",title,".summary.item.consensus.csv",sep=""), row.names=F, cc) | 
|  | 329   } | 
|  | 330   return(list(clusterConsensus=cc,itemConsensus=cci)) | 
|  | 331 } | 
|  | 332 | 
|  | 333 | 
|  | 334 ccRun <- function( d=d, | 
|  | 335                    maxK=NULL, | 
|  | 336                    repCount=NULL, | 
|  | 337                    diss=inherits( d, "dist" ), | 
|  | 338                    pItem=NULL, | 
|  | 339                    pFeature=NULL, | 
|  | 340                    innerLinkage=NULL, | 
|  | 341                    distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ), | 
|  | 342                    clusterAlg=NULL, | 
|  | 343                    weightsItem=NULL, | 
|  | 344                    weightsFeature=NULL, | 
|  | 345                    verbose=NULL) { | 
|  | 346   m = vector(mode='list', repCount) | 
|  | 347   ml = vector(mode="list",maxK) | 
|  | 348   n <- ifelse( diss, ncol( as.matrix(d) ), ncol(d) ) | 
|  | 349   mCount = mConsist = matrix(c(0),ncol=n,nrow=n) | 
|  | 350   ml[[1]] = c(0); | 
|  | 351 | 
|  | 352   if (is.null( distance ) ) distance <- 'euclidean'  ## necessary if d is a dist object and attr( d, "method" ) == NULLa | 
|  | 353 | 
|  | 354   require( amap ) | 
|  | 355   ##  we're going to use the amap Dist function, but they misname their correlation | 
|  | 356   ##  functions, so re-name them correctly | 
|  | 357   amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary", | 
|  | 358                       "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" ) | 
|  | 359   names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary", | 
|  | 360                                "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" ) | 
|  | 361   main.dist.obj <- NULL | 
|  | 362   ##browser() | 
|  | 363   if ( diss ){ | 
|  | 364     main.dist.obj <- d | 
|  | 365 | 
|  | 366     ## reset the pFeature & weightsFeature params if they've been set (irrelevant if d is a dist matrix) | 
|  | 367     if ( ( !is.null(pFeature) ) && | 
|  | 368          ( pFeature < 1 ) ) { | 
|  | 369       if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified pFeature parameter\n" ) | 
|  | 370       pFeature <- 1 # set it to 1 to avoid problems with sampleCols | 
|  | 371     } | 
|  | 372     if ( ! is.null( weightsFeature ) ) { | 
|  | 373       if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified weightsFeature parameter\n" ) | 
|  | 374       weightsFeature <- NULL  # set it to NULL to avoid problems with sampleCols | 
|  | 375     } | 
|  | 376   } else { ## d is a data matrix | 
|  | 377     ## we're not sampling over the features | 
|  | 378     if ( ( clusterAlg != "km" ) && | 
|  | 379          ( is.null( pFeature ) || | 
|  | 380            ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) ) { | 
|  | 381       ## only generate a main.dist.object IFF 1) d is a matrix, 2) we're not sampling the features, and 3) the algorithm isn't 'km' | 
|  | 382       if ( inherits( distance, "character" ) ) { | 
|  | 383         if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.") | 
|  | 384 | 
|  | 385         main.dist.obj <- Dist( t(d), method=as.character( amap.distance[ distance ] ) ) | 
|  | 386         ## now fix dumb amap naming convention for distance metrics | 
|  | 387         attr( main.dist.obj, "method" ) <- as.character( amap.distance[ distance ] ) | 
|  | 388       } else stop("unsupported distance specified.") | 
|  | 389     } else { | 
|  | 390       ## pFeature < 1 or a weightsFeature != NULL | 
|  | 391       ## since d is a data matrix, the user wants to sample over the gene features, so main.dist.obj is left as NULL | 
|  | 392     } | 
|  | 393   } | 
|  | 394 | 
|  | 395 | 
|  | 396   for (i in 1:repCount){ | 
|  | 397     ##browser() | 
|  | 398     if(verbose){ | 
|  | 399       message(paste("random subsample",i)); | 
|  | 400     } | 
|  | 401     ## take expression matrix sample, samples and genes | 
|  | 402     sample_x = sampleCols( d, pItem, pFeature, weightsItem, weightsFeature ) | 
|  | 403 | 
|  | 404     this_dist = NA | 
|  | 405     if ( ! is.null( main.dist.obj ) ) { | 
|  | 406       boot.cols <- sample_x$subcols | 
|  | 407       this_dist <- as.matrix( main.dist.obj )[ boot.cols, boot.cols ] | 
|  | 408       if ( clusterAlg != "km" ) { | 
|  | 409         ## if this isn't kmeans, then convert to a distance object | 
|  | 410         this_dist <- as.dist( this_dist ) | 
|  | 411         attr( this_dist, "method" ) <- attr( main.dist.obj, "method" ) | 
|  | 412       } | 
|  | 413     } else { | 
|  | 414       ## if main.dist.obj is NULL, then d is a data matrix, and either: | 
|  | 415       ##   1) clusterAlg is 'km' | 
|  | 416       ##   2) pFeatures < 1 or weightsFeatures have been specified, or | 
|  | 417       ##   3) both | 
|  | 418       ## so we can't use a main distance object and for every iteration, we will have to re-calculate either | 
|  | 419       ##   1) the distance matrix (because we're also sampling the features as well), or | 
|  | 420       ##   2) the submat (if using km) | 
|  | 421 | 
|  | 422       if ( clusterAlg != "km" )  { | 
|  | 423         if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.") | 
|  | 424         ## good, we have a supported distance type | 
|  | 425         this_dist <- Dist( t( sample_x$submat ), method=as.character( amap.distance[ distance ] ) ) | 
|  | 426         ## now fix dumb amap naming convention for distance metrics | 
|  | 427         attr( this_dist, "method" ) <- as.character( amap.distance[ distance ] ) | 
|  | 428       } else { | 
|  | 429         ##browser() | 
|  | 430         ##clusterAlg == "km" | 
|  | 431         ## if we're not sampling the features, then grab the colslice | 
|  | 432         if ( is.null( pFeature ) || | 
|  | 433             ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) { | 
|  | 434           this_dist <- d[, sample_x$subcols ] | 
|  | 435         } else { | 
|  | 436           if ( is.na( sample_x$submat ) ) { | 
|  | 437             save( "ccrun.submat.eq.na.dbg.rda" ) | 
|  | 438             stop( "Houston, we have a problem.  sample_x$submat is NA in ccRun when it should be specified - saving state\n" ) | 
|  | 439           } | 
|  | 440 | 
|  | 441           this_dist <- sample_x$submat | 
|  | 442         } | 
|  | 443       } | 
|  | 444     } | 
|  | 445 | 
|  | 446     ## cluster samples for HC. | 
|  | 447     this_cluster=NA | 
|  | 448     if(clusterAlg=="hc"){ | 
|  | 449       this_cluster = hclust( this_dist, method=innerLinkage) | 
|  | 450     } | 
|  | 451     ##browser() | 
|  | 452     ##mCount is possible number of times that two sample occur in same random sample, independent of k | 
|  | 453     ##mCount stores number of times a sample pair was sampled together. | 
|  | 454     mCount <- connectivityMatrix( rep( 1,length(sample_x[[3]])), | 
|  | 455                                   mCount, | 
|  | 456                                   sample_x[[3]] ) | 
|  | 457 | 
|  | 458     ##use samples for each k | 
|  | 459     for (k in 2:maxK){ | 
|  | 460       if(verbose){ | 
|  | 461         message(paste("  k =",k)) | 
|  | 462       } | 
|  | 463       if (i==1){ | 
|  | 464         ml[[k]] = mConsist #initialize | 
|  | 465       } | 
|  | 466       this_assignment=NA | 
|  | 467       if(clusterAlg=="hc"){ | 
|  | 468         ##prune to k for hc | 
|  | 469         this_assignment = cutree(this_cluster,k) | 
|  | 470         ##browser() | 
|  | 471       }else if(clusterAlg=="km"){ | 
|  | 472         ##this_dist should now be a matrix corresponding to the result from sampleCols | 
|  | 473         this_assignment <- kmeans( t( this_dist ), | 
|  | 474                                    k, | 
|  | 475                                    iter.max = 10, | 
|  | 476                                    nstart = 1, | 
|  | 477                                    algorithm = c("Hartigan-Wong") )$cluster | 
|  | 478       }else if ( clusterAlg == "pam" ) { | 
|  | 479         require( cluster ) | 
|  | 480         this_assignment <- pam( x=this_dist, | 
|  | 481                                 k, | 
|  | 482                                 diss=TRUE, | 
|  | 483                                 metric=distance, | 
|  | 484                                 cluster.only=TRUE ) | 
|  | 485       } else{ | 
|  | 486         ##optional cluterArg Hook. | 
|  | 487         this_assignment <- get(clusterAlg)(this_dist, k) | 
|  | 488       } | 
|  | 489       ##add to tally | 
|  | 490       ml[[k]] <- connectivityMatrix( this_assignment, | 
|  | 491                                      ml[[k]], | 
|  | 492                                      sample_x[[3]] ) | 
|  | 493     } | 
|  | 494   } | 
|  | 495 | 
|  | 496 | 
|  | 497   ##consensus fraction | 
|  | 498   res = vector(mode="list",maxK) | 
|  | 499   for (k in 2:maxK){ | 
|  | 500     ##fill in other half of matrix for tally and count. | 
|  | 501     tmp = triangle(ml[[k]],mode=3) | 
|  | 502     tmpCount = triangle(mCount,mode=3) | 
|  | 503     res[[k]] = tmp / tmpCount | 
|  | 504     res[[k]][which(tmpCount==0)] = 0 | 
|  | 505   } | 
|  | 506   message("end fraction") | 
|  | 507   return(res) | 
|  | 508 } | 
|  | 509 | 
|  | 510 | 
|  | 511 connectivityMatrix <- function( clusterAssignments, m, sampleKey){ | 
|  | 512   ##input: named vector of cluster assignments, matrix to add connectivities | 
|  | 513   ##output: connectivity matrix | 
|  | 514   names( clusterAssignments ) <- sampleKey | 
|  | 515   cls <- lapply( unique( clusterAssignments ), function(i) as.numeric( names( clusterAssignments[ clusterAssignments %in% i ] ) ) ) | 
|  | 516 | 
|  | 517   for ( i in 1:length( cls ) ) { | 
|  | 518     nelts <- 1:ncol( m ) | 
|  | 519     cl <- as.numeric( nelts %in% cls[[i]] ) ## produces a binary vector | 
|  | 520     updt <- outer( cl, cl ) | 
|  | 521     m <- m + updt | 
|  | 522   } | 
|  | 523   return(m) | 
|  | 524 } | 
|  | 525 | 
|  | 526 ## returns a list with the sample columns, as well as the sub-matrix & sample features (if necessary) | 
|  | 527 ##  if no sampling over the features is performed, the submatrix & sample features are returned as NAs | 
|  | 528 ##  to reduce memory overhead | 
|  | 529 sampleCols <- function( d, | 
|  | 530                         pSamp=NULL, | 
|  | 531                         pRow=NULL, | 
|  | 532                         weightsItem=NULL, | 
|  | 533                         weightsFeature=NULL ){ | 
|  | 534   space <- ifelse( inherits( d, "dist" ), ncol( as.matrix(d) ), ncol(d) ) | 
|  | 535   sampleN <- floor(space*pSamp) | 
|  | 536   sampCols <- sort( sample(space, sampleN, replace = FALSE, prob = weightsItem) ) | 
|  | 537 | 
|  | 538   this_sample <- sampRows <- NA | 
|  | 539   if ( inherits( d, "matrix" ) ) { | 
|  | 540     if ( (! is.null( pRow ) ) && | 
|  | 541          ( (pRow < 1 ) || (! is.null( weightsFeature ) ) ) ) { | 
|  | 542       ## only sample the rows and generate a sub-matrix if we're sampling over the row/gene/features | 
|  | 543       space = nrow(d) | 
|  | 544       sampleN = floor(space*pRow) | 
|  | 545       sampRows = sort( sample(space, sampleN, replace = FALSE, prob = weightsFeature) ) | 
|  | 546       this_sample <- d[sampRows,sampCols] | 
|  | 547       dimnames(this_sample) <- NULL | 
|  | 548     } else { | 
|  | 549       ## do nothing | 
|  | 550     } | 
|  | 551   } | 
|  | 552   return( list( submat=this_sample, | 
|  | 553                 subrows=sampRows, | 
|  | 554                 subcols=sampCols ) ) | 
|  | 555 } | 
|  | 556 | 
|  | 557 CDF=function(ml,breaks=100){ | 
|  | 558   #plot CDF distribution | 
|  | 559   plot(c(0),xlim=c(0,1),ylim=c(0,1),col="white",bg="white",xlab="consensus index",ylab="CDF",main="consensus CDF", las=2) | 
|  | 560   k=length(ml) | 
|  | 561   this_colors = rainbow(k-1) | 
|  | 562   areaK = c() | 
|  | 563   for (i in 2:length(ml)){ | 
|  | 564     v=triangle(ml[[i]],mode=1) | 
|  | 565 | 
|  | 566     #empirical CDF distribution. default number of breaks is 100 | 
|  | 567     h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks)) | 
|  | 568     h$counts = cumsum(h$counts)/sum(h$counts) | 
|  | 569 | 
|  | 570     #calculate area under CDF curve, by histogram method. | 
|  | 571     thisArea=0 | 
|  | 572     for (bi in 1:(length(h$breaks)-1)){ | 
|  | 573        thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width | 
|  | 574        bi = bi + 1 | 
|  | 575     } | 
|  | 576     areaK = c(areaK,thisArea) | 
|  | 577     lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l') | 
|  | 578   } | 
|  | 579   legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors) | 
|  | 580 | 
|  | 581   #plot area under CDF change. | 
|  | 582   deltaK=areaK[1] #initial auc at k=2 | 
|  | 583   for(i in 2:(length(areaK))){ | 
|  | 584     #proportional increase relative to prior K. | 
|  | 585     deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1]) | 
|  | 586   } | 
|  | 587   plot(1+(1:length(deltaK)),y=deltaK,xlab="k",ylab="relative change in area under CDF curve",main="Delta area",type="b") | 
|  | 588 } | 
|  | 589 | 
|  | 590 | 
|  | 591 myPal = function(n=10){ | 
|  | 592   #returns n colors | 
|  | 593   seq = rev(seq(0,255,by=255/(n))) | 
|  | 594   palRGB = cbind(seq,seq,255) | 
|  | 595   rgb(palRGB,maxColorValue=255) | 
|  | 596 } | 
|  | 597 | 
|  | 598 setClusterColors = function(past_ct,ct,colorU,colorList){ | 
|  | 599 	#description: sets common color of clusters between different K | 
|  | 600 	newColors = c() | 
|  | 601 	if(length(colorList)==0){ | 
|  | 602 		#k==2 | 
|  | 603 		newColors = colorU[ct] | 
|  | 604 		colori=2 | 
|  | 605 	}else{ | 
|  | 606 		newColors = rep(NULL,length(ct)) | 
|  | 607 		colori = colorList[[2]] | 
|  | 608 		mo=table(past_ct,ct) | 
|  | 609 		m=mo/apply(mo,1,sum) | 
|  | 610 			for(tci in 1:ncol(m)){ # for each cluster | 
|  | 611 				maxC = max(m[,tci]) | 
|  | 612 				pci = which(m[,tci] == maxC) | 
|  | 613 				if( sum(m[,tci]==maxC)==1 & max(m[pci,])==maxC & sum(m[pci,]==maxC)==1  )  { | 
|  | 614 				#if new column maximum is unique, same cell is row maximum and is also unique | 
|  | 615 				##Note: the greatest of the prior clusters' members are the greatest in a current cluster's members. | 
|  | 616 					newColors[which(ct==tci)] = unique(colorList[[1]][which(past_ct==pci)]) # one value | 
|  | 617 				}else{ #add new color. | 
|  | 618 					colori=colori+1 | 
|  | 619 					newColors[which(ct==tci)] = colorU[colori] | 
|  | 620 				} | 
|  | 621 			} | 
|  | 622 	} | 
|  | 623 	return(list(newColors,colori,unique(newColors) )) | 
|  | 624 } | 
|  | 625 | 
|  | 626 clusterTrackingPlot = function(m){ | 
|  | 627   #description: plots cluster tracking plot | 
|  | 628   #input: m - matrix where rows are k, columns are samples, and values are cluster assignments. | 
|  | 629   plot(NULL,xlim=c(-0.1,1),ylim=c(0,1),axes=FALSE,xlab="samples",ylab="k",main="tracking plot") | 
|  | 630   for(i in 1:nrow(m)){ | 
|  | 631     rect(  xleft=seq(0,1-1/ncol(m),by=1/ncol(m)),  ybottom=rep(1-i/nrow(m),ncol(m)) , xright=seq(1/ncol(m),1,by=1/ncol(m)), ytop=rep(1-(i-1)/nrow(m),ncol(m)), col=m[i,],border=NA) | 
|  | 632   } | 
|  | 633   #hatch lines to indicate samples | 
|  | 634   xl = seq(0,1-1/ncol(m),by=1/ncol(m)) | 
|  | 635   segments(  xl, rep(-0.1,ncol(m)) , xl, rep(0,ncol(m)), col="black")    #** alt white and black color? | 
|  | 636   ypos = seq(1,0,by=-1/nrow(m))-1/(2*nrow(m)) | 
|  | 637   text(x=-0.1,y=ypos[-length(ypos)],labels=seq(2,nrow(m)+1,by=1)) | 
|  | 638 } | 
|  | 639 | 
|  | 640 triangle = function(m,mode=1){ | 
|  | 641   #mode=1 for CDF, vector of lower triangle. | 
|  | 642   #mode==3 for full matrix. | 
|  | 643   #mode==2 for calcICL; nonredundant half matrix coun | 
|  | 644   #mode!=1 for summary | 
|  | 645   n=dim(m)[1] | 
|  | 646   nm = matrix(0,ncol=n,nrow=n) | 
|  | 647   fm = m | 
|  | 648 | 
|  | 649 | 
|  | 650   nm[upper.tri(nm)] = m[upper.tri(m)] #only upper half | 
|  | 651 | 
|  | 652   fm = t(nm)+nm | 
|  | 653   diag(fm) = diag(m) | 
|  | 654 | 
|  | 655   nm=fm | 
|  | 656   nm[upper.tri(nm)] = NA | 
|  | 657   diag(nm) = NA | 
|  | 658   vm = m[lower.tri(nm)] | 
|  | 659 | 
|  | 660   if(mode==1){ | 
|  | 661     return(vm) #vector | 
|  | 662   }else if(mode==3){ | 
|  | 663     return(fm) #return full matrix | 
|  | 664   }else if(mode == 2){ | 
|  | 665     return(nm) #returns lower triangle and no diagonal. no double counts. | 
|  | 666   } | 
|  | 667 | 
|  | 668 } | 
|  | 669 | 
|  | 670 | 
|  | 671 rankedBarPlot=function(d,myc,cc,title){ | 
|  | 672 	colors = rbind() #each row is a barplot series | 
|  | 673 	byRank = cbind() | 
|  | 674 | 
|  | 675 	spaceh = 0.1 #space between bars | 
|  | 676 	for(i in 1:ncol(d)){ | 
|  | 677 	  byRank = cbind(byRank,sort(d[,i],na.last=F)) | 
|  | 678 	  colors = rbind(colors,order(d[,i],na.last=F)) | 
|  | 679 	} | 
|  | 680 	maxH = max(c(1.5,apply(byRank,2,sum)),na.rm=T) #maximum height of graph | 
|  | 681 | 
|  | 682 	#barplot largest to smallest so that smallest is in front. | 
|  | 683 	barp = barplot( apply(byRank,2,sum) ,  col=myc[colors[,1]] ,space=spaceh,ylim=c(0,maxH),main=paste("item-consensus", title),border=NA,las=1  ) | 
|  | 684 	for(i in 2:nrow(byRank)){ | 
|  | 685 	  barplot( apply(matrix(byRank[i:nrow(byRank),],ncol=ncol(byRank))  ,2,sum), space=spaceh,col=myc[colors[,i]],ylim=c(0,maxH), add=T,border=NA,las=1  ) | 
|  | 686 	} | 
|  | 687 	xr=seq(spaceh,ncol(d)+ncol(d)*spaceh,(ncol(d)+ncol(d)*spaceh)/ncol(d)  ) | 
|  | 688 	#class labels as asterisks | 
|  | 689 	text("*",x=xr+0.5,y=maxH,col=myc[cc],cex=1.4) #rect(xr,1.4,xr+1,1.5,col=myc[cc] ) | 
|  | 690 } | 
|  | 691 | 
|  | 692 | 
|  | 693 | 
|  | 694 ###################################################################3333 | 
|  | 695 ## RESTART MY SCRIPTS HERE | 
|  | 696 ##save.image( '/home/waltman/work.local/tmp/new.ccplus.R.dbg' ) | 
|  | 697 stop( "phw forced stop\n") | 
|  | 698 spec <- matrix( c( "data.fname",         "d", 1, "character", | 
|  | 699                    "direction",          "n", 2, "character", | 
|  | 700                    "output.name",        "o", 2, "character", | 
|  | 701                    "cluster.alg",        "a", 2, "character", ## must be either 'hc' or 'km' | 
|  | 702                    "distance.metric",    "m", 2, "character", ## must be one supported by ConsensusClusterPlus | 
|  | 703                    "max.k",              "k", 2, "integer", | 
|  | 704                    "reps",               "r", 2, "integer", | 
|  | 705                    "innerLinkage",       "i", 1, "character", | 
|  | 706                    "finalLinkage",       "f", 1, "character", | 
|  | 707                    "out.report.dir",     "p", 2, "character", | 
|  | 708                    "out.report.html",    "h", 2, "character" | 
|  | 709                    ), | 
|  | 710                 nc=4, | 
|  | 711                 byrow=TRUE | 
|  | 712                ) | 
|  | 713 | 
|  | 714 opt <- getopt( spec=spec ) | 
|  | 715 | 
|  | 716 ## default params for non-required params | 
|  | 717 if ( is.null( opt$direction ) ) { opt$direction <- "cols"  } | 
|  | 718 if ( is.null( opt$cluster.alg ) ) { opt$cluster.alg <- "pam" } | 
|  | 719 if ( is.null( opt$output.name ) ) { opt$output.name <- "consensus.cluster.result" } | 
|  | 720 if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "cosine" } | 
|  | 721 if ( is.null( opt$max.k ) ) { opt$max.k <- 10 } | 
|  | 722 if ( is.null( opt$reps ) ) { opt$reps <- 1000 } | 
|  | 723 if ( is.null( opt$innerLinkage ) ) { opt$innerLinkage <- "average" } | 
|  | 724 if ( is.null( opt$finalLinkage ) ) { opt$finalLinkage <- "average" } | 
|  | 725 | 
|  | 726 if ( is.null( opt$out.report.dir ) ) { opt$out.report.dir <- "report" } | 
|  | 727 if ( is.null( opt$out.report.html ) ) { opt$out.report.html <- file.path( "report", "index.html" ) } | 
|  | 728 | 
|  | 729 ## validate params here (make sure set to valid values) | 
|  | 730 if ( !opt$cluster.alg %in% c( "hc", "km", "pam" ) ) { | 
|  | 731   stop( "invalid clustering algorithm specified", cluster.alg ) | 
|  | 732 } | 
|  | 733 | 
|  | 734 | 
|  | 735 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) | 
|  | 736 ## transpose the matrix if we want to cluster the rows (genes) | 
|  | 737 if ( opt$direction == "rows" ) { | 
|  | 738   data <- t( data ) | 
|  | 739 } | 
|  | 740 | 
|  | 741 | 
|  | 742 title <- paste( opt$cluster.alg, opt$output.name, sep="." ) | 
|  | 743 ##source( '~/bin/galaxy-dist/tools/ucsc.cancer.tools/cluster.tools/new.ccplus.R' ) | 
|  | 744 results <- ConsensusClusterPlus( data, | 
|  | 745                                  maxK=opt$max.k, | 
|  | 746                                  reps=opt$reps, | 
|  | 747                                  pItem=0.8, | 
|  | 748                                  ##pFeature=NULL, | 
|  | 749                                  pFeature=0.5, | 
|  | 750                                  title=opt$out.report.dir, | 
|  | 751                                  clusterAlg=opt$cluster.alg, | 
|  | 752                                  distance=opt$distance.metric, | 
|  | 753                                  innerLinkage=opt$innerLinkage, | 
|  | 754                                  finalLinkage=opt$finalLinkage, | 
|  | 755                                  plot='pdf', | 
|  | 756                                  writeTable=FALSE, | 
|  | 757                                  seed=100, | 
|  | 758                                  weightsFeature=abs( rnorm( nrow( orig.data ) ) ), | 
|  | 759                                  ##verbose=FALSE ) | 
|  | 760                                  verbose=TRUE ) | 
|  | 761 | 
|  | 762 pngs = list.files(path=opt$out.report.dir, patt="png") | 
|  | 763 html.out <- paste( "<html>", | 
|  | 764                    paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), | 
|  | 765                    "</html>" ) | 
|  | 766 cat( html.out, file=opt$out.report.html ) | 
|  | 767 | 
|  | 768 | 
|  | 769 ## re-transpose the matrix back if we've clustered the rows (genes) | 
|  | 770 if ( opt$direction == "rows" ) { | 
|  | 771   data <- t( data ) | 
|  | 772 } | 
|  | 773 save( file=opt$output.name, data, results) |