0
|
1 #!/usr/bin/env Rscript
|
|
2 ## Consensus Clustering Script by Peter Waltman
|
|
3 ## May 31, 2011
|
|
4 ## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
|
|
5 ##
|
|
6 #usage, options and doc goes here
|
|
7 argspec <- c("consensus.clustering.R takes a clustering from ConsensusClusterPlus and clinical survival data
|
|
8 and generates a KM-plot, along with the log-rank p-values
|
|
9
|
|
10 Usage:
|
|
11 consensus.clustering.R -d <data.file>
|
|
12 Optional:
|
|
13 -o <output.name>
|
|
14 -a <cluster.alg> ## must be either 'hc' or 'km'
|
|
15 -m <distance.metric> ## must be one supported by ConsensusClusterPlus
|
|
16 -k <max.k>
|
|
17 -r <reps>
|
|
18 -f <filter> ## filter, o/w no filtering
|
|
19
|
|
20 \n\n")
|
|
21 args <- commandArgs(TRUE)
|
|
22 if ( length( args ) == 1 && args =="--help") {
|
|
23 write(argspec, stderr())
|
|
24 q();
|
|
25 }
|
|
26
|
|
27 lib.load.quiet <- function( package ) {
|
|
28 package <- as.character(substitute(package))
|
|
29 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
|
|
30 }
|
|
31 lib.load.quiet(getopt)
|
|
32 lib.load.quiet( gplots )
|
|
33 lib.load.quiet( amap )
|
|
34 ## if any of the faster clustering methods are available on this system, load them
|
|
35 if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
|
|
36 if ( 'flashClust' %in% installed.packages() ) {
|
|
37 lib.load.quiet( flashClust )
|
|
38 } else {
|
|
39 if ( 'fastcluster' %in% installed.packages() ) {
|
|
40 lib.load.quiet( fastcluster )
|
|
41 }
|
|
42 }
|
|
43 }
|
|
44 ##lib.load.quiet(ConsensusClusterPlus)
|
|
45 lib.load.quiet( amap )
|
|
46 lib.load.quiet( cluster )
|
|
47
|
|
48 ###################
|
|
49 ## code borrowed/updated from ConsensusClusterPlus
|
|
50 ###################
|
|
51
|
|
52 ConsensusClusterPlus <- function( d=NULL,
|
|
53 maxK = 3,
|
|
54 reps=10,
|
|
55 pItem=0.8,
|
|
56 pFeature=1,
|
|
57 clusterAlg="hc",
|
|
58 title="untitled_consensus_cluster",
|
|
59 innerLinkage="average",
|
|
60 finalLinkage="average",
|
|
61 distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ),
|
|
62 ml=NULL,
|
|
63 tmyPal=NULL,
|
|
64 seed=NULL,
|
|
65 plot=NULL,
|
|
66 writeTable=FALSE,
|
|
67 weightsItem=NULL,
|
|
68 weightsFeature=NULL,
|
|
69 verbose=F ) {
|
|
70 ##description: runs consensus subsamples
|
|
71
|
|
72
|
|
73 if(is.null(seed)==TRUE){
|
|
74 seed=timeSeed = as.numeric(Sys.time())
|
|
75 }
|
|
76 set.seed(seed)
|
|
77
|
|
78 if(is.null(ml)==TRUE){
|
|
79
|
|
80 if ( inherits( distance, "dist" ) ) {
|
|
81 stop( "If you want to pass in a pre-calculated distance object, pass it in as the data, rather than the distance parameter\n" )
|
|
82 }
|
|
83
|
|
84 if ( ! class( d ) %in% c( "dist", "matrix", "ExpressionSet" ) ) {
|
|
85 stop("d must be a matrix, distance object or ExpressionSet (eset object)")
|
|
86 }
|
|
87
|
|
88 if ( inherits( d, "dist" ) ) {
|
|
89 ## if d is a distance matrix, fix a few things so that they don't cause problems with the analysis
|
|
90 ## Note, assumption is that if d is a distance matrix, the user doesn't want to sample over the row features
|
|
91 if ( is.null( attr( d, "method" ) ) ) {
|
|
92 attr( d, "method" ) <- distance <- "unknown - user-specified"
|
|
93 }
|
|
94 if ( is.null( distance ) || ( distance != attr( d, "method" ) ) ) {
|
|
95 distance <- attr( d, "method" )
|
|
96 }
|
|
97
|
|
98 if ( ( ! is.null( pFeature ) ) && ( pFeature < 1 ) ) {
|
|
99 if ( verbose ) warning( "Cannot use the pFeatures parameter when specifying a distance matrix as the data object\n" )
|
|
100 pFeature <- 1
|
|
101 }
|
|
102 if ( ! is.null( weightsFeature ) ) {
|
|
103 if ( verbose ) warning( "Cannot use the weightsFeature parameter when specifying a distance matrix as the data object\n" )
|
|
104 weightsFeature <- NULL
|
|
105 }
|
|
106 if ( clusterAlg == "km" ) {
|
|
107 if ( verbose ) warning( "You are asking CCPLUS to use K-means to cluster a distance matrix (rather than the data itself) - this may produce unintended results. We suggest using PAM if you want to use alternate distance metrics/objects\n" )
|
|
108 ##d <- as.matrix( d ) #this is now done w/in ccRun
|
|
109 }
|
|
110 } else {
|
|
111 if ( is.null( distance ) ) {
|
|
112 ## we should never get here, but just in case
|
|
113 distance <- "pearson"
|
|
114 }
|
|
115 }
|
|
116
|
|
117 if ( ( clusterAlg == "km" ) && inherits( distance, "character" ) && ( distance != "euclidean" ) ) {
|
|
118 warning( "WARNING: kmeans can only use the euclidean distance metric. If you would like to use an alternate metric, we suggest using PAM or HC clustering instead. This parameter combinationwill use k-means, but will NOT use the specified distance metric\n" )
|
|
119 distance <- 'euclidean'
|
|
120 }
|
|
121
|
|
122
|
|
123 if ( inherits( d,"ExpressionSet" ) ) {
|
|
124 d <- exprs(d)
|
|
125 }
|
|
126
|
|
127 ml <- ccRun( d=d,
|
|
128 maxK=maxK,
|
|
129 repCount=reps,
|
|
130 diss=inherits(d,"dist"),
|
|
131 pItem=pItem,
|
|
132 pFeature=pFeature,
|
|
133 innerLinkage=innerLinkage,
|
|
134 clusterAlg=clusterAlg,
|
|
135 weightsFeature=weightsFeature,
|
|
136 weightsItem=weightsItem,
|
|
137 distance=distance,
|
|
138 verbose=verbose)
|
|
139 }
|
|
140 res=list();
|
|
141
|
|
142 ##make results directory
|
|
143 if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){
|
|
144 dir.create(paste(title,sep=""))
|
|
145 }
|
|
146
|
|
147 ##write log file
|
|
148 log <- matrix( ncol=2,
|
|
149 byrow=T,
|
|
150 c("title",title,
|
|
151 "maxK",maxK,
|
|
152 "input matrix rows",ifelse ( inherits( d, "matrix" ), nrow(d), "dist-mat" ),
|
|
153 "input matric columns",ifelse ( inherits( d, "matrix" ), ncol(d), ncol( as.matrix(d) ) ),
|
|
154 "number of bootstraps",reps,
|
|
155 "item subsampling proportion",pItem,
|
|
156 "feature subsampling proportion",ifelse( is.null(pFeature), 1, pFeature ),
|
|
157 "cluster algorithm",clusterAlg,
|
|
158 "inner linkage type",innerLinkage,
|
|
159 "final linkage type",finalLinkage,
|
|
160 "correlation method",distance,
|
|
161 "plot",if(is.null(plot)) NA else plot,
|
|
162 "seed",if(is.null(seed)) NA else seed))
|
|
163 colnames(log) = c("option","value")
|
|
164 if(writeTable){
|
|
165 write.csv(file=paste(title,"/",title,".log.csv",sep=""), log,row.names=F)
|
|
166 }
|
|
167 if(is.null(plot)){
|
|
168 ##nothing
|
|
169 }else if(plot=="png"){
|
|
170 png(paste(title,"/","consensus%03d.png",sep=""))
|
|
171 }else if (plot=="pdf"){
|
|
172 pdf(onefile=TRUE, paste(title,"/","consensus.pdf",sep=""))
|
|
173 }else if (plot=="ps"){
|
|
174 postscript(onefile=TRUE, paste(title,"/","consensus.ps",sep=""))
|
|
175 }
|
|
176
|
|
177 colorList=list()
|
|
178 colorM = rbind() #matrix of colors.
|
|
179
|
|
180 #18 colors for marking different clusters
|
|
181 thisPal <- c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99","#E31A1C","#FDBF6F","#FF7F00","#CAB2D6","#6A3D9A","#FFFF99","#B15928",
|
|
182 "#bd18ea", #magenta
|
|
183 "#2ef4ca", #aqua
|
|
184 "#f4cced", #pink,
|
|
185 "#f4cc03", #lightorange
|
|
186 "#05188a", #navy,
|
|
187 "#e5a25a", #light brown
|
|
188 "#06f106", #bright green
|
|
189 "#85848f", #med gray
|
|
190 "#000000", #black
|
|
191 "#076f25", #dark green
|
|
192 "#93cd7f",#lime green
|
|
193 "#4d0776", #dark purple
|
|
194 "#ffffff" #white
|
|
195 )
|
|
196
|
|
197 ##plot scale
|
|
198 colBreaks=NA
|
|
199 if(is.null(tmyPal)==TRUE){
|
|
200 colBreaks=10
|
|
201 tmyPal = myPal(colBreaks)
|
|
202 }else{
|
|
203 colBreaks=length(tmyPal)
|
|
204 }
|
|
205 sc = cbind(seq(0,1,by=1/( colBreaks) )); rownames(sc) = sc[,1]
|
|
206 sc = cbind(sc,sc)
|
|
207 heatmap(sc, Colv=NA, Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=rownames(sc),labCol=F,main="consensus matrix legend")
|
|
208
|
|
209 for (tk in 2:maxK){
|
|
210 if(verbose){
|
|
211 message(paste("consensus ",tk))
|
|
212 }
|
|
213 fm = ml[[tk]]
|
|
214 hc=hclust( as.dist( 1 - fm ), method=finalLinkage);
|
|
215 message("clustered")
|
|
216 ct = cutree(hc,tk)
|
|
217 names(ct) = colnames(d)
|
|
218 c = fm
|
|
219 ##colnames(c) = colnames(d)
|
|
220 ##rownames(c) = colnames(d)
|
|
221
|
|
222 colorList = setClusterColors(res[[tk-1]][[3]],ct,thisPal,colorList)
|
|
223
|
|
224 pc = c
|
|
225 pc=pc[hc$order,] #***pc is matrix for plotting, same as c but is row-ordered and has names and extra row of zeros.
|
|
226 pc = rbind(pc,0)
|
|
227
|
|
228 heatmap(pc, Colv=as.dendrogram(hc), Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=F,labCol=F,mar=c(5,5),main=paste("consensus matrix k=",tk,sep="") , ColSideCol=colorList[[1]])
|
|
229 legend("topright",legend=unique(ct),fill=unique(colorList[[1]]),horiz=FALSE )
|
|
230
|
|
231 res[[tk]] = list(consensusMatrix=c,consensusTree=hc,consensusClass=ct,ml=ml[[tk]],clrs=colorList)
|
|
232 colorM = rbind(colorM,colorList[[1]])
|
|
233 }
|
|
234 CDF(ml)
|
|
235 clusterTrackingPlot(colorM[,res[[length(res)]]$consensusTree$order])
|
|
236 if(is.null(plot)==FALSE){
|
|
237 dev.off();
|
|
238 }
|
|
239 res[[1]] = colorM
|
|
240 if(writeTable){
|
|
241 for(i in 2:length(res)){
|
|
242 write.csv(file=paste(title,"/",title,".k=",i,".consensusMatrix.csv",sep=""), res[[i]]$consensusMatrix)
|
|
243 write.table(file=paste(title,"/",title,".k=",i,".consensusClass.csv",sep=""), res[[i]]$consensusClass,col.names = F,sep=",")
|
|
244 }
|
|
245 }
|
|
246 return(res)
|
|
247 }
|
|
248
|
|
249
|
|
250 calcICL = function(res,title="untitled_consensus_cluster",plot=NULL,writeTable=FALSE){
|
|
251 #calculates and plots cluster consensus and item consensus
|
|
252 cc=rbind()
|
|
253 cci = rbind()
|
|
254 sumRes=list()
|
|
255 colorsArr=c()
|
|
256
|
|
257 #make results directory
|
|
258 if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){
|
|
259 dir.create(paste(title,sep=""))
|
|
260 }
|
|
261 if(is.null(plot)){
|
|
262 #to screen
|
|
263 }else if(plot=="pdf"){
|
|
264 pdf(onefile=TRUE, paste(title,"/","icl.pdf",sep=""))
|
|
265 }else if(plot=="ps"){
|
|
266 postscript(onefile=TRUE, paste(title,"/","icl.ps",sep=""))
|
|
267 }else if (plot=="png"){
|
|
268 png(paste(title,"/","icl%03d.png",sep=""))
|
|
269 }
|
|
270
|
|
271 par(mfrow=c(3,1),mar=c(4,3,2,0))
|
|
272
|
|
273 for (k in 2:length(res)){ #each k
|
|
274 eiCols = c();
|
|
275 o = res[[k]]
|
|
276 m = o$consensusMatrix
|
|
277 m = triangle(m,mode=2)
|
|
278 for (ci in sort(unique(o$consensusClass))){ #each cluster in k
|
|
279 items = which(o$consensusClass==ci)
|
|
280 nk = length(items)
|
|
281 mk = sum( m[items,items], na.rm=T)/((nk*(nk-1))/2)
|
|
282 cc=rbind(cc,c(k,ci,mk)) #cluster-consensus
|
|
283
|
|
284 for (ei in rev(res[[2]]$consensusTree$order) ){
|
|
285 denom = if (ei %in% items) { nk - 1} else { nk }
|
|
286 mei = sum( c(m[ei,items],m[items,ei]), na.rm=T)/denom # mean item consensus to a cluster.
|
|
287 cci = rbind(cci,c(k,ci,ei,mei)) #cluster, cluster index, item index, item-consensus
|
|
288 }
|
|
289 eiCols = c(eiCols, rep(ci,length(o$consensusClass)) )
|
|
290 }
|
|
291
|
|
292 cck = cci[which(cci[,1]==k),] #only plot the new k data.
|
|
293
|
|
294 #group by item, order by cluster i
|
|
295 w=lapply(split(cck,cck[,3]), function(x) { y=matrix(unlist(x),ncol=4); y[order(y[,2]),4] })
|
|
296 q = matrix(as.numeric(unlist(w)),ncol=length(w),byrow=F)
|
|
297 q = q[,res[[2]]$consensusTree$order] #order by leave order of k=2
|
|
298 #q is a matrix of k rows and sample columns, values are item consensus of sample to the cluster.
|
|
299
|
|
300 thisColors = unique(cbind(res[[k]]$consensusClass,res[[k]]$clrs[[1]]))
|
|
301 thisColors=thisColors[order(as.numeric(thisColors[,1])),2]
|
|
302 colorsArr=c(colorsArr,thisColors)
|
|
303 sumRes[[k]] = rankedBarPlot(q,thisColors,cc=res[[k]]$consensusClass[res[[2]]$consensusTree$order],paste("k=",k,sep="") )
|
|
304 }
|
|
305
|
|
306 ys=cs=lab=c()
|
|
307 lastk=cc[1,1]
|
|
308 for(i in 1:length(colorsArr)){
|
|
309 if(lastk != cc[i,1]){
|
|
310 ys=c(ys,0,0)
|
|
311 cs=c(cs,NA,NA)
|
|
312 lastk=cc[i,1]
|
|
313 lab=c(lab,NA,NA)
|
|
314 }
|
|
315 ys=c(ys,cc[i,3])
|
|
316 cs=c(cs,colorsArr[i])
|
|
317 lab=c(lab,cc[i,1])
|
|
318 }
|
|
319 names(ys) = lab
|
|
320 par(mfrow=c(3,1),mar=c(4,3,2,0))
|
|
321 barplot(ys,col=cs,border=cs,main="cluster-consensus",ylim=c(0,1),las=1)
|
|
322 if(is.null(plot)==FALSE){
|
|
323 dev.off()
|
|
324 }
|
|
325 colnames(cc) = c("k","cluster","clusterConsensus")
|
|
326 colnames(cci) = c("k","cluster","item","itemConsensus")
|
|
327 cci[,"item"] = names(res[[2]]$consensusClass)[ cci[,"item"] ]
|
|
328 #type cci
|
|
329 cci = data.frame( k=as.numeric(cci[,"k"]), cluster=as.numeric(cci[,"cluster"]), item=cci[,"item"], itemConsensus=as.numeric(cci[,"itemConsensus"]))
|
|
330
|
|
331 #write to file.
|
|
332 if(writeTable){
|
|
333 write.csv(file=paste(title,"/",title,".summary.cluster.consensus.csv",sep=""),row.names=F, cc)
|
|
334 write.csv(file=paste(title,"/",title,".summary.item.consensus.csv",sep=""), row.names=F, cc)
|
|
335 }
|
|
336 return(list(clusterConsensus=cc,itemConsensus=cci))
|
|
337 }
|
|
338
|
|
339
|
|
340 ccRun <- function( d=d,
|
|
341 maxK=NULL,
|
|
342 repCount=NULL,
|
|
343 diss=inherits( d, "dist" ),
|
|
344 pItem=NULL,
|
|
345 pFeature=NULL,
|
|
346 innerLinkage=NULL,
|
|
347 distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ),
|
|
348 clusterAlg=NULL,
|
|
349 weightsItem=NULL,
|
|
350 weightsFeature=NULL,
|
|
351 verbose=NULL) {
|
|
352 m = vector(mode='list', repCount)
|
|
353 ml = vector(mode="list",maxK)
|
|
354 n <- ifelse( diss, ncol( as.matrix(d) ), ncol(d) )
|
|
355 mCount = mConsist = matrix(c(0),ncol=n,nrow=n)
|
|
356 ml[[1]] = c(0);
|
|
357
|
|
358 if (is.null( distance ) ) distance <- 'euclidean' ## necessary if d is a dist object and attr( d, "method" ) == NULLa
|
|
359
|
|
360 require( amap )
|
|
361 ## we're going to use the amap Dist function, but they misname their correlation
|
|
362 ## functions, so re-name them correctly
|
|
363 amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
|
|
364 "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
|
|
365 names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
|
|
366 "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
|
|
367 main.dist.obj <- NULL
|
|
368 ##browser()
|
|
369 if ( diss ){
|
|
370 main.dist.obj <- d
|
|
371
|
|
372 ## reset the pFeature & weightsFeature params if they've been set (irrelevant if d is a dist matrix)
|
|
373 if ( ( !is.null(pFeature) ) &&
|
|
374 ( pFeature < 1 ) ) {
|
|
375 if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified pFeature parameter\n" )
|
|
376 pFeature <- 1 # set it to 1 to avoid problems with sampleCols
|
|
377 }
|
|
378 if ( ! is.null( weightsFeature ) ) {
|
|
379 if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified weightsFeature parameter\n" )
|
|
380 weightsFeature <- NULL # set it to NULL to avoid problems with sampleCols
|
|
381 }
|
|
382 } else { ## d is a data matrix
|
|
383 ## we're not sampling over the features
|
|
384 if ( ( clusterAlg != "km" ) &&
|
|
385 ( is.null( pFeature ) ||
|
|
386 ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) ) {
|
|
387 ## only generate a main.dist.object IFF 1) d is a matrix, 2) we're not sampling the features, and 3) the algorithm isn't 'km'
|
|
388 if ( inherits( distance, "character" ) ) {
|
|
389 if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
|
|
390
|
|
391 main.dist.obj <- Dist( t(d), method=as.character( amap.distance[ distance ] ) )
|
|
392 ## now fix dumb amap naming convention for distance metrics
|
|
393 attr( main.dist.obj, "method" ) <- as.character( amap.distance[ distance ] )
|
|
394 } else stop("unsupported distance specified.")
|
|
395 } else {
|
|
396 ## pFeature < 1 or a weightsFeature != NULL
|
|
397 ## since d is a data matrix, the user wants to sample over the gene features, so main.dist.obj is left as NULL
|
|
398 }
|
|
399 }
|
|
400
|
|
401
|
|
402 for (i in 1:repCount){
|
|
403 ##browser()
|
|
404 if(verbose){
|
|
405 message(paste("random subsample",i));
|
|
406 }
|
|
407 ## take expression matrix sample, samples and genes
|
|
408 sample_x = sampleCols( d, pItem, pFeature, weightsItem, weightsFeature )
|
|
409
|
|
410 this_dist = NA
|
|
411 if ( ! is.null( main.dist.obj ) ) {
|
|
412 boot.cols <- sample_x$subcols
|
|
413 this_dist <- as.matrix( main.dist.obj )[ boot.cols, boot.cols ]
|
|
414 if ( clusterAlg != "km" ) {
|
|
415 ## if this isn't kmeans, then convert to a distance object
|
|
416 this_dist <- as.dist( this_dist )
|
|
417 attr( this_dist, "method" ) <- attr( main.dist.obj, "method" )
|
|
418 }
|
|
419 } else {
|
|
420 ## if main.dist.obj is NULL, then d is a data matrix, and either:
|
|
421 ## 1) clusterAlg is 'km'
|
|
422 ## 2) pFeatures < 1 or weightsFeatures have been specified, or
|
|
423 ## 3) both
|
|
424 ## so we can't use a main distance object and for every iteration, we will have to re-calculate either
|
|
425 ## 1) the distance matrix (because we're also sampling the features as well), or
|
|
426 ## 2) the submat (if using km)
|
|
427
|
|
428 if ( clusterAlg != "km" ) {
|
|
429 if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
|
|
430 ## good, we have a supported distance type
|
|
431 this_dist <- Dist( t( sample_x$submat ), method=as.character( amap.distance[ distance ] ) )
|
|
432 ## now fix dumb amap naming convention for distance metrics
|
|
433 attr( this_dist, "method" ) <- as.character( amap.distance[ distance ] )
|
|
434 } else {
|
|
435 ##browser()
|
|
436 ##clusterAlg == "km"
|
|
437 ## if we're not sampling the features, then grab the colslice
|
|
438 if ( is.null( pFeature ) ||
|
|
439 ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) {
|
|
440 this_dist <- d[, sample_x$subcols ]
|
|
441 } else {
|
|
442 if ( is.na( sample_x$submat ) ) {
|
|
443 save( "ccrun.submat.eq.na.dbg.rda" )
|
|
444 stop( "Houston, we have a problem. sample_x$submat is NA in ccRun when it should be specified - saving state\n" )
|
|
445 }
|
|
446
|
|
447 this_dist <- sample_x$submat
|
|
448 }
|
|
449 }
|
|
450 }
|
|
451
|
|
452 ## cluster samples for HC.
|
|
453 this_cluster=NA
|
|
454 if(clusterAlg=="hc"){
|
|
455 this_cluster = hclust( this_dist, method=innerLinkage)
|
|
456 }
|
|
457 ##browser()
|
|
458 ##mCount is possible number of times that two sample occur in same random sample, independent of k
|
|
459 ##mCount stores number of times a sample pair was sampled together.
|
|
460 mCount <- connectivityMatrix( rep( 1,length(sample_x[[3]])),
|
|
461 mCount,
|
|
462 sample_x[[3]] )
|
|
463
|
|
464 ##use samples for each k
|
|
465 for (k in 2:maxK){
|
|
466 if(verbose){
|
|
467 message(paste(" k =",k))
|
|
468 }
|
|
469 if (i==1){
|
|
470 ml[[k]] = mConsist #initialize
|
|
471 }
|
|
472 this_assignment=NA
|
|
473 if(clusterAlg=="hc"){
|
|
474 ##prune to k for hc
|
|
475 this_assignment = cutree(this_cluster,k)
|
|
476 ##browser()
|
|
477 }else if(clusterAlg=="km"){
|
|
478 ##this_dist should now be a matrix corresponding to the result from sampleCols
|
|
479 this_assignment <- kmeans( t( this_dist ),
|
|
480 k,
|
|
481 iter.max = 10,
|
|
482 nstart = 1,
|
|
483 algorithm = c("Hartigan-Wong") )$cluster
|
|
484 }else if ( clusterAlg == "pam" ) {
|
|
485 require( cluster )
|
|
486 this_assignment <- pam( x=this_dist,
|
|
487 k,
|
|
488 diss=TRUE,
|
|
489 metric=distance,
|
|
490 cluster.only=TRUE )
|
|
491 } else{
|
|
492 ##optional cluterArg Hook.
|
|
493 this_assignment <- get(clusterAlg)(this_dist, k)
|
|
494 }
|
|
495 ##add to tally
|
|
496 ml[[k]] <- connectivityMatrix( this_assignment,
|
|
497 ml[[k]],
|
|
498 sample_x[[3]] )
|
|
499 }
|
|
500 }
|
|
501
|
|
502
|
|
503 ##consensus fraction
|
|
504 res = vector(mode="list",maxK)
|
|
505 for (k in 2:maxK){
|
|
506 ##fill in other half of matrix for tally and count.
|
|
507 tmp = triangle(ml[[k]],mode=3)
|
|
508 tmpCount = triangle(mCount,mode=3)
|
|
509 res[[k]] = tmp / tmpCount
|
|
510 res[[k]][which(tmpCount==0)] = 0
|
|
511 }
|
|
512 message("end fraction")
|
|
513 return(res)
|
|
514 }
|
|
515
|
|
516
|
|
517 connectivityMatrix <- function( clusterAssignments, m, sampleKey){
|
|
518 ##input: named vector of cluster assignments, matrix to add connectivities
|
|
519 ##output: connectivity matrix
|
|
520 names( clusterAssignments ) <- sampleKey
|
|
521 cls <- lapply( unique( clusterAssignments ), function(i) as.numeric( names( clusterAssignments[ clusterAssignments %in% i ] ) ) )
|
|
522
|
|
523 for ( i in 1:length( cls ) ) {
|
|
524 nelts <- 1:ncol( m )
|
|
525 cl <- as.numeric( nelts %in% cls[[i]] ) ## produces a binary vector
|
|
526 updt <- outer( cl, cl )
|
|
527 m <- m + updt
|
|
528 }
|
|
529 return(m)
|
|
530 }
|
|
531
|
|
532 ## returns a list with the sample columns, as well as the sub-matrix & sample features (if necessary)
|
|
533 ## if no sampling over the features is performed, the submatrix & sample features are returned as NAs
|
|
534 ## to reduce memory overhead
|
|
535 sampleCols <- function( d,
|
|
536 pSamp=NULL,
|
|
537 pRow=NULL,
|
|
538 weightsItem=NULL,
|
|
539 weightsFeature=NULL ){
|
|
540 space <- ifelse( inherits( d, "dist" ), ncol( as.matrix(d) ), ncol(d) )
|
|
541 sampleN <- floor(space*pSamp)
|
|
542 sampCols <- sort( sample(space, sampleN, replace = FALSE, prob = weightsItem) )
|
|
543
|
|
544 this_sample <- sampRows <- NA
|
|
545 if ( inherits( d, "matrix" ) ) {
|
|
546 if ( (! is.null( pRow ) ) &&
|
|
547 ( (pRow < 1 ) || (! is.null( weightsFeature ) ) ) ) {
|
|
548 ## only sample the rows and generate a sub-matrix if we're sampling over the row/gene/features
|
|
549 space = nrow(d)
|
|
550 sampleN = floor(space*pRow)
|
|
551 sampRows = sort( sample(space, sampleN, replace = FALSE, prob = weightsFeature) )
|
|
552 this_sample <- d[sampRows,sampCols]
|
|
553 dimnames(this_sample) <- NULL
|
|
554 } else {
|
|
555 ## do nothing
|
|
556 }
|
|
557 }
|
|
558 return( list( submat=this_sample,
|
|
559 subrows=sampRows,
|
|
560 subcols=sampCols ) )
|
|
561 }
|
|
562
|
|
563 CDF=function(ml,breaks=100){
|
|
564 #plot CDF distribution
|
|
565 plot(c(0),xlim=c(0,1),ylim=c(0,1),col="white",bg="white",xlab="consensus index",ylab="CDF",main="consensus CDF", las=2)
|
|
566 k=length(ml)
|
|
567 this_colors = rainbow(k-1)
|
|
568 areaK = c()
|
|
569 for (i in 2:length(ml)){
|
|
570 v=triangle(ml[[i]],mode=1)
|
|
571
|
|
572 #empirical CDF distribution. default number of breaks is 100
|
|
573 h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks))
|
|
574 h$counts = cumsum(h$counts)/sum(h$counts)
|
|
575
|
|
576 #calculate area under CDF curve, by histogram method.
|
|
577 thisArea=0
|
|
578 for (bi in 1:(length(h$breaks)-1)){
|
|
579 thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width
|
|
580 bi = bi + 1
|
|
581 }
|
|
582 areaK = c(areaK,thisArea)
|
|
583 lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l')
|
|
584 }
|
|
585 legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors)
|
|
586
|
|
587 #plot area under CDF change.
|
|
588 deltaK=areaK[1] #initial auc at k=2
|
|
589 for(i in 2:(length(areaK))){
|
|
590 #proportional increase relative to prior K.
|
|
591 deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1])
|
|
592 }
|
|
593 plot(1+(1:length(deltaK)),y=deltaK,xlab="k",ylab="relative change in area under CDF curve",main="Delta area",type="b")
|
|
594 }
|
|
595
|
|
596
|
|
597 myPal = function(n=10){
|
|
598 #returns n colors
|
|
599 seq = rev(seq(0,255,by=255/(n)))
|
|
600 palRGB = cbind(seq,seq,255)
|
|
601 rgb(palRGB,maxColorValue=255)
|
|
602 }
|
|
603
|
|
604 setClusterColors = function(past_ct,ct,colorU,colorList){
|
|
605 #description: sets common color of clusters between different K
|
|
606 newColors = c()
|
|
607 if(length(colorList)==0){
|
|
608 #k==2
|
|
609 newColors = colorU[ct]
|
|
610 colori=2
|
|
611 }else{
|
|
612 newColors = rep(NULL,length(ct))
|
|
613 colori = colorList[[2]]
|
|
614 mo=table(past_ct,ct)
|
|
615 m=mo/apply(mo,1,sum)
|
|
616 for(tci in 1:ncol(m)){ # for each cluster
|
|
617 maxC = max(m[,tci])
|
|
618 pci = which(m[,tci] == maxC)
|
|
619 if( sum(m[,tci]==maxC)==1 & max(m[pci,])==maxC & sum(m[pci,]==maxC)==1 ) {
|
|
620 #if new column maximum is unique, same cell is row maximum and is also unique
|
|
621 ##Note: the greatest of the prior clusters' members are the greatest in a current cluster's members.
|
|
622 newColors[which(ct==tci)] = unique(colorList[[1]][which(past_ct==pci)]) # one value
|
|
623 }else{ #add new color.
|
|
624 colori=colori+1
|
|
625 newColors[which(ct==tci)] = colorU[colori]
|
|
626 }
|
|
627 }
|
|
628 }
|
|
629 return(list(newColors,colori,unique(newColors) ))
|
|
630 }
|
|
631
|
|
632 clusterTrackingPlot = function(m){
|
|
633 #description: plots cluster tracking plot
|
|
634 #input: m - matrix where rows are k, columns are samples, and values are cluster assignments.
|
|
635 plot(NULL,xlim=c(-0.1,1),ylim=c(0,1),axes=FALSE,xlab="samples",ylab="k",main="tracking plot")
|
|
636 for(i in 1:nrow(m)){
|
|
637 rect( xleft=seq(0,1-1/ncol(m),by=1/ncol(m)), ybottom=rep(1-i/nrow(m),ncol(m)) , xright=seq(1/ncol(m),1,by=1/ncol(m)), ytop=rep(1-(i-1)/nrow(m),ncol(m)), col=m[i,],border=NA)
|
|
638 }
|
|
639 #hatch lines to indicate samples
|
|
640 xl = seq(0,1-1/ncol(m),by=1/ncol(m))
|
|
641 segments( xl, rep(-0.1,ncol(m)) , xl, rep(0,ncol(m)), col="black") #** alt white and black color?
|
|
642 ypos = seq(1,0,by=-1/nrow(m))-1/(2*nrow(m))
|
|
643 text(x=-0.1,y=ypos[-length(ypos)],labels=seq(2,nrow(m)+1,by=1))
|
|
644 }
|
|
645
|
|
646 triangle = function(m,mode=1){
|
|
647 #mode=1 for CDF, vector of lower triangle.
|
|
648 #mode==3 for full matrix.
|
|
649 #mode==2 for calcICL; nonredundant half matrix coun
|
|
650 #mode!=1 for summary
|
|
651 n=dim(m)[1]
|
|
652 nm = matrix(0,ncol=n,nrow=n)
|
|
653 fm = m
|
|
654
|
|
655
|
|
656 nm[upper.tri(nm)] = m[upper.tri(m)] #only upper half
|
|
657
|
|
658 fm = t(nm)+nm
|
|
659 diag(fm) = diag(m)
|
|
660
|
|
661 nm=fm
|
|
662 nm[upper.tri(nm)] = NA
|
|
663 diag(nm) = NA
|
|
664 vm = m[lower.tri(nm)]
|
|
665
|
|
666 if(mode==1){
|
|
667 return(vm) #vector
|
|
668 }else if(mode==3){
|
|
669 return(fm) #return full matrix
|
|
670 }else if(mode == 2){
|
|
671 return(nm) #returns lower triangle and no diagonal. no double counts.
|
|
672 }
|
|
673
|
|
674 }
|
|
675
|
|
676
|
|
677 rankedBarPlot=function(d,myc,cc,title){
|
|
678 colors = rbind() #each row is a barplot series
|
|
679 byRank = cbind()
|
|
680
|
|
681 spaceh = 0.1 #space between bars
|
|
682 for(i in 1:ncol(d)){
|
|
683 byRank = cbind(byRank,sort(d[,i],na.last=F))
|
|
684 colors = rbind(colors,order(d[,i],na.last=F))
|
|
685 }
|
|
686 maxH = max(c(1.5,apply(byRank,2,sum)),na.rm=T) #maximum height of graph
|
|
687
|
|
688 #barplot largest to smallest so that smallest is in front.
|
|
689 barp = barplot( apply(byRank,2,sum) , col=myc[colors[,1]] ,space=spaceh,ylim=c(0,maxH),main=paste("item-consensus", title),border=NA,las=1 )
|
|
690 for(i in 2:nrow(byRank)){
|
|
691 barplot( apply(matrix(byRank[i:nrow(byRank),],ncol=ncol(byRank)) ,2,sum), space=spaceh,col=myc[colors[,i]],ylim=c(0,maxH), add=T,border=NA,las=1 )
|
|
692 }
|
|
693 xr=seq(spaceh,ncol(d)+ncol(d)*spaceh,(ncol(d)+ncol(d)*spaceh)/ncol(d) )
|
|
694 #class labels as asterisks
|
|
695 text("*",x=xr+0.5,y=maxH,col=myc[cc],cex=1.4) #rect(xr,1.4,xr+1,1.5,col=myc[cc] )
|
|
696 }
|
|
697
|
|
698
|
|
699
|
|
700 ###################################################################3333
|
|
701 ## RESTART MY SCRIPTS HERE
|
|
702
|
|
703 spec <- matrix( c( "data.fname", "d", 1, "character",
|
|
704 "direction", "n", 2, "character",
|
|
705 "output.name", "o", 2, "character",
|
|
706 "cluster.alg", "a", 2, "character",
|
|
707 "distance.metric", "m", 2, "character",
|
|
708 "max.k", "k", 2, "integer",
|
|
709 "reps", "r", 2, "integer",
|
|
710 "innerLinkage", "i", 1, "character",
|
|
711 "finalLinkage", "f", 1, "character",
|
|
712 "out.report.dir", "p", 2, "character",
|
|
713 "out.report.html", "h", 2, "character"
|
|
714 ),
|
|
715 nc=4,
|
|
716 byrow=TRUE
|
|
717 )
|
|
718
|
|
719 opt <- getopt( spec=spec )
|
|
720
|
|
721 ## default params for non-required params
|
|
722 if ( is.null( opt$direction ) ) { opt$direction <- "cols" }
|
|
723 if ( is.null( opt$cluster.alg ) ) { opt$cluster.alg <- "pam" }
|
|
724 if ( is.null( opt$output.name ) ) { opt$output.name <- "consensus.cluster.result" }
|
|
725 if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "cosine" }
|
|
726 if ( is.null( opt$max.k ) ) { opt$max.k <- 10 }
|
|
727 if ( is.null( opt$reps ) ) { opt$reps <- 1000 }
|
|
728 if ( is.null( opt$innerLinkage ) ) { opt$innerLinkage <- "average" }
|
|
729 if ( is.null( opt$finalLinkage ) ) { opt$finalLinkage <- "average" }
|
|
730
|
|
731 if ( is.null( opt$out.report.dir ) ) { opt$out.report.dir <- "report" }
|
|
732 if ( is.null( opt$out.report.html ) ) { opt$out.report.html <- file.path( "report", "index.html" ) }
|
|
733
|
|
734 ## validate params here (make sure set to valid values)
|
|
735 if ( !opt$cluster.alg %in% c( "hc", "km", "pam" ) ) {
|
|
736 stop( "invalid clustering algorithm specified", cluster.alg )
|
|
737 }
|
|
738
|
|
739
|
|
740 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
|
|
741 ## transpose the matrix if we want to cluster the rows (genes)
|
|
742 if ( opt$direction == "rows" ) {
|
|
743 data <- t( data )
|
|
744 }
|
|
745
|
|
746 title <- paste( opt$cluster.alg, opt$output.name, sep="." )
|
|
747 results <- ConsensusClusterPlus( data,
|
|
748 maxK=opt$max.k,
|
|
749 reps=opt$reps,
|
|
750 pItem=0.8,
|
|
751 pFeature=1,
|
|
752 title=opt$out.report.dir,
|
|
753 clusterAlg=opt$cluster.alg,
|
|
754 distance=opt$distance.metric,
|
|
755 innerLinkage=opt$innerLinkage,
|
|
756 finalLinkage=opt$finalLinkage,
|
|
757 plot='png',
|
|
758 writeTable=FALSE,
|
|
759 seed=100,
|
|
760 ##weightsFeature=abs( rnorm( nrow( orig.data ) ) ),
|
|
761 verbose=FALSE )
|
|
762
|
|
763 pngs = list.files(path=opt$out.report.dir, patt="png")
|
|
764 html.out <- paste( "<html>",
|
|
765 paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ),
|
|
766 "</html>" )
|
|
767 cat( html.out, file=opt$out.report.html )
|
|
768
|
|
769
|
|
770 ## re-transpose the matrix back if we've clustered the rows (genes)
|
|
771 if ( opt$direction == "rows" ) {
|
|
772 data <- t( data )
|
|
773 }
|
|
774 save( file=opt$output.name, data, results)
|