comparison cluster.tools/consensus.clustering.R @ 0:0decf3fd54bc draft

Uploaded
author peter-waltman
date Thu, 28 Feb 2013 01:45:39 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0decf3fd54bc
1 #!/usr/bin/env Rscript
2 ## Consensus Clustering Script by Peter Waltman
3 ## May 31, 2011
4 ## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
5 ##
6 #usage, options and doc goes here
7 argspec <- c("consensus.clustering.R takes a clustering from ConsensusClusterPlus and clinical survival data
8 and generates a KM-plot, along with the log-rank p-values
9
10 Usage:
11 consensus.clustering.R -d <data.file>
12 Optional:
13 -o <output.name>
14 -a <cluster.alg> ## must be either 'hc' or 'km'
15 -m <distance.metric> ## must be one supported by ConsensusClusterPlus
16 -k <max.k>
17 -r <reps>
18 -f <filter> ## filter, o/w no filtering
19
20 \n\n")
21 args <- commandArgs(TRUE)
22 if ( length( args ) == 1 && args =="--help") {
23 write(argspec, stderr())
24 q();
25 }
26
27 lib.load.quiet <- function( package ) {
28 package <- as.character(substitute(package))
29 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
30 }
31 lib.load.quiet(getopt)
32 lib.load.quiet( gplots )
33 lib.load.quiet( amap )
34 ## if any of the faster clustering methods are available on this system, load them
35 if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
36 if ( 'flashClust' %in% installed.packages() ) {
37 lib.load.quiet( flashClust )
38 } else {
39 if ( 'fastcluster' %in% installed.packages() ) {
40 lib.load.quiet( fastcluster )
41 }
42 }
43 }
44 ##lib.load.quiet(ConsensusClusterPlus)
45 lib.load.quiet( amap )
46 lib.load.quiet( cluster )
47
48 ###################
49 ## code borrowed/updated from ConsensusClusterPlus
50 ###################
51
52 ConsensusClusterPlus <- function( d=NULL,
53 maxK = 3,
54 reps=10,
55 pItem=0.8,
56 pFeature=1,
57 clusterAlg="hc",
58 title="untitled_consensus_cluster",
59 innerLinkage="average",
60 finalLinkage="average",
61 distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ),
62 ml=NULL,
63 tmyPal=NULL,
64 seed=NULL,
65 plot=NULL,
66 writeTable=FALSE,
67 weightsItem=NULL,
68 weightsFeature=NULL,
69 verbose=F ) {
70 ##description: runs consensus subsamples
71
72
73 if(is.null(seed)==TRUE){
74 seed=timeSeed = as.numeric(Sys.time())
75 }
76 set.seed(seed)
77
78 if(is.null(ml)==TRUE){
79
80 if ( inherits( distance, "dist" ) ) {
81 stop( "If you want to pass in a pre-calculated distance object, pass it in as the data, rather than the distance parameter\n" )
82 }
83
84 if ( ! class( d ) %in% c( "dist", "matrix", "ExpressionSet" ) ) {
85 stop("d must be a matrix, distance object or ExpressionSet (eset object)")
86 }
87
88 if ( inherits( d, "dist" ) ) {
89 ## if d is a distance matrix, fix a few things so that they don't cause problems with the analysis
90 ## Note, assumption is that if d is a distance matrix, the user doesn't want to sample over the row features
91 if ( is.null( attr( d, "method" ) ) ) {
92 attr( d, "method" ) <- distance <- "unknown - user-specified"
93 }
94 if ( is.null( distance ) || ( distance != attr( d, "method" ) ) ) {
95 distance <- attr( d, "method" )
96 }
97
98 if ( ( ! is.null( pFeature ) ) && ( pFeature < 1 ) ) {
99 if ( verbose ) warning( "Cannot use the pFeatures parameter when specifying a distance matrix as the data object\n" )
100 pFeature <- 1
101 }
102 if ( ! is.null( weightsFeature ) ) {
103 if ( verbose ) warning( "Cannot use the weightsFeature parameter when specifying a distance matrix as the data object\n" )
104 weightsFeature <- NULL
105 }
106 if ( clusterAlg == "km" ) {
107 if ( verbose ) warning( "You are asking CCPLUS to use K-means to cluster a distance matrix (rather than the data itself) - this may produce unintended results. We suggest using PAM if you want to use alternate distance metrics/objects\n" )
108 ##d <- as.matrix( d ) #this is now done w/in ccRun
109 }
110 } else {
111 if ( is.null( distance ) ) {
112 ## we should never get here, but just in case
113 distance <- "pearson"
114 }
115 }
116
117 if ( ( clusterAlg == "km" ) && inherits( distance, "character" ) && ( distance != "euclidean" ) ) {
118 warning( "WARNING: kmeans can only use the euclidean distance metric. If you would like to use an alternate metric, we suggest using PAM or HC clustering instead. This parameter combinationwill use k-means, but will NOT use the specified distance metric\n" )
119 distance <- 'euclidean'
120 }
121
122
123 if ( inherits( d,"ExpressionSet" ) ) {
124 d <- exprs(d)
125 }
126
127 ml <- ccRun( d=d,
128 maxK=maxK,
129 repCount=reps,
130 diss=inherits(d,"dist"),
131 pItem=pItem,
132 pFeature=pFeature,
133 innerLinkage=innerLinkage,
134 clusterAlg=clusterAlg,
135 weightsFeature=weightsFeature,
136 weightsItem=weightsItem,
137 distance=distance,
138 verbose=verbose)
139 }
140 res=list();
141
142 ##make results directory
143 if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){
144 dir.create(paste(title,sep=""))
145 }
146
147 ##write log file
148 log <- matrix( ncol=2,
149 byrow=T,
150 c("title",title,
151 "maxK",maxK,
152 "input matrix rows",ifelse ( inherits( d, "matrix" ), nrow(d), "dist-mat" ),
153 "input matric columns",ifelse ( inherits( d, "matrix" ), ncol(d), ncol( as.matrix(d) ) ),
154 "number of bootstraps",reps,
155 "item subsampling proportion",pItem,
156 "feature subsampling proportion",ifelse( is.null(pFeature), 1, pFeature ),
157 "cluster algorithm",clusterAlg,
158 "inner linkage type",innerLinkage,
159 "final linkage type",finalLinkage,
160 "correlation method",distance,
161 "plot",if(is.null(plot)) NA else plot,
162 "seed",if(is.null(seed)) NA else seed))
163 colnames(log) = c("option","value")
164 if(writeTable){
165 write.csv(file=paste(title,"/",title,".log.csv",sep=""), log,row.names=F)
166 }
167 if(is.null(plot)){
168 ##nothing
169 }else if(plot=="png"){
170 png(paste(title,"/","consensus%03d.png",sep=""))
171 }else if (plot=="pdf"){
172 pdf(onefile=TRUE, paste(title,"/","consensus.pdf",sep=""))
173 }else if (plot=="ps"){
174 postscript(onefile=TRUE, paste(title,"/","consensus.ps",sep=""))
175 }
176
177 colorList=list()
178 colorM = rbind() #matrix of colors.
179
180 #18 colors for marking different clusters
181 thisPal <- c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99","#E31A1C","#FDBF6F","#FF7F00","#CAB2D6","#6A3D9A","#FFFF99","#B15928",
182 "#bd18ea", #magenta
183 "#2ef4ca", #aqua
184 "#f4cced", #pink,
185 "#f4cc03", #lightorange
186 "#05188a", #navy,
187 "#e5a25a", #light brown
188 "#06f106", #bright green
189 "#85848f", #med gray
190 "#000000", #black
191 "#076f25", #dark green
192 "#93cd7f",#lime green
193 "#4d0776", #dark purple
194 "#ffffff" #white
195 )
196
197 ##plot scale
198 colBreaks=NA
199 if(is.null(tmyPal)==TRUE){
200 colBreaks=10
201 tmyPal = myPal(colBreaks)
202 }else{
203 colBreaks=length(tmyPal)
204 }
205 sc = cbind(seq(0,1,by=1/( colBreaks) )); rownames(sc) = sc[,1]
206 sc = cbind(sc,sc)
207 heatmap(sc, Colv=NA, Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=rownames(sc),labCol=F,main="consensus matrix legend")
208
209 for (tk in 2:maxK){
210 if(verbose){
211 message(paste("consensus ",tk))
212 }
213 fm = ml[[tk]]
214 hc=hclust( as.dist( 1 - fm ), method=finalLinkage);
215 message("clustered")
216 ct = cutree(hc,tk)
217 names(ct) = colnames(d)
218 c = fm
219 ##colnames(c) = colnames(d)
220 ##rownames(c) = colnames(d)
221
222 colorList = setClusterColors(res[[tk-1]][[3]],ct,thisPal,colorList)
223
224 pc = c
225 pc=pc[hc$order,] #***pc is matrix for plotting, same as c but is row-ordered and has names and extra row of zeros.
226 pc = rbind(pc,0)
227
228 heatmap(pc, Colv=as.dendrogram(hc), Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=F,labCol=F,mar=c(5,5),main=paste("consensus matrix k=",tk,sep="") , ColSideCol=colorList[[1]])
229 legend("topright",legend=unique(ct),fill=unique(colorList[[1]]),horiz=FALSE )
230
231 res[[tk]] = list(consensusMatrix=c,consensusTree=hc,consensusClass=ct,ml=ml[[tk]],clrs=colorList)
232 colorM = rbind(colorM,colorList[[1]])
233 }
234 CDF(ml)
235 clusterTrackingPlot(colorM[,res[[length(res)]]$consensusTree$order])
236 if(is.null(plot)==FALSE){
237 dev.off();
238 }
239 res[[1]] = colorM
240 if(writeTable){
241 for(i in 2:length(res)){
242 write.csv(file=paste(title,"/",title,".k=",i,".consensusMatrix.csv",sep=""), res[[i]]$consensusMatrix)
243 write.table(file=paste(title,"/",title,".k=",i,".consensusClass.csv",sep=""), res[[i]]$consensusClass,col.names = F,sep=",")
244 }
245 }
246 return(res)
247 }
248
249
250 calcICL = function(res,title="untitled_consensus_cluster",plot=NULL,writeTable=FALSE){
251 #calculates and plots cluster consensus and item consensus
252 cc=rbind()
253 cci = rbind()
254 sumRes=list()
255 colorsArr=c()
256
257 #make results directory
258 if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){
259 dir.create(paste(title,sep=""))
260 }
261 if(is.null(plot)){
262 #to screen
263 }else if(plot=="pdf"){
264 pdf(onefile=TRUE, paste(title,"/","icl.pdf",sep=""))
265 }else if(plot=="ps"){
266 postscript(onefile=TRUE, paste(title,"/","icl.ps",sep=""))
267 }else if (plot=="png"){
268 png(paste(title,"/","icl%03d.png",sep=""))
269 }
270
271 par(mfrow=c(3,1),mar=c(4,3,2,0))
272
273 for (k in 2:length(res)){ #each k
274 eiCols = c();
275 o = res[[k]]
276 m = o$consensusMatrix
277 m = triangle(m,mode=2)
278 for (ci in sort(unique(o$consensusClass))){ #each cluster in k
279 items = which(o$consensusClass==ci)
280 nk = length(items)
281 mk = sum( m[items,items], na.rm=T)/((nk*(nk-1))/2)
282 cc=rbind(cc,c(k,ci,mk)) #cluster-consensus
283
284 for (ei in rev(res[[2]]$consensusTree$order) ){
285 denom = if (ei %in% items) { nk - 1} else { nk }
286 mei = sum( c(m[ei,items],m[items,ei]), na.rm=T)/denom # mean item consensus to a cluster.
287 cci = rbind(cci,c(k,ci,ei,mei)) #cluster, cluster index, item index, item-consensus
288 }
289 eiCols = c(eiCols, rep(ci,length(o$consensusClass)) )
290 }
291
292 cck = cci[which(cci[,1]==k),] #only plot the new k data.
293
294 #group by item, order by cluster i
295 w=lapply(split(cck,cck[,3]), function(x) { y=matrix(unlist(x),ncol=4); y[order(y[,2]),4] })
296 q = matrix(as.numeric(unlist(w)),ncol=length(w),byrow=F)
297 q = q[,res[[2]]$consensusTree$order] #order by leave order of k=2
298 #q is a matrix of k rows and sample columns, values are item consensus of sample to the cluster.
299
300 thisColors = unique(cbind(res[[k]]$consensusClass,res[[k]]$clrs[[1]]))
301 thisColors=thisColors[order(as.numeric(thisColors[,1])),2]
302 colorsArr=c(colorsArr,thisColors)
303 sumRes[[k]] = rankedBarPlot(q,thisColors,cc=res[[k]]$consensusClass[res[[2]]$consensusTree$order],paste("k=",k,sep="") )
304 }
305
306 ys=cs=lab=c()
307 lastk=cc[1,1]
308 for(i in 1:length(colorsArr)){
309 if(lastk != cc[i,1]){
310 ys=c(ys,0,0)
311 cs=c(cs,NA,NA)
312 lastk=cc[i,1]
313 lab=c(lab,NA,NA)
314 }
315 ys=c(ys,cc[i,3])
316 cs=c(cs,colorsArr[i])
317 lab=c(lab,cc[i,1])
318 }
319 names(ys) = lab
320 par(mfrow=c(3,1),mar=c(4,3,2,0))
321 barplot(ys,col=cs,border=cs,main="cluster-consensus",ylim=c(0,1),las=1)
322 if(is.null(plot)==FALSE){
323 dev.off()
324 }
325 colnames(cc) = c("k","cluster","clusterConsensus")
326 colnames(cci) = c("k","cluster","item","itemConsensus")
327 cci[,"item"] = names(res[[2]]$consensusClass)[ cci[,"item"] ]
328 #type cci
329 cci = data.frame( k=as.numeric(cci[,"k"]), cluster=as.numeric(cci[,"cluster"]), item=cci[,"item"], itemConsensus=as.numeric(cci[,"itemConsensus"]))
330
331 #write to file.
332 if(writeTable){
333 write.csv(file=paste(title,"/",title,".summary.cluster.consensus.csv",sep=""),row.names=F, cc)
334 write.csv(file=paste(title,"/",title,".summary.item.consensus.csv",sep=""), row.names=F, cc)
335 }
336 return(list(clusterConsensus=cc,itemConsensus=cci))
337 }
338
339
340 ccRun <- function( d=d,
341 maxK=NULL,
342 repCount=NULL,
343 diss=inherits( d, "dist" ),
344 pItem=NULL,
345 pFeature=NULL,
346 innerLinkage=NULL,
347 distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ),
348 clusterAlg=NULL,
349 weightsItem=NULL,
350 weightsFeature=NULL,
351 verbose=NULL) {
352 m = vector(mode='list', repCount)
353 ml = vector(mode="list",maxK)
354 n <- ifelse( diss, ncol( as.matrix(d) ), ncol(d) )
355 mCount = mConsist = matrix(c(0),ncol=n,nrow=n)
356 ml[[1]] = c(0);
357
358 if (is.null( distance ) ) distance <- 'euclidean' ## necessary if d is a dist object and attr( d, "method" ) == NULLa
359
360 require( amap )
361 ## we're going to use the amap Dist function, but they misname their correlation
362 ## functions, so re-name them correctly
363 amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
364 "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
365 names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
366 "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
367 main.dist.obj <- NULL
368 ##browser()
369 if ( diss ){
370 main.dist.obj <- d
371
372 ## reset the pFeature & weightsFeature params if they've been set (irrelevant if d is a dist matrix)
373 if ( ( !is.null(pFeature) ) &&
374 ( pFeature < 1 ) ) {
375 if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified pFeature parameter\n" )
376 pFeature <- 1 # set it to 1 to avoid problems with sampleCols
377 }
378 if ( ! is.null( weightsFeature ) ) {
379 if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified weightsFeature parameter\n" )
380 weightsFeature <- NULL # set it to NULL to avoid problems with sampleCols
381 }
382 } else { ## d is a data matrix
383 ## we're not sampling over the features
384 if ( ( clusterAlg != "km" ) &&
385 ( is.null( pFeature ) ||
386 ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) ) {
387 ## only generate a main.dist.object IFF 1) d is a matrix, 2) we're not sampling the features, and 3) the algorithm isn't 'km'
388 if ( inherits( distance, "character" ) ) {
389 if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
390
391 main.dist.obj <- Dist( t(d), method=as.character( amap.distance[ distance ] ) )
392 ## now fix dumb amap naming convention for distance metrics
393 attr( main.dist.obj, "method" ) <- as.character( amap.distance[ distance ] )
394 } else stop("unsupported distance specified.")
395 } else {
396 ## pFeature < 1 or a weightsFeature != NULL
397 ## since d is a data matrix, the user wants to sample over the gene features, so main.dist.obj is left as NULL
398 }
399 }
400
401
402 for (i in 1:repCount){
403 ##browser()
404 if(verbose){
405 message(paste("random subsample",i));
406 }
407 ## take expression matrix sample, samples and genes
408 sample_x = sampleCols( d, pItem, pFeature, weightsItem, weightsFeature )
409
410 this_dist = NA
411 if ( ! is.null( main.dist.obj ) ) {
412 boot.cols <- sample_x$subcols
413 this_dist <- as.matrix( main.dist.obj )[ boot.cols, boot.cols ]
414 if ( clusterAlg != "km" ) {
415 ## if this isn't kmeans, then convert to a distance object
416 this_dist <- as.dist( this_dist )
417 attr( this_dist, "method" ) <- attr( main.dist.obj, "method" )
418 }
419 } else {
420 ## if main.dist.obj is NULL, then d is a data matrix, and either:
421 ## 1) clusterAlg is 'km'
422 ## 2) pFeatures < 1 or weightsFeatures have been specified, or
423 ## 3) both
424 ## so we can't use a main distance object and for every iteration, we will have to re-calculate either
425 ## 1) the distance matrix (because we're also sampling the features as well), or
426 ## 2) the submat (if using km)
427
428 if ( clusterAlg != "km" ) {
429 if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
430 ## good, we have a supported distance type
431 this_dist <- Dist( t( sample_x$submat ), method=as.character( amap.distance[ distance ] ) )
432 ## now fix dumb amap naming convention for distance metrics
433 attr( this_dist, "method" ) <- as.character( amap.distance[ distance ] )
434 } else {
435 ##browser()
436 ##clusterAlg == "km"
437 ## if we're not sampling the features, then grab the colslice
438 if ( is.null( pFeature ) ||
439 ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) {
440 this_dist <- d[, sample_x$subcols ]
441 } else {
442 if ( is.na( sample_x$submat ) ) {
443 save( "ccrun.submat.eq.na.dbg.rda" )
444 stop( "Houston, we have a problem. sample_x$submat is NA in ccRun when it should be specified - saving state\n" )
445 }
446
447 this_dist <- sample_x$submat
448 }
449 }
450 }
451
452 ## cluster samples for HC.
453 this_cluster=NA
454 if(clusterAlg=="hc"){
455 this_cluster = hclust( this_dist, method=innerLinkage)
456 }
457 ##browser()
458 ##mCount is possible number of times that two sample occur in same random sample, independent of k
459 ##mCount stores number of times a sample pair was sampled together.
460 mCount <- connectivityMatrix( rep( 1,length(sample_x[[3]])),
461 mCount,
462 sample_x[[3]] )
463
464 ##use samples for each k
465 for (k in 2:maxK){
466 if(verbose){
467 message(paste(" k =",k))
468 }
469 if (i==1){
470 ml[[k]] = mConsist #initialize
471 }
472 this_assignment=NA
473 if(clusterAlg=="hc"){
474 ##prune to k for hc
475 this_assignment = cutree(this_cluster,k)
476 ##browser()
477 }else if(clusterAlg=="km"){
478 ##this_dist should now be a matrix corresponding to the result from sampleCols
479 this_assignment <- kmeans( t( this_dist ),
480 k,
481 iter.max = 10,
482 nstart = 1,
483 algorithm = c("Hartigan-Wong") )$cluster
484 }else if ( clusterAlg == "pam" ) {
485 require( cluster )
486 this_assignment <- pam( x=this_dist,
487 k,
488 diss=TRUE,
489 metric=distance,
490 cluster.only=TRUE )
491 } else{
492 ##optional cluterArg Hook.
493 this_assignment <- get(clusterAlg)(this_dist, k)
494 }
495 ##add to tally
496 ml[[k]] <- connectivityMatrix( this_assignment,
497 ml[[k]],
498 sample_x[[3]] )
499 }
500 }
501
502
503 ##consensus fraction
504 res = vector(mode="list",maxK)
505 for (k in 2:maxK){
506 ##fill in other half of matrix for tally and count.
507 tmp = triangle(ml[[k]],mode=3)
508 tmpCount = triangle(mCount,mode=3)
509 res[[k]] = tmp / tmpCount
510 res[[k]][which(tmpCount==0)] = 0
511 }
512 message("end fraction")
513 return(res)
514 }
515
516
517 connectivityMatrix <- function( clusterAssignments, m, sampleKey){
518 ##input: named vector of cluster assignments, matrix to add connectivities
519 ##output: connectivity matrix
520 names( clusterAssignments ) <- sampleKey
521 cls <- lapply( unique( clusterAssignments ), function(i) as.numeric( names( clusterAssignments[ clusterAssignments %in% i ] ) ) )
522
523 for ( i in 1:length( cls ) ) {
524 nelts <- 1:ncol( m )
525 cl <- as.numeric( nelts %in% cls[[i]] ) ## produces a binary vector
526 updt <- outer( cl, cl )
527 m <- m + updt
528 }
529 return(m)
530 }
531
532 ## returns a list with the sample columns, as well as the sub-matrix & sample features (if necessary)
533 ## if no sampling over the features is performed, the submatrix & sample features are returned as NAs
534 ## to reduce memory overhead
535 sampleCols <- function( d,
536 pSamp=NULL,
537 pRow=NULL,
538 weightsItem=NULL,
539 weightsFeature=NULL ){
540 space <- ifelse( inherits( d, "dist" ), ncol( as.matrix(d) ), ncol(d) )
541 sampleN <- floor(space*pSamp)
542 sampCols <- sort( sample(space, sampleN, replace = FALSE, prob = weightsItem) )
543
544 this_sample <- sampRows <- NA
545 if ( inherits( d, "matrix" ) ) {
546 if ( (! is.null( pRow ) ) &&
547 ( (pRow < 1 ) || (! is.null( weightsFeature ) ) ) ) {
548 ## only sample the rows and generate a sub-matrix if we're sampling over the row/gene/features
549 space = nrow(d)
550 sampleN = floor(space*pRow)
551 sampRows = sort( sample(space, sampleN, replace = FALSE, prob = weightsFeature) )
552 this_sample <- d[sampRows,sampCols]
553 dimnames(this_sample) <- NULL
554 } else {
555 ## do nothing
556 }
557 }
558 return( list( submat=this_sample,
559 subrows=sampRows,
560 subcols=sampCols ) )
561 }
562
563 CDF=function(ml,breaks=100){
564 #plot CDF distribution
565 plot(c(0),xlim=c(0,1),ylim=c(0,1),col="white",bg="white",xlab="consensus index",ylab="CDF",main="consensus CDF", las=2)
566 k=length(ml)
567 this_colors = rainbow(k-1)
568 areaK = c()
569 for (i in 2:length(ml)){
570 v=triangle(ml[[i]],mode=1)
571
572 #empirical CDF distribution. default number of breaks is 100
573 h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks))
574 h$counts = cumsum(h$counts)/sum(h$counts)
575
576 #calculate area under CDF curve, by histogram method.
577 thisArea=0
578 for (bi in 1:(length(h$breaks)-1)){
579 thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width
580 bi = bi + 1
581 }
582 areaK = c(areaK,thisArea)
583 lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l')
584 }
585 legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors)
586
587 #plot area under CDF change.
588 deltaK=areaK[1] #initial auc at k=2
589 for(i in 2:(length(areaK))){
590 #proportional increase relative to prior K.
591 deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1])
592 }
593 plot(1+(1:length(deltaK)),y=deltaK,xlab="k",ylab="relative change in area under CDF curve",main="Delta area",type="b")
594 }
595
596
597 myPal = function(n=10){
598 #returns n colors
599 seq = rev(seq(0,255,by=255/(n)))
600 palRGB = cbind(seq,seq,255)
601 rgb(palRGB,maxColorValue=255)
602 }
603
604 setClusterColors = function(past_ct,ct,colorU,colorList){
605 #description: sets common color of clusters between different K
606 newColors = c()
607 if(length(colorList)==0){
608 #k==2
609 newColors = colorU[ct]
610 colori=2
611 }else{
612 newColors = rep(NULL,length(ct))
613 colori = colorList[[2]]
614 mo=table(past_ct,ct)
615 m=mo/apply(mo,1,sum)
616 for(tci in 1:ncol(m)){ # for each cluster
617 maxC = max(m[,tci])
618 pci = which(m[,tci] == maxC)
619 if( sum(m[,tci]==maxC)==1 & max(m[pci,])==maxC & sum(m[pci,]==maxC)==1 ) {
620 #if new column maximum is unique, same cell is row maximum and is also unique
621 ##Note: the greatest of the prior clusters' members are the greatest in a current cluster's members.
622 newColors[which(ct==tci)] = unique(colorList[[1]][which(past_ct==pci)]) # one value
623 }else{ #add new color.
624 colori=colori+1
625 newColors[which(ct==tci)] = colorU[colori]
626 }
627 }
628 }
629 return(list(newColors,colori,unique(newColors) ))
630 }
631
632 clusterTrackingPlot = function(m){
633 #description: plots cluster tracking plot
634 #input: m - matrix where rows are k, columns are samples, and values are cluster assignments.
635 plot(NULL,xlim=c(-0.1,1),ylim=c(0,1),axes=FALSE,xlab="samples",ylab="k",main="tracking plot")
636 for(i in 1:nrow(m)){
637 rect( xleft=seq(0,1-1/ncol(m),by=1/ncol(m)), ybottom=rep(1-i/nrow(m),ncol(m)) , xright=seq(1/ncol(m),1,by=1/ncol(m)), ytop=rep(1-(i-1)/nrow(m),ncol(m)), col=m[i,],border=NA)
638 }
639 #hatch lines to indicate samples
640 xl = seq(0,1-1/ncol(m),by=1/ncol(m))
641 segments( xl, rep(-0.1,ncol(m)) , xl, rep(0,ncol(m)), col="black") #** alt white and black color?
642 ypos = seq(1,0,by=-1/nrow(m))-1/(2*nrow(m))
643 text(x=-0.1,y=ypos[-length(ypos)],labels=seq(2,nrow(m)+1,by=1))
644 }
645
646 triangle = function(m,mode=1){
647 #mode=1 for CDF, vector of lower triangle.
648 #mode==3 for full matrix.
649 #mode==2 for calcICL; nonredundant half matrix coun
650 #mode!=1 for summary
651 n=dim(m)[1]
652 nm = matrix(0,ncol=n,nrow=n)
653 fm = m
654
655
656 nm[upper.tri(nm)] = m[upper.tri(m)] #only upper half
657
658 fm = t(nm)+nm
659 diag(fm) = diag(m)
660
661 nm=fm
662 nm[upper.tri(nm)] = NA
663 diag(nm) = NA
664 vm = m[lower.tri(nm)]
665
666 if(mode==1){
667 return(vm) #vector
668 }else if(mode==3){
669 return(fm) #return full matrix
670 }else if(mode == 2){
671 return(nm) #returns lower triangle and no diagonal. no double counts.
672 }
673
674 }
675
676
677 rankedBarPlot=function(d,myc,cc,title){
678 colors = rbind() #each row is a barplot series
679 byRank = cbind()
680
681 spaceh = 0.1 #space between bars
682 for(i in 1:ncol(d)){
683 byRank = cbind(byRank,sort(d[,i],na.last=F))
684 colors = rbind(colors,order(d[,i],na.last=F))
685 }
686 maxH = max(c(1.5,apply(byRank,2,sum)),na.rm=T) #maximum height of graph
687
688 #barplot largest to smallest so that smallest is in front.
689 barp = barplot( apply(byRank,2,sum) , col=myc[colors[,1]] ,space=spaceh,ylim=c(0,maxH),main=paste("item-consensus", title),border=NA,las=1 )
690 for(i in 2:nrow(byRank)){
691 barplot( apply(matrix(byRank[i:nrow(byRank),],ncol=ncol(byRank)) ,2,sum), space=spaceh,col=myc[colors[,i]],ylim=c(0,maxH), add=T,border=NA,las=1 )
692 }
693 xr=seq(spaceh,ncol(d)+ncol(d)*spaceh,(ncol(d)+ncol(d)*spaceh)/ncol(d) )
694 #class labels as asterisks
695 text("*",x=xr+0.5,y=maxH,col=myc[cc],cex=1.4) #rect(xr,1.4,xr+1,1.5,col=myc[cc] )
696 }
697
698
699
700 ###################################################################3333
701 ## RESTART MY SCRIPTS HERE
702
703 spec <- matrix( c( "data.fname", "d", 1, "character",
704 "direction", "n", 2, "character",
705 "output.name", "o", 2, "character",
706 "cluster.alg", "a", 2, "character",
707 "distance.metric", "m", 2, "character",
708 "max.k", "k", 2, "integer",
709 "reps", "r", 2, "integer",
710 "innerLinkage", "i", 1, "character",
711 "finalLinkage", "f", 1, "character",
712 "out.report.dir", "p", 2, "character",
713 "out.report.html", "h", 2, "character"
714 ),
715 nc=4,
716 byrow=TRUE
717 )
718
719 opt <- getopt( spec=spec )
720
721 ## default params for non-required params
722 if ( is.null( opt$direction ) ) { opt$direction <- "cols" }
723 if ( is.null( opt$cluster.alg ) ) { opt$cluster.alg <- "pam" }
724 if ( is.null( opt$output.name ) ) { opt$output.name <- "consensus.cluster.result" }
725 if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "cosine" }
726 if ( is.null( opt$max.k ) ) { opt$max.k <- 10 }
727 if ( is.null( opt$reps ) ) { opt$reps <- 1000 }
728 if ( is.null( opt$innerLinkage ) ) { opt$innerLinkage <- "average" }
729 if ( is.null( opt$finalLinkage ) ) { opt$finalLinkage <- "average" }
730
731 if ( is.null( opt$out.report.dir ) ) { opt$out.report.dir <- "report" }
732 if ( is.null( opt$out.report.html ) ) { opt$out.report.html <- file.path( "report", "index.html" ) }
733
734 ## validate params here (make sure set to valid values)
735 if ( !opt$cluster.alg %in% c( "hc", "km", "pam" ) ) {
736 stop( "invalid clustering algorithm specified", cluster.alg )
737 }
738
739
740 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
741 ## transpose the matrix if we want to cluster the rows (genes)
742 if ( opt$direction == "rows" ) {
743 data <- t( data )
744 }
745
746 title <- paste( opt$cluster.alg, opt$output.name, sep="." )
747 results <- ConsensusClusterPlus( data,
748 maxK=opt$max.k,
749 reps=opt$reps,
750 pItem=0.8,
751 pFeature=1,
752 title=opt$out.report.dir,
753 clusterAlg=opt$cluster.alg,
754 distance=opt$distance.metric,
755 innerLinkage=opt$innerLinkage,
756 finalLinkage=opt$finalLinkage,
757 plot='png',
758 writeTable=FALSE,
759 seed=100,
760 ##weightsFeature=abs( rnorm( nrow( orig.data ) ) ),
761 verbose=FALSE )
762
763 pngs = list.files(path=opt$out.report.dir, patt="png")
764 html.out <- paste( "<html>",
765 paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ),
766 "</html>" )
767 cat( html.out, file=opt$out.report.html )
768
769
770 ## re-transpose the matrix back if we've clustered the rows (genes)
771 if ( opt$direction == "rows" ) {
772 data <- t( data )
773 }
774 save( file=opt$output.name, data, results)