comparison cluster.tools/new.ccplus.R @ 2:b442996b66ae draft

Uploaded
author peter-waltman
date Wed, 27 Feb 2013 20:17:04 -0500
parents
children
comparison
equal deleted inserted replaced
1:e25d2bece0a2 2:b442996b66ae
1 ##!/usr/bin/env Rscript
2 ## Consensus Clustering Script by Peter Waltman
3 ## May 31, 2011
4 ## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
5 ##
6 #usage, options and doc goes here
7 argspec <- c("consensus.clustering.R takes a clustering from ConsensusClusterPlus and clinical survival data
8 and generates a KM-plot, along with the log-rank p-values
9
10 Usage:
11 consensus.clustering.R -d <data.file>
12 Optional:
13 -o <output.name>
14 -a <cluster.alg> ## must be either 'hc' or 'km'
15 -m <distance.metric> ## must be one supported by ConsensusClusterPlus
16 -k <max.k>
17 -r <reps>
18 -f <filter> ## filter, o/w no filtering
19
20 \n\n")
21 args <- commandArgs(TRUE)
22 if ( length( args ) == 1 && args =="--help") {
23 write(argspec, stderr())
24 q();
25 }
26
27 require(getopt)
28 ##require(ConsensusClusterPlus)
29 ## if any of the faster clustering methods are available on this system, load them
30 require( amap )
31 require( cluster )
32 if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
33 if ( 'flashClust' %in% installed.packages() ) {
34 require( flashClust )
35 } else {
36 if ( 'fastcluster' %in% installed.packages() ) {
37 require( fastcluster )
38 }
39 }
40 }
41
42 ###################
43 ## code borrowed/updated from ConsensusClusterPlus
44 ###################
45
46 ConsensusClusterPlus <- function( d=NULL,
47 maxK = 3,
48 reps=10,
49 pItem=0.8,
50 pFeature=1,
51 clusterAlg="hc",
52 title="untitled_consensus_cluster",
53 innerLinkage="average",
54 finalLinkage="average",
55 distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ),
56 ml=NULL,
57 tmyPal=NULL,
58 seed=NULL,
59 plot=NULL,
60 writeTable=FALSE,
61 weightsItem=NULL,
62 weightsFeature=NULL,
63 verbose=F ) {
64 ##description: runs consensus subsamples
65
66
67 if(is.null(seed)==TRUE){
68 seed=timeSeed = as.numeric(Sys.time())
69 }
70 set.seed(seed)
71
72 if(is.null(ml)==TRUE){
73
74 if ( inherits( distance, "dist" ) ) {
75 stop( "If you want to pass in a pre-calculated distance object, pass it in as the data, rather than the distance parameter\n" )
76 }
77
78 if ( ! class( d ) %in% c( "dist", "matrix", "ExpressionSet" ) ) {
79 stop("d must be a matrix, distance object or ExpressionSet (eset object)")
80 }
81
82 if ( inherits( d, "dist" ) ) {
83 ## if d is a distance matrix, fix a few things so that they don't cause problems with the analysis
84 ## Note, assumption is that if d is a distance matrix, the user doesn't want to sample over the row features
85 if ( is.null( attr( d, "method" ) ) ) {
86 attr( d, "method" ) <- distance <- "unknown - user-specified"
87 }
88 if ( is.null( distance ) || ( distance != attr( d, "method" ) ) ) {
89 distance <- attr( d, "method" )
90 }
91
92 if ( ( ! is.null( pFeature ) ) && ( pFeature < 1 ) ) {
93 if ( verbose ) warning( "Cannot use the pFeatures parameter when specifying a distance matrix as the data object\n" )
94 pFeature <- 1
95 }
96 if ( ! is.null( weightsFeature ) ) {
97 if ( verbose ) warning( "Cannot use the weightsFeature parameter when specifying a distance matrix as the data object\n" )
98 weightsFeature <- NULL
99 }
100 if ( clusterAlg == "km" ) {
101 if ( verbose ) warning( "You are asking CCPLUS to use K-means to cluster a distance matrix (rather than the data itself) - this may produce unintended results. We suggest using PAM if you want to use alternate distance metrics/objects\n" )
102 ##d <- as.matrix( d ) #this is now done w/in ccRun
103 }
104 } else {
105 if ( is.null( distance ) ) {
106 ## we should never get here, but just in case
107 distance <- "pearson"
108 }
109 }
110
111 if ( ( clusterAlg == "km" ) && inherits( distance, "character" ) && ( distance != "euclidean" ) ) {
112 warning( "WARNING: kmeans can only use the euclidean distance metric. If you would like to use an alternate metric, we suggest using PAM or HC clustering instead. This parameter combinationwill use k-means, but will NOT use the specified distance metric\n" )
113 distance <- 'euclidean'
114 }
115
116
117 if ( inherits( d,"ExpressionSet" ) ) {
118 d <- exprs(d)
119 }
120
121 ml <- ccRun( d=d,
122 maxK=maxK,
123 repCount=reps,
124 diss=inherits(d,"dist"),
125 pItem=pItem,
126 pFeature=pFeature,
127 innerLinkage=innerLinkage,
128 clusterAlg=clusterAlg,
129 weightsFeature=weightsFeature,
130 weightsItem=weightsItem,
131 distance=distance,
132 verbose=verbose)
133 }
134 res=list();
135
136 ##make results directory
137 if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){
138 dir.create(paste(title,sep=""))
139 }
140
141 ##write log file
142 log <- matrix( ncol=2,
143 byrow=T,
144 c("title",title,
145 "maxK",maxK,
146 "input matrix rows",ifelse ( inherits( d, "matrix" ), nrow(d), "dist-mat" ),
147 "input matric columns",ifelse ( inherits( d, "matrix" ), ncol(d), ncol( as.matrix(d) ) ),
148 "number of bootstraps",reps,
149 "item subsampling proportion",pItem,
150 "feature subsampling proportion",ifelse( is.null(pFeature), 1, pFeature ),
151 "cluster algorithm",clusterAlg,
152 "inner linkage type",innerLinkage,
153 "final linkage type",finalLinkage,
154 "correlation method",distance,
155 "plot",if(is.null(plot)) NA else plot,
156 "seed",if(is.null(seed)) NA else seed))
157 colnames(log) = c("option","value")
158 if(writeTable){
159 write.csv(file=paste(title,"/",title,".log.csv",sep=""), log,row.names=F)
160 }
161 if(is.null(plot)){
162 ##nothing
163 }else if(plot=="png"){
164 png(paste(title,"/","consensus%03d.png",sep=""))
165 }else if (plot=="pdf"){
166 pdf(onefile=TRUE, paste(title,"/","consensus.pdf",sep=""))
167 }else if (plot=="ps"){
168 postscript(onefile=TRUE, paste(title,"/","consensus.ps",sep=""))
169 }
170
171 colorList=list()
172 colorM = rbind() #matrix of colors.
173
174 #18 colors for marking different clusters
175 thisPal <- c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99","#E31A1C","#FDBF6F","#FF7F00","#CAB2D6","#6A3D9A","#FFFF99","#B15928",
176 "#bd18ea", #magenta
177 "#2ef4ca", #aqua
178 "#f4cced", #pink,
179 "#f4cc03", #lightorange
180 "#05188a", #navy,
181 "#e5a25a", #light brown
182 "#06f106", #bright green
183 "#85848f", #med gray
184 "#000000", #black
185 "#076f25", #dark green
186 "#93cd7f",#lime green
187 "#4d0776", #dark purple
188 "#ffffff" #white
189 )
190
191 ##plot scale
192 colBreaks=NA
193 if(is.null(tmyPal)==TRUE){
194 colBreaks=10
195 tmyPal = myPal(colBreaks)
196 }else{
197 colBreaks=length(tmyPal)
198 }
199 sc = cbind(seq(0,1,by=1/( colBreaks) )); rownames(sc) = sc[,1]
200 sc = cbind(sc,sc)
201 heatmap(sc, Colv=NA, Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=rownames(sc),labCol=F,main="consensus matrix legend")
202
203 for (tk in 2:maxK){
204 if(verbose){
205 message(paste("consensus ",tk))
206 }
207 fm = ml[[tk]]
208 hc=hclust( as.dist( 1 - fm ), method=finalLinkage);
209 message("clustered")
210 ct = cutree(hc,tk)
211 names(ct) = colnames(d)
212 c = fm
213 ##colnames(c) = colnames(d)
214 ##rownames(c) = colnames(d)
215
216 colorList = setClusterColors(res[[tk-1]][[3]],ct,thisPal,colorList)
217
218 pc = c
219 pc=pc[hc$order,] #***pc is matrix for plotting, same as c but is row-ordered and has names and extra row of zeros.
220 pc = rbind(pc,0)
221
222 heatmap(pc, Colv=as.dendrogram(hc), Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=F,labCol=F,mar=c(5,5),main=paste("consensus matrix k=",tk,sep="") , ColSideCol=colorList[[1]])
223 legend("topright",legend=unique(ct),fill=unique(colorList[[1]]),horiz=FALSE )
224
225 res[[tk]] = list(consensusMatrix=c,consensusTree=hc,consensusClass=ct,ml=ml[[tk]],clrs=colorList)
226 colorM = rbind(colorM,colorList[[1]])
227 }
228 CDF(ml)
229 clusterTrackingPlot(colorM[,res[[length(res)]]$consensusTree$order])
230 if(is.null(plot)==FALSE){
231 dev.off();
232 }
233 res[[1]] = colorM
234 if(writeTable){
235 for(i in 2:length(res)){
236 write.csv(file=paste(title,"/",title,".k=",i,".consensusMatrix.csv",sep=""), res[[i]]$consensusMatrix)
237 write.table(file=paste(title,"/",title,".k=",i,".consensusClass.csv",sep=""), res[[i]]$consensusClass,col.names = F,sep=",")
238 }
239 }
240 return(res)
241 }
242
243
244 calcICL = function(res,title="untitled_consensus_cluster",plot=NULL,writeTable=FALSE){
245 #calculates and plots cluster consensus and item consensus
246 cc=rbind()
247 cci = rbind()
248 sumRes=list()
249 colorsArr=c()
250
251 #make results directory
252 if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){
253 dir.create(paste(title,sep=""))
254 }
255 if(is.null(plot)){
256 #to screen
257 }else if(plot=="pdf"){
258 pdf(onefile=TRUE, paste(title,"/","icl.pdf",sep=""))
259 }else if(plot=="ps"){
260 postscript(onefile=TRUE, paste(title,"/","icl.ps",sep=""))
261 }else if (plot=="png"){
262 png(paste(title,"/","icl%03d.png",sep=""))
263 }
264
265 par(mfrow=c(3,1),mar=c(4,3,2,0))
266
267 for (k in 2:length(res)){ #each k
268 eiCols = c();
269 o = res[[k]]
270 m = o$consensusMatrix
271 m = triangle(m,mode=2)
272 for (ci in sort(unique(o$consensusClass))){ #each cluster in k
273 items = which(o$consensusClass==ci)
274 nk = length(items)
275 mk = sum( m[items,items], na.rm=T)/((nk*(nk-1))/2)
276 cc=rbind(cc,c(k,ci,mk)) #cluster-consensus
277
278 for (ei in rev(res[[2]]$consensusTree$order) ){
279 denom = if (ei %in% items) { nk - 1} else { nk }
280 mei = sum( c(m[ei,items],m[items,ei]), na.rm=T)/denom # mean item consensus to a cluster.
281 cci = rbind(cci,c(k,ci,ei,mei)) #cluster, cluster index, item index, item-consensus
282 }
283 eiCols = c(eiCols, rep(ci,length(o$consensusClass)) )
284 }
285
286 cck = cci[which(cci[,1]==k),] #only plot the new k data.
287
288 #group by item, order by cluster i
289 w=lapply(split(cck,cck[,3]), function(x) { y=matrix(unlist(x),ncol=4); y[order(y[,2]),4] })
290 q = matrix(as.numeric(unlist(w)),ncol=length(w),byrow=F)
291 q = q[,res[[2]]$consensusTree$order] #order by leave order of k=2
292 #q is a matrix of k rows and sample columns, values are item consensus of sample to the cluster.
293
294 thisColors = unique(cbind(res[[k]]$consensusClass,res[[k]]$clrs[[1]]))
295 thisColors=thisColors[order(as.numeric(thisColors[,1])),2]
296 colorsArr=c(colorsArr,thisColors)
297 sumRes[[k]] = rankedBarPlot(q,thisColors,cc=res[[k]]$consensusClass[res[[2]]$consensusTree$order],paste("k=",k,sep="") )
298 }
299
300 ys=cs=lab=c()
301 lastk=cc[1,1]
302 for(i in 1:length(colorsArr)){
303 if(lastk != cc[i,1]){
304 ys=c(ys,0,0)
305 cs=c(cs,NA,NA)
306 lastk=cc[i,1]
307 lab=c(lab,NA,NA)
308 }
309 ys=c(ys,cc[i,3])
310 cs=c(cs,colorsArr[i])
311 lab=c(lab,cc[i,1])
312 }
313 names(ys) = lab
314 par(mfrow=c(3,1),mar=c(4,3,2,0))
315 barplot(ys,col=cs,border=cs,main="cluster-consensus",ylim=c(0,1),las=1)
316 if(is.null(plot)==FALSE){
317 dev.off()
318 }
319 colnames(cc) = c("k","cluster","clusterConsensus")
320 colnames(cci) = c("k","cluster","item","itemConsensus")
321 cci[,"item"] = names(res[[2]]$consensusClass)[ cci[,"item"] ]
322 #type cci
323 cci = data.frame( k=as.numeric(cci[,"k"]), cluster=as.numeric(cci[,"cluster"]), item=cci[,"item"], itemConsensus=as.numeric(cci[,"itemConsensus"]))
324
325 #write to file.
326 if(writeTable){
327 write.csv(file=paste(title,"/",title,".summary.cluster.consensus.csv",sep=""),row.names=F, cc)
328 write.csv(file=paste(title,"/",title,".summary.item.consensus.csv",sep=""), row.names=F, cc)
329 }
330 return(list(clusterConsensus=cc,itemConsensus=cci))
331 }
332
333
334 ccRun <- function( d=d,
335 maxK=NULL,
336 repCount=NULL,
337 diss=inherits( d, "dist" ),
338 pItem=NULL,
339 pFeature=NULL,
340 innerLinkage=NULL,
341 distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ),
342 clusterAlg=NULL,
343 weightsItem=NULL,
344 weightsFeature=NULL,
345 verbose=NULL) {
346 m = vector(mode='list', repCount)
347 ml = vector(mode="list",maxK)
348 n <- ifelse( diss, ncol( as.matrix(d) ), ncol(d) )
349 mCount = mConsist = matrix(c(0),ncol=n,nrow=n)
350 ml[[1]] = c(0);
351
352 if (is.null( distance ) ) distance <- 'euclidean' ## necessary if d is a dist object and attr( d, "method" ) == NULLa
353
354 require( amap )
355 ## we're going to use the amap Dist function, but they misname their correlation
356 ## functions, so re-name them correctly
357 amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
358 "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
359 names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
360 "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
361 main.dist.obj <- NULL
362 ##browser()
363 if ( diss ){
364 main.dist.obj <- d
365
366 ## reset the pFeature & weightsFeature params if they've been set (irrelevant if d is a dist matrix)
367 if ( ( !is.null(pFeature) ) &&
368 ( pFeature < 1 ) ) {
369 if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified pFeature parameter\n" )
370 pFeature <- 1 # set it to 1 to avoid problems with sampleCols
371 }
372 if ( ! is.null( weightsFeature ) ) {
373 if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified weightsFeature parameter\n" )
374 weightsFeature <- NULL # set it to NULL to avoid problems with sampleCols
375 }
376 } else { ## d is a data matrix
377 ## we're not sampling over the features
378 if ( ( clusterAlg != "km" ) &&
379 ( is.null( pFeature ) ||
380 ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) ) {
381 ## only generate a main.dist.object IFF 1) d is a matrix, 2) we're not sampling the features, and 3) the algorithm isn't 'km'
382 if ( inherits( distance, "character" ) ) {
383 if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
384
385 main.dist.obj <- Dist( t(d), method=as.character( amap.distance[ distance ] ) )
386 ## now fix dumb amap naming convention for distance metrics
387 attr( main.dist.obj, "method" ) <- as.character( amap.distance[ distance ] )
388 } else stop("unsupported distance specified.")
389 } else {
390 ## pFeature < 1 or a weightsFeature != NULL
391 ## since d is a data matrix, the user wants to sample over the gene features, so main.dist.obj is left as NULL
392 }
393 }
394
395
396 for (i in 1:repCount){
397 ##browser()
398 if(verbose){
399 message(paste("random subsample",i));
400 }
401 ## take expression matrix sample, samples and genes
402 sample_x = sampleCols( d, pItem, pFeature, weightsItem, weightsFeature )
403
404 this_dist = NA
405 if ( ! is.null( main.dist.obj ) ) {
406 boot.cols <- sample_x$subcols
407 this_dist <- as.matrix( main.dist.obj )[ boot.cols, boot.cols ]
408 if ( clusterAlg != "km" ) {
409 ## if this isn't kmeans, then convert to a distance object
410 this_dist <- as.dist( this_dist )
411 attr( this_dist, "method" ) <- attr( main.dist.obj, "method" )
412 }
413 } else {
414 ## if main.dist.obj is NULL, then d is a data matrix, and either:
415 ## 1) clusterAlg is 'km'
416 ## 2) pFeatures < 1 or weightsFeatures have been specified, or
417 ## 3) both
418 ## so we can't use a main distance object and for every iteration, we will have to re-calculate either
419 ## 1) the distance matrix (because we're also sampling the features as well), or
420 ## 2) the submat (if using km)
421
422 if ( clusterAlg != "km" ) {
423 if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
424 ## good, we have a supported distance type
425 this_dist <- Dist( t( sample_x$submat ), method=as.character( amap.distance[ distance ] ) )
426 ## now fix dumb amap naming convention for distance metrics
427 attr( this_dist, "method" ) <- as.character( amap.distance[ distance ] )
428 } else {
429 ##browser()
430 ##clusterAlg == "km"
431 ## if we're not sampling the features, then grab the colslice
432 if ( is.null( pFeature ) ||
433 ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) {
434 this_dist <- d[, sample_x$subcols ]
435 } else {
436 if ( is.na( sample_x$submat ) ) {
437 save( "ccrun.submat.eq.na.dbg.rda" )
438 stop( "Houston, we have a problem. sample_x$submat is NA in ccRun when it should be specified - saving state\n" )
439 }
440
441 this_dist <- sample_x$submat
442 }
443 }
444 }
445
446 ## cluster samples for HC.
447 this_cluster=NA
448 if(clusterAlg=="hc"){
449 this_cluster = hclust( this_dist, method=innerLinkage)
450 }
451 ##browser()
452 ##mCount is possible number of times that two sample occur in same random sample, independent of k
453 ##mCount stores number of times a sample pair was sampled together.
454 mCount <- connectivityMatrix( rep( 1,length(sample_x[[3]])),
455 mCount,
456 sample_x[[3]] )
457
458 ##use samples for each k
459 for (k in 2:maxK){
460 if(verbose){
461 message(paste(" k =",k))
462 }
463 if (i==1){
464 ml[[k]] = mConsist #initialize
465 }
466 this_assignment=NA
467 if(clusterAlg=="hc"){
468 ##prune to k for hc
469 this_assignment = cutree(this_cluster,k)
470 ##browser()
471 }else if(clusterAlg=="km"){
472 ##this_dist should now be a matrix corresponding to the result from sampleCols
473 this_assignment <- kmeans( t( this_dist ),
474 k,
475 iter.max = 10,
476 nstart = 1,
477 algorithm = c("Hartigan-Wong") )$cluster
478 }else if ( clusterAlg == "pam" ) {
479 require( cluster )
480 this_assignment <- pam( x=this_dist,
481 k,
482 diss=TRUE,
483 metric=distance,
484 cluster.only=TRUE )
485 } else{
486 ##optional cluterArg Hook.
487 this_assignment <- get(clusterAlg)(this_dist, k)
488 }
489 ##add to tally
490 ml[[k]] <- connectivityMatrix( this_assignment,
491 ml[[k]],
492 sample_x[[3]] )
493 }
494 }
495
496
497 ##consensus fraction
498 res = vector(mode="list",maxK)
499 for (k in 2:maxK){
500 ##fill in other half of matrix for tally and count.
501 tmp = triangle(ml[[k]],mode=3)
502 tmpCount = triangle(mCount,mode=3)
503 res[[k]] = tmp / tmpCount
504 res[[k]][which(tmpCount==0)] = 0
505 }
506 message("end fraction")
507 return(res)
508 }
509
510
511 connectivityMatrix <- function( clusterAssignments, m, sampleKey){
512 ##input: named vector of cluster assignments, matrix to add connectivities
513 ##output: connectivity matrix
514 names( clusterAssignments ) <- sampleKey
515 cls <- lapply( unique( clusterAssignments ), function(i) as.numeric( names( clusterAssignments[ clusterAssignments %in% i ] ) ) )
516
517 for ( i in 1:length( cls ) ) {
518 nelts <- 1:ncol( m )
519 cl <- as.numeric( nelts %in% cls[[i]] ) ## produces a binary vector
520 updt <- outer( cl, cl )
521 m <- m + updt
522 }
523 return(m)
524 }
525
526 ## returns a list with the sample columns, as well as the sub-matrix & sample features (if necessary)
527 ## if no sampling over the features is performed, the submatrix & sample features are returned as NAs
528 ## to reduce memory overhead
529 sampleCols <- function( d,
530 pSamp=NULL,
531 pRow=NULL,
532 weightsItem=NULL,
533 weightsFeature=NULL ){
534 space <- ifelse( inherits( d, "dist" ), ncol( as.matrix(d) ), ncol(d) )
535 sampleN <- floor(space*pSamp)
536 sampCols <- sort( sample(space, sampleN, replace = FALSE, prob = weightsItem) )
537
538 this_sample <- sampRows <- NA
539 if ( inherits( d, "matrix" ) ) {
540 if ( (! is.null( pRow ) ) &&
541 ( (pRow < 1 ) || (! is.null( weightsFeature ) ) ) ) {
542 ## only sample the rows and generate a sub-matrix if we're sampling over the row/gene/features
543 space = nrow(d)
544 sampleN = floor(space*pRow)
545 sampRows = sort( sample(space, sampleN, replace = FALSE, prob = weightsFeature) )
546 this_sample <- d[sampRows,sampCols]
547 dimnames(this_sample) <- NULL
548 } else {
549 ## do nothing
550 }
551 }
552 return( list( submat=this_sample,
553 subrows=sampRows,
554 subcols=sampCols ) )
555 }
556
557 CDF=function(ml,breaks=100){
558 #plot CDF distribution
559 plot(c(0),xlim=c(0,1),ylim=c(0,1),col="white",bg="white",xlab="consensus index",ylab="CDF",main="consensus CDF", las=2)
560 k=length(ml)
561 this_colors = rainbow(k-1)
562 areaK = c()
563 for (i in 2:length(ml)){
564 v=triangle(ml[[i]],mode=1)
565
566 #empirical CDF distribution. default number of breaks is 100
567 h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks))
568 h$counts = cumsum(h$counts)/sum(h$counts)
569
570 #calculate area under CDF curve, by histogram method.
571 thisArea=0
572 for (bi in 1:(length(h$breaks)-1)){
573 thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width
574 bi = bi + 1
575 }
576 areaK = c(areaK,thisArea)
577 lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l')
578 }
579 legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors)
580
581 #plot area under CDF change.
582 deltaK=areaK[1] #initial auc at k=2
583 for(i in 2:(length(areaK))){
584 #proportional increase relative to prior K.
585 deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1])
586 }
587 plot(1+(1:length(deltaK)),y=deltaK,xlab="k",ylab="relative change in area under CDF curve",main="Delta area",type="b")
588 }
589
590
591 myPal = function(n=10){
592 #returns n colors
593 seq = rev(seq(0,255,by=255/(n)))
594 palRGB = cbind(seq,seq,255)
595 rgb(palRGB,maxColorValue=255)
596 }
597
598 setClusterColors = function(past_ct,ct,colorU,colorList){
599 #description: sets common color of clusters between different K
600 newColors = c()
601 if(length(colorList)==0){
602 #k==2
603 newColors = colorU[ct]
604 colori=2
605 }else{
606 newColors = rep(NULL,length(ct))
607 colori = colorList[[2]]
608 mo=table(past_ct,ct)
609 m=mo/apply(mo,1,sum)
610 for(tci in 1:ncol(m)){ # for each cluster
611 maxC = max(m[,tci])
612 pci = which(m[,tci] == maxC)
613 if( sum(m[,tci]==maxC)==1 & max(m[pci,])==maxC & sum(m[pci,]==maxC)==1 ) {
614 #if new column maximum is unique, same cell is row maximum and is also unique
615 ##Note: the greatest of the prior clusters' members are the greatest in a current cluster's members.
616 newColors[which(ct==tci)] = unique(colorList[[1]][which(past_ct==pci)]) # one value
617 }else{ #add new color.
618 colori=colori+1
619 newColors[which(ct==tci)] = colorU[colori]
620 }
621 }
622 }
623 return(list(newColors,colori,unique(newColors) ))
624 }
625
626 clusterTrackingPlot = function(m){
627 #description: plots cluster tracking plot
628 #input: m - matrix where rows are k, columns are samples, and values are cluster assignments.
629 plot(NULL,xlim=c(-0.1,1),ylim=c(0,1),axes=FALSE,xlab="samples",ylab="k",main="tracking plot")
630 for(i in 1:nrow(m)){
631 rect( xleft=seq(0,1-1/ncol(m),by=1/ncol(m)), ybottom=rep(1-i/nrow(m),ncol(m)) , xright=seq(1/ncol(m),1,by=1/ncol(m)), ytop=rep(1-(i-1)/nrow(m),ncol(m)), col=m[i,],border=NA)
632 }
633 #hatch lines to indicate samples
634 xl = seq(0,1-1/ncol(m),by=1/ncol(m))
635 segments( xl, rep(-0.1,ncol(m)) , xl, rep(0,ncol(m)), col="black") #** alt white and black color?
636 ypos = seq(1,0,by=-1/nrow(m))-1/(2*nrow(m))
637 text(x=-0.1,y=ypos[-length(ypos)],labels=seq(2,nrow(m)+1,by=1))
638 }
639
640 triangle = function(m,mode=1){
641 #mode=1 for CDF, vector of lower triangle.
642 #mode==3 for full matrix.
643 #mode==2 for calcICL; nonredundant half matrix coun
644 #mode!=1 for summary
645 n=dim(m)[1]
646 nm = matrix(0,ncol=n,nrow=n)
647 fm = m
648
649
650 nm[upper.tri(nm)] = m[upper.tri(m)] #only upper half
651
652 fm = t(nm)+nm
653 diag(fm) = diag(m)
654
655 nm=fm
656 nm[upper.tri(nm)] = NA
657 diag(nm) = NA
658 vm = m[lower.tri(nm)]
659
660 if(mode==1){
661 return(vm) #vector
662 }else if(mode==3){
663 return(fm) #return full matrix
664 }else if(mode == 2){
665 return(nm) #returns lower triangle and no diagonal. no double counts.
666 }
667
668 }
669
670
671 rankedBarPlot=function(d,myc,cc,title){
672 colors = rbind() #each row is a barplot series
673 byRank = cbind()
674
675 spaceh = 0.1 #space between bars
676 for(i in 1:ncol(d)){
677 byRank = cbind(byRank,sort(d[,i],na.last=F))
678 colors = rbind(colors,order(d[,i],na.last=F))
679 }
680 maxH = max(c(1.5,apply(byRank,2,sum)),na.rm=T) #maximum height of graph
681
682 #barplot largest to smallest so that smallest is in front.
683 barp = barplot( apply(byRank,2,sum) , col=myc[colors[,1]] ,space=spaceh,ylim=c(0,maxH),main=paste("item-consensus", title),border=NA,las=1 )
684 for(i in 2:nrow(byRank)){
685 barplot( apply(matrix(byRank[i:nrow(byRank),],ncol=ncol(byRank)) ,2,sum), space=spaceh,col=myc[colors[,i]],ylim=c(0,maxH), add=T,border=NA,las=1 )
686 }
687 xr=seq(spaceh,ncol(d)+ncol(d)*spaceh,(ncol(d)+ncol(d)*spaceh)/ncol(d) )
688 #class labels as asterisks
689 text("*",x=xr+0.5,y=maxH,col=myc[cc],cex=1.4) #rect(xr,1.4,xr+1,1.5,col=myc[cc] )
690 }
691
692
693
694 ###################################################################3333
695 ## RESTART MY SCRIPTS HERE
696 ##save.image( '/home/waltman/work.local/tmp/new.ccplus.R.dbg' )
697 stop( "phw forced stop\n")
698 spec <- matrix( c( "data.fname", "d", 1, "character",
699 "direction", "n", 2, "character",
700 "output.name", "o", 2, "character",
701 "cluster.alg", "a", 2, "character", ## must be either 'hc' or 'km'
702 "distance.metric", "m", 2, "character", ## must be one supported by ConsensusClusterPlus
703 "max.k", "k", 2, "integer",
704 "reps", "r", 2, "integer",
705 "innerLinkage", "i", 1, "character",
706 "finalLinkage", "f", 1, "character",
707 "out.report.dir", "p", 2, "character",
708 "out.report.html", "h", 2, "character"
709 ),
710 nc=4,
711 byrow=TRUE
712 )
713
714 opt <- getopt( spec=spec )
715
716 ## default params for non-required params
717 if ( is.null( opt$direction ) ) { opt$direction <- "cols" }
718 if ( is.null( opt$cluster.alg ) ) { opt$cluster.alg <- "pam" }
719 if ( is.null( opt$output.name ) ) { opt$output.name <- "consensus.cluster.result" }
720 if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "cosine" }
721 if ( is.null( opt$max.k ) ) { opt$max.k <- 10 }
722 if ( is.null( opt$reps ) ) { opt$reps <- 1000 }
723 if ( is.null( opt$innerLinkage ) ) { opt$innerLinkage <- "average" }
724 if ( is.null( opt$finalLinkage ) ) { opt$finalLinkage <- "average" }
725
726 if ( is.null( opt$out.report.dir ) ) { opt$out.report.dir <- "report" }
727 if ( is.null( opt$out.report.html ) ) { opt$out.report.html <- file.path( "report", "index.html" ) }
728
729 ## validate params here (make sure set to valid values)
730 if ( !opt$cluster.alg %in% c( "hc", "km", "pam" ) ) {
731 stop( "invalid clustering algorithm specified", cluster.alg )
732 }
733
734
735 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
736 ## transpose the matrix if we want to cluster the rows (genes)
737 if ( opt$direction == "rows" ) {
738 data <- t( data )
739 }
740
741
742 title <- paste( opt$cluster.alg, opt$output.name, sep="." )
743 ##source( '~/bin/galaxy-dist/tools/ucsc.cancer.tools/cluster.tools/new.ccplus.R' )
744 results <- ConsensusClusterPlus( data,
745 maxK=opt$max.k,
746 reps=opt$reps,
747 pItem=0.8,
748 ##pFeature=NULL,
749 pFeature=0.5,
750 title=opt$out.report.dir,
751 clusterAlg=opt$cluster.alg,
752 distance=opt$distance.metric,
753 innerLinkage=opt$innerLinkage,
754 finalLinkage=opt$finalLinkage,
755 plot='pdf',
756 writeTable=FALSE,
757 seed=100,
758 weightsFeature=abs( rnorm( nrow( orig.data ) ) ),
759 ##verbose=FALSE )
760 verbose=TRUE )
761
762 pngs = list.files(path=opt$out.report.dir, patt="png")
763 html.out <- paste( "<html>",
764 paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ),
765 "</html>" )
766 cat( html.out, file=opt$out.report.html )
767
768
769 ## re-transpose the matrix back if we've clustered the rows (genes)
770 if ( opt$direction == "rows" ) {
771 data <- t( data )
772 }
773 save( file=opt$output.name, data, results)