Mercurial > repos > peter-waltman > ucsc_cluster_tools2
diff cluster.tools/consensus.clustering.R @ 0:0decf3fd54bc draft
Uploaded
author | peter-waltman |
---|---|
date | Thu, 28 Feb 2013 01:45:39 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/consensus.clustering.R Thu Feb 28 01:45:39 2013 -0500 @@ -0,0 +1,774 @@ +#!/usr/bin/env Rscript +## Consensus Clustering Script by Peter Waltman +## May 31, 2011 +## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0) +## +#usage, options and doc goes here +argspec <- c("consensus.clustering.R takes a clustering from ConsensusClusterPlus and clinical survival data +and generates a KM-plot, along with the log-rank p-values + + Usage: + consensus.clustering.R -d <data.file> + Optional: + -o <output.name> + -a <cluster.alg> ## must be either 'hc' or 'km' + -m <distance.metric> ## must be one supported by ConsensusClusterPlus + -k <max.k> + -r <reps> + -f <filter> ## filter, o/w no filtering + + \n\n") +args <- commandArgs(TRUE) +if ( length( args ) == 1 && args =="--help") { + write(argspec, stderr()) + q(); +} + +lib.load.quiet <- function( package ) { + package <- as.character(substitute(package)) + suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) +} +lib.load.quiet(getopt) +lib.load.quiet( gplots ) +lib.load.quiet( amap ) +## if any of the faster clustering methods are available on this system, load them +if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) { + if ( 'flashClust' %in% installed.packages() ) { + lib.load.quiet( flashClust ) + } else { + if ( 'fastcluster' %in% installed.packages() ) { + lib.load.quiet( fastcluster ) + } + } +} +##lib.load.quiet(ConsensusClusterPlus) +lib.load.quiet( amap ) +lib.load.quiet( cluster ) + +################### +## code borrowed/updated from ConsensusClusterPlus +################### + +ConsensusClusterPlus <- function( d=NULL, + maxK = 3, + reps=10, + pItem=0.8, + pFeature=1, + clusterAlg="hc", + title="untitled_consensus_cluster", + innerLinkage="average", + finalLinkage="average", + distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ), + ml=NULL, + tmyPal=NULL, + seed=NULL, + plot=NULL, + writeTable=FALSE, + weightsItem=NULL, + weightsFeature=NULL, + verbose=F ) { + ##description: runs consensus subsamples + + + if(is.null(seed)==TRUE){ + seed=timeSeed = as.numeric(Sys.time()) + } + set.seed(seed) + + if(is.null(ml)==TRUE){ + + if ( inherits( distance, "dist" ) ) { + stop( "If you want to pass in a pre-calculated distance object, pass it in as the data, rather than the distance parameter\n" ) + } + + if ( ! class( d ) %in% c( "dist", "matrix", "ExpressionSet" ) ) { + stop("d must be a matrix, distance object or ExpressionSet (eset object)") + } + + if ( inherits( d, "dist" ) ) { + ## if d is a distance matrix, fix a few things so that they don't cause problems with the analysis + ## Note, assumption is that if d is a distance matrix, the user doesn't want to sample over the row features + if ( is.null( attr( d, "method" ) ) ) { + attr( d, "method" ) <- distance <- "unknown - user-specified" + } + if ( is.null( distance ) || ( distance != attr( d, "method" ) ) ) { + distance <- attr( d, "method" ) + } + + if ( ( ! is.null( pFeature ) ) && ( pFeature < 1 ) ) { + if ( verbose ) warning( "Cannot use the pFeatures parameter when specifying a distance matrix as the data object\n" ) + pFeature <- 1 + } + if ( ! is.null( weightsFeature ) ) { + if ( verbose ) warning( "Cannot use the weightsFeature parameter when specifying a distance matrix as the data object\n" ) + weightsFeature <- NULL + } + if ( clusterAlg == "km" ) { + if ( verbose ) warning( "You are asking CCPLUS to use K-means to cluster a distance matrix (rather than the data itself) - this may produce unintended results. We suggest using PAM if you want to use alternate distance metrics/objects\n" ) + ##d <- as.matrix( d ) #this is now done w/in ccRun + } + } else { + if ( is.null( distance ) ) { + ## we should never get here, but just in case + distance <- "pearson" + } + } + + if ( ( clusterAlg == "km" ) && inherits( distance, "character" ) && ( distance != "euclidean" ) ) { + warning( "WARNING: kmeans can only use the euclidean distance metric. If you would like to use an alternate metric, we suggest using PAM or HC clustering instead. This parameter combinationwill use k-means, but will NOT use the specified distance metric\n" ) + distance <- 'euclidean' + } + + + if ( inherits( d,"ExpressionSet" ) ) { + d <- exprs(d) + } + + ml <- ccRun( d=d, + maxK=maxK, + repCount=reps, + diss=inherits(d,"dist"), + pItem=pItem, + pFeature=pFeature, + innerLinkage=innerLinkage, + clusterAlg=clusterAlg, + weightsFeature=weightsFeature, + weightsItem=weightsItem, + distance=distance, + verbose=verbose) + } + res=list(); + + ##make results directory + if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){ + dir.create(paste(title,sep="")) + } + + ##write log file + log <- matrix( ncol=2, + byrow=T, + c("title",title, + "maxK",maxK, + "input matrix rows",ifelse ( inherits( d, "matrix" ), nrow(d), "dist-mat" ), + "input matric columns",ifelse ( inherits( d, "matrix" ), ncol(d), ncol( as.matrix(d) ) ), + "number of bootstraps",reps, + "item subsampling proportion",pItem, + "feature subsampling proportion",ifelse( is.null(pFeature), 1, pFeature ), + "cluster algorithm",clusterAlg, + "inner linkage type",innerLinkage, + "final linkage type",finalLinkage, + "correlation method",distance, + "plot",if(is.null(plot)) NA else plot, + "seed",if(is.null(seed)) NA else seed)) + colnames(log) = c("option","value") + if(writeTable){ + write.csv(file=paste(title,"/",title,".log.csv",sep=""), log,row.names=F) + } + if(is.null(plot)){ + ##nothing + }else if(plot=="png"){ + png(paste(title,"/","consensus%03d.png",sep="")) + }else if (plot=="pdf"){ + pdf(onefile=TRUE, paste(title,"/","consensus.pdf",sep="")) + }else if (plot=="ps"){ + postscript(onefile=TRUE, paste(title,"/","consensus.ps",sep="")) + } + + colorList=list() + colorM = rbind() #matrix of colors. + + #18 colors for marking different clusters + thisPal <- c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99","#E31A1C","#FDBF6F","#FF7F00","#CAB2D6","#6A3D9A","#FFFF99","#B15928", + "#bd18ea", #magenta + "#2ef4ca", #aqua + "#f4cced", #pink, + "#f4cc03", #lightorange + "#05188a", #navy, + "#e5a25a", #light brown + "#06f106", #bright green + "#85848f", #med gray + "#000000", #black + "#076f25", #dark green + "#93cd7f",#lime green + "#4d0776", #dark purple + "#ffffff" #white + ) + + ##plot scale + colBreaks=NA + if(is.null(tmyPal)==TRUE){ + colBreaks=10 + tmyPal = myPal(colBreaks) + }else{ + colBreaks=length(tmyPal) + } + sc = cbind(seq(0,1,by=1/( colBreaks) )); rownames(sc) = sc[,1] + sc = cbind(sc,sc) + heatmap(sc, Colv=NA, Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=rownames(sc),labCol=F,main="consensus matrix legend") + + for (tk in 2:maxK){ + if(verbose){ + message(paste("consensus ",tk)) + } + fm = ml[[tk]] + hc=hclust( as.dist( 1 - fm ), method=finalLinkage); + message("clustered") + ct = cutree(hc,tk) + names(ct) = colnames(d) + c = fm + ##colnames(c) = colnames(d) + ##rownames(c) = colnames(d) + + colorList = setClusterColors(res[[tk-1]][[3]],ct,thisPal,colorList) + + pc = c + pc=pc[hc$order,] #***pc is matrix for plotting, same as c but is row-ordered and has names and extra row of zeros. + pc = rbind(pc,0) + + heatmap(pc, Colv=as.dendrogram(hc), Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=F,labCol=F,mar=c(5,5),main=paste("consensus matrix k=",tk,sep="") , ColSideCol=colorList[[1]]) + legend("topright",legend=unique(ct),fill=unique(colorList[[1]]),horiz=FALSE ) + + res[[tk]] = list(consensusMatrix=c,consensusTree=hc,consensusClass=ct,ml=ml[[tk]],clrs=colorList) + colorM = rbind(colorM,colorList[[1]]) + } + CDF(ml) + clusterTrackingPlot(colorM[,res[[length(res)]]$consensusTree$order]) + if(is.null(plot)==FALSE){ + dev.off(); + } + res[[1]] = colorM + if(writeTable){ + for(i in 2:length(res)){ + write.csv(file=paste(title,"/",title,".k=",i,".consensusMatrix.csv",sep=""), res[[i]]$consensusMatrix) + write.table(file=paste(title,"/",title,".k=",i,".consensusClass.csv",sep=""), res[[i]]$consensusClass,col.names = F,sep=",") + } + } + return(res) +} + + +calcICL = function(res,title="untitled_consensus_cluster",plot=NULL,writeTable=FALSE){ + #calculates and plots cluster consensus and item consensus + cc=rbind() + cci = rbind() + sumRes=list() + colorsArr=c() + + #make results directory + if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){ + dir.create(paste(title,sep="")) + } + if(is.null(plot)){ + #to screen + }else if(plot=="pdf"){ + pdf(onefile=TRUE, paste(title,"/","icl.pdf",sep="")) + }else if(plot=="ps"){ + postscript(onefile=TRUE, paste(title,"/","icl.ps",sep="")) + }else if (plot=="png"){ + png(paste(title,"/","icl%03d.png",sep="")) + } + + par(mfrow=c(3,1),mar=c(4,3,2,0)) + + for (k in 2:length(res)){ #each k + eiCols = c(); + o = res[[k]] + m = o$consensusMatrix + m = triangle(m,mode=2) + for (ci in sort(unique(o$consensusClass))){ #each cluster in k + items = which(o$consensusClass==ci) + nk = length(items) + mk = sum( m[items,items], na.rm=T)/((nk*(nk-1))/2) + cc=rbind(cc,c(k,ci,mk)) #cluster-consensus + + for (ei in rev(res[[2]]$consensusTree$order) ){ + denom = if (ei %in% items) { nk - 1} else { nk } + mei = sum( c(m[ei,items],m[items,ei]), na.rm=T)/denom # mean item consensus to a cluster. + cci = rbind(cci,c(k,ci,ei,mei)) #cluster, cluster index, item index, item-consensus + } + eiCols = c(eiCols, rep(ci,length(o$consensusClass)) ) + } + + cck = cci[which(cci[,1]==k),] #only plot the new k data. + + #group by item, order by cluster i + w=lapply(split(cck,cck[,3]), function(x) { y=matrix(unlist(x),ncol=4); y[order(y[,2]),4] }) + q = matrix(as.numeric(unlist(w)),ncol=length(w),byrow=F) + q = q[,res[[2]]$consensusTree$order] #order by leave order of k=2 + #q is a matrix of k rows and sample columns, values are item consensus of sample to the cluster. + + thisColors = unique(cbind(res[[k]]$consensusClass,res[[k]]$clrs[[1]])) + thisColors=thisColors[order(as.numeric(thisColors[,1])),2] + colorsArr=c(colorsArr,thisColors) + sumRes[[k]] = rankedBarPlot(q,thisColors,cc=res[[k]]$consensusClass[res[[2]]$consensusTree$order],paste("k=",k,sep="") ) + } + + ys=cs=lab=c() + lastk=cc[1,1] + for(i in 1:length(colorsArr)){ + if(lastk != cc[i,1]){ + ys=c(ys,0,0) + cs=c(cs,NA,NA) + lastk=cc[i,1] + lab=c(lab,NA,NA) + } + ys=c(ys,cc[i,3]) + cs=c(cs,colorsArr[i]) + lab=c(lab,cc[i,1]) + } + names(ys) = lab + par(mfrow=c(3,1),mar=c(4,3,2,0)) + barplot(ys,col=cs,border=cs,main="cluster-consensus",ylim=c(0,1),las=1) + if(is.null(plot)==FALSE){ + dev.off() + } + colnames(cc) = c("k","cluster","clusterConsensus") + colnames(cci) = c("k","cluster","item","itemConsensus") + cci[,"item"] = names(res[[2]]$consensusClass)[ cci[,"item"] ] + #type cci + cci = data.frame( k=as.numeric(cci[,"k"]), cluster=as.numeric(cci[,"cluster"]), item=cci[,"item"], itemConsensus=as.numeric(cci[,"itemConsensus"])) + + #write to file. + if(writeTable){ + write.csv(file=paste(title,"/",title,".summary.cluster.consensus.csv",sep=""),row.names=F, cc) + write.csv(file=paste(title,"/",title,".summary.item.consensus.csv",sep=""), row.names=F, cc) + } + return(list(clusterConsensus=cc,itemConsensus=cci)) +} + + +ccRun <- function( d=d, + maxK=NULL, + repCount=NULL, + diss=inherits( d, "dist" ), + pItem=NULL, + pFeature=NULL, + innerLinkage=NULL, + distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ), + clusterAlg=NULL, + weightsItem=NULL, + weightsFeature=NULL, + verbose=NULL) { + m = vector(mode='list', repCount) + ml = vector(mode="list",maxK) + n <- ifelse( diss, ncol( as.matrix(d) ), ncol(d) ) + mCount = mConsist = matrix(c(0),ncol=n,nrow=n) + ml[[1]] = c(0); + + if (is.null( distance ) ) distance <- 'euclidean' ## necessary if d is a dist object and attr( d, "method" ) == NULLa + + require( amap ) + ## we're going to use the amap Dist function, but they misname their correlation + ## functions, so re-name them correctly + amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary", + "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" ) + names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary", + "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" ) + main.dist.obj <- NULL + ##browser() + if ( diss ){ + main.dist.obj <- d + + ## reset the pFeature & weightsFeature params if they've been set (irrelevant if d is a dist matrix) + if ( ( !is.null(pFeature) ) && + ( pFeature < 1 ) ) { + if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified pFeature parameter\n" ) + pFeature <- 1 # set it to 1 to avoid problems with sampleCols + } + if ( ! is.null( weightsFeature ) ) { + if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified weightsFeature parameter\n" ) + weightsFeature <- NULL # set it to NULL to avoid problems with sampleCols + } + } else { ## d is a data matrix + ## we're not sampling over the features + if ( ( clusterAlg != "km" ) && + ( is.null( pFeature ) || + ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) ) { + ## only generate a main.dist.object IFF 1) d is a matrix, 2) we're not sampling the features, and 3) the algorithm isn't 'km' + if ( inherits( distance, "character" ) ) { + if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.") + + main.dist.obj <- Dist( t(d), method=as.character( amap.distance[ distance ] ) ) + ## now fix dumb amap naming convention for distance metrics + attr( main.dist.obj, "method" ) <- as.character( amap.distance[ distance ] ) + } else stop("unsupported distance specified.") + } else { + ## pFeature < 1 or a weightsFeature != NULL + ## since d is a data matrix, the user wants to sample over the gene features, so main.dist.obj is left as NULL + } + } + + + for (i in 1:repCount){ + ##browser() + if(verbose){ + message(paste("random subsample",i)); + } + ## take expression matrix sample, samples and genes + sample_x = sampleCols( d, pItem, pFeature, weightsItem, weightsFeature ) + + this_dist = NA + if ( ! is.null( main.dist.obj ) ) { + boot.cols <- sample_x$subcols + this_dist <- as.matrix( main.dist.obj )[ boot.cols, boot.cols ] + if ( clusterAlg != "km" ) { + ## if this isn't kmeans, then convert to a distance object + this_dist <- as.dist( this_dist ) + attr( this_dist, "method" ) <- attr( main.dist.obj, "method" ) + } + } else { + ## if main.dist.obj is NULL, then d is a data matrix, and either: + ## 1) clusterAlg is 'km' + ## 2) pFeatures < 1 or weightsFeatures have been specified, or + ## 3) both + ## so we can't use a main distance object and for every iteration, we will have to re-calculate either + ## 1) the distance matrix (because we're also sampling the features as well), or + ## 2) the submat (if using km) + + if ( clusterAlg != "km" ) { + if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.") + ## good, we have a supported distance type + this_dist <- Dist( t( sample_x$submat ), method=as.character( amap.distance[ distance ] ) ) + ## now fix dumb amap naming convention for distance metrics + attr( this_dist, "method" ) <- as.character( amap.distance[ distance ] ) + } else { + ##browser() + ##clusterAlg == "km" + ## if we're not sampling the features, then grab the colslice + if ( is.null( pFeature ) || + ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) { + this_dist <- d[, sample_x$subcols ] + } else { + if ( is.na( sample_x$submat ) ) { + save( "ccrun.submat.eq.na.dbg.rda" ) + stop( "Houston, we have a problem. sample_x$submat is NA in ccRun when it should be specified - saving state\n" ) + } + + this_dist <- sample_x$submat + } + } + } + + ## cluster samples for HC. + this_cluster=NA + if(clusterAlg=="hc"){ + this_cluster = hclust( this_dist, method=innerLinkage) + } + ##browser() + ##mCount is possible number of times that two sample occur in same random sample, independent of k + ##mCount stores number of times a sample pair was sampled together. + mCount <- connectivityMatrix( rep( 1,length(sample_x[[3]])), + mCount, + sample_x[[3]] ) + + ##use samples for each k + for (k in 2:maxK){ + if(verbose){ + message(paste(" k =",k)) + } + if (i==1){ + ml[[k]] = mConsist #initialize + } + this_assignment=NA + if(clusterAlg=="hc"){ + ##prune to k for hc + this_assignment = cutree(this_cluster,k) + ##browser() + }else if(clusterAlg=="km"){ + ##this_dist should now be a matrix corresponding to the result from sampleCols + this_assignment <- kmeans( t( this_dist ), + k, + iter.max = 10, + nstart = 1, + algorithm = c("Hartigan-Wong") )$cluster + }else if ( clusterAlg == "pam" ) { + require( cluster ) + this_assignment <- pam( x=this_dist, + k, + diss=TRUE, + metric=distance, + cluster.only=TRUE ) + } else{ + ##optional cluterArg Hook. + this_assignment <- get(clusterAlg)(this_dist, k) + } + ##add to tally + ml[[k]] <- connectivityMatrix( this_assignment, + ml[[k]], + sample_x[[3]] ) + } + } + + + ##consensus fraction + res = vector(mode="list",maxK) + for (k in 2:maxK){ + ##fill in other half of matrix for tally and count. + tmp = triangle(ml[[k]],mode=3) + tmpCount = triangle(mCount,mode=3) + res[[k]] = tmp / tmpCount + res[[k]][which(tmpCount==0)] = 0 + } + message("end fraction") + return(res) +} + + +connectivityMatrix <- function( clusterAssignments, m, sampleKey){ + ##input: named vector of cluster assignments, matrix to add connectivities + ##output: connectivity matrix + names( clusterAssignments ) <- sampleKey + cls <- lapply( unique( clusterAssignments ), function(i) as.numeric( names( clusterAssignments[ clusterAssignments %in% i ] ) ) ) + + for ( i in 1:length( cls ) ) { + nelts <- 1:ncol( m ) + cl <- as.numeric( nelts %in% cls[[i]] ) ## produces a binary vector + updt <- outer( cl, cl ) + m <- m + updt + } + return(m) +} + +## returns a list with the sample columns, as well as the sub-matrix & sample features (if necessary) +## if no sampling over the features is performed, the submatrix & sample features are returned as NAs +## to reduce memory overhead +sampleCols <- function( d, + pSamp=NULL, + pRow=NULL, + weightsItem=NULL, + weightsFeature=NULL ){ + space <- ifelse( inherits( d, "dist" ), ncol( as.matrix(d) ), ncol(d) ) + sampleN <- floor(space*pSamp) + sampCols <- sort( sample(space, sampleN, replace = FALSE, prob = weightsItem) ) + + this_sample <- sampRows <- NA + if ( inherits( d, "matrix" ) ) { + if ( (! is.null( pRow ) ) && + ( (pRow < 1 ) || (! is.null( weightsFeature ) ) ) ) { + ## only sample the rows and generate a sub-matrix if we're sampling over the row/gene/features + space = nrow(d) + sampleN = floor(space*pRow) + sampRows = sort( sample(space, sampleN, replace = FALSE, prob = weightsFeature) ) + this_sample <- d[sampRows,sampCols] + dimnames(this_sample) <- NULL + } else { + ## do nothing + } + } + return( list( submat=this_sample, + subrows=sampRows, + subcols=sampCols ) ) +} + +CDF=function(ml,breaks=100){ + #plot CDF distribution + plot(c(0),xlim=c(0,1),ylim=c(0,1),col="white",bg="white",xlab="consensus index",ylab="CDF",main="consensus CDF", las=2) + k=length(ml) + this_colors = rainbow(k-1) + areaK = c() + for (i in 2:length(ml)){ + v=triangle(ml[[i]],mode=1) + + #empirical CDF distribution. default number of breaks is 100 + h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks)) + h$counts = cumsum(h$counts)/sum(h$counts) + + #calculate area under CDF curve, by histogram method. + thisArea=0 + for (bi in 1:(length(h$breaks)-1)){ + thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width + bi = bi + 1 + } + areaK = c(areaK,thisArea) + lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l') + } + legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors) + + #plot area under CDF change. + deltaK=areaK[1] #initial auc at k=2 + for(i in 2:(length(areaK))){ + #proportional increase relative to prior K. + deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1]) + } + plot(1+(1:length(deltaK)),y=deltaK,xlab="k",ylab="relative change in area under CDF curve",main="Delta area",type="b") +} + + +myPal = function(n=10){ + #returns n colors + seq = rev(seq(0,255,by=255/(n))) + palRGB = cbind(seq,seq,255) + rgb(palRGB,maxColorValue=255) +} + +setClusterColors = function(past_ct,ct,colorU,colorList){ + #description: sets common color of clusters between different K + newColors = c() + if(length(colorList)==0){ + #k==2 + newColors = colorU[ct] + colori=2 + }else{ + newColors = rep(NULL,length(ct)) + colori = colorList[[2]] + mo=table(past_ct,ct) + m=mo/apply(mo,1,sum) + for(tci in 1:ncol(m)){ # for each cluster + maxC = max(m[,tci]) + pci = which(m[,tci] == maxC) + if( sum(m[,tci]==maxC)==1 & max(m[pci,])==maxC & sum(m[pci,]==maxC)==1 ) { + #if new column maximum is unique, same cell is row maximum and is also unique + ##Note: the greatest of the prior clusters' members are the greatest in a current cluster's members. + newColors[which(ct==tci)] = unique(colorList[[1]][which(past_ct==pci)]) # one value + }else{ #add new color. + colori=colori+1 + newColors[which(ct==tci)] = colorU[colori] + } + } + } + return(list(newColors,colori,unique(newColors) )) +} + +clusterTrackingPlot = function(m){ + #description: plots cluster tracking plot + #input: m - matrix where rows are k, columns are samples, and values are cluster assignments. + plot(NULL,xlim=c(-0.1,1),ylim=c(0,1),axes=FALSE,xlab="samples",ylab="k",main="tracking plot") + for(i in 1:nrow(m)){ + rect( xleft=seq(0,1-1/ncol(m),by=1/ncol(m)), ybottom=rep(1-i/nrow(m),ncol(m)) , xright=seq(1/ncol(m),1,by=1/ncol(m)), ytop=rep(1-(i-1)/nrow(m),ncol(m)), col=m[i,],border=NA) + } + #hatch lines to indicate samples + xl = seq(0,1-1/ncol(m),by=1/ncol(m)) + segments( xl, rep(-0.1,ncol(m)) , xl, rep(0,ncol(m)), col="black") #** alt white and black color? + ypos = seq(1,0,by=-1/nrow(m))-1/(2*nrow(m)) + text(x=-0.1,y=ypos[-length(ypos)],labels=seq(2,nrow(m)+1,by=1)) +} + +triangle = function(m,mode=1){ + #mode=1 for CDF, vector of lower triangle. + #mode==3 for full matrix. + #mode==2 for calcICL; nonredundant half matrix coun + #mode!=1 for summary + n=dim(m)[1] + nm = matrix(0,ncol=n,nrow=n) + fm = m + + + nm[upper.tri(nm)] = m[upper.tri(m)] #only upper half + + fm = t(nm)+nm + diag(fm) = diag(m) + + nm=fm + nm[upper.tri(nm)] = NA + diag(nm) = NA + vm = m[lower.tri(nm)] + + if(mode==1){ + return(vm) #vector + }else if(mode==3){ + return(fm) #return full matrix + }else if(mode == 2){ + return(nm) #returns lower triangle and no diagonal. no double counts. + } + +} + + +rankedBarPlot=function(d,myc,cc,title){ + colors = rbind() #each row is a barplot series + byRank = cbind() + + spaceh = 0.1 #space between bars + for(i in 1:ncol(d)){ + byRank = cbind(byRank,sort(d[,i],na.last=F)) + colors = rbind(colors,order(d[,i],na.last=F)) + } + maxH = max(c(1.5,apply(byRank,2,sum)),na.rm=T) #maximum height of graph + + #barplot largest to smallest so that smallest is in front. + barp = barplot( apply(byRank,2,sum) , col=myc[colors[,1]] ,space=spaceh,ylim=c(0,maxH),main=paste("item-consensus", title),border=NA,las=1 ) + for(i in 2:nrow(byRank)){ + barplot( apply(matrix(byRank[i:nrow(byRank),],ncol=ncol(byRank)) ,2,sum), space=spaceh,col=myc[colors[,i]],ylim=c(0,maxH), add=T,border=NA,las=1 ) + } + xr=seq(spaceh,ncol(d)+ncol(d)*spaceh,(ncol(d)+ncol(d)*spaceh)/ncol(d) ) + #class labels as asterisks + text("*",x=xr+0.5,y=maxH,col=myc[cc],cex=1.4) #rect(xr,1.4,xr+1,1.5,col=myc[cc] ) +} + + + +###################################################################3333 +## RESTART MY SCRIPTS HERE + +spec <- matrix( c( "data.fname", "d", 1, "character", + "direction", "n", 2, "character", + "output.name", "o", 2, "character", + "cluster.alg", "a", 2, "character", + "distance.metric", "m", 2, "character", + "max.k", "k", 2, "integer", + "reps", "r", 2, "integer", + "innerLinkage", "i", 1, "character", + "finalLinkage", "f", 1, "character", + "out.report.dir", "p", 2, "character", + "out.report.html", "h", 2, "character" + ), + nc=4, + byrow=TRUE + ) + +opt <- getopt( spec=spec ) + +## default params for non-required params +if ( is.null( opt$direction ) ) { opt$direction <- "cols" } +if ( is.null( opt$cluster.alg ) ) { opt$cluster.alg <- "pam" } +if ( is.null( opt$output.name ) ) { opt$output.name <- "consensus.cluster.result" } +if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "cosine" } +if ( is.null( opt$max.k ) ) { opt$max.k <- 10 } +if ( is.null( opt$reps ) ) { opt$reps <- 1000 } +if ( is.null( opt$innerLinkage ) ) { opt$innerLinkage <- "average" } +if ( is.null( opt$finalLinkage ) ) { opt$finalLinkage <- "average" } + +if ( is.null( opt$out.report.dir ) ) { opt$out.report.dir <- "report" } +if ( is.null( opt$out.report.html ) ) { opt$out.report.html <- file.path( "report", "index.html" ) } + +## validate params here (make sure set to valid values) +if ( !opt$cluster.alg %in% c( "hc", "km", "pam" ) ) { + stop( "invalid clustering algorithm specified", cluster.alg ) +} + + +data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) +## transpose the matrix if we want to cluster the rows (genes) +if ( opt$direction == "rows" ) { + data <- t( data ) +} + +title <- paste( opt$cluster.alg, opt$output.name, sep="." ) +results <- ConsensusClusterPlus( data, + maxK=opt$max.k, + reps=opt$reps, + pItem=0.8, + pFeature=1, + title=opt$out.report.dir, + clusterAlg=opt$cluster.alg, + distance=opt$distance.metric, + innerLinkage=opt$innerLinkage, + finalLinkage=opt$finalLinkage, + plot='png', + writeTable=FALSE, + seed=100, + ##weightsFeature=abs( rnorm( nrow( orig.data ) ) ), + verbose=FALSE ) + +pngs = list.files(path=opt$out.report.dir, patt="png") +html.out <- paste( "<html>", + paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), + "</html>" ) +cat( html.out, file=opt$out.report.html ) + + +## re-transpose the matrix back if we've clustered the rows (genes) +if ( opt$direction == "rows" ) { + data <- t( data ) +} +save( file=opt$output.name, data, results)