Mercurial > repos > peter-waltman > ucsc_cluster_tools2
view cluster.tools/consensus.clustering.R @ 0:0decf3fd54bc draft
Uploaded
author | peter-waltman |
---|---|
date | Thu, 28 Feb 2013 01:45:39 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env Rscript ## Consensus Clustering Script by Peter Waltman ## May 31, 2011 ## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0) ## #usage, options and doc goes here argspec <- c("consensus.clustering.R takes a clustering from ConsensusClusterPlus and clinical survival data and generates a KM-plot, along with the log-rank p-values Usage: consensus.clustering.R -d <data.file> Optional: -o <output.name> -a <cluster.alg> ## must be either 'hc' or 'km' -m <distance.metric> ## must be one supported by ConsensusClusterPlus -k <max.k> -r <reps> -f <filter> ## filter, o/w no filtering \n\n") args <- commandArgs(TRUE) if ( length( args ) == 1 && args =="--help") { write(argspec, stderr()) q(); } lib.load.quiet <- function( package ) { package <- as.character(substitute(package)) suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) } lib.load.quiet(getopt) lib.load.quiet( gplots ) lib.load.quiet( amap ) ## if any of the faster clustering methods are available on this system, load them if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) { if ( 'flashClust' %in% installed.packages() ) { lib.load.quiet( flashClust ) } else { if ( 'fastcluster' %in% installed.packages() ) { lib.load.quiet( fastcluster ) } } } ##lib.load.quiet(ConsensusClusterPlus) lib.load.quiet( amap ) lib.load.quiet( cluster ) ################### ## code borrowed/updated from ConsensusClusterPlus ################### ConsensusClusterPlus <- function( d=NULL, maxK = 3, reps=10, pItem=0.8, pFeature=1, clusterAlg="hc", title="untitled_consensus_cluster", innerLinkage="average", finalLinkage="average", distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ), ml=NULL, tmyPal=NULL, seed=NULL, plot=NULL, writeTable=FALSE, weightsItem=NULL, weightsFeature=NULL, verbose=F ) { ##description: runs consensus subsamples if(is.null(seed)==TRUE){ seed=timeSeed = as.numeric(Sys.time()) } set.seed(seed) if(is.null(ml)==TRUE){ if ( inherits( distance, "dist" ) ) { stop( "If you want to pass in a pre-calculated distance object, pass it in as the data, rather than the distance parameter\n" ) } if ( ! class( d ) %in% c( "dist", "matrix", "ExpressionSet" ) ) { stop("d must be a matrix, distance object or ExpressionSet (eset object)") } if ( inherits( d, "dist" ) ) { ## if d is a distance matrix, fix a few things so that they don't cause problems with the analysis ## Note, assumption is that if d is a distance matrix, the user doesn't want to sample over the row features if ( is.null( attr( d, "method" ) ) ) { attr( d, "method" ) <- distance <- "unknown - user-specified" } if ( is.null( distance ) || ( distance != attr( d, "method" ) ) ) { distance <- attr( d, "method" ) } if ( ( ! is.null( pFeature ) ) && ( pFeature < 1 ) ) { if ( verbose ) warning( "Cannot use the pFeatures parameter when specifying a distance matrix as the data object\n" ) pFeature <- 1 } if ( ! is.null( weightsFeature ) ) { if ( verbose ) warning( "Cannot use the weightsFeature parameter when specifying a distance matrix as the data object\n" ) weightsFeature <- NULL } if ( clusterAlg == "km" ) { if ( verbose ) warning( "You are asking CCPLUS to use K-means to cluster a distance matrix (rather than the data itself) - this may produce unintended results. We suggest using PAM if you want to use alternate distance metrics/objects\n" ) ##d <- as.matrix( d ) #this is now done w/in ccRun } } else { if ( is.null( distance ) ) { ## we should never get here, but just in case distance <- "pearson" } } if ( ( clusterAlg == "km" ) && inherits( distance, "character" ) && ( distance != "euclidean" ) ) { warning( "WARNING: kmeans can only use the euclidean distance metric. If you would like to use an alternate metric, we suggest using PAM or HC clustering instead. This parameter combinationwill use k-means, but will NOT use the specified distance metric\n" ) distance <- 'euclidean' } if ( inherits( d,"ExpressionSet" ) ) { d <- exprs(d) } ml <- ccRun( d=d, maxK=maxK, repCount=reps, diss=inherits(d,"dist"), pItem=pItem, pFeature=pFeature, innerLinkage=innerLinkage, clusterAlg=clusterAlg, weightsFeature=weightsFeature, weightsItem=weightsItem, distance=distance, verbose=verbose) } res=list(); ##make results directory if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){ dir.create(paste(title,sep="")) } ##write log file log <- matrix( ncol=2, byrow=T, c("title",title, "maxK",maxK, "input matrix rows",ifelse ( inherits( d, "matrix" ), nrow(d), "dist-mat" ), "input matric columns",ifelse ( inherits( d, "matrix" ), ncol(d), ncol( as.matrix(d) ) ), "number of bootstraps",reps, "item subsampling proportion",pItem, "feature subsampling proportion",ifelse( is.null(pFeature), 1, pFeature ), "cluster algorithm",clusterAlg, "inner linkage type",innerLinkage, "final linkage type",finalLinkage, "correlation method",distance, "plot",if(is.null(plot)) NA else plot, "seed",if(is.null(seed)) NA else seed)) colnames(log) = c("option","value") if(writeTable){ write.csv(file=paste(title,"/",title,".log.csv",sep=""), log,row.names=F) } if(is.null(plot)){ ##nothing }else if(plot=="png"){ png(paste(title,"/","consensus%03d.png",sep="")) }else if (plot=="pdf"){ pdf(onefile=TRUE, paste(title,"/","consensus.pdf",sep="")) }else if (plot=="ps"){ postscript(onefile=TRUE, paste(title,"/","consensus.ps",sep="")) } colorList=list() colorM = rbind() #matrix of colors. #18 colors for marking different clusters thisPal <- c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99","#E31A1C","#FDBF6F","#FF7F00","#CAB2D6","#6A3D9A","#FFFF99","#B15928", "#bd18ea", #magenta "#2ef4ca", #aqua "#f4cced", #pink, "#f4cc03", #lightorange "#05188a", #navy, "#e5a25a", #light brown "#06f106", #bright green "#85848f", #med gray "#000000", #black "#076f25", #dark green "#93cd7f",#lime green "#4d0776", #dark purple "#ffffff" #white ) ##plot scale colBreaks=NA if(is.null(tmyPal)==TRUE){ colBreaks=10 tmyPal = myPal(colBreaks) }else{ colBreaks=length(tmyPal) } sc = cbind(seq(0,1,by=1/( colBreaks) )); rownames(sc) = sc[,1] sc = cbind(sc,sc) heatmap(sc, Colv=NA, Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=rownames(sc),labCol=F,main="consensus matrix legend") for (tk in 2:maxK){ if(verbose){ message(paste("consensus ",tk)) } fm = ml[[tk]] hc=hclust( as.dist( 1 - fm ), method=finalLinkage); message("clustered") ct = cutree(hc,tk) names(ct) = colnames(d) c = fm ##colnames(c) = colnames(d) ##rownames(c) = colnames(d) colorList = setClusterColors(res[[tk-1]][[3]],ct,thisPal,colorList) pc = c pc=pc[hc$order,] #***pc is matrix for plotting, same as c but is row-ordered and has names and extra row of zeros. pc = rbind(pc,0) heatmap(pc, Colv=as.dendrogram(hc), Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=F,labCol=F,mar=c(5,5),main=paste("consensus matrix k=",tk,sep="") , ColSideCol=colorList[[1]]) legend("topright",legend=unique(ct),fill=unique(colorList[[1]]),horiz=FALSE ) res[[tk]] = list(consensusMatrix=c,consensusTree=hc,consensusClass=ct,ml=ml[[tk]],clrs=colorList) colorM = rbind(colorM,colorList[[1]]) } CDF(ml) clusterTrackingPlot(colorM[,res[[length(res)]]$consensusTree$order]) if(is.null(plot)==FALSE){ dev.off(); } res[[1]] = colorM if(writeTable){ for(i in 2:length(res)){ write.csv(file=paste(title,"/",title,".k=",i,".consensusMatrix.csv",sep=""), res[[i]]$consensusMatrix) write.table(file=paste(title,"/",title,".k=",i,".consensusClass.csv",sep=""), res[[i]]$consensusClass,col.names = F,sep=",") } } return(res) } calcICL = function(res,title="untitled_consensus_cluster",plot=NULL,writeTable=FALSE){ #calculates and plots cluster consensus and item consensus cc=rbind() cci = rbind() sumRes=list() colorsArr=c() #make results directory if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){ dir.create(paste(title,sep="")) } if(is.null(plot)){ #to screen }else if(plot=="pdf"){ pdf(onefile=TRUE, paste(title,"/","icl.pdf",sep="")) }else if(plot=="ps"){ postscript(onefile=TRUE, paste(title,"/","icl.ps",sep="")) }else if (plot=="png"){ png(paste(title,"/","icl%03d.png",sep="")) } par(mfrow=c(3,1),mar=c(4,3,2,0)) for (k in 2:length(res)){ #each k eiCols = c(); o = res[[k]] m = o$consensusMatrix m = triangle(m,mode=2) for (ci in sort(unique(o$consensusClass))){ #each cluster in k items = which(o$consensusClass==ci) nk = length(items) mk = sum( m[items,items], na.rm=T)/((nk*(nk-1))/2) cc=rbind(cc,c(k,ci,mk)) #cluster-consensus for (ei in rev(res[[2]]$consensusTree$order) ){ denom = if (ei %in% items) { nk - 1} else { nk } mei = sum( c(m[ei,items],m[items,ei]), na.rm=T)/denom # mean item consensus to a cluster. cci = rbind(cci,c(k,ci,ei,mei)) #cluster, cluster index, item index, item-consensus } eiCols = c(eiCols, rep(ci,length(o$consensusClass)) ) } cck = cci[which(cci[,1]==k),] #only plot the new k data. #group by item, order by cluster i w=lapply(split(cck,cck[,3]), function(x) { y=matrix(unlist(x),ncol=4); y[order(y[,2]),4] }) q = matrix(as.numeric(unlist(w)),ncol=length(w),byrow=F) q = q[,res[[2]]$consensusTree$order] #order by leave order of k=2 #q is a matrix of k rows and sample columns, values are item consensus of sample to the cluster. thisColors = unique(cbind(res[[k]]$consensusClass,res[[k]]$clrs[[1]])) thisColors=thisColors[order(as.numeric(thisColors[,1])),2] colorsArr=c(colorsArr,thisColors) sumRes[[k]] = rankedBarPlot(q,thisColors,cc=res[[k]]$consensusClass[res[[2]]$consensusTree$order],paste("k=",k,sep="") ) } ys=cs=lab=c() lastk=cc[1,1] for(i in 1:length(colorsArr)){ if(lastk != cc[i,1]){ ys=c(ys,0,0) cs=c(cs,NA,NA) lastk=cc[i,1] lab=c(lab,NA,NA) } ys=c(ys,cc[i,3]) cs=c(cs,colorsArr[i]) lab=c(lab,cc[i,1]) } names(ys) = lab par(mfrow=c(3,1),mar=c(4,3,2,0)) barplot(ys,col=cs,border=cs,main="cluster-consensus",ylim=c(0,1),las=1) if(is.null(plot)==FALSE){ dev.off() } colnames(cc) = c("k","cluster","clusterConsensus") colnames(cci) = c("k","cluster","item","itemConsensus") cci[,"item"] = names(res[[2]]$consensusClass)[ cci[,"item"] ] #type cci cci = data.frame( k=as.numeric(cci[,"k"]), cluster=as.numeric(cci[,"cluster"]), item=cci[,"item"], itemConsensus=as.numeric(cci[,"itemConsensus"])) #write to file. if(writeTable){ write.csv(file=paste(title,"/",title,".summary.cluster.consensus.csv",sep=""),row.names=F, cc) write.csv(file=paste(title,"/",title,".summary.item.consensus.csv",sep=""), row.names=F, cc) } return(list(clusterConsensus=cc,itemConsensus=cci)) } ccRun <- function( d=d, maxK=NULL, repCount=NULL, diss=inherits( d, "dist" ), pItem=NULL, pFeature=NULL, innerLinkage=NULL, distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ), clusterAlg=NULL, weightsItem=NULL, weightsFeature=NULL, verbose=NULL) { m = vector(mode='list', repCount) ml = vector(mode="list",maxK) n <- ifelse( diss, ncol( as.matrix(d) ), ncol(d) ) mCount = mConsist = matrix(c(0),ncol=n,nrow=n) ml[[1]] = c(0); if (is.null( distance ) ) distance <- 'euclidean' ## necessary if d is a dist object and attr( d, "method" ) == NULLa require( amap ) ## we're going to use the amap Dist function, but they misname their correlation ## functions, so re-name them correctly amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary", "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" ) names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary", "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" ) main.dist.obj <- NULL ##browser() if ( diss ){ main.dist.obj <- d ## reset the pFeature & weightsFeature params if they've been set (irrelevant if d is a dist matrix) if ( ( !is.null(pFeature) ) && ( pFeature < 1 ) ) { if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified pFeature parameter\n" ) pFeature <- 1 # set it to 1 to avoid problems with sampleCols } if ( ! is.null( weightsFeature ) ) { if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified weightsFeature parameter\n" ) weightsFeature <- NULL # set it to NULL to avoid problems with sampleCols } } else { ## d is a data matrix ## we're not sampling over the features if ( ( clusterAlg != "km" ) && ( is.null( pFeature ) || ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) ) { ## only generate a main.dist.object IFF 1) d is a matrix, 2) we're not sampling the features, and 3) the algorithm isn't 'km' if ( inherits( distance, "character" ) ) { if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.") main.dist.obj <- Dist( t(d), method=as.character( amap.distance[ distance ] ) ) ## now fix dumb amap naming convention for distance metrics attr( main.dist.obj, "method" ) <- as.character( amap.distance[ distance ] ) } else stop("unsupported distance specified.") } else { ## pFeature < 1 or a weightsFeature != NULL ## since d is a data matrix, the user wants to sample over the gene features, so main.dist.obj is left as NULL } } for (i in 1:repCount){ ##browser() if(verbose){ message(paste("random subsample",i)); } ## take expression matrix sample, samples and genes sample_x = sampleCols( d, pItem, pFeature, weightsItem, weightsFeature ) this_dist = NA if ( ! is.null( main.dist.obj ) ) { boot.cols <- sample_x$subcols this_dist <- as.matrix( main.dist.obj )[ boot.cols, boot.cols ] if ( clusterAlg != "km" ) { ## if this isn't kmeans, then convert to a distance object this_dist <- as.dist( this_dist ) attr( this_dist, "method" ) <- attr( main.dist.obj, "method" ) } } else { ## if main.dist.obj is NULL, then d is a data matrix, and either: ## 1) clusterAlg is 'km' ## 2) pFeatures < 1 or weightsFeatures have been specified, or ## 3) both ## so we can't use a main distance object and for every iteration, we will have to re-calculate either ## 1) the distance matrix (because we're also sampling the features as well), or ## 2) the submat (if using km) if ( clusterAlg != "km" ) { if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.") ## good, we have a supported distance type this_dist <- Dist( t( sample_x$submat ), method=as.character( amap.distance[ distance ] ) ) ## now fix dumb amap naming convention for distance metrics attr( this_dist, "method" ) <- as.character( amap.distance[ distance ] ) } else { ##browser() ##clusterAlg == "km" ## if we're not sampling the features, then grab the colslice if ( is.null( pFeature ) || ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) { this_dist <- d[, sample_x$subcols ] } else { if ( is.na( sample_x$submat ) ) { save( "ccrun.submat.eq.na.dbg.rda" ) stop( "Houston, we have a problem. sample_x$submat is NA in ccRun when it should be specified - saving state\n" ) } this_dist <- sample_x$submat } } } ## cluster samples for HC. this_cluster=NA if(clusterAlg=="hc"){ this_cluster = hclust( this_dist, method=innerLinkage) } ##browser() ##mCount is possible number of times that two sample occur in same random sample, independent of k ##mCount stores number of times a sample pair was sampled together. mCount <- connectivityMatrix( rep( 1,length(sample_x[[3]])), mCount, sample_x[[3]] ) ##use samples for each k for (k in 2:maxK){ if(verbose){ message(paste(" k =",k)) } if (i==1){ ml[[k]] = mConsist #initialize } this_assignment=NA if(clusterAlg=="hc"){ ##prune to k for hc this_assignment = cutree(this_cluster,k) ##browser() }else if(clusterAlg=="km"){ ##this_dist should now be a matrix corresponding to the result from sampleCols this_assignment <- kmeans( t( this_dist ), k, iter.max = 10, nstart = 1, algorithm = c("Hartigan-Wong") )$cluster }else if ( clusterAlg == "pam" ) { require( cluster ) this_assignment <- pam( x=this_dist, k, diss=TRUE, metric=distance, cluster.only=TRUE ) } else{ ##optional cluterArg Hook. this_assignment <- get(clusterAlg)(this_dist, k) } ##add to tally ml[[k]] <- connectivityMatrix( this_assignment, ml[[k]], sample_x[[3]] ) } } ##consensus fraction res = vector(mode="list",maxK) for (k in 2:maxK){ ##fill in other half of matrix for tally and count. tmp = triangle(ml[[k]],mode=3) tmpCount = triangle(mCount,mode=3) res[[k]] = tmp / tmpCount res[[k]][which(tmpCount==0)] = 0 } message("end fraction") return(res) } connectivityMatrix <- function( clusterAssignments, m, sampleKey){ ##input: named vector of cluster assignments, matrix to add connectivities ##output: connectivity matrix names( clusterAssignments ) <- sampleKey cls <- lapply( unique( clusterAssignments ), function(i) as.numeric( names( clusterAssignments[ clusterAssignments %in% i ] ) ) ) for ( i in 1:length( cls ) ) { nelts <- 1:ncol( m ) cl <- as.numeric( nelts %in% cls[[i]] ) ## produces a binary vector updt <- outer( cl, cl ) m <- m + updt } return(m) } ## returns a list with the sample columns, as well as the sub-matrix & sample features (if necessary) ## if no sampling over the features is performed, the submatrix & sample features are returned as NAs ## to reduce memory overhead sampleCols <- function( d, pSamp=NULL, pRow=NULL, weightsItem=NULL, weightsFeature=NULL ){ space <- ifelse( inherits( d, "dist" ), ncol( as.matrix(d) ), ncol(d) ) sampleN <- floor(space*pSamp) sampCols <- sort( sample(space, sampleN, replace = FALSE, prob = weightsItem) ) this_sample <- sampRows <- NA if ( inherits( d, "matrix" ) ) { if ( (! is.null( pRow ) ) && ( (pRow < 1 ) || (! is.null( weightsFeature ) ) ) ) { ## only sample the rows and generate a sub-matrix if we're sampling over the row/gene/features space = nrow(d) sampleN = floor(space*pRow) sampRows = sort( sample(space, sampleN, replace = FALSE, prob = weightsFeature) ) this_sample <- d[sampRows,sampCols] dimnames(this_sample) <- NULL } else { ## do nothing } } return( list( submat=this_sample, subrows=sampRows, subcols=sampCols ) ) } CDF=function(ml,breaks=100){ #plot CDF distribution plot(c(0),xlim=c(0,1),ylim=c(0,1),col="white",bg="white",xlab="consensus index",ylab="CDF",main="consensus CDF", las=2) k=length(ml) this_colors = rainbow(k-1) areaK = c() for (i in 2:length(ml)){ v=triangle(ml[[i]],mode=1) #empirical CDF distribution. default number of breaks is 100 h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks)) h$counts = cumsum(h$counts)/sum(h$counts) #calculate area under CDF curve, by histogram method. thisArea=0 for (bi in 1:(length(h$breaks)-1)){ thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width bi = bi + 1 } areaK = c(areaK,thisArea) lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l') } legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors) #plot area under CDF change. deltaK=areaK[1] #initial auc at k=2 for(i in 2:(length(areaK))){ #proportional increase relative to prior K. deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1]) } plot(1+(1:length(deltaK)),y=deltaK,xlab="k",ylab="relative change in area under CDF curve",main="Delta area",type="b") } myPal = function(n=10){ #returns n colors seq = rev(seq(0,255,by=255/(n))) palRGB = cbind(seq,seq,255) rgb(palRGB,maxColorValue=255) } setClusterColors = function(past_ct,ct,colorU,colorList){ #description: sets common color of clusters between different K newColors = c() if(length(colorList)==0){ #k==2 newColors = colorU[ct] colori=2 }else{ newColors = rep(NULL,length(ct)) colori = colorList[[2]] mo=table(past_ct,ct) m=mo/apply(mo,1,sum) for(tci in 1:ncol(m)){ # for each cluster maxC = max(m[,tci]) pci = which(m[,tci] == maxC) if( sum(m[,tci]==maxC)==1 & max(m[pci,])==maxC & sum(m[pci,]==maxC)==1 ) { #if new column maximum is unique, same cell is row maximum and is also unique ##Note: the greatest of the prior clusters' members are the greatest in a current cluster's members. newColors[which(ct==tci)] = unique(colorList[[1]][which(past_ct==pci)]) # one value }else{ #add new color. colori=colori+1 newColors[which(ct==tci)] = colorU[colori] } } } return(list(newColors,colori,unique(newColors) )) } clusterTrackingPlot = function(m){ #description: plots cluster tracking plot #input: m - matrix where rows are k, columns are samples, and values are cluster assignments. plot(NULL,xlim=c(-0.1,1),ylim=c(0,1),axes=FALSE,xlab="samples",ylab="k",main="tracking plot") for(i in 1:nrow(m)){ rect( xleft=seq(0,1-1/ncol(m),by=1/ncol(m)), ybottom=rep(1-i/nrow(m),ncol(m)) , xright=seq(1/ncol(m),1,by=1/ncol(m)), ytop=rep(1-(i-1)/nrow(m),ncol(m)), col=m[i,],border=NA) } #hatch lines to indicate samples xl = seq(0,1-1/ncol(m),by=1/ncol(m)) segments( xl, rep(-0.1,ncol(m)) , xl, rep(0,ncol(m)), col="black") #** alt white and black color? ypos = seq(1,0,by=-1/nrow(m))-1/(2*nrow(m)) text(x=-0.1,y=ypos[-length(ypos)],labels=seq(2,nrow(m)+1,by=1)) } triangle = function(m,mode=1){ #mode=1 for CDF, vector of lower triangle. #mode==3 for full matrix. #mode==2 for calcICL; nonredundant half matrix coun #mode!=1 for summary n=dim(m)[1] nm = matrix(0,ncol=n,nrow=n) fm = m nm[upper.tri(nm)] = m[upper.tri(m)] #only upper half fm = t(nm)+nm diag(fm) = diag(m) nm=fm nm[upper.tri(nm)] = NA diag(nm) = NA vm = m[lower.tri(nm)] if(mode==1){ return(vm) #vector }else if(mode==3){ return(fm) #return full matrix }else if(mode == 2){ return(nm) #returns lower triangle and no diagonal. no double counts. } } rankedBarPlot=function(d,myc,cc,title){ colors = rbind() #each row is a barplot series byRank = cbind() spaceh = 0.1 #space between bars for(i in 1:ncol(d)){ byRank = cbind(byRank,sort(d[,i],na.last=F)) colors = rbind(colors,order(d[,i],na.last=F)) } maxH = max(c(1.5,apply(byRank,2,sum)),na.rm=T) #maximum height of graph #barplot largest to smallest so that smallest is in front. barp = barplot( apply(byRank,2,sum) , col=myc[colors[,1]] ,space=spaceh,ylim=c(0,maxH),main=paste("item-consensus", title),border=NA,las=1 ) for(i in 2:nrow(byRank)){ barplot( apply(matrix(byRank[i:nrow(byRank),],ncol=ncol(byRank)) ,2,sum), space=spaceh,col=myc[colors[,i]],ylim=c(0,maxH), add=T,border=NA,las=1 ) } xr=seq(spaceh,ncol(d)+ncol(d)*spaceh,(ncol(d)+ncol(d)*spaceh)/ncol(d) ) #class labels as asterisks text("*",x=xr+0.5,y=maxH,col=myc[cc],cex=1.4) #rect(xr,1.4,xr+1,1.5,col=myc[cc] ) } ###################################################################3333 ## RESTART MY SCRIPTS HERE spec <- matrix( c( "data.fname", "d", 1, "character", "direction", "n", 2, "character", "output.name", "o", 2, "character", "cluster.alg", "a", 2, "character", "distance.metric", "m", 2, "character", "max.k", "k", 2, "integer", "reps", "r", 2, "integer", "innerLinkage", "i", 1, "character", "finalLinkage", "f", 1, "character", "out.report.dir", "p", 2, "character", "out.report.html", "h", 2, "character" ), nc=4, byrow=TRUE ) opt <- getopt( spec=spec ) ## default params for non-required params if ( is.null( opt$direction ) ) { opt$direction <- "cols" } if ( is.null( opt$cluster.alg ) ) { opt$cluster.alg <- "pam" } if ( is.null( opt$output.name ) ) { opt$output.name <- "consensus.cluster.result" } if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "cosine" } if ( is.null( opt$max.k ) ) { opt$max.k <- 10 } if ( is.null( opt$reps ) ) { opt$reps <- 1000 } if ( is.null( opt$innerLinkage ) ) { opt$innerLinkage <- "average" } if ( is.null( opt$finalLinkage ) ) { opt$finalLinkage <- "average" } if ( is.null( opt$out.report.dir ) ) { opt$out.report.dir <- "report" } if ( is.null( opt$out.report.html ) ) { opt$out.report.html <- file.path( "report", "index.html" ) } ## validate params here (make sure set to valid values) if ( !opt$cluster.alg %in% c( "hc", "km", "pam" ) ) { stop( "invalid clustering algorithm specified", cluster.alg ) } data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) ## transpose the matrix if we want to cluster the rows (genes) if ( opt$direction == "rows" ) { data <- t( data ) } title <- paste( opt$cluster.alg, opt$output.name, sep="." ) results <- ConsensusClusterPlus( data, maxK=opt$max.k, reps=opt$reps, pItem=0.8, pFeature=1, title=opt$out.report.dir, clusterAlg=opt$cluster.alg, distance=opt$distance.metric, innerLinkage=opt$innerLinkage, finalLinkage=opt$finalLinkage, plot='png', writeTable=FALSE, seed=100, ##weightsFeature=abs( rnorm( nrow( orig.data ) ) ), verbose=FALSE ) pngs = list.files(path=opt$out.report.dir, patt="png") html.out <- paste( "<html>", paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), "</html>" ) cat( html.out, file=opt$out.report.html ) ## re-transpose the matrix back if we've clustered the rows (genes) if ( opt$direction == "rows" ) { data <- t( data ) } save( file=opt$output.name, data, results)