Mercurial > repos > peter-waltman > ucsc_cluster_tools2
diff cluster.tools/select.k.from.consensus.cluster.R @ 0:0decf3fd54bc draft
Uploaded
author | peter-waltman |
---|---|
date | Thu, 28 Feb 2013 01:45:39 -0500 |
parents | |
children | a58527c632b7 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/select.k.from.consensus.cluster.R Thu Feb 28 01:45:39 2013 -0500 @@ -0,0 +1,338 @@ +#!/usr/bin/env Rscript +# Consensus Clustering Script by Peter Waltman +# June 1, 2012 +# License under Creative Commons Attribution 3.0 Unported (CC BY 3.0) +# +##usage, options and doc goes here +argspec <- c("select.k.from.consensus.clust4er.R takes a clustering from ConsensusClusterPlus +and clinical survival data and determines the right k to use. + + Usage: + select.k.from.consensus.cluster.R -r <results_file> + Optional: + -o output.png # default is stdout + -c change.min + -m metric (must be either: + rel.change, + angle, + silhouette (must specify data matrix) + survival (must specify survival data; uses minimal cummulative log-rank p-value) + -d data ## for calculating silhouette plots (plotted, but not used unless specified) + -s survival.data.fname (plotted, but not used unless specified) + -e survival.comp (can be either all, one or both - see the mode param for gen.survival.curves for explanation) + -z survival analysis script to be called (defaults to galaxy.gen.survival.curves.R) +\n\n") +args <- commandArgs(TRUE) +if ( length( args ) == 1 && args =="--help") { + write( argspec, stderr() ) + q(); +} + +lib.load.quiet <- function( package ) { + package <- as.character(substitute(package)) + suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) +} +lib.load.quiet(getopt) +lib.load.quiet( amap ) +lib.load.quiet( cluster ) + +spec <- matrix( c( "results.file", "r", 1, "character", + "change.min", "c", 2, "double", + "metric", "m", 2, "character", + "survival.data", "s", 2, "character", + "survival.comp", "e", 2, "character", + "survival.script", "z", 2, "character", + "output.format", "f", 2, "character", + "cluster.class.out", "o", 2, "character", + "output.report.dir", "p", 2, "character", + "output.report.html", "h", 2, "character" + ), + nc=4, + byrow=T + ) +opt <- getopt( spec=spec ) + +## default params for non-required params +if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" } +if ( is.null( opt$output.report.html ) ) { opt$output.report.html <- "report/index.html" } + +if ( is.null( opt$change.min ) ) { opt$change.min <- 0.075 } +if ( is.null( opt$metric ) ) { opt$metric <- "difference" } ## alternate is angle } +if ( is.null( opt$survival.comp ) ) { opt$survival.comp <- "all" } ## alternate is one or both } +if ( is.null( opt$survival.script ) ) { opt$survival.script <- "galaxy.gen.survival.curves.R" } ## alternate is one or both } +if ( is.null( opt$cluster.class.out) ) { opt$cluster.class.out <- "select.cls.rda" } + +if ( !file.exists( opt$output.report.dir ) ){ + dir.create(opt$output.report.dir) +} + +if ( ! opt$metric %in% c( "difference", "angle", "silhouette", "survival" ) ) { + stop( "invalid metric specified ", opt$metric, "\n" ) +} + + +opt$change.min <- as.numeric( opt$change.min ) +if ( abs( opt$change.min ) > 1 ) { + stop( "invalid angle specified:", opt$change.min, "Please specify angle in rangel [-1,0]\n" ) +} +if ( opt$metric=="angle" && opt$change.min > 0 ) { + opt$change.min <- -opt$change.min + cat( "Using", opt$change.min, "for minimum angle\n" ) +} + +if ( opt$metric == "survival" && + ( is.null( opt$survival.data ) || + (! file.exists( opt$survival.data ) ) ) + ) { + stop( "Must provide valid survival file in order to use survival as metric\n" ) +} + + +## From the ConsensusClusterPlust package - modified by phw +CDF <- function( ml, + breaks=1000, + plot.it=TRUE ){ + if ( class(ml[[1]])=="matrix" && ( names( ml[1] ) =="2" ) ) { + ml <- c( c(0), ml ) + } + ##plot CDF distribution + if ( plot.it ) { + plot( c(0), + xlim=c(0,1), + ylim=c(0,1), + col="white", + bg="white", + xlab="consensus index", + ylab="CDF", + main="consensus CDF", + las=2 ) + } + + k=length(ml) + this_colors <- rainbow(k-1) + areaK <- c() + for (i in 2:length( ml ) ) { + v <- ml[[i]] + v <- v[ lower.tri(v) ] + + #empirical CDF distribution. default number of breaks is 100 + h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks)) + h$counts = cumsum(h$counts)/sum(h$counts) + + #calculate area under CDF curve, by histogram method. + thisArea=0 + for (bi in 1:(length(h$breaks)-1)){ + thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width + } + areaK = c(areaK,thisArea) + if ( plot.it ) lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l') + } + if ( plot.it ) legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors) + + #Calc area under CDF change. + deltaK=areaK[1] #initial auc at k=2 + for(i in 2:(length(areaK))){ + #proportional increase relative to prior K. + deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1]) + } + return ( list( areaK=areaK, deltaK=deltaK ) ) +} + + +load( opt$results.file ) + +if ( opt$metric == "silhouette" ) { + if ( ! exists( 'data' ) && ( class( data ) != "matrix" ) ) { + stop( "Must provide valid data matrix in order to use silhouette as metric\n" ) + } +} +cons.matrices <- lapply( results[ 2:length(results) ], '[[', 'consensusMatrix' ) +cls <- lapply( results[ 2:length(results) ], function( res ) return( res$consensusClass[ res$consensusTree$order ] ) ) ##'[[', 'consensusClass' ) +names( cons.matrices ) <- names( cls ) <- 2:length( results ) + +png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.CDF.png") +plot.dev <- png( png.fname, + width=11, + height=8.5, + units='in', + res=72 ) +## this will calculate the CDF, plus plot them +rel.delta <- CDF( cons.matrices, breaks=1000, plot.it=TRUE )$deltaK +dev.off() +names( rel.delta ) <- seq( from=2, by=1, length=length( rel.delta ) ) +vector.of.metric.changes <- rel.delta + +main.txt <- ", per Size K" +ylab.txt <- "" + +main.txt <- paste( "Relative Change in Area", main.txt, sep="" ) +ylab.txt <- paste( "relative change in area under CDF curve", ylab.txt, sep="" ) +png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.diff.png") + +plot.dev <- png( png.fname, + width=11, + height=8.5, + units='in', + res=72 ) +plot( as.numeric( names( vector.of.metric.changes ) ), + vector.of.metric.changes, + main=main.txt, + ylab=ylab.txt, + xlab="Cluster size (K)", + type='b' ) +dev.off() + +k.select <- vector.of.metric.changes[ vector.of.metric.changes < opt$change.min ] +if ( length( k.select ) > 1 ) { + k.select <- k.select[1] +} else { + if ( length( k.select ) == 0 ) { + k.select <- vector.of.metric.changes[ length( vector.of.metric.changes ) ] + } else { + ## do nothing + } +} +k.select <- as.numeric( names( k.select ) ) +## find the search range +k.search.range <- (k.select-2):(k.select+2) +k.search.range <- k.search.range[ k.search.range %in% as.numeric( names( vector.of.metric.changes ) ) ] +k.search.range <- vector.of.metric.changes[ as.character( k.search.range ) ] +k.search.range <- k.search.range[ k.search.range < 0.25 ] +k.search.range <- k.search.range[ k.search.range > 0.025 ] +k.search.range <- names( k.search.range ) + + +if ( exists("data") ) { + ## what direction is the clustering in? rows or cols? + elts <- unique( names( results[[2]]$consensusClass ) ) + if ( all( elts %in% colnames( data ) ) ) { + ## sample clusters + data.dist <- dist( t( data ) ) + cls <- lapply( cls, function( x ) return( x[ colnames( data ) ] ) ) + } else if ( all( elts %in% rownames( data ) ) ) { + data.dist <- dist( data ) + cls <- lapply( cls, function( x ) return( x[ rownames( data ) ] ) ) + } else { + stop( "incompatible cluster results and data matrix\n" ) + } + + + sils <- lapply( cls, + silhouette, + dist=data.dist ) + sils <- sapply( sils, + function(x) { + return( summary( x )$avg.width ) + } + ) + + png.fname <- file.path( opt$output.report.dir, "consensus.sel.silhouette.png") + + plot.dev <- png( png.fname, + width=11, + height=8.5, + units='in', + res=72 ) + plot( as.numeric( names( sils ) ), + sils, + main="Average Silhouette Widths, per Cluster Size K", + ylab="average silhouette width (correlation distance)", + xlab="Cluster size (K)", + type='b' ) + dev.off() + + ## if the metric is silhouette, use that (but only over the k's that are on the rel-change "elbow" + if ( opt$metric == "silhouette" ) { + names( sils ) <- names( cls ) + + sils <- sils[ k.search.range ] + k.select <- as.numeric( names( sils[ sils == max( sils, na.rm=T ) ] ) ) + } +} + +if ( ! is.null( opt$survival.data ) ) { + if ( ! file.exists( opt$survival.data ) ) { + stop( 'specified clinical/survival file can not be found:', opt$survival.data, "\n" ) + } + + if ( opt$metric == "survival" ) { + pvals <- numeric() + + for ( cl in cls ) { + + cons.class.file <- tempfile( "tmp.class.rdata" ) + save( file=cons.class.file, cl ) + + cmd.string <- opt$survival.script + + ## get the consensusClass file that's associated with the k.select + cmd.string <- paste( cmd.string, "-C", cons.class.file ) + cmd.string <- paste( cmd.string, "-S", opt$survival.data ) + cmd.string <- paste( cmd.string, "-M", opt$survival.comp ) + cmd.string <- paste( cmd.string, "-P" ) + pvals <- c( pvals, as.numeric( system( cmd.string, intern=T ) ) ) + } + names( pvals ) <- names( cls ) + + + png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.survival.png" ) + + plot.dev <- png( png.fname, + width=11, + height=8.5, + units='in', + res=72 ) + plot( as.numeric( names( pvals ) ), + -log( pvals ), + main="Average Log-rank p-values (-log), per Cluster Size K", + ylab="Average log-rank p-values (-log)", + xlab="Cluster size (K)", + type='b' ) + dev.off() + + + pvals <- pvals[ k.search.range ] + k.select <- as.numeric( names( pvals[ pvals == min( pvals, na.rm=T ) ] ) ) + } + + cmd.string <- opt$survival.script + + ## get the consensusClass file that's associated with the k.select + cl <- cls[[ as.character( k.select ) ]] + cl <- cbind( names( cl ), as.integer(cl) ) + colnames( cl ) <- c( "ID", "class" ) + write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE ) + + cmd.string <- paste( cmd.string, "-c", opt$cluster.class.out ) + cmd.string <- paste( cmd.string, "-s", opt$survival.data ) + cmd.string <- paste( cmd.string, "-m", opt$survival.comp ) + + survival.out.file <- paste( opt$output.report.dir, "survival.png", sep="/" ) + cmd.string <- paste( cmd.string, "-o", survival.out.file ) + output <- system( cmd.string, intern=T ) + cat( output, sep="\n" ) +} else { + ## get the consensusClass file that's associated with the k.select + cl <- cls[[ as.character( k.select ) ]] + cl <- cbind( names( cl ), as.integer(cl) ) + colnames( cl ) <- c( "ID", "class" ) + write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE ) +} + +treecl.res <- results[[ k.select ]]$consensusTree +## cl should already exist, but re-create it just in case +cl <- cls[[ as.character( k.select ) ]] + + +select.result <- results[[ k.select ]] +## over-write the tabular version of the opt$cluster.class.out with an RData file +save( file=opt$cluster.class.out, treecl.res, cl, select.result, data ) + +report_str = paste( "k selected by consensus clustering and user-specified metric, ", opt$metric, ", is ", k.select, "\n", sep="" ) + +pngs = list.files(path=opt$output.report.dir, patt="png") +html.out <- paste( "<html>", report_str, + paste( paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), collapse=""), + "</html>" ) +cat( html.out, file=opt$output.report.html ) +