diff cluster.tools/select.k.from.consensus.cluster.R @ 2:b442996b66ae draft

Uploaded
author peter-waltman
date Wed, 27 Feb 2013 20:17:04 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/select.k.from.consensus.cluster.R	Wed Feb 27 20:17:04 2013 -0500
@@ -0,0 +1,340 @@
+#!/usr/bin/env Rscript
+# Consensus Clustering Script by Peter Waltman
+# June 1, 2012
+# License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
+#
+##usage, options and doc goes here
+argspec <- c("select.k.from.consensus.clust4er.R takes a clustering from ConsensusClusterPlus
+and clinical survival data and determines the right k to use.
+
+        Usage: 
+                select.k.from.consensus.cluster.R -r <results_file>
+        Optional:
+                -o output.png # default is stdout
+                -c change.min
+                -m metric (must be either:
+                                        rel.change,
+                                        angle,
+                                        silhouette (must specify data matrix)
+                                        survival (must specify survival data; uses minimal cummulative log-rank p-value)
+                -d data ## for calculating silhouette plots (plotted, but not used unless specified)
+                -s survival.data.fname (plotted, but not used unless specified)
+                -e survival.comp (can be either all, one or both - see the mode param for gen.survival.curves for explanation)
+                -z survival analysis script to be called (defaults to galaxy.gen.survival.curves.R)
+\n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write( argspec, stderr() )
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+lib.load.quiet( amap )
+lib.load.quiet( cluster )
+
+spec <- matrix( c( "results.file",        "r", 1, "character",
+                   "change.min",          "c", 2, "double",
+                   "metric",              "m", 2, "character",
+                   "survival.data",       "s", 2, "character",
+                   "survival.comp",       "e", 2, "character",
+                   "survival.script",     "z", 2, "character",
+                   "output.format",       "f", 2, "character",
+                   "cluster.class.out",   "o", 2, "character",
+                   "output.report.dir",   "p", 2, "character",
+                   "output.report.html",  "h", 2, "character"
+                  ),
+                nc=4,
+                byrow=T
+               )
+opt <- getopt( spec=spec )
+
+## default params for non-required params
+if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" }
+if ( is.null( opt$output.report.html ) ) { opt$output.report.html <- "report/index.html" }
+
+if ( is.null( opt$change.min ) ) { opt$change.min <- 0.075 }
+if ( is.null( opt$metric ) ) { opt$metric <- "difference" } ## alternate is angle }
+if ( is.null( opt$survival.comp ) ) { opt$survival.comp <- "all" } ## alternate is one or both }
+if ( is.null( opt$survival.script ) ) { opt$survival.script <- "galaxy.gen.survival.curves.R" } ## alternate is one or both }
+if ( is.null( opt$cluster.class.out) ) { opt$cluster.class.out <- "select.cls.rda" }
+
+if ( !file.exists( opt$output.report.dir ) ){
+    dir.create(opt$output.report.dir)
+}
+
+if ( ! opt$metric %in% c( "difference", "angle", "silhouette", "survival" ) ) {
+  stop( "invalid metric specified ", opt$metric, "\n" )
+}
+
+                               
+opt$change.min <- as.numeric( opt$change.min )
+if ( abs( opt$change.min ) > 1 ) {
+  stop( "invalid angle specified:", opt$change.min, "Please specify angle in rangel [-1,0]\n" )
+}
+if ( opt$metric=="angle" && opt$change.min > 0 ) {
+  opt$change.min <- -opt$change.min
+  cat( "Using", opt$change.min, "for minimum angle\n" )
+}
+
+if ( opt$metric == "survival" &&
+     ( is.null( opt$survival.data ) ||
+       (! file.exists( opt$survival.data ) ) )
+    ) {
+  stop( "Must provide valid survival file in order to use survival as metric\n" )
+}
+
+
+## From the ConsensusClusterPlust package - modified by phw
+CDF <- function( ml,
+                 breaks=1000,
+                 plot.it=TRUE ){
+  if ( class(ml[[1]])=="matrix" && ( names( ml[1] ) =="2" ) ) {
+    ml <- c( c(0), ml )
+  }
+  ##plot CDF distribution
+  if ( plot.it ) {
+    plot( c(0),
+          xlim=c(0,1),
+          ylim=c(0,1),
+          col="white",
+          bg="white",
+          xlab="consensus index",
+          ylab="CDF",
+          main="consensus CDF",
+          las=2 )
+  }
+  
+  k=length(ml)
+  this_colors <- rainbow(k-1)
+  areaK <- c()
+  for (i in 2:length( ml ) ) {
+    v <- ml[[i]]
+    v <- v[ lower.tri(v) ]
+
+    #empirical CDF distribution. default number of breaks is 100    
+    h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks))
+    h$counts = cumsum(h$counts)/sum(h$counts)
+
+    #calculate area under CDF curve, by histogram method.
+    thisArea=0
+    for (bi in 1:(length(h$breaks)-1)){
+       thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width
+    }
+    areaK = c(areaK,thisArea)
+    if ( plot.it ) lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l')
+  }
+  if ( plot.it ) legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors)
+
+  #Calc area under CDF change.
+  deltaK=areaK[1] #initial auc at k=2
+  for(i in 2:(length(areaK))){
+    #proportional increase relative to prior K.
+    deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1])
+  }
+  return ( list( areaK=areaK, deltaK=deltaK ) )
+}
+
+
+load( opt$results.file )
+save.image( '/home/waltman/work.local/tmp/select.dbg.rda' )
+
+if ( opt$metric == "silhouette" ) {
+  if ( ! exists( 'data' ) && ( class( data ) != "matrix" ) ) {
+    stop( "Must provide valid data matrix in order to use silhouette as metric\n" )
+  }
+}
+cons.matrices <- lapply( results[ 2:length(results) ], '[[', 'consensusMatrix' )
+cls <- lapply( results[ 2:length(results) ], function( res ) return( res$consensusClass[ res$consensusTree$order ] ) )  ##'[[', 'consensusClass' )
+names( cons.matrices ) <- names( cls ) <- 2:length( results )
+
+png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.CDF.png")
+plot.dev <- png( png.fname,
+                 width=11,
+                 height=8.5,
+                 units='in',
+                 res=72 )
+##  this will calculate the CDF, plus plot them
+rel.delta <- CDF( cons.matrices, breaks=1000, plot.it=TRUE )$deltaK
+dev.off()
+names( rel.delta ) <- seq( from=2, by=1, length=length( rel.delta ) )
+vector.of.metric.changes <- rel.delta
+
+main.txt <- ", per Size K"
+ylab.txt <- ""
+
+main.txt <- paste( "Relative Change in Area", main.txt, sep="" )
+ylab.txt <- paste( "relative change in area under CDF curve", ylab.txt, sep="" )
+png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.diff.png")
+
+plot.dev <- png( png.fname,
+                 width=11,
+                 height=8.5,
+                 units='in',
+                 res=72 )
+plot( as.numeric( names( vector.of.metric.changes ) ),
+      vector.of.metric.changes,
+      main=main.txt,
+      ylab=ylab.txt,
+      xlab="Cluster size (K)",
+      type='b' )
+dev.off()
+
+k.select <- vector.of.metric.changes[ vector.of.metric.changes < opt$change.min ]
+if ( length( k.select ) > 1 ) {
+  k.select <- k.select[1]
+} else {
+  if ( length( k.select ) == 0 ) {
+    k.select <- vector.of.metric.changes[ length( vector.of.metric.changes ) ]
+  } else {
+    ## do nothing
+  }
+}
+k.select <- as.numeric( names( k.select ) )
+## find the search range
+k.search.range <- (k.select-2):(k.select+2)
+k.search.range <- k.search.range[ k.search.range %in% as.numeric( names( vector.of.metric.changes ) ) ]
+k.search.range <- vector.of.metric.changes[ as.character( k.search.range ) ]
+k.search.range <- k.search.range[ k.search.range  < 0.25 ]
+k.search.range <- k.search.range[ k.search.range > 0.025 ]
+k.search.range <- names( k.search.range )
+
+
+if ( exists("data") ) {
+  ## what direction is the clustering in? rows or cols?
+  elts <- unique( names( results[[2]]$consensusClass ) )
+  if ( all( elts %in% colnames( data ) ) ) {
+    ## sample clusters
+    data.dist <- dist( t( data ) )
+    cls <- lapply( cls, function( x ) return( x[ colnames( data ) ] ) )
+  } else if ( all( elts  %in% rownames( data ) ) ) {
+    data.dist <- dist( data )
+    cls <- lapply( cls, function( x ) return( x[ rownames( data ) ] ) )
+  } else {
+    stop( "incompatible cluster results and data matrix\n" )
+  }
+
+  
+  sils <- lapply( cls,
+                  silhouette,
+                  dist=data.dist )
+  sils <- sapply( sils,
+                  function(x) {
+                    return( summary( x )$avg.width )
+                  }
+                 )
+
+  png.fname <- file.path( opt$output.report.dir, "consensus.sel.silhouette.png")
+
+  plot.dev <- png( png.fname,
+                   width=11,
+                   height=8.5,
+                   units='in',
+                   res=72 )
+  plot( as.numeric( names( sils ) ),
+        sils,
+        main="Average Silhouette Widths, per Cluster Size K",
+        ylab="average silhouette width (correlation distance)",
+        xlab="Cluster size (K)",
+        type='b' )
+  dev.off()
+
+  ## if the metric is silhouette, use that (but only over the k's that are on the rel-change "elbow"
+  if ( opt$metric == "silhouette" ) {
+    names( sils ) <- names( cls )
+
+    sils <- sils[ k.search.range ]
+    k.select <- as.numeric( names( sils[ sils == max( sils, na.rm=T ) ] ) )
+  }
+}
+
+if ( ! is.null( opt$survival.data ) ) {
+  if ( ! file.exists( opt$survival.data ) ) {
+    stop( 'specified clinical/survival file can not be found:', opt$survival.data, "\n" )
+  }
+
+  if ( opt$metric == "survival" ) {
+    pvals <- numeric()
+
+    for ( cl in cls ) {
+      cons.class.file <- "tmp.class.tab"
+      tmp.cl <- cbind( gsub( "\\.", "-", names( cl ) ), as.integer(cl) )
+      colnames( tmp.cl ) <- c( "ID", "class" )
+      write.table( tmp.cl, cons.class.file, sep="\t", row.names=FALSE, quote=FALSE )
+      
+      cmd.string <- opt$survival.script
+
+      ## get the consensusClass file that's associated with the k.select
+      cmd.string <- paste( cmd.string, "-c", cons.class.file )
+      cmd.string <- paste( cmd.string, "-s", opt$survival.data )
+      cmd.string <- paste( cmd.string, "-m", opt$survival.comp )
+      cmd.string <- paste( cmd.string, "-p" )
+      pvals <- c( pvals, as.numeric( system( cmd.string, intern=T ) ) )
+    }
+    names( pvals ) <- names( cls )
+    
+
+    png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.survival.png" )
+
+    plot.dev <- png( png.fname,
+                     width=11,
+                     height=8.5,
+                     units='in',
+                     res=72 )
+    plot( as.numeric( names( pvals ) ),
+          -log( pvals ),
+          main="Average Log-rank p-values (-log), per Cluster Size K",
+          ylab="Average log-rank p-values (-log)",
+          xlab="Cluster size (K)",
+          type='b' )
+    dev.off()
+    
+
+    pvals <- pvals[ ksearch.range ]
+    k.select <- as.numeric( names( pvals[ pvals == min( pvals, na.rm=T ) ] ) )
+  }
+  
+  cmd.string <- opt$survival.script
+
+  ## get the consensusClass file that's associated with the k.select 
+  cl <- cls[[ as.character( k.select ) ]] 
+  cl <- cbind( names( cl ), as.integer(cl) )
+  colnames( cl ) <- c( "ID", "class" )
+  write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE )
+
+  cmd.string <- paste( cmd.string, "-c", opt$cluster.class.out )
+  cmd.string <- paste( cmd.string, "-s", opt$survival.data )
+  cmd.string <- paste( cmd.string, "-m", opt$survival.comp )
+  
+  survival.out.file <- paste( opt$output.report.dir, "survival.png", sep="/" )
+  cmd.string <- paste( cmd.string, "-o", survival.out.file )
+  output <- system( cmd.string, intern=T )
+  cat( output, sep="\n" )
+} else {
+  ## get the consensusClass file that's associated with the k.select 
+  cl <- cls[[ as.character( k.select ) ]] 
+  cl <- cbind( names( cl ), as.integer(cl) )
+  colnames( cl ) <- c( "ID", "class" )
+  write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE )
+}
+
+treecl.res <- results[[ k.select ]]$consensusTree
+## cl should already exist, but re-create it just in case
+cl <- cls[[ as.character( k.select ) ]] 
+
+
+select.result <- results[[ k.select ]]
+## over-write the tabular version of the opt$cluster.class.out with an RData file
+save( file=opt$cluster.class.out, treecl.res, cl, select.result, data ) 
+
+report_str = paste( "k selected by consensus clustering and user-specified metric, ", opt$metric, ", is ", k.select, "\n", sep="" )
+
+pngs = list.files(path=opt$output.report.dir, patt="png")
+html.out <- paste( "<html>", report_str,
+                   paste( paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), collapse=""),
+                   "</html>" )
+cat( html.out, file=opt$output.report.html )
+