Mercurial > repos > peter-waltman > ucsc_cluster_tools2
comparison cluster.tools/select.k.from.consensus.cluster.R @ 0:0decf3fd54bc draft
Uploaded
| author | peter-waltman |
|---|---|
| date | Thu, 28 Feb 2013 01:45:39 -0500 |
| parents | |
| children | a58527c632b7 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0decf3fd54bc |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 # Consensus Clustering Script by Peter Waltman | |
| 3 # June 1, 2012 | |
| 4 # License under Creative Commons Attribution 3.0 Unported (CC BY 3.0) | |
| 5 # | |
| 6 ##usage, options and doc goes here | |
| 7 argspec <- c("select.k.from.consensus.clust4er.R takes a clustering from ConsensusClusterPlus | |
| 8 and clinical survival data and determines the right k to use. | |
| 9 | |
| 10 Usage: | |
| 11 select.k.from.consensus.cluster.R -r <results_file> | |
| 12 Optional: | |
| 13 -o output.png # default is stdout | |
| 14 -c change.min | |
| 15 -m metric (must be either: | |
| 16 rel.change, | |
| 17 angle, | |
| 18 silhouette (must specify data matrix) | |
| 19 survival (must specify survival data; uses minimal cummulative log-rank p-value) | |
| 20 -d data ## for calculating silhouette plots (plotted, but not used unless specified) | |
| 21 -s survival.data.fname (plotted, but not used unless specified) | |
| 22 -e survival.comp (can be either all, one or both - see the mode param for gen.survival.curves for explanation) | |
| 23 -z survival analysis script to be called (defaults to galaxy.gen.survival.curves.R) | |
| 24 \n\n") | |
| 25 args <- commandArgs(TRUE) | |
| 26 if ( length( args ) == 1 && args =="--help") { | |
| 27 write( argspec, stderr() ) | |
| 28 q(); | |
| 29 } | |
| 30 | |
| 31 lib.load.quiet <- function( package ) { | |
| 32 package <- as.character(substitute(package)) | |
| 33 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) | |
| 34 } | |
| 35 lib.load.quiet(getopt) | |
| 36 lib.load.quiet( amap ) | |
| 37 lib.load.quiet( cluster ) | |
| 38 | |
| 39 spec <- matrix( c( "results.file", "r", 1, "character", | |
| 40 "change.min", "c", 2, "double", | |
| 41 "metric", "m", 2, "character", | |
| 42 "survival.data", "s", 2, "character", | |
| 43 "survival.comp", "e", 2, "character", | |
| 44 "survival.script", "z", 2, "character", | |
| 45 "output.format", "f", 2, "character", | |
| 46 "cluster.class.out", "o", 2, "character", | |
| 47 "output.report.dir", "p", 2, "character", | |
| 48 "output.report.html", "h", 2, "character" | |
| 49 ), | |
| 50 nc=4, | |
| 51 byrow=T | |
| 52 ) | |
| 53 opt <- getopt( spec=spec ) | |
| 54 | |
| 55 ## default params for non-required params | |
| 56 if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" } | |
| 57 if ( is.null( opt$output.report.html ) ) { opt$output.report.html <- "report/index.html" } | |
| 58 | |
| 59 if ( is.null( opt$change.min ) ) { opt$change.min <- 0.075 } | |
| 60 if ( is.null( opt$metric ) ) { opt$metric <- "difference" } ## alternate is angle } | |
| 61 if ( is.null( opt$survival.comp ) ) { opt$survival.comp <- "all" } ## alternate is one or both } | |
| 62 if ( is.null( opt$survival.script ) ) { opt$survival.script <- "galaxy.gen.survival.curves.R" } ## alternate is one or both } | |
| 63 if ( is.null( opt$cluster.class.out) ) { opt$cluster.class.out <- "select.cls.rda" } | |
| 64 | |
| 65 if ( !file.exists( opt$output.report.dir ) ){ | |
| 66 dir.create(opt$output.report.dir) | |
| 67 } | |
| 68 | |
| 69 if ( ! opt$metric %in% c( "difference", "angle", "silhouette", "survival" ) ) { | |
| 70 stop( "invalid metric specified ", opt$metric, "\n" ) | |
| 71 } | |
| 72 | |
| 73 | |
| 74 opt$change.min <- as.numeric( opt$change.min ) | |
| 75 if ( abs( opt$change.min ) > 1 ) { | |
| 76 stop( "invalid angle specified:", opt$change.min, "Please specify angle in rangel [-1,0]\n" ) | |
| 77 } | |
| 78 if ( opt$metric=="angle" && opt$change.min > 0 ) { | |
| 79 opt$change.min <- -opt$change.min | |
| 80 cat( "Using", opt$change.min, "for minimum angle\n" ) | |
| 81 } | |
| 82 | |
| 83 if ( opt$metric == "survival" && | |
| 84 ( is.null( opt$survival.data ) || | |
| 85 (! file.exists( opt$survival.data ) ) ) | |
| 86 ) { | |
| 87 stop( "Must provide valid survival file in order to use survival as metric\n" ) | |
| 88 } | |
| 89 | |
| 90 | |
| 91 ## From the ConsensusClusterPlust package - modified by phw | |
| 92 CDF <- function( ml, | |
| 93 breaks=1000, | |
| 94 plot.it=TRUE ){ | |
| 95 if ( class(ml[[1]])=="matrix" && ( names( ml[1] ) =="2" ) ) { | |
| 96 ml <- c( c(0), ml ) | |
| 97 } | |
| 98 ##plot CDF distribution | |
| 99 if ( plot.it ) { | |
| 100 plot( c(0), | |
| 101 xlim=c(0,1), | |
| 102 ylim=c(0,1), | |
| 103 col="white", | |
| 104 bg="white", | |
| 105 xlab="consensus index", | |
| 106 ylab="CDF", | |
| 107 main="consensus CDF", | |
| 108 las=2 ) | |
| 109 } | |
| 110 | |
| 111 k=length(ml) | |
| 112 this_colors <- rainbow(k-1) | |
| 113 areaK <- c() | |
| 114 for (i in 2:length( ml ) ) { | |
| 115 v <- ml[[i]] | |
| 116 v <- v[ lower.tri(v) ] | |
| 117 | |
| 118 #empirical CDF distribution. default number of breaks is 100 | |
| 119 h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks)) | |
| 120 h$counts = cumsum(h$counts)/sum(h$counts) | |
| 121 | |
| 122 #calculate area under CDF curve, by histogram method. | |
| 123 thisArea=0 | |
| 124 for (bi in 1:(length(h$breaks)-1)){ | |
| 125 thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width | |
| 126 } | |
| 127 areaK = c(areaK,thisArea) | |
| 128 if ( plot.it ) lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l') | |
| 129 } | |
| 130 if ( plot.it ) legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors) | |
| 131 | |
| 132 #Calc area under CDF change. | |
| 133 deltaK=areaK[1] #initial auc at k=2 | |
| 134 for(i in 2:(length(areaK))){ | |
| 135 #proportional increase relative to prior K. | |
| 136 deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1]) | |
| 137 } | |
| 138 return ( list( areaK=areaK, deltaK=deltaK ) ) | |
| 139 } | |
| 140 | |
| 141 | |
| 142 load( opt$results.file ) | |
| 143 | |
| 144 if ( opt$metric == "silhouette" ) { | |
| 145 if ( ! exists( 'data' ) && ( class( data ) != "matrix" ) ) { | |
| 146 stop( "Must provide valid data matrix in order to use silhouette as metric\n" ) | |
| 147 } | |
| 148 } | |
| 149 cons.matrices <- lapply( results[ 2:length(results) ], '[[', 'consensusMatrix' ) | |
| 150 cls <- lapply( results[ 2:length(results) ], function( res ) return( res$consensusClass[ res$consensusTree$order ] ) ) ##'[[', 'consensusClass' ) | |
| 151 names( cons.matrices ) <- names( cls ) <- 2:length( results ) | |
| 152 | |
| 153 png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.CDF.png") | |
| 154 plot.dev <- png( png.fname, | |
| 155 width=11, | |
| 156 height=8.5, | |
| 157 units='in', | |
| 158 res=72 ) | |
| 159 ## this will calculate the CDF, plus plot them | |
| 160 rel.delta <- CDF( cons.matrices, breaks=1000, plot.it=TRUE )$deltaK | |
| 161 dev.off() | |
| 162 names( rel.delta ) <- seq( from=2, by=1, length=length( rel.delta ) ) | |
| 163 vector.of.metric.changes <- rel.delta | |
| 164 | |
| 165 main.txt <- ", per Size K" | |
| 166 ylab.txt <- "" | |
| 167 | |
| 168 main.txt <- paste( "Relative Change in Area", main.txt, sep="" ) | |
| 169 ylab.txt <- paste( "relative change in area under CDF curve", ylab.txt, sep="" ) | |
| 170 png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.diff.png") | |
| 171 | |
| 172 plot.dev <- png( png.fname, | |
| 173 width=11, | |
| 174 height=8.5, | |
| 175 units='in', | |
| 176 res=72 ) | |
| 177 plot( as.numeric( names( vector.of.metric.changes ) ), | |
| 178 vector.of.metric.changes, | |
| 179 main=main.txt, | |
| 180 ylab=ylab.txt, | |
| 181 xlab="Cluster size (K)", | |
| 182 type='b' ) | |
| 183 dev.off() | |
| 184 | |
| 185 k.select <- vector.of.metric.changes[ vector.of.metric.changes < opt$change.min ] | |
| 186 if ( length( k.select ) > 1 ) { | |
| 187 k.select <- k.select[1] | |
| 188 } else { | |
| 189 if ( length( k.select ) == 0 ) { | |
| 190 k.select <- vector.of.metric.changes[ length( vector.of.metric.changes ) ] | |
| 191 } else { | |
| 192 ## do nothing | |
| 193 } | |
| 194 } | |
| 195 k.select <- as.numeric( names( k.select ) ) | |
| 196 ## find the search range | |
| 197 k.search.range <- (k.select-2):(k.select+2) | |
| 198 k.search.range <- k.search.range[ k.search.range %in% as.numeric( names( vector.of.metric.changes ) ) ] | |
| 199 k.search.range <- vector.of.metric.changes[ as.character( k.search.range ) ] | |
| 200 k.search.range <- k.search.range[ k.search.range < 0.25 ] | |
| 201 k.search.range <- k.search.range[ k.search.range > 0.025 ] | |
| 202 k.search.range <- names( k.search.range ) | |
| 203 | |
| 204 | |
| 205 if ( exists("data") ) { | |
| 206 ## what direction is the clustering in? rows or cols? | |
| 207 elts <- unique( names( results[[2]]$consensusClass ) ) | |
| 208 if ( all( elts %in% colnames( data ) ) ) { | |
| 209 ## sample clusters | |
| 210 data.dist <- dist( t( data ) ) | |
| 211 cls <- lapply( cls, function( x ) return( x[ colnames( data ) ] ) ) | |
| 212 } else if ( all( elts %in% rownames( data ) ) ) { | |
| 213 data.dist <- dist( data ) | |
| 214 cls <- lapply( cls, function( x ) return( x[ rownames( data ) ] ) ) | |
| 215 } else { | |
| 216 stop( "incompatible cluster results and data matrix\n" ) | |
| 217 } | |
| 218 | |
| 219 | |
| 220 sils <- lapply( cls, | |
| 221 silhouette, | |
| 222 dist=data.dist ) | |
| 223 sils <- sapply( sils, | |
| 224 function(x) { | |
| 225 return( summary( x )$avg.width ) | |
| 226 } | |
| 227 ) | |
| 228 | |
| 229 png.fname <- file.path( opt$output.report.dir, "consensus.sel.silhouette.png") | |
| 230 | |
| 231 plot.dev <- png( png.fname, | |
| 232 width=11, | |
| 233 height=8.5, | |
| 234 units='in', | |
| 235 res=72 ) | |
| 236 plot( as.numeric( names( sils ) ), | |
| 237 sils, | |
| 238 main="Average Silhouette Widths, per Cluster Size K", | |
| 239 ylab="average silhouette width (correlation distance)", | |
| 240 xlab="Cluster size (K)", | |
| 241 type='b' ) | |
| 242 dev.off() | |
| 243 | |
| 244 ## if the metric is silhouette, use that (but only over the k's that are on the rel-change "elbow" | |
| 245 if ( opt$metric == "silhouette" ) { | |
| 246 names( sils ) <- names( cls ) | |
| 247 | |
| 248 sils <- sils[ k.search.range ] | |
| 249 k.select <- as.numeric( names( sils[ sils == max( sils, na.rm=T ) ] ) ) | |
| 250 } | |
| 251 } | |
| 252 | |
| 253 if ( ! is.null( opt$survival.data ) ) { | |
| 254 if ( ! file.exists( opt$survival.data ) ) { | |
| 255 stop( 'specified clinical/survival file can not be found:', opt$survival.data, "\n" ) | |
| 256 } | |
| 257 | |
| 258 if ( opt$metric == "survival" ) { | |
| 259 pvals <- numeric() | |
| 260 | |
| 261 for ( cl in cls ) { | |
| 262 | |
| 263 cons.class.file <- tempfile( "tmp.class.rdata" ) | |
| 264 save( file=cons.class.file, cl ) | |
| 265 | |
| 266 cmd.string <- opt$survival.script | |
| 267 | |
| 268 ## get the consensusClass file that's associated with the k.select | |
| 269 cmd.string <- paste( cmd.string, "-C", cons.class.file ) | |
| 270 cmd.string <- paste( cmd.string, "-S", opt$survival.data ) | |
| 271 cmd.string <- paste( cmd.string, "-M", opt$survival.comp ) | |
| 272 cmd.string <- paste( cmd.string, "-P" ) | |
| 273 pvals <- c( pvals, as.numeric( system( cmd.string, intern=T ) ) ) | |
| 274 } | |
| 275 names( pvals ) <- names( cls ) | |
| 276 | |
| 277 | |
| 278 png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.survival.png" ) | |
| 279 | |
| 280 plot.dev <- png( png.fname, | |
| 281 width=11, | |
| 282 height=8.5, | |
| 283 units='in', | |
| 284 res=72 ) | |
| 285 plot( as.numeric( names( pvals ) ), | |
| 286 -log( pvals ), | |
| 287 main="Average Log-rank p-values (-log), per Cluster Size K", | |
| 288 ylab="Average log-rank p-values (-log)", | |
| 289 xlab="Cluster size (K)", | |
| 290 type='b' ) | |
| 291 dev.off() | |
| 292 | |
| 293 | |
| 294 pvals <- pvals[ k.search.range ] | |
| 295 k.select <- as.numeric( names( pvals[ pvals == min( pvals, na.rm=T ) ] ) ) | |
| 296 } | |
| 297 | |
| 298 cmd.string <- opt$survival.script | |
| 299 | |
| 300 ## get the consensusClass file that's associated with the k.select | |
| 301 cl <- cls[[ as.character( k.select ) ]] | |
| 302 cl <- cbind( names( cl ), as.integer(cl) ) | |
| 303 colnames( cl ) <- c( "ID", "class" ) | |
| 304 write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE ) | |
| 305 | |
| 306 cmd.string <- paste( cmd.string, "-c", opt$cluster.class.out ) | |
| 307 cmd.string <- paste( cmd.string, "-s", opt$survival.data ) | |
| 308 cmd.string <- paste( cmd.string, "-m", opt$survival.comp ) | |
| 309 | |
| 310 survival.out.file <- paste( opt$output.report.dir, "survival.png", sep="/" ) | |
| 311 cmd.string <- paste( cmd.string, "-o", survival.out.file ) | |
| 312 output <- system( cmd.string, intern=T ) | |
| 313 cat( output, sep="\n" ) | |
| 314 } else { | |
| 315 ## get the consensusClass file that's associated with the k.select | |
| 316 cl <- cls[[ as.character( k.select ) ]] | |
| 317 cl <- cbind( names( cl ), as.integer(cl) ) | |
| 318 colnames( cl ) <- c( "ID", "class" ) | |
| 319 write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE ) | |
| 320 } | |
| 321 | |
| 322 treecl.res <- results[[ k.select ]]$consensusTree | |
| 323 ## cl should already exist, but re-create it just in case | |
| 324 cl <- cls[[ as.character( k.select ) ]] | |
| 325 | |
| 326 | |
| 327 select.result <- results[[ k.select ]] | |
| 328 ## over-write the tabular version of the opt$cluster.class.out with an RData file | |
| 329 save( file=opt$cluster.class.out, treecl.res, cl, select.result, data ) | |
| 330 | |
| 331 report_str = paste( "k selected by consensus clustering and user-specified metric, ", opt$metric, ", is ", k.select, "\n", sep="" ) | |
| 332 | |
| 333 pngs = list.files(path=opt$output.report.dir, patt="png") | |
| 334 html.out <- paste( "<html>", report_str, | |
| 335 paste( paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), collapse=""), | |
| 336 "</html>" ) | |
| 337 cat( html.out, file=opt$output.report.html ) | |
| 338 |
