comparison cluster.tools/select.k.from.consensus.cluster.R @ 2:b442996b66ae draft

Uploaded
author peter-waltman
date Wed, 27 Feb 2013 20:17:04 -0500
parents
children
comparison
equal deleted inserted replaced
1:e25d2bece0a2 2:b442996b66ae
1 #!/usr/bin/env Rscript
2 # Consensus Clustering Script by Peter Waltman
3 # June 1, 2012
4 # License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
5 #
6 ##usage, options and doc goes here
7 argspec <- c("select.k.from.consensus.clust4er.R takes a clustering from ConsensusClusterPlus
8 and clinical survival data and determines the right k to use.
9
10 Usage:
11 select.k.from.consensus.cluster.R -r <results_file>
12 Optional:
13 -o output.png # default is stdout
14 -c change.min
15 -m metric (must be either:
16 rel.change,
17 angle,
18 silhouette (must specify data matrix)
19 survival (must specify survival data; uses minimal cummulative log-rank p-value)
20 -d data ## for calculating silhouette plots (plotted, but not used unless specified)
21 -s survival.data.fname (plotted, but not used unless specified)
22 -e survival.comp (can be either all, one or both - see the mode param for gen.survival.curves for explanation)
23 -z survival analysis script to be called (defaults to galaxy.gen.survival.curves.R)
24 \n\n")
25 args <- commandArgs(TRUE)
26 if ( length( args ) == 1 && args =="--help") {
27 write( argspec, stderr() )
28 q();
29 }
30
31 lib.load.quiet <- function( package ) {
32 package <- as.character(substitute(package))
33 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
34 }
35 lib.load.quiet(getopt)
36 lib.load.quiet( amap )
37 lib.load.quiet( cluster )
38
39 spec <- matrix( c( "results.file", "r", 1, "character",
40 "change.min", "c", 2, "double",
41 "metric", "m", 2, "character",
42 "survival.data", "s", 2, "character",
43 "survival.comp", "e", 2, "character",
44 "survival.script", "z", 2, "character",
45 "output.format", "f", 2, "character",
46 "cluster.class.out", "o", 2, "character",
47 "output.report.dir", "p", 2, "character",
48 "output.report.html", "h", 2, "character"
49 ),
50 nc=4,
51 byrow=T
52 )
53 opt <- getopt( spec=spec )
54
55 ## default params for non-required params
56 if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" }
57 if ( is.null( opt$output.report.html ) ) { opt$output.report.html <- "report/index.html" }
58
59 if ( is.null( opt$change.min ) ) { opt$change.min <- 0.075 }
60 if ( is.null( opt$metric ) ) { opt$metric <- "difference" } ## alternate is angle }
61 if ( is.null( opt$survival.comp ) ) { opt$survival.comp <- "all" } ## alternate is one or both }
62 if ( is.null( opt$survival.script ) ) { opt$survival.script <- "galaxy.gen.survival.curves.R" } ## alternate is one or both }
63 if ( is.null( opt$cluster.class.out) ) { opt$cluster.class.out <- "select.cls.rda" }
64
65 if ( !file.exists( opt$output.report.dir ) ){
66 dir.create(opt$output.report.dir)
67 }
68
69 if ( ! opt$metric %in% c( "difference", "angle", "silhouette", "survival" ) ) {
70 stop( "invalid metric specified ", opt$metric, "\n" )
71 }
72
73
74 opt$change.min <- as.numeric( opt$change.min )
75 if ( abs( opt$change.min ) > 1 ) {
76 stop( "invalid angle specified:", opt$change.min, "Please specify angle in rangel [-1,0]\n" )
77 }
78 if ( opt$metric=="angle" && opt$change.min > 0 ) {
79 opt$change.min <- -opt$change.min
80 cat( "Using", opt$change.min, "for minimum angle\n" )
81 }
82
83 if ( opt$metric == "survival" &&
84 ( is.null( opt$survival.data ) ||
85 (! file.exists( opt$survival.data ) ) )
86 ) {
87 stop( "Must provide valid survival file in order to use survival as metric\n" )
88 }
89
90
91 ## From the ConsensusClusterPlust package - modified by phw
92 CDF <- function( ml,
93 breaks=1000,
94 plot.it=TRUE ){
95 if ( class(ml[[1]])=="matrix" && ( names( ml[1] ) =="2" ) ) {
96 ml <- c( c(0), ml )
97 }
98 ##plot CDF distribution
99 if ( plot.it ) {
100 plot( c(0),
101 xlim=c(0,1),
102 ylim=c(0,1),
103 col="white",
104 bg="white",
105 xlab="consensus index",
106 ylab="CDF",
107 main="consensus CDF",
108 las=2 )
109 }
110
111 k=length(ml)
112 this_colors <- rainbow(k-1)
113 areaK <- c()
114 for (i in 2:length( ml ) ) {
115 v <- ml[[i]]
116 v <- v[ lower.tri(v) ]
117
118 #empirical CDF distribution. default number of breaks is 100
119 h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks))
120 h$counts = cumsum(h$counts)/sum(h$counts)
121
122 #calculate area under CDF curve, by histogram method.
123 thisArea=0
124 for (bi in 1:(length(h$breaks)-1)){
125 thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width
126 }
127 areaK = c(areaK,thisArea)
128 if ( plot.it ) lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l')
129 }
130 if ( plot.it ) legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors)
131
132 #Calc area under CDF change.
133 deltaK=areaK[1] #initial auc at k=2
134 for(i in 2:(length(areaK))){
135 #proportional increase relative to prior K.
136 deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1])
137 }
138 return ( list( areaK=areaK, deltaK=deltaK ) )
139 }
140
141
142 load( opt$results.file )
143 save.image( '/home/waltman/work.local/tmp/select.dbg.rda' )
144
145 if ( opt$metric == "silhouette" ) {
146 if ( ! exists( 'data' ) && ( class( data ) != "matrix" ) ) {
147 stop( "Must provide valid data matrix in order to use silhouette as metric\n" )
148 }
149 }
150 cons.matrices <- lapply( results[ 2:length(results) ], '[[', 'consensusMatrix' )
151 cls <- lapply( results[ 2:length(results) ], function( res ) return( res$consensusClass[ res$consensusTree$order ] ) ) ##'[[', 'consensusClass' )
152 names( cons.matrices ) <- names( cls ) <- 2:length( results )
153
154 png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.CDF.png")
155 plot.dev <- png( png.fname,
156 width=11,
157 height=8.5,
158 units='in',
159 res=72 )
160 ## this will calculate the CDF, plus plot them
161 rel.delta <- CDF( cons.matrices, breaks=1000, plot.it=TRUE )$deltaK
162 dev.off()
163 names( rel.delta ) <- seq( from=2, by=1, length=length( rel.delta ) )
164 vector.of.metric.changes <- rel.delta
165
166 main.txt <- ", per Size K"
167 ylab.txt <- ""
168
169 main.txt <- paste( "Relative Change in Area", main.txt, sep="" )
170 ylab.txt <- paste( "relative change in area under CDF curve", ylab.txt, sep="" )
171 png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.diff.png")
172
173 plot.dev <- png( png.fname,
174 width=11,
175 height=8.5,
176 units='in',
177 res=72 )
178 plot( as.numeric( names( vector.of.metric.changes ) ),
179 vector.of.metric.changes,
180 main=main.txt,
181 ylab=ylab.txt,
182 xlab="Cluster size (K)",
183 type='b' )
184 dev.off()
185
186 k.select <- vector.of.metric.changes[ vector.of.metric.changes < opt$change.min ]
187 if ( length( k.select ) > 1 ) {
188 k.select <- k.select[1]
189 } else {
190 if ( length( k.select ) == 0 ) {
191 k.select <- vector.of.metric.changes[ length( vector.of.metric.changes ) ]
192 } else {
193 ## do nothing
194 }
195 }
196 k.select <- as.numeric( names( k.select ) )
197 ## find the search range
198 k.search.range <- (k.select-2):(k.select+2)
199 k.search.range <- k.search.range[ k.search.range %in% as.numeric( names( vector.of.metric.changes ) ) ]
200 k.search.range <- vector.of.metric.changes[ as.character( k.search.range ) ]
201 k.search.range <- k.search.range[ k.search.range < 0.25 ]
202 k.search.range <- k.search.range[ k.search.range > 0.025 ]
203 k.search.range <- names( k.search.range )
204
205
206 if ( exists("data") ) {
207 ## what direction is the clustering in? rows or cols?
208 elts <- unique( names( results[[2]]$consensusClass ) )
209 if ( all( elts %in% colnames( data ) ) ) {
210 ## sample clusters
211 data.dist <- dist( t( data ) )
212 cls <- lapply( cls, function( x ) return( x[ colnames( data ) ] ) )
213 } else if ( all( elts %in% rownames( data ) ) ) {
214 data.dist <- dist( data )
215 cls <- lapply( cls, function( x ) return( x[ rownames( data ) ] ) )
216 } else {
217 stop( "incompatible cluster results and data matrix\n" )
218 }
219
220
221 sils <- lapply( cls,
222 silhouette,
223 dist=data.dist )
224 sils <- sapply( sils,
225 function(x) {
226 return( summary( x )$avg.width )
227 }
228 )
229
230 png.fname <- file.path( opt$output.report.dir, "consensus.sel.silhouette.png")
231
232 plot.dev <- png( png.fname,
233 width=11,
234 height=8.5,
235 units='in',
236 res=72 )
237 plot( as.numeric( names( sils ) ),
238 sils,
239 main="Average Silhouette Widths, per Cluster Size K",
240 ylab="average silhouette width (correlation distance)",
241 xlab="Cluster size (K)",
242 type='b' )
243 dev.off()
244
245 ## if the metric is silhouette, use that (but only over the k's that are on the rel-change "elbow"
246 if ( opt$metric == "silhouette" ) {
247 names( sils ) <- names( cls )
248
249 sils <- sils[ k.search.range ]
250 k.select <- as.numeric( names( sils[ sils == max( sils, na.rm=T ) ] ) )
251 }
252 }
253
254 if ( ! is.null( opt$survival.data ) ) {
255 if ( ! file.exists( opt$survival.data ) ) {
256 stop( 'specified clinical/survival file can not be found:', opt$survival.data, "\n" )
257 }
258
259 if ( opt$metric == "survival" ) {
260 pvals <- numeric()
261
262 for ( cl in cls ) {
263 cons.class.file <- "tmp.class.tab"
264 tmp.cl <- cbind( gsub( "\\.", "-", names( cl ) ), as.integer(cl) )
265 colnames( tmp.cl ) <- c( "ID", "class" )
266 write.table( tmp.cl, cons.class.file, sep="\t", row.names=FALSE, quote=FALSE )
267
268 cmd.string <- opt$survival.script
269
270 ## get the consensusClass file that's associated with the k.select
271 cmd.string <- paste( cmd.string, "-c", cons.class.file )
272 cmd.string <- paste( cmd.string, "-s", opt$survival.data )
273 cmd.string <- paste( cmd.string, "-m", opt$survival.comp )
274 cmd.string <- paste( cmd.string, "-p" )
275 pvals <- c( pvals, as.numeric( system( cmd.string, intern=T ) ) )
276 }
277 names( pvals ) <- names( cls )
278
279
280 png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.survival.png" )
281
282 plot.dev <- png( png.fname,
283 width=11,
284 height=8.5,
285 units='in',
286 res=72 )
287 plot( as.numeric( names( pvals ) ),
288 -log( pvals ),
289 main="Average Log-rank p-values (-log), per Cluster Size K",
290 ylab="Average log-rank p-values (-log)",
291 xlab="Cluster size (K)",
292 type='b' )
293 dev.off()
294
295
296 pvals <- pvals[ ksearch.range ]
297 k.select <- as.numeric( names( pvals[ pvals == min( pvals, na.rm=T ) ] ) )
298 }
299
300 cmd.string <- opt$survival.script
301
302 ## get the consensusClass file that's associated with the k.select
303 cl <- cls[[ as.character( k.select ) ]]
304 cl <- cbind( names( cl ), as.integer(cl) )
305 colnames( cl ) <- c( "ID", "class" )
306 write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE )
307
308 cmd.string <- paste( cmd.string, "-c", opt$cluster.class.out )
309 cmd.string <- paste( cmd.string, "-s", opt$survival.data )
310 cmd.string <- paste( cmd.string, "-m", opt$survival.comp )
311
312 survival.out.file <- paste( opt$output.report.dir, "survival.png", sep="/" )
313 cmd.string <- paste( cmd.string, "-o", survival.out.file )
314 output <- system( cmd.string, intern=T )
315 cat( output, sep="\n" )
316 } else {
317 ## get the consensusClass file that's associated with the k.select
318 cl <- cls[[ as.character( k.select ) ]]
319 cl <- cbind( names( cl ), as.integer(cl) )
320 colnames( cl ) <- c( "ID", "class" )
321 write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE )
322 }
323
324 treecl.res <- results[[ k.select ]]$consensusTree
325 ## cl should already exist, but re-create it just in case
326 cl <- cls[[ as.character( k.select ) ]]
327
328
329 select.result <- results[[ k.select ]]
330 ## over-write the tabular version of the opt$cluster.class.out with an RData file
331 save( file=opt$cluster.class.out, treecl.res, cl, select.result, data )
332
333 report_str = paste( "k selected by consensus clustering and user-specified metric, ", opt$metric, ", is ", k.select, "\n", sep="" )
334
335 pngs = list.files(path=opt$output.report.dir, patt="png")
336 html.out <- paste( "<html>", report_str,
337 paste( paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), collapse=""),
338 "</html>" )
339 cat( html.out, file=opt$output.report.html )
340