Mercurial > repos > peter-waltman > ucsc_cluster_tools2
comparison cluster.tools/select.k.from.consensus.cluster.R @ 0:0decf3fd54bc draft
Uploaded
author | peter-waltman |
---|---|
date | Thu, 28 Feb 2013 01:45:39 -0500 |
parents | |
children | a58527c632b7 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0decf3fd54bc |
---|---|
1 #!/usr/bin/env Rscript | |
2 # Consensus Clustering Script by Peter Waltman | |
3 # June 1, 2012 | |
4 # License under Creative Commons Attribution 3.0 Unported (CC BY 3.0) | |
5 # | |
6 ##usage, options and doc goes here | |
7 argspec <- c("select.k.from.consensus.clust4er.R takes a clustering from ConsensusClusterPlus | |
8 and clinical survival data and determines the right k to use. | |
9 | |
10 Usage: | |
11 select.k.from.consensus.cluster.R -r <results_file> | |
12 Optional: | |
13 -o output.png # default is stdout | |
14 -c change.min | |
15 -m metric (must be either: | |
16 rel.change, | |
17 angle, | |
18 silhouette (must specify data matrix) | |
19 survival (must specify survival data; uses minimal cummulative log-rank p-value) | |
20 -d data ## for calculating silhouette plots (plotted, but not used unless specified) | |
21 -s survival.data.fname (plotted, but not used unless specified) | |
22 -e survival.comp (can be either all, one or both - see the mode param for gen.survival.curves for explanation) | |
23 -z survival analysis script to be called (defaults to galaxy.gen.survival.curves.R) | |
24 \n\n") | |
25 args <- commandArgs(TRUE) | |
26 if ( length( args ) == 1 && args =="--help") { | |
27 write( argspec, stderr() ) | |
28 q(); | |
29 } | |
30 | |
31 lib.load.quiet <- function( package ) { | |
32 package <- as.character(substitute(package)) | |
33 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) | |
34 } | |
35 lib.load.quiet(getopt) | |
36 lib.load.quiet( amap ) | |
37 lib.load.quiet( cluster ) | |
38 | |
39 spec <- matrix( c( "results.file", "r", 1, "character", | |
40 "change.min", "c", 2, "double", | |
41 "metric", "m", 2, "character", | |
42 "survival.data", "s", 2, "character", | |
43 "survival.comp", "e", 2, "character", | |
44 "survival.script", "z", 2, "character", | |
45 "output.format", "f", 2, "character", | |
46 "cluster.class.out", "o", 2, "character", | |
47 "output.report.dir", "p", 2, "character", | |
48 "output.report.html", "h", 2, "character" | |
49 ), | |
50 nc=4, | |
51 byrow=T | |
52 ) | |
53 opt <- getopt( spec=spec ) | |
54 | |
55 ## default params for non-required params | |
56 if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" } | |
57 if ( is.null( opt$output.report.html ) ) { opt$output.report.html <- "report/index.html" } | |
58 | |
59 if ( is.null( opt$change.min ) ) { opt$change.min <- 0.075 } | |
60 if ( is.null( opt$metric ) ) { opt$metric <- "difference" } ## alternate is angle } | |
61 if ( is.null( opt$survival.comp ) ) { opt$survival.comp <- "all" } ## alternate is one or both } | |
62 if ( is.null( opt$survival.script ) ) { opt$survival.script <- "galaxy.gen.survival.curves.R" } ## alternate is one or both } | |
63 if ( is.null( opt$cluster.class.out) ) { opt$cluster.class.out <- "select.cls.rda" } | |
64 | |
65 if ( !file.exists( opt$output.report.dir ) ){ | |
66 dir.create(opt$output.report.dir) | |
67 } | |
68 | |
69 if ( ! opt$metric %in% c( "difference", "angle", "silhouette", "survival" ) ) { | |
70 stop( "invalid metric specified ", opt$metric, "\n" ) | |
71 } | |
72 | |
73 | |
74 opt$change.min <- as.numeric( opt$change.min ) | |
75 if ( abs( opt$change.min ) > 1 ) { | |
76 stop( "invalid angle specified:", opt$change.min, "Please specify angle in rangel [-1,0]\n" ) | |
77 } | |
78 if ( opt$metric=="angle" && opt$change.min > 0 ) { | |
79 opt$change.min <- -opt$change.min | |
80 cat( "Using", opt$change.min, "for minimum angle\n" ) | |
81 } | |
82 | |
83 if ( opt$metric == "survival" && | |
84 ( is.null( opt$survival.data ) || | |
85 (! file.exists( opt$survival.data ) ) ) | |
86 ) { | |
87 stop( "Must provide valid survival file in order to use survival as metric\n" ) | |
88 } | |
89 | |
90 | |
91 ## From the ConsensusClusterPlust package - modified by phw | |
92 CDF <- function( ml, | |
93 breaks=1000, | |
94 plot.it=TRUE ){ | |
95 if ( class(ml[[1]])=="matrix" && ( names( ml[1] ) =="2" ) ) { | |
96 ml <- c( c(0), ml ) | |
97 } | |
98 ##plot CDF distribution | |
99 if ( plot.it ) { | |
100 plot( c(0), | |
101 xlim=c(0,1), | |
102 ylim=c(0,1), | |
103 col="white", | |
104 bg="white", | |
105 xlab="consensus index", | |
106 ylab="CDF", | |
107 main="consensus CDF", | |
108 las=2 ) | |
109 } | |
110 | |
111 k=length(ml) | |
112 this_colors <- rainbow(k-1) | |
113 areaK <- c() | |
114 for (i in 2:length( ml ) ) { | |
115 v <- ml[[i]] | |
116 v <- v[ lower.tri(v) ] | |
117 | |
118 #empirical CDF distribution. default number of breaks is 100 | |
119 h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks)) | |
120 h$counts = cumsum(h$counts)/sum(h$counts) | |
121 | |
122 #calculate area under CDF curve, by histogram method. | |
123 thisArea=0 | |
124 for (bi in 1:(length(h$breaks)-1)){ | |
125 thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width | |
126 } | |
127 areaK = c(areaK,thisArea) | |
128 if ( plot.it ) lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l') | |
129 } | |
130 if ( plot.it ) legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors) | |
131 | |
132 #Calc area under CDF change. | |
133 deltaK=areaK[1] #initial auc at k=2 | |
134 for(i in 2:(length(areaK))){ | |
135 #proportional increase relative to prior K. | |
136 deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1]) | |
137 } | |
138 return ( list( areaK=areaK, deltaK=deltaK ) ) | |
139 } | |
140 | |
141 | |
142 load( opt$results.file ) | |
143 | |
144 if ( opt$metric == "silhouette" ) { | |
145 if ( ! exists( 'data' ) && ( class( data ) != "matrix" ) ) { | |
146 stop( "Must provide valid data matrix in order to use silhouette as metric\n" ) | |
147 } | |
148 } | |
149 cons.matrices <- lapply( results[ 2:length(results) ], '[[', 'consensusMatrix' ) | |
150 cls <- lapply( results[ 2:length(results) ], function( res ) return( res$consensusClass[ res$consensusTree$order ] ) ) ##'[[', 'consensusClass' ) | |
151 names( cons.matrices ) <- names( cls ) <- 2:length( results ) | |
152 | |
153 png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.CDF.png") | |
154 plot.dev <- png( png.fname, | |
155 width=11, | |
156 height=8.5, | |
157 units='in', | |
158 res=72 ) | |
159 ## this will calculate the CDF, plus plot them | |
160 rel.delta <- CDF( cons.matrices, breaks=1000, plot.it=TRUE )$deltaK | |
161 dev.off() | |
162 names( rel.delta ) <- seq( from=2, by=1, length=length( rel.delta ) ) | |
163 vector.of.metric.changes <- rel.delta | |
164 | |
165 main.txt <- ", per Size K" | |
166 ylab.txt <- "" | |
167 | |
168 main.txt <- paste( "Relative Change in Area", main.txt, sep="" ) | |
169 ylab.txt <- paste( "relative change in area under CDF curve", ylab.txt, sep="" ) | |
170 png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.diff.png") | |
171 | |
172 plot.dev <- png( png.fname, | |
173 width=11, | |
174 height=8.5, | |
175 units='in', | |
176 res=72 ) | |
177 plot( as.numeric( names( vector.of.metric.changes ) ), | |
178 vector.of.metric.changes, | |
179 main=main.txt, | |
180 ylab=ylab.txt, | |
181 xlab="Cluster size (K)", | |
182 type='b' ) | |
183 dev.off() | |
184 | |
185 k.select <- vector.of.metric.changes[ vector.of.metric.changes < opt$change.min ] | |
186 if ( length( k.select ) > 1 ) { | |
187 k.select <- k.select[1] | |
188 } else { | |
189 if ( length( k.select ) == 0 ) { | |
190 k.select <- vector.of.metric.changes[ length( vector.of.metric.changes ) ] | |
191 } else { | |
192 ## do nothing | |
193 } | |
194 } | |
195 k.select <- as.numeric( names( k.select ) ) | |
196 ## find the search range | |
197 k.search.range <- (k.select-2):(k.select+2) | |
198 k.search.range <- k.search.range[ k.search.range %in% as.numeric( names( vector.of.metric.changes ) ) ] | |
199 k.search.range <- vector.of.metric.changes[ as.character( k.search.range ) ] | |
200 k.search.range <- k.search.range[ k.search.range < 0.25 ] | |
201 k.search.range <- k.search.range[ k.search.range > 0.025 ] | |
202 k.search.range <- names( k.search.range ) | |
203 | |
204 | |
205 if ( exists("data") ) { | |
206 ## what direction is the clustering in? rows or cols? | |
207 elts <- unique( names( results[[2]]$consensusClass ) ) | |
208 if ( all( elts %in% colnames( data ) ) ) { | |
209 ## sample clusters | |
210 data.dist <- dist( t( data ) ) | |
211 cls <- lapply( cls, function( x ) return( x[ colnames( data ) ] ) ) | |
212 } else if ( all( elts %in% rownames( data ) ) ) { | |
213 data.dist <- dist( data ) | |
214 cls <- lapply( cls, function( x ) return( x[ rownames( data ) ] ) ) | |
215 } else { | |
216 stop( "incompatible cluster results and data matrix\n" ) | |
217 } | |
218 | |
219 | |
220 sils <- lapply( cls, | |
221 silhouette, | |
222 dist=data.dist ) | |
223 sils <- sapply( sils, | |
224 function(x) { | |
225 return( summary( x )$avg.width ) | |
226 } | |
227 ) | |
228 | |
229 png.fname <- file.path( opt$output.report.dir, "consensus.sel.silhouette.png") | |
230 | |
231 plot.dev <- png( png.fname, | |
232 width=11, | |
233 height=8.5, | |
234 units='in', | |
235 res=72 ) | |
236 plot( as.numeric( names( sils ) ), | |
237 sils, | |
238 main="Average Silhouette Widths, per Cluster Size K", | |
239 ylab="average silhouette width (correlation distance)", | |
240 xlab="Cluster size (K)", | |
241 type='b' ) | |
242 dev.off() | |
243 | |
244 ## if the metric is silhouette, use that (but only over the k's that are on the rel-change "elbow" | |
245 if ( opt$metric == "silhouette" ) { | |
246 names( sils ) <- names( cls ) | |
247 | |
248 sils <- sils[ k.search.range ] | |
249 k.select <- as.numeric( names( sils[ sils == max( sils, na.rm=T ) ] ) ) | |
250 } | |
251 } | |
252 | |
253 if ( ! is.null( opt$survival.data ) ) { | |
254 if ( ! file.exists( opt$survival.data ) ) { | |
255 stop( 'specified clinical/survival file can not be found:', opt$survival.data, "\n" ) | |
256 } | |
257 | |
258 if ( opt$metric == "survival" ) { | |
259 pvals <- numeric() | |
260 | |
261 for ( cl in cls ) { | |
262 | |
263 cons.class.file <- tempfile( "tmp.class.rdata" ) | |
264 save( file=cons.class.file, cl ) | |
265 | |
266 cmd.string <- opt$survival.script | |
267 | |
268 ## get the consensusClass file that's associated with the k.select | |
269 cmd.string <- paste( cmd.string, "-C", cons.class.file ) | |
270 cmd.string <- paste( cmd.string, "-S", opt$survival.data ) | |
271 cmd.string <- paste( cmd.string, "-M", opt$survival.comp ) | |
272 cmd.string <- paste( cmd.string, "-P" ) | |
273 pvals <- c( pvals, as.numeric( system( cmd.string, intern=T ) ) ) | |
274 } | |
275 names( pvals ) <- names( cls ) | |
276 | |
277 | |
278 png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.survival.png" ) | |
279 | |
280 plot.dev <- png( png.fname, | |
281 width=11, | |
282 height=8.5, | |
283 units='in', | |
284 res=72 ) | |
285 plot( as.numeric( names( pvals ) ), | |
286 -log( pvals ), | |
287 main="Average Log-rank p-values (-log), per Cluster Size K", | |
288 ylab="Average log-rank p-values (-log)", | |
289 xlab="Cluster size (K)", | |
290 type='b' ) | |
291 dev.off() | |
292 | |
293 | |
294 pvals <- pvals[ k.search.range ] | |
295 k.select <- as.numeric( names( pvals[ pvals == min( pvals, na.rm=T ) ] ) ) | |
296 } | |
297 | |
298 cmd.string <- opt$survival.script | |
299 | |
300 ## get the consensusClass file that's associated with the k.select | |
301 cl <- cls[[ as.character( k.select ) ]] | |
302 cl <- cbind( names( cl ), as.integer(cl) ) | |
303 colnames( cl ) <- c( "ID", "class" ) | |
304 write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE ) | |
305 | |
306 cmd.string <- paste( cmd.string, "-c", opt$cluster.class.out ) | |
307 cmd.string <- paste( cmd.string, "-s", opt$survival.data ) | |
308 cmd.string <- paste( cmd.string, "-m", opt$survival.comp ) | |
309 | |
310 survival.out.file <- paste( opt$output.report.dir, "survival.png", sep="/" ) | |
311 cmd.string <- paste( cmd.string, "-o", survival.out.file ) | |
312 output <- system( cmd.string, intern=T ) | |
313 cat( output, sep="\n" ) | |
314 } else { | |
315 ## get the consensusClass file that's associated with the k.select | |
316 cl <- cls[[ as.character( k.select ) ]] | |
317 cl <- cbind( names( cl ), as.integer(cl) ) | |
318 colnames( cl ) <- c( "ID", "class" ) | |
319 write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE ) | |
320 } | |
321 | |
322 treecl.res <- results[[ k.select ]]$consensusTree | |
323 ## cl should already exist, but re-create it just in case | |
324 cl <- cls[[ as.character( k.select ) ]] | |
325 | |
326 | |
327 select.result <- results[[ k.select ]] | |
328 ## over-write the tabular version of the opt$cluster.class.out with an RData file | |
329 save( file=opt$cluster.class.out, treecl.res, cl, select.result, data ) | |
330 | |
331 report_str = paste( "k selected by consensus clustering and user-specified metric, ", opt$metric, ", is ", k.select, "\n", sep="" ) | |
332 | |
333 pngs = list.files(path=opt$output.report.dir, patt="png") | |
334 html.out <- paste( "<html>", report_str, | |
335 paste( paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), collapse=""), | |
336 "</html>" ) | |
337 cat( html.out, file=opt$output.report.html ) | |
338 |