# HG changeset patch # User peter-waltman # Date 1362388288 18000 # Node ID 2efa1a284546a8df9791ce6aa8a9c7ccaa3b0e34 # Parent 3d3a8595b981daac91ba2ec2df9c48382da9c6e5 Uploaded diff -r 3d3a8595b981 -r 2efa1a284546 cluster.tools/dichotomize.sample.clusters.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/dichotomize.sample.clusters.R Mon Mar 04 04:11:28 2013 -0500 @@ -0,0 +1,121 @@ +#!/usr/bin/env Rscript +argspec <- c("tab.2.cdt.R converts a data matrix to cdt format + + Usage: + tab.2.cdt.R -d + Optional: + -o + \n\n") +args <- commandArgs(TRUE) +if ( length( args ) == 1 && args =="--help") { + write(argspec, stderr()) + q(); +} + +## some helper fn's +write.2.tab <- function( mat, + fname ) { + mat <- rbind( colnames( mat ), mat ) + mat <- cbind( c( "ID", rownames( mat )[-1] ), + mat ) + write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) +} + +lib.load.quiet <- function( package ) { + package <- as.character(substitute(package)) + suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) +} + +lib.load.quiet( getopt ) +lib.load.quiet( ctc ) +if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) { + if ( 'flashClust' %in% installed.packages() ) { + lib.load.quiet( flashClust ) + } else { + if ( 'fastcluster' %in% installed.packages() ) { + lib.load.quiet( fastcluster ) + } + } +} + +spec <- matrix( c( "dataset", "d", 1, "character", + "num.k", "k", 2, "character", + "output.fname", "o", 2, "character" + ), + nc=4, + byrow=TRUE + ) + + +opt <- getopt( spec=spec ) +if ( is.null( opt$output.fname ) ) { opt$output.fname <- file.path( opt$output.report.dir, paste( "data", opt$output.format, sep="." ) ) } +if ( is.null( opt$num.k ) ) { + opt$num.k <- -1 +} else { + num.k <- as.integer( eval( parse( text=paste( "c(",gsub( "-", ":", gsub( ", |,", ",", opt$num.k ) ), ")" ) ) ) ) + num.k <- num.k[ ! is.na( num.k ) ] + if ( length( opt$num.k ) == 0 ) stop( 'invalid input for k_range specified:', opt$num.k, "\n" ) + + num.k <- num.k[ ! num.k %in% 1 ] # strip out a k==1 since that doesn't make any sense + opt$num.k <- num.k; rm( num.k ) +} + +load( opt$dataset ) +## if this is a clustering result w/cluster assignments ('raw' CCPLUS does not) +if ( exists( 'cl' ) ) { + k <- max( as.numeric( cl ) ) + cl <- matrix( cl, nc=1, dimnames=list( names(cl), k ) ) + if ( (length(opt$num.k)==1) && (opt$num.k == -1 ) ) opt$num.k <- k + + ## if this is a one-off to produce a phenotype for the number of clusters that the user originally proposed + if ( !opt$num.k[1] %in% c( -1, k ) ) { + + if ( exists( 'partcl.res' ) || exists( 'select.result' ) ) { + + if ( exists( 'partcl.res' ) ) { + warning( 'The k_range value(s) specified are:', + opt$num.k, + "however k_range vals can not specify alternate k values for partition clusters. Using the K value that corresponds to this result instead\n" ) + } else { + warning( 'The k_range value(s) specified are:', + opt$num.k, + "however k_range vals can not specify alternate k values for specific cluster results from CCPLUS (i.e. those from the Select K or Extract tools). To get alternate K values, re-run the dichotomizer on the 'raw' CCPLUS results. Using the K value that corresponds to this result instead\n" ) + } + + opt$num.k <- k + cl <- matrix( cl, nr=1, dimnames=list( k, names(cl) ) ) + } else { + ## handle if this is a hclust result + + opt$num.k <- opt$num.k[ opt$num.k < length( cl ) ] + cl.samps <- rownames( cl ) + cl <- sapply( opt$num.k, function(i) cutree( treecl.res, i )[ cl.samps ] ) + colnames( cl ) <- opt$num.k + } + } +} else if ( exists( 'results' ) ) { + ## handle if this is a ccplus-raw result + opt$num.k <- opt$num.k[ opt$num.k <= length( results ) ] + cl <- sapply( results[ opt$num.k ], '[[', 'consensusClass' ) + colnames( cl ) <- opt$num.k +} + +pheno.mat <- lapply( 1:ncol(cl), + function(i) { + x <- cl[,i] + cls <- ks <- sort( unique(x) ) + cls <- sapply( cls, function(y) as.numeric( x %in% y ) ) + colnames(cls) <- paste( "CLeq", ks, sep="" ) + rownames(cls) <- names(x) + return(cls) + } + ) +names( pheno.mat ) <- opt$num.k + +final.mat <- matrix( NA, nc=0, nrow=nrow(cl), dimnames=list( names(cl), NULL ) ) +for ( i in names( pheno.mat ) ) { + colnames( pheno.mat[[i]] ) <- paste( "Keq", i, "_", colnames( pheno.mat[[i]] ), sep="" ) + final.mat <- cbind( final.mat, pheno.mat[[i]] ) +} + +write.2.tab( final.mat, opt$output.fname ) diff -r 3d3a8595b981 -r 2efa1a284546 cluster.tools/dichotomize.sample.clusters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/dichotomize.sample.clusters.py Mon Mar 04 04:11:28 2013 -0500 @@ -0,0 +1,14 @@ +#!/usr/bin/env python +import os +import sys +import subprocess + +select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "dichotomize.sample.clusters.R") + +cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] + +proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE ) +(stdoutdata, stderrdata) = proc.communicate() +if proc.returncode: + sys.stderr.write(stderrdata) +sys.stdout.write(stdoutdata) diff -r 3d3a8595b981 -r 2efa1a284546 cluster.tools/dichotomize.sample.clusters.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/dichotomize.sample.clusters.xml Mon Mar 04 04:11:28 2013 -0500 @@ -0,0 +1,35 @@ + + dichotomize.sample.clusters.py +-d $dataset +#if str($k_range) != "-1": +-k ${k_range} +#end if +-o ${output_fname} + + + + + + + + + + +- **Sample Clustering Classification Result** - Specify the cluster result to analayze (MUST BE IN rdata format), and must contain the same objects that are produced by the 'Partition Clustering,' 'Hierarchical Clustering (HAC),' or 'Consensus Clustering' tools. + +- **Range of K to Dichotomize** Specify the maximum number of clusters to use; use -1 to indicate to use the default cluster size (i.e. if K-means and K was 5, 5 will be used) + * **To specify a range of K's to explore** users can use characters like commas (,), hyphens (-) and colons (:) to specify ranges. + * **Examples:** + * 2,4,6 - would generate a dictotomy for K's equal to 2, 4 and 6 + * 2,4:6 - would generate a dictotomy for K's equal to 2, 4, 5 and 6 + * 2,4-6 - would generate a dictotomy for K's equal to 2, 4, 5 and 6 + * 2, 5:7, 9, 11-14 - would generate a dictotomy for K's equal to 2, 5, 6, 7, 9, 11, 12, 13 and 14. + +- **NOTE,** however, the following limitations: + * IF cluster result is a **Partition** clustering (K-Means or PAM) - users **CAN NOT** specify alternate choices of K. + * IF cluster result is a result from the **Select K from Consensus Clustering** or **Extract Clustering from Consensus Clustering** tools - users **CAN NOT** specify alternate choices of K. + * IF cluster results is a "raw" result from Consensus Clustering, only the range of K's that Consensus Clustering considered can be used as alternate choices of K. Alternate choices of K outside of that range will be ignored. + * IF cluster result is a **Hierarchical** clustering - alternate choices of K that are greater than the total number of samples will be ignored + + + diff -r 3d3a8595b981 -r 2efa1a284546 cluster.tools/extract.cons.cluster.from.result.R --- a/cluster.tools/extract.cons.cluster.from.result.R Fri Mar 01 19:54:08 2013 -0500 +++ b/cluster.tools/extract.cons.cluster.from.result.R Mon Mar 04 04:11:28 2013 -0500 @@ -37,7 +37,7 @@ cons.matrices <- lapply( results[ 2:length(results) ], '[[', 'consensusMatrix' ) cls <- lapply( results[ 2:length(results) ], '[[', 'consensusClass' ) names( cons.matrices ) <- names( cls ) <- 2:length( results ) - +save.image( '~/tmp/extract.dbg.rda' ) ch.k.select <- as.character( opt$k.select ) if ( ch.k.select %in% names( cls ) ) { ## get the consensusClass file that's associated with the k.select @@ -51,6 +51,15 @@ ## re-order the samples to follow the cluster assignment treecl.res <- results[[ opt$k.select ]]$consensusTree select.result <- results[[ opt$k.select ]] + + if ( length(cl) == ncol(data) ) { + names( cl ) <- treecl.res$labels <- select.result$consensusTree$labels <- colnames(data) + } else if ( length(cl) == nrow(data) ) { + names( cl ) <- treecl.res$labels <- select.result$consensusTree$labels <- rownames(data) + } else { + stop( "Number of clustered elements not equal to either number of rows or columns of data matrix\n" ) + } + save( file=opt$output.select.rdata, treecl.res, cl, select.result, data ) } else { stop( 'no valid output format specified\n' )