Mercurial > repos > peter-waltman > ucsc_cluster_tools2

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/dichotomize.sample.clusters.R	Mon Mar 04 04:11:28 2013 -0500
@@ -0,0 +1,121 @@
+#!/usr/bin/env Rscript
+argspec <- c("tab.2.cdt.R converts a data matrix to cdt format
+
+        Usage:
+                tab.2.cdt.R -d <data.file>
+        Optional:
+                            -o <output_file>
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") {
+  write(argspec, stderr())
+  q();
+}
+
+## some helper fn's
+write.2.tab <- function( mat,
+                         fname ) {
+  mat <- rbind( colnames( mat ), mat )
+  mat <- cbind( c( "ID", rownames( mat )[-1] ),
+                      mat )
+  write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+
+lib.load.quiet( getopt )
+lib.load.quiet( ctc )
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+  if ( 'flashClust' %in% installed.packages() ) {
+    lib.load.quiet( flashClust )
+  } else {
+    if ( 'fastcluster' %in% installed.packages() ) {
+      lib.load.quiet( fastcluster )
+    }
+  }
+}
+
+spec <- matrix( c( "dataset",             "d", 1, "character",
+                   "num.k",               "k", 2, "character",
+                   "output.fname",        "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+
+opt <- getopt( spec=spec )
+if ( is.null( opt$output.fname ) ) { opt$output.fname <- file.path( opt$output.report.dir, paste( "data", opt$output.format, sep="." ) ) }
+if ( is.null( opt$num.k ) ) {
+  opt$num.k <- -1
+} else {
+  num.k <- as.integer( eval( parse( text=paste( "c(",gsub( "-", ":", gsub( ", |,", ",", opt$num.k ) ), ")" ) ) ) )
+  num.k <- num.k[ ! is.na( num.k ) ]
+  if ( length( opt$num.k ) == 0 ) stop( 'invalid input for k_range specified:', opt$num.k, "\n" )
+
+  num.k <- num.k[ ! num.k %in% 1 ]  # strip out a k==1 since that doesn't make any sense
+  opt$num.k <- num.k; rm( num.k )
+}
+
+load( opt$dataset )
+## if this is a clustering result w/cluster assignments ('raw' CCPLUS does not)
+if ( exists( 'cl' ) ) {
+  k <- max( as.numeric( cl ) )
+  cl <- matrix( cl, nc=1, dimnames=list( names(cl), k ) )
+  if ( (length(opt$num.k)==1) && (opt$num.k == -1 ) ) opt$num.k <- k
+
+  ## if this is a one-off to produce a phenotype for the number of clusters that the user originally proposed
+  if ( !opt$num.k[1] %in% c( -1, k ) ) {
+
+    if ( exists( 'partcl.res' ) || exists( 'select.result' ) ) {
+
+      if ( exists( 'partcl.res' ) ) {
+        warning( 'The k_range value(s) specified are:',
+                 opt$num.k,
+                 "however k_range vals can not specify alternate k values for partition clusters. Using the K value that corresponds to this result instead\n" )
+      } else {
+        warning( 'The k_range value(s) specified are:',
+                 opt$num.k,
+                 "however k_range vals can not specify alternate k values for specific cluster results from CCPLUS (i.e. those from the Select K or Extract tools). To get alternate K values, re-run the dichotomizer on the 'raw' CCPLUS results. Using the K value that corresponds to this result instead\n" )
+      }
+
+      opt$num.k <- k
+      cl <- matrix( cl, nr=1, dimnames=list( k, names(cl) ) )
+    } else {
+      ## handle if this is a hclust result
+
+      opt$num.k <- opt$num.k[ opt$num.k < length( cl ) ]
+      cl.samps <- rownames( cl )
+      cl <- sapply( opt$num.k, function(i) cutree( treecl.res, i )[ cl.samps ] )
+      colnames( cl ) <- opt$num.k
+    }
+  }
+} else if ( exists( 'results' ) ) {
+  ## handle if this is a ccplus-raw result
+  opt$num.k <- opt$num.k[ opt$num.k <= length( results ) ]
+  cl <- sapply( results[ opt$num.k ], '[[', 'consensusClass' )
+  colnames( cl ) <- opt$num.k
+}
+
+pheno.mat <- lapply( 1:ncol(cl),
+                     function(i) {
+                       x <- cl[,i]
+                       cls <- ks <- sort( unique(x) )
+                       cls <- sapply( cls, function(y) as.numeric( x %in% y ) )
+                       colnames(cls) <- paste( "CLeq", ks, sep="" )
+                       rownames(cls) <- names(x)
+                       return(cls)
+                     }
+                    )
+names( pheno.mat ) <- opt$num.k
+
+final.mat <- matrix( NA, nc=0, nrow=nrow(cl), dimnames=list( names(cl), NULL ) )
+for ( i in names( pheno.mat ) ) {
+  colnames( pheno.mat[[i]] ) <- paste( "Keq", i, "_", colnames( pheno.mat[[i]] ), sep="" )
+  final.mat <- cbind( final.mat, pheno.mat[[i]] )
+}
+
+write.2.tab( final.mat, opt$output.fname )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/dichotomize.sample.clusters.py	Mon Mar 04 04:11:28 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "dichotomize.sample.clusters.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:]
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/dichotomize.sample.clusters.xml	Mon Mar 04 04:11:28 2013 -0500
@@ -0,0 +1,35 @@
+<tool id="dichotomize_sample_clusters" name="Dichotomize Previous Sample Cluster Result" force_history_refresh="True">
+  <command interpreter="python">dichotomize.sample.clusters.py
+-d $dataset
+#if str($k_range) != "-1":
+-k ${k_range}
+#end if
+-o ${output_fname}
+
+</command>
+    <inputs>
+      <param format="rdata" name="dataset" type="data" label="Sample Clustering Classification" help="Cluster result file from CCPLUS, HAC, or PAM"/>
+      <param name="k_range" type="text" label="Range of K to Dichotomize" value="-1" help="Specify the number of clusters to use (-1 to use default; see help below)"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output_fname" label="Dichotomized Sample Clusters (phenotype)"/>
+    </outputs>
+<help>
+- **Sample Clustering Classification Result** - Specify the cluster result to analayze (MUST BE IN rdata format), and must contain the same objects that are produced by the 'Partition Clustering,' 'Hierarchical Clustering (HAC),' or 'Consensus Clustering' tools.
+
+- **Range of K to Dichotomize** Specify the maximum number of clusters to use; use -1 to indicate to use the default cluster size (i.e. if K-means and K was 5, 5 will be used)
+       * **To specify a range of K's to explore** users can use characters like commas (,), hyphens (-) and colons (:) to specify ranges.
+       * **Examples:**
+               * 2,4,6 - would generate a dictotomy for K's equal to 2, 4 and 6
+	       * 2,4:6 - would generate a dictotomy for K's equal to 2, 4, 5 and 6
+	       * 2,4-6 - would generate a dictotomy for K's equal to 2, 4, 5 and 6
+	       * 2, 5:7, 9, 11-14 - would generate a dictotomy for K's equal to 2, 5, 6, 7, 9, 11, 12, 13 and 14.
+
+- **NOTE,** however, the following limitations:
+       * IF cluster result is a **Partition** clustering (K-Means or PAM) - users **CAN NOT** specify alternate choices of K.
+       * IF cluster result is a result from the **Select K from Consensus Clustering** or **Extract Clustering from Consensus Clustering** tools - users **CAN NOT** specify alternate choices of K.
+       * IF cluster results is a "raw" result from Consensus Clustering, only the range of K's that Consensus Clustering considered can be used as alternate choices of K.  Alternate choices of K outside of that range will be ignored.
+       * IF cluster result is a **Hierarchical** clustering - alternate choices of K that are greater than the total number of samples will be ignored
+
+</help>
+</tool>
--- a/cluster.tools/extract.cons.cluster.from.result.R	Fri Mar 01 19:54:08 2013 -0500
+++ b/cluster.tools/extract.cons.cluster.from.result.R	Mon Mar 04 04:11:28 2013 -0500
@@ -37,7 +37,7 @@
 cons.matrices <- lapply( results[ 2:length(results) ], '[[', 'consensusMatrix' )
 cls <- lapply( results[ 2:length(results) ], '[[', 'consensusClass' )
 names( cons.matrices ) <- names( cls ) <- 2:length( results )
-
+save.image( '~/tmp/extract.dbg.rda' )
 ch.k.select <- as.character( opt$k.select )
 if ( ch.k.select %in% names( cls ) ) {
   ## get the consensusClass file that's associated with the k.select
@@ -51,6 +51,15 @@
     ## re-order the samples to follow the cluster assignment
     treecl.res <- results[[ opt$k.select ]]$consensusTree
     select.result <- results[[ opt$k.select ]]
+
+    if ( length(cl) == ncol(data) ) {
+      names( cl ) <- treecl.res$labels <- select.result$consensusTree$labels <- colnames(data)
+    } else if ( length(cl) == nrow(data) ) {
+      names( cl ) <- treecl.res$labels <- select.result$consensusTree$labels <- rownames(data)
+    } else {
+      stop( "Number of clustered elements not equal to either number of rows or columns of data matrix\n" )
+    }
+
     save( file=opt$output.select.rdata, treecl.res, cl, select.result, data )
   } else {
     stop( 'no valid output format specified\n' )