diff cluster.tools/hclust.R @ 0:0decf3fd54bc draft

Uploaded
author peter-waltman
date Thu, 28 Feb 2013 01:45:39 -0500
parents
children a58527c632b7
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/hclust.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,87 @@
+#!/usr/bin/env Rscript
+
+argspec <- c("hclust.R help TBD
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+lib.load.quiet( amap )
+##  if any of the faster clustering methods are available on this system, load them
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+  if ( 'flashClust' %in% installed.packages() ) {
+    lib.load.quiet( flashClust )
+  } else {
+    if ( 'fastcluster' %in% installed.packages() ) {
+      lib.load.quiet( fastcluster )
+    }
+  }
+}
+
+spec <- matrix( c( "data.fname",         "d", 1, "character",
+                   "num.k",              "k", 1, "integer",
+                   "distance.metric",    "m", 2, "character", 
+                   "dist.obj",           "D", 2, "logical",
+                   "direction",          "n", 2, "character",
+                   "linkage",            "l", 2, "character",
+                   "output.name",        "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+
+if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "euclidean" }
+if ( is.null( opt$dist.obj ) ) { opt$dist.obj <- FALSE }
+if ( is.null( opt$direction ) ) { opt$direction <- "cols"  }
+if ( is.null( opt$linkage ) ) { opt$linkage <- "average" }
+if ( is.null( opt$num.k ) ) { opt$num.k <- 10 }
+if ( is.null( opt$output.name ) ) { opt$output.name <- "hclust.result.rda" }
+
+data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
+if ( opt$direction == "cols" ) {
+  ## need to transpose b/c both kmeans & pam cluster the rows
+  ## this shouldn't have an effect upon a distance matrix
+  data <- t( data )
+}
+if ( opt$num.k > nrow( data ) ) {
+  err.msg <- paste( "K specified is greater than the number of elements (", opt$direction, ") in data matrix to be clustereed\n", sep="" )
+  stop( err.msg )
+}
+
+if ( opt$dist.obj ) {
+  dist.mat <- as.dist( data )  
+} else {
+  ##  we're going to use the amap Dist function, but they misname their correlation
+  ##  functions, so re-name them correctly
+  amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+                      "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
+  names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+                               "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
+
+  if ( ! opt$distance.metric %in% names( amap.distance ) ) stop("unsupported distance.")
+  dist.mat <- Dist( data, method=as.character( amap.distance[ opt$distance.metric ] ) )
+  attr( dist.mat, "method" ) <- opt$distance.metric
+}
+
+##  now, do the clustering
+treecl.res <- hclust( dist.mat, method=opt$linkage )
+cutree.res <- cutree( treecl.res, k=opt$num.k )
+##cl <- cbind( names( cutree.res ), as.numeric( cutree.res ) )
+##colnames( cl ) <- c( "ID", "class" )
+
+if ( opt$direction == "cols" ) {
+  ## need to re-transpose the data back to it's original dimensionality
+  data <- t( data )
+}
+
+cl <- cutree.res
+save( file=opt$output.name, treecl.res, cl, data )