# HG changeset patch
# User peter-waltman
# Date 1362033939 18000
# Node ID 0decf3fd54bc4570a98078388015db0c88eb6747

Uploaded

diff -r 000000000000 -r 0decf3fd54bc cluster.tools/cluster.2.centroid.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/cluster.2.centroid.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,137 @@
+#!/usr/bin/env Rscript
+argspec <- c("tab.2.cdt.R converts a data matrix to cdt format
+
+        Usage: 
+
+        Optional:
+
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+
+lib.load.quiet( getopt )
+lib.load.quiet( amap )
+
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+  if ( 'flashClust' %in% installed.packages() ) {
+    lib.load.quiet( flashClust )
+  } else {
+    if ( 'fastcluster' %in% installed.packages() ) {
+      lib.load.quiet( fastcluster )
+    }
+  }
+}
+
+spec <- matrix( c( "dataset",             "d", 1, "character",
+                   "gen.new.dgram",       "g", 2, "character",
+                   "output.fname",        "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+
+opt <- getopt( spec=spec )
+if ( is.null( opt$output.report.dir ) ) {
+  opt$output.report.dir <- "report"
+
+  if (! file.exists( opt$output.report.dir ) ) {
+    dir.create( opt$output.report.dir )
+  } else {
+      if ( ! file.info( 'report' )$isdir ) {
+        opt$output.report.dir <- 'heatmap.report'
+        dir.create( opt$output.report.dir )
+      }
+    }
+}
+if ( is.null( opt$output.fname ) ) { opt$output.fname <- file.path( opt$output.report.dir, paste( "data.RData", sep="." ) ) }
+if ( is.null( opt$gen.new.dgram ) ) {
+  opt$gen.new.dgram <- FALSE
+} else {
+  if ( ! opt$gen.new.dgram %in% c( "no", "yes" ) ) {
+    stop( "invalid input to gen.new.dgram param", opt$gen.new.dgram, "\n" )
+  }
+  ##  set to TRUE/FALSE
+  opt$gen.new.dgram <- ( opt$gen.new.dgram == "yes" )
+}
+ 
+
+load( opt$dataset )  ## should load the cl, treecl.res (or partcl.res) and data
+
+if ( ! exists( 'data' ) ) stop( "No data object in the rdata file provided for", opt$output.format, "format!!\n" )
+if ( inherits( data, "dist" ) ) stop( "data provided is a distance matrix - not a data matrix.  Can't generate TreeView or Tab-delimited files w/distance matrices!\n" )
+
+## the rest of this is for the remaining output formats
+##  pre-set the cluster results for rows & cols to NULL
+direction <- NULL
+if ( exists( 'treecl.res' ) ) {
+  cl.res <- treecl.res
+  if ( is.null( treecl.res$dist.method ) ) treecl.res$dist.method <- 'euclidean'  # just set it to some stub so that the ctc fn's don't complain
+} else {
+  if ( exists( 'partcl.res' ) ) {
+    cl.res <- partcl.res
+  }
+  else {
+    stop( 'could not find a valid cluster result to use for primary direction\n' )
+  }
+}
+
+if ( all( names( cl ) %in% rownames( data ) ) ) {
+  direction <- "rows"
+} else if ( all( names( cl ) %in% colnames( data ) ) ) {
+  direction <- "cols"
+  data <- t( data )
+} else {
+  stop( "Specified cluster result does not come from this data set\n" )
+}
+
+
+centroids <- NULL
+cl <- sort( cl )
+if ( inherits( cl.res, "kmeans" ) ) {
+  ## already comes pre-calculated for us!!
+  centroids <- cl.res$centers
+} else {
+  data <- data[ names( cl ), ]
+  cl.list <- unique( cl )
+  cl.list <- lapply( cl.list, function(i) cl[ cl %in% i ] )
+  centroids <- sapply( cl.list,
+                       function(x) {
+                         return( apply( data[ names(x), , drop=F ], 2, mean, na.rm=T ) )
+                       }
+                    )
+  centroids <- t( centroids )  ## get them back to the same number of columns that data has now
+}
+
+data <- centroids
+rownames( data ) <- sapply( 1:max( cl ), function(i) sprintf( "cluster-%02d", i ) )
+
+if ( opt$gen.new.dgram ) {
+  distance <- 'euclidean'
+  if ( inherits( cl.res, 'hclust' ) ) {
+    distance <- cl.res$dist.method
+  }
+  amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+                      "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
+  names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+                               "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
+
+  if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
+  dist.mat <- Dist( data, method=as.character( amap.distance[ distance ] ) )
+  treecl.res <- hclust( dist.mat )
+  cl <- cutree( treecl.res, nrow(data) )
+}
+
+if ( direction == "cols" ) {
+  data <- t( data )
+}
+
+save( file=opt$output.fname, treecl.res, cl, data )
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/cluster.2.centroid.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/cluster.2.centroid.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "cluster.2.centroid.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/cluster.2.centroid.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/cluster.2.centroid.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,41 @@
+<tool id="cluster_2_centroid" name="Calculate Matrix of Cluster Centroids (Mean Euclidean)" force_history_refresh="True">
+    <command interpreter="python">cluster.2.centroid.py
+-d $dataset 
+-g ${gen_new_dgram}
+-o ${output_fname}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='rdata' label="Cluster Result" help="Specify the cluster result to analayze (MUST BE IN rdata format; see help below)"/>
+	<param name="gen_new_dgram" type='select' display="radio" label="Re-calculate cluster tree for new matrix ?" help="Specify whether or not to re-calculate a dendrogram for the cluster centroid matrix">
+	  <option value="no">No</option>
+	  <option value="yes" selected='true' >Yes</option>
+	</param>
+
+    </inputs>
+    <outputs>
+        <data format="rdata" name="output_fname" label="Cluster Centroid Matrix (RData)" />
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Calculate Matrix of Cluster Centroids** - Tool to calculate a new matrix containing the cluster centroids,  NOTE, this tool will automatically detect whether the dimensionality of the clusters (rows or columns).
+
+**OUTPUT:**  A new CDT file
+
+----
+
+**Parameters**
+
+- **Cluster Result** - Specify the cluster result to analayze (MUST BE IN rdata format), and must contain the same objects that are produced by the 'Partition Clustering,' 'Hierarchical Clustering (HAC),' or 'Consensus Clustering' tools.  Specifically, it must contain the following objects
+
+         * A 'treecl.res' or 'partcl.res' object - corresponding to whether the cluster results is from a partition or tree clustering method
+         * A 'data' object that contains the data that was passed into the clustering method.  NOTE, it is better for this to be the actual data passed in, rather than the data prior to the pre-processing that was performed prior to the actual clustering.
+
+- **Re-calculate cluster tree for new matrix?** - Specify whether or not to re-calculate a dendrogram for the cluster centroid matrix.
+        * IF the cluster result was a tree cluster, the new dendrogram will use the distance method from the original clustering
+	* IF the cluster result was a partition cluster, the dendrogram will use 'Euclidean' distance
+
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/consensus.clustering.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/consensus.clustering.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,774 @@
+#!/usr/bin/env Rscript
+## Consensus Clustering Script by Peter Waltman
+## May 31, 2011
+## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
+##
+#usage, options and doc goes here
+argspec <- c("consensus.clustering.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+        Usage: 
+                consensus.clustering.R -d <data.file> 
+        Optional:
+                -o <output.name>
+                -a <cluster.alg>  ## must be either 'hc' or 'km'
+                -m <distance.metric> ## must be one supported by ConsensusClusterPlus
+                -k <max.k>
+                -r <reps>
+                -f <filter>            ## filter, o/w no filtering
+
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+lib.load.quiet( gplots )
+lib.load.quiet( amap )
+##  if any of the faster clustering methods are available on this system, load them
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+  if ( 'flashClust' %in% installed.packages() ) {
+    lib.load.quiet( flashClust )
+  } else {
+    if ( 'fastcluster' %in% installed.packages() ) {
+      lib.load.quiet( fastcluster )
+    }
+  }
+}
+##lib.load.quiet(ConsensusClusterPlus)
+lib.load.quiet( amap )
+lib.load.quiet( cluster )
+
+###################
+## code borrowed/updated from ConsensusClusterPlus
+###################
+
+ConsensusClusterPlus <- function( d=NULL,
+                                  maxK = 3,
+                                  reps=10,
+                                  pItem=0.8,
+                                  pFeature=1,
+                                  clusterAlg="hc",
+                                  title="untitled_consensus_cluster",
+                                  innerLinkage="average",
+                                  finalLinkage="average",
+                                  distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ),
+                                  ml=NULL,
+                                  tmyPal=NULL,
+                                  seed=NULL,
+                                  plot=NULL,
+                                  writeTable=FALSE,
+                                  weightsItem=NULL,
+                                  weightsFeature=NULL,
+                                  verbose=F ) {
+  ##description: runs consensus subsamples 
+
+
+  if(is.null(seed)==TRUE){
+    seed=timeSeed = as.numeric(Sys.time())
+  }
+  set.seed(seed)
+
+  if(is.null(ml)==TRUE){
+
+    if ( inherits( distance, "dist" ) ) {
+      stop( "If you want to pass in a pre-calculated distance object, pass it in as the data, rather than the distance parameter\n" )
+    }
+    
+    if ( ! class( d ) %in% c( "dist", "matrix", "ExpressionSet" ) ) {
+      stop("d must be a matrix, distance object or ExpressionSet (eset object)")
+    }
+
+    if ( inherits( d, "dist" ) ) {
+      ## if d is a distance matrix, fix a few things so that they don't cause problems with the analysis
+      ##  Note, assumption is that if d is a distance matrix, the user doesn't want to sample over the row features
+      if ( is.null( attr( d, "method" ) ) ) {
+        attr( d, "method" ) <- distance <- "unknown - user-specified"
+      }
+      if ( is.null( distance ) || ( distance != attr( d, "method" ) ) ) {
+        distance <- attr( d, "method" )
+      }
+      
+      if ( ( ! is.null( pFeature ) ) && ( pFeature < 1 ) ) {
+        if ( verbose ) warning( "Cannot use the pFeatures parameter when specifying a distance matrix as the data object\n" )
+        pFeature <- 1
+      }
+      if ( ! is.null( weightsFeature ) ) {
+        if ( verbose ) warning( "Cannot use the weightsFeature parameter when specifying a distance matrix as the data object\n" )
+        weightsFeature <- NULL
+      }
+      if ( clusterAlg == "km" ) {
+        if ( verbose ) warning( "You are asking CCPLUS to use K-means to cluster a distance matrix (rather than the data itself) - this may produce unintended results. We suggest using PAM if you want to use alternate distance metrics/objects\n" )
+        ##d <- as.matrix( d )  #this is now done w/in ccRun
+      }
+    } else {
+      if ( is.null( distance ) ) {
+        ## we should never get here, but just in case
+        distance <- "pearson"
+      }
+    }
+
+    if ( ( clusterAlg == "km" ) && inherits( distance, "character" ) && ( distance != "euclidean" ) ) {
+      warning( "WARNING: kmeans can only use the euclidean distance metric.  If you would like to use an alternate metric, we suggest using PAM or HC clustering instead. This parameter combinationwill use k-means, but will NOT use the specified distance metric\n" )
+      distance <- 'euclidean'
+    }
+
+
+    if ( inherits( d,"ExpressionSet" ) ) {
+      d <- exprs(d)
+    }
+
+    ml <- ccRun( d=d,
+                 maxK=maxK,
+                 repCount=reps,
+                 diss=inherits(d,"dist"),
+                 pItem=pItem,
+                 pFeature=pFeature,
+                 innerLinkage=innerLinkage,
+                 clusterAlg=clusterAlg,
+                 weightsFeature=weightsFeature,
+                 weightsItem=weightsItem,
+                 distance=distance,
+                 verbose=verbose)
+  }
+  res=list();
+  
+  ##make results directory
+  if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){
+    dir.create(paste(title,sep=""))
+  }
+  
+  ##write log file
+  log <- matrix( ncol=2,
+                 byrow=T,
+                 c("title",title,
+                   "maxK",maxK,
+                   "input matrix rows",ifelse ( inherits( d, "matrix" ), nrow(d), "dist-mat" ), 
+                   "input matric columns",ifelse ( inherits( d, "matrix" ), ncol(d), ncol( as.matrix(d) ) ), 
+                   "number of bootstraps",reps,
+                   "item subsampling proportion",pItem,
+                   "feature subsampling proportion",ifelse( is.null(pFeature), 1, pFeature ),
+                   "cluster algorithm",clusterAlg,
+                   "inner linkage type",innerLinkage,
+                   "final linkage type",finalLinkage,
+                   "correlation method",distance,
+                   "plot",if(is.null(plot)) NA else plot,
+                   "seed",if(is.null(seed)) NA else seed))
+  colnames(log) = c("option","value")
+  if(writeTable){
+    write.csv(file=paste(title,"/",title,".log.csv",sep=""), log,row.names=F)
+  }
+  if(is.null(plot)){
+    ##nothing
+  }else if(plot=="png"){
+    png(paste(title,"/","consensus%03d.png",sep=""))
+  }else if (plot=="pdf"){
+    pdf(onefile=TRUE, paste(title,"/","consensus.pdf",sep=""))
+  }else if (plot=="ps"){
+    postscript(onefile=TRUE, paste(title,"/","consensus.ps",sep=""))
+  }	
+  
+  colorList=list()
+  colorM = rbind() #matrix of colors.
+  
+                                        #18 colors for marking different clusters
+  thisPal <- c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99","#E31A1C","#FDBF6F","#FF7F00","#CAB2D6","#6A3D9A","#FFFF99","#B15928",
+               "#bd18ea", #magenta
+               "#2ef4ca", #aqua
+               "#f4cced", #pink,
+               "#f4cc03", #lightorange
+               "#05188a", #navy,
+               "#e5a25a", #light brown
+               "#06f106", #bright green
+               "#85848f", #med gray
+               "#000000", #black
+               "#076f25", #dark green
+               "#93cd7f",#lime green
+               "#4d0776", #dark purple
+               "#ffffff" #white
+               )
+  
+  ##plot scale
+  colBreaks=NA
+  if(is.null(tmyPal)==TRUE){
+    colBreaks=10
+    tmyPal = myPal(colBreaks)
+  }else{
+    colBreaks=length(tmyPal)
+  }
+  sc = cbind(seq(0,1,by=1/( colBreaks) )); rownames(sc) = sc[,1]
+  sc = cbind(sc,sc)
+  heatmap(sc, Colv=NA, Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=rownames(sc),labCol=F,main="consensus matrix legend")
+
+  for (tk in 2:maxK){
+    if(verbose){
+      message(paste("consensus ",tk))
+    }
+    fm = ml[[tk]]
+    hc=hclust( as.dist( 1 - fm ), method=finalLinkage);
+    message("clustered")	
+    ct = cutree(hc,tk)
+    names(ct) = colnames(d)
+    c = fm
+    ##colnames(c) = colnames(d)
+    ##rownames(c) = colnames(d)
+
+    colorList = setClusterColors(res[[tk-1]][[3]],ct,thisPal,colorList)
+	
+    pc = c
+    pc=pc[hc$order,] #***pc is matrix for plotting, same as c but is row-ordered and has names and extra row of zeros.
+    pc = rbind(pc,0)
+    
+    heatmap(pc, Colv=as.dendrogram(hc), Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=F,labCol=F,mar=c(5,5),main=paste("consensus matrix k=",tk,sep="") , ColSideCol=colorList[[1]])
+    legend("topright",legend=unique(ct),fill=unique(colorList[[1]]),horiz=FALSE )
+
+    res[[tk]] = list(consensusMatrix=c,consensusTree=hc,consensusClass=ct,ml=ml[[tk]],clrs=colorList)
+    colorM = rbind(colorM,colorList[[1]]) 
+  }
+  CDF(ml)
+  clusterTrackingPlot(colorM[,res[[length(res)]]$consensusTree$order])
+  if(is.null(plot)==FALSE){
+    dev.off();
+  }
+  res[[1]] = colorM
+  if(writeTable){
+    for(i in 2:length(res)){
+      write.csv(file=paste(title,"/",title,".k=",i,".consensusMatrix.csv",sep=""), res[[i]]$consensusMatrix)
+      write.table(file=paste(title,"/",title,".k=",i,".consensusClass.csv",sep=""), res[[i]]$consensusClass,col.names = F,sep=",")
+    }
+  }
+  return(res)
+}
+
+
+calcICL = function(res,title="untitled_consensus_cluster",plot=NULL,writeTable=FALSE){
+  #calculates and plots cluster consensus and item consensus
+  cc=rbind()
+  cci = rbind()
+  sumRes=list()
+  colorsArr=c()
+  
+  #make results directory
+  if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){
+	dir.create(paste(title,sep=""))
+  }
+  if(is.null(plot)){
+    #to screen
+  }else if(plot=="pdf"){
+    pdf(onefile=TRUE, paste(title,"/","icl.pdf",sep=""))
+  }else if(plot=="ps"){
+    postscript(onefile=TRUE, paste(title,"/","icl.ps",sep=""))
+  }else if (plot=="png"){
+    png(paste(title,"/","icl%03d.png",sep=""))
+  }
+
+  par(mfrow=c(3,1),mar=c(4,3,2,0))
+
+  for (k in 2:length(res)){ #each k
+    eiCols = c();
+    o = res[[k]]
+    m = o$consensusMatrix
+    m = triangle(m,mode=2)
+    for (ci in sort(unique(o$consensusClass))){ #each cluster in k
+	items = which(o$consensusClass==ci)
+	nk = length(items)
+	mk = sum( m[items,items], na.rm=T)/((nk*(nk-1))/2)
+	cc=rbind(cc,c(k,ci,mk)) #cluster-consensus
+	
+      for (ei in rev(res[[2]]$consensusTree$order) ){
+		denom = if (ei %in% items) { nk - 1} else { nk }
+        	mei = sum( c(m[ei,items],m[items,ei]), na.rm=T)/denom  # mean item consensus to a cluster.
+		cci = rbind(cci,c(k,ci,ei,mei)) #cluster, cluster index, item index, item-consensus
+      }
+      eiCols = c(eiCols, rep(ci,length(o$consensusClass)) )
+    }
+	  
+	  cck = cci[which(cci[,1]==k),] #only plot the new k data.
+
+	  #group by item, order by cluster i
+	  w=lapply(split(cck,cck[,3]), function(x) { y=matrix(unlist(x),ncol=4); y[order(y[,2]),4] }) 
+	  q = matrix(as.numeric(unlist(w)),ncol=length(w),byrow=F)
+	  q = q[,res[[2]]$consensusTree$order] #order by leave order of k=2
+ 	  #q is a matrix of k rows and sample columns, values are item consensus of sample to the cluster.
+
+	  thisColors = unique(cbind(res[[k]]$consensusClass,res[[k]]$clrs[[1]]))
+	  thisColors=thisColors[order(as.numeric(thisColors[,1])),2]
+	  colorsArr=c(colorsArr,thisColors)
+	  sumRes[[k]] = rankedBarPlot(q,thisColors,cc=res[[k]]$consensusClass[res[[2]]$consensusTree$order],paste("k=",k,sep="") )
+  }
+
+  ys=cs=lab=c()
+  lastk=cc[1,1]
+  for(i in 1:length(colorsArr)){
+    if(lastk != cc[i,1]){
+      ys=c(ys,0,0)
+      cs=c(cs,NA,NA)
+      lastk=cc[i,1]
+      lab=c(lab,NA,NA)
+    }
+    ys=c(ys,cc[i,3])
+    cs=c(cs,colorsArr[i])
+    lab=c(lab,cc[i,1])
+  }
+  names(ys) = lab
+  par(mfrow=c(3,1),mar=c(4,3,2,0))
+  barplot(ys,col=cs,border=cs,main="cluster-consensus",ylim=c(0,1),las=1)
+  if(is.null(plot)==FALSE){
+	  dev.off()
+  }
+  colnames(cc) = c("k","cluster","clusterConsensus")
+  colnames(cci) = c("k","cluster","item","itemConsensus")
+  cci[,"item"] = names(res[[2]]$consensusClass)[ cci[,"item"] ]
+  #type cci
+  cci = data.frame( k=as.numeric(cci[,"k"]), cluster=as.numeric(cci[,"cluster"]), item=cci[,"item"], itemConsensus=as.numeric(cci[,"itemConsensus"])) 
+  
+  #write to file.
+  if(writeTable){
+	write.csv(file=paste(title,"/",title,".summary.cluster.consensus.csv",sep=""),row.names=F, cc)
+	write.csv(file=paste(title,"/",title,".summary.item.consensus.csv",sep=""), row.names=F, cc)
+  }
+  return(list(clusterConsensus=cc,itemConsensus=cci))
+}
+
+
+ccRun <- function( d=d,
+                   maxK=NULL,
+                   repCount=NULL,
+                   diss=inherits( d, "dist" ),
+                   pItem=NULL,
+                   pFeature=NULL,
+                   innerLinkage=NULL,
+                   distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ),
+                   clusterAlg=NULL,
+                   weightsItem=NULL,
+                   weightsFeature=NULL,
+                   verbose=NULL) {
+  m = vector(mode='list', repCount)
+  ml = vector(mode="list",maxK)
+  n <- ifelse( diss, ncol( as.matrix(d) ), ncol(d) )
+  mCount = mConsist = matrix(c(0),ncol=n,nrow=n)
+  ml[[1]] = c(0);
+
+  if (is.null( distance ) ) distance <- 'euclidean'  ## necessary if d is a dist object and attr( d, "method" ) == NULLa
+  
+  require( amap )
+  ##  we're going to use the amap Dist function, but they misname their correlation
+  ##  functions, so re-name them correctly
+  amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+                      "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
+  names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+                               "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
+  main.dist.obj <- NULL
+  ##browser()
+  if ( diss ){
+    main.dist.obj <- d
+
+    ## reset the pFeature & weightsFeature params if they've been set (irrelevant if d is a dist matrix)
+    if ( ( !is.null(pFeature) ) &&
+         ( pFeature < 1 ) ) {
+      if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified pFeature parameter\n" )
+      pFeature <- 1 # set it to 1 to avoid problems with sampleCols
+    }
+    if ( ! is.null( weightsFeature ) ) {
+      if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified weightsFeature parameter\n" )
+      weightsFeature <- NULL  # set it to NULL to avoid problems with sampleCols
+    }
+  } else { ## d is a data matrix
+    ## we're not sampling over the features
+    if ( ( clusterAlg != "km" ) &&
+         ( is.null( pFeature ) ||
+           ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) ) {
+      ## only generate a main.dist.object IFF 1) d is a matrix, 2) we're not sampling the features, and 3) the algorithm isn't 'km'
+      if ( inherits( distance, "character" ) ) {
+        if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
+
+        main.dist.obj <- Dist( t(d), method=as.character( amap.distance[ distance ] ) )
+        ## now fix dumb amap naming convention for distance metrics
+        attr( main.dist.obj, "method" ) <- as.character( amap.distance[ distance ] )
+      } else stop("unsupported distance specified.")
+    } else {
+      ## pFeature < 1 or a weightsFeature != NULL
+      ## since d is a data matrix, the user wants to sample over the gene features, so main.dist.obj is left as NULL
+    }
+  }
+ 
+
+  for (i in 1:repCount){
+    ##browser()  
+    if(verbose){
+      message(paste("random subsample",i));
+    }
+    ## take expression matrix sample, samples and genes
+    sample_x = sampleCols( d, pItem, pFeature, weightsItem, weightsFeature )
+
+    this_dist = NA
+    if ( ! is.null( main.dist.obj ) ) {
+      boot.cols <- sample_x$subcols
+      this_dist <- as.matrix( main.dist.obj )[ boot.cols, boot.cols ]
+      if ( clusterAlg != "km" ) {
+        ## if this isn't kmeans, then convert to a distance object
+        this_dist <- as.dist( this_dist )
+        attr( this_dist, "method" ) <- attr( main.dist.obj, "method" )
+      }
+    } else {
+      ## if main.dist.obj is NULL, then d is a data matrix, and either:
+      ##   1) clusterAlg is 'km'
+      ##   2) pFeatures < 1 or weightsFeatures have been specified, or
+      ##   3) both
+      ## so we can't use a main distance object and for every iteration, we will have to re-calculate either
+      ##   1) the distance matrix (because we're also sampling the features as well), or
+      ##   2) the submat (if using km) 
+
+      if ( clusterAlg != "km" )  {
+        if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
+        ## good, we have a supported distance type
+        this_dist <- Dist( t( sample_x$submat ), method=as.character( amap.distance[ distance ] ) )
+        ## now fix dumb amap naming convention for distance metrics
+        attr( this_dist, "method" ) <- as.character( amap.distance[ distance ] )
+      } else {
+        ##browser()
+        ##clusterAlg == "km" 
+        ## if we're not sampling the features, then grab the colslice
+        if ( is.null( pFeature ) ||
+            ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) {
+          this_dist <- d[, sample_x$subcols ]
+        } else {
+          if ( is.na( sample_x$submat ) ) {
+            save( "ccrun.submat.eq.na.dbg.rda" )
+            stop( "Houston, we have a problem.  sample_x$submat is NA in ccRun when it should be specified - saving state\n" )
+          }
+          
+          this_dist <- sample_x$submat
+        } 
+      }
+    }
+                  
+    ## cluster samples for HC.
+    this_cluster=NA
+    if(clusterAlg=="hc"){
+      this_cluster = hclust( this_dist, method=innerLinkage)
+    }
+    ##browser()
+    ##mCount is possible number of times that two sample occur in same random sample, independent of k
+    ##mCount stores number of times a sample pair was sampled together.
+    mCount <- connectivityMatrix( rep( 1,length(sample_x[[3]])),
+                                  mCount,
+                                  sample_x[[3]] ) 
+
+    ##use samples for each k		
+    for (k in 2:maxK){
+      if(verbose){
+        message(paste("  k =",k))
+      }
+      if (i==1){
+        ml[[k]] = mConsist #initialize
+      }
+      this_assignment=NA
+      if(clusterAlg=="hc"){
+        ##prune to k for hc
+        this_assignment = cutree(this_cluster,k)
+        ##browser()
+      }else if(clusterAlg=="km"){
+        ##this_dist should now be a matrix corresponding to the result from sampleCols
+        this_assignment <- kmeans( t( this_dist ),
+                                   k,
+                                   iter.max = 10,
+                                   nstart = 1,
+                                   algorithm = c("Hartigan-Wong") )$cluster
+      }else if ( clusterAlg == "pam" ) {
+        require( cluster )
+        this_assignment <- pam( x=this_dist,
+                                k,
+                                diss=TRUE,
+                                metric=distance, 
+                                cluster.only=TRUE )
+      } else{
+        ##optional cluterArg Hook.
+        this_assignment <- get(clusterAlg)(this_dist, k)
+      }
+      ##add to tally				
+      ml[[k]] <- connectivityMatrix( this_assignment,
+                                     ml[[k]],
+                                     sample_x[[3]] )
+    }
+  }
+	
+
+  ##consensus fraction
+  res = vector(mode="list",maxK)
+  for (k in 2:maxK){
+    ##fill in other half of matrix for tally and count.
+    tmp = triangle(ml[[k]],mode=3)
+    tmpCount = triangle(mCount,mode=3)
+    res[[k]] = tmp / tmpCount
+    res[[k]][which(tmpCount==0)] = 0
+  }
+  message("end fraction")
+  return(res)
+}
+
+
+connectivityMatrix <- function( clusterAssignments, m, sampleKey){
+  ##input: named vector of cluster assignments, matrix to add connectivities
+  ##output: connectivity matrix
+  names( clusterAssignments ) <- sampleKey 
+  cls <- lapply( unique( clusterAssignments ), function(i) as.numeric( names( clusterAssignments[ clusterAssignments %in% i ] ) ) )
+
+  for ( i in 1:length( cls ) ) {
+    nelts <- 1:ncol( m )
+    cl <- as.numeric( nelts %in% cls[[i]] ) ## produces a binary vector
+    updt <- outer( cl, cl )
+    m <- m + updt
+  }
+  return(m)
+}
+
+## returns a list with the sample columns, as well as the sub-matrix & sample features (if necessary)
+##  if no sampling over the features is performed, the submatrix & sample features are returned as NAs
+##  to reduce memory overhead
+sampleCols <- function( d,
+                        pSamp=NULL,
+                        pRow=NULL,
+                        weightsItem=NULL,
+                        weightsFeature=NULL ){
+  space <- ifelse( inherits( d, "dist" ), ncol( as.matrix(d) ), ncol(d) )
+  sampleN <- floor(space*pSamp)
+  sampCols <- sort( sample(space, sampleN, replace = FALSE, prob = weightsItem) )
+
+  this_sample <- sampRows <- NA
+  if ( inherits( d, "matrix" ) ) {
+    if ( (! is.null( pRow ) ) &&
+         ( (pRow < 1 ) || (! is.null( weightsFeature ) ) ) ) {
+      ## only sample the rows and generate a sub-matrix if we're sampling over the row/gene/features
+      space = nrow(d)
+      sampleN = floor(space*pRow)
+      sampRows = sort( sample(space, sampleN, replace = FALSE, prob = weightsFeature) )
+      this_sample <- d[sampRows,sampCols]
+      dimnames(this_sample) <- NULL
+    } else {
+      ## do nothing
+    }
+  }
+  return( list( submat=this_sample,
+                subrows=sampRows,
+                subcols=sampCols ) )
+}
+
+CDF=function(ml,breaks=100){
+  #plot CDF distribution
+  plot(c(0),xlim=c(0,1),ylim=c(0,1),col="white",bg="white",xlab="consensus index",ylab="CDF",main="consensus CDF", las=2)
+  k=length(ml)
+  this_colors = rainbow(k-1)
+  areaK = c()
+  for (i in 2:length(ml)){
+    v=triangle(ml[[i]],mode=1)
+
+    #empirical CDF distribution. default number of breaks is 100    
+    h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks))
+    h$counts = cumsum(h$counts)/sum(h$counts)
+
+    #calculate area under CDF curve, by histogram method.
+    thisArea=0
+    for (bi in 1:(length(h$breaks)-1)){
+       thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width
+       bi = bi + 1
+    }
+    areaK = c(areaK,thisArea)
+    lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l')
+  }
+  legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors)
+
+  #plot area under CDF change.
+  deltaK=areaK[1] #initial auc at k=2
+  for(i in 2:(length(areaK))){
+    #proportional increase relative to prior K.
+    deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1])
+  }
+  plot(1+(1:length(deltaK)),y=deltaK,xlab="k",ylab="relative change in area under CDF curve",main="Delta area",type="b")
+}
+
+
+myPal = function(n=10){
+  #returns n colors
+  seq = rev(seq(0,255,by=255/(n)))
+  palRGB = cbind(seq,seq,255)
+  rgb(palRGB,maxColorValue=255)
+}
+
+setClusterColors = function(past_ct,ct,colorU,colorList){
+	#description: sets common color of clusters between different K
+	newColors = c()
+	if(length(colorList)==0){
+		#k==2
+		newColors = colorU[ct]
+		colori=2
+	}else{
+		newColors = rep(NULL,length(ct))
+		colori = colorList[[2]]
+		mo=table(past_ct,ct)
+		m=mo/apply(mo,1,sum)
+			for(tci in 1:ncol(m)){ # for each cluster
+				maxC = max(m[,tci])
+				pci = which(m[,tci] == maxC)				
+				if( sum(m[,tci]==maxC)==1 & max(m[pci,])==maxC & sum(m[pci,]==maxC)==1  )  {
+				#if new column maximum is unique, same cell is row maximum and is also unique
+				##Note: the greatest of the prior clusters' members are the greatest in a current cluster's members.
+					newColors[which(ct==tci)] = unique(colorList[[1]][which(past_ct==pci)]) # one value
+				}else{ #add new color.
+					colori=colori+1
+					newColors[which(ct==tci)] = colorU[colori]
+				}
+			}
+	}
+	return(list(newColors,colori,unique(newColors) ))
+}
+
+clusterTrackingPlot = function(m){
+  #description: plots cluster tracking plot
+  #input: m - matrix where rows are k, columns are samples, and values are cluster assignments.
+  plot(NULL,xlim=c(-0.1,1),ylim=c(0,1),axes=FALSE,xlab="samples",ylab="k",main="tracking plot")
+  for(i in 1:nrow(m)){
+    rect(  xleft=seq(0,1-1/ncol(m),by=1/ncol(m)),  ybottom=rep(1-i/nrow(m),ncol(m)) , xright=seq(1/ncol(m),1,by=1/ncol(m)), ytop=rep(1-(i-1)/nrow(m),ncol(m)), col=m[i,],border=NA)   
+  }
+  #hatch lines to indicate samples
+  xl = seq(0,1-1/ncol(m),by=1/ncol(m))
+  segments(  xl, rep(-0.1,ncol(m)) , xl, rep(0,ncol(m)), col="black")    #** alt white and black color?
+  ypos = seq(1,0,by=-1/nrow(m))-1/(2*nrow(m))
+  text(x=-0.1,y=ypos[-length(ypos)],labels=seq(2,nrow(m)+1,by=1))
+}
+
+triangle = function(m,mode=1){
+  #mode=1 for CDF, vector of lower triangle.
+  #mode==3 for full matrix.
+  #mode==2 for calcICL; nonredundant half matrix coun
+  #mode!=1 for summary 
+  n=dim(m)[1]
+  nm = matrix(0,ncol=n,nrow=n)
+  fm = m
+
+
+  nm[upper.tri(nm)] = m[upper.tri(m)] #only upper half
+  
+  fm = t(nm)+nm
+  diag(fm) = diag(m)
+  
+  nm=fm
+  nm[upper.tri(nm)] = NA
+  diag(nm) = NA
+  vm = m[lower.tri(nm)]
+  
+  if(mode==1){
+    return(vm) #vector 		
+  }else if(mode==3){
+    return(fm) #return full matrix
+  }else if(mode == 2){
+    return(nm) #returns lower triangle and no diagonal. no double counts.
+  }
+  
+}
+
+
+rankedBarPlot=function(d,myc,cc,title){
+	colors = rbind() #each row is a barplot series
+	byRank = cbind()
+
+	spaceh = 0.1 #space between bars
+	for(i in 1:ncol(d)){
+	  byRank = cbind(byRank,sort(d[,i],na.last=F))
+	  colors = rbind(colors,order(d[,i],na.last=F))
+	}
+	maxH = max(c(1.5,apply(byRank,2,sum)),na.rm=T) #maximum height of graph
+	
+	#barplot largest to smallest so that smallest is in front.
+	barp = barplot( apply(byRank,2,sum) ,  col=myc[colors[,1]] ,space=spaceh,ylim=c(0,maxH),main=paste("item-consensus", title),border=NA,las=1  )
+	for(i in 2:nrow(byRank)){
+	  barplot( apply(matrix(byRank[i:nrow(byRank),],ncol=ncol(byRank))  ,2,sum), space=spaceh,col=myc[colors[,i]],ylim=c(0,maxH), add=T,border=NA,las=1  )
+	}
+	xr=seq(spaceh,ncol(d)+ncol(d)*spaceh,(ncol(d)+ncol(d)*spaceh)/ncol(d)  )
+	#class labels as asterisks
+	text("*",x=xr+0.5,y=maxH,col=myc[cc],cex=1.4) #rect(xr,1.4,xr+1,1.5,col=myc[cc] )
+}
+
+
+
+###################################################################3333
+## RESTART MY SCRIPTS HERE
+
+spec <- matrix( c( "data.fname",         "d", 1, "character",
+                   "direction",          "n", 2, "character",
+                   "output.name",        "o", 2, "character",
+                   "cluster.alg",        "a", 2, "character", 
+                   "distance.metric",    "m", 2, "character", 
+                   "max.k",              "k", 2, "integer",
+                   "reps",               "r", 2, "integer",
+                   "innerLinkage",       "i", 1, "character",
+                   "finalLinkage",       "f", 1, "character",
+                   "out.report.dir",     "p", 2, "character",
+                   "out.report.html",    "h", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+
+## default params for non-required params
+if ( is.null( opt$direction ) ) { opt$direction <- "cols"  }
+if ( is.null( opt$cluster.alg ) ) { opt$cluster.alg <- "pam" }
+if ( is.null( opt$output.name ) ) { opt$output.name <- "consensus.cluster.result" }
+if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "cosine" }
+if ( is.null( opt$max.k ) ) { opt$max.k <- 10 }
+if ( is.null( opt$reps ) ) { opt$reps <- 1000 }
+if ( is.null( opt$innerLinkage ) ) { opt$innerLinkage <- "average" }
+if ( is.null( opt$finalLinkage ) ) { opt$finalLinkage <- "average" }
+
+if ( is.null( opt$out.report.dir ) ) { opt$out.report.dir <- "report" }
+if ( is.null( opt$out.report.html ) ) { opt$out.report.html <- file.path( "report", "index.html" ) }
+
+## validate params here (make sure set to valid values)
+if ( !opt$cluster.alg %in% c( "hc", "km", "pam" ) ) {
+  stop( "invalid clustering algorithm specified", cluster.alg )
+}
+
+
+data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
+## transpose the matrix if we want to cluster the rows (genes)
+if ( opt$direction == "rows" ) {
+  data <- t( data )
+}
+
+title <- paste( opt$cluster.alg, opt$output.name, sep="." )
+results <- ConsensusClusterPlus( data,
+                                 maxK=opt$max.k,
+                                 reps=opt$reps,
+                                 pItem=0.8,
+                                 pFeature=1,
+                                 title=opt$out.report.dir,
+                                 clusterAlg=opt$cluster.alg,
+                                 distance=opt$distance.metric,
+                                 innerLinkage=opt$innerLinkage,
+                                 finalLinkage=opt$finalLinkage,
+                                 plot='png',
+                                 writeTable=FALSE,
+                                 seed=100,
+                                 ##weightsFeature=abs( rnorm( nrow( orig.data ) ) ),
+                                 verbose=FALSE )
+
+pngs = list.files(path=opt$out.report.dir, patt="png")
+html.out <- paste( "<html>", 
+                   paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ),
+                   "</html>" )
+cat( html.out, file=opt$out.report.html )
+
+
+## re-transpose the matrix back if we've clustered the rows (genes)
+if ( opt$direction == "rows" ) {
+  data <- t( data )
+}
+save( file=opt$output.name, data, results)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/consensus.clustering.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/consensus.clustering.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "consensus.clustering.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/consensus.clustering.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/consensus.clustering.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,149 @@
+<tool id="consensus_clustering" name="Consensus Clustering" force_history_refresh="True">
+    <command interpreter="python">consensus.clustering.py
+-d $dataset 
+-n ${direction} 
+-a ${method.algorithm} 
+#if $method.algorithm == 'hc' # -m ${method.hc_distance_metric}
+-i ${method.innerLinkage} 
+#end if
+#if $method.algorithm == 'pam' # -m ${method.pam_distance_metric}
+#end if
+#if $method.algorithm == 'km' # -m euclidean
+#end if
+-k ${kmax} 
+-r ${reps} 
+-f ${finalLinkage}
+-o ${output} 
+-h $report 
+-p ${report.files_path}
+
+</command>
+    <inputs>
+      <param name="dataset" type="data" format='tabular' label="Data Set" help="Specify the data matrix (tab-delimited) to be clustered"/>
+      <param name="direction" type="select" label="Cluster Samples or Genes?" help="Specify the matrix dimension to cluster (see help below)">
+	<option value="rows">Genes (rows)</option>
+	<option value="cols" selected="true">Samples (columns)</option>
+      </param>
+    	
+      <conditional name='method'>
+	<param name="algorithm" type="select" label="Clustering Algorithm" help="Specify the cluster method to use (see help below)">
+	  <option value="hc">Hierarchical Clustering</option>
+	  <option value="pam" selected='true'>Partioning around Medioids</option>
+	  <option value="km">K-Means Clustering</option>
+	</param>
+	<when value='hc'>
+	  <param name="hc_distance_metric" type="select" label="Distance Metric" help="Specify the distance metric to use (see help below)">
+	    <option value="cosine" selected='true'>Cosine</option>
+	    <option value="abscosine">Absolute Cosine</option>
+	    <option value="pearson">Pearson</option>
+	    <option value="abspearson">Absolute Pearson</option>
+	    <option value="spearman">Spearman</option>
+	    <option value="kendall">Kendall</option>
+	    <option value="euclidean">Euclidean</option>
+	    <option value="maximum">Maximum</option>
+	    <option value="manhattan">Manhattan (AKA city block)</option>
+	    <option value="canberra">Canberra</option>
+	    <option value="binary">Binary</option>
+	  </param>
+
+	  <param name="innerLinkage" type="select" label="Linkage for inner HAC " help="Specify the linkage to use during the 'inner' hierarchcial clustering (see help below)">
+	    <option value="average">Average</option>
+	    <option value="centroid">Centroid</option>
+	    <option value="complete" selected='true'>Complete</option>
+	    <option value="mcquitty">McQuitty</option>
+	    <option value="median">Median</option>
+	    <option value="single">Single</option>
+	    <option value="ward">Ward</option>
+	  </param>
+    	</when>
+ 	<when value='pam'>
+	  <param name="pam_distance_metric" type="select" label="Distance Metric" help="Specify the distance metric to use (see help below)">
+	    <option value="cosine" selected='true'>Cosine</option>
+	    <option value="abscosine">Absolute Cosine</option>
+	    <option value="pearson">Pearson</option>
+	    <option value="abspearson">Absolute Pearson</option>
+	    <option value="spearman">Spearman</option>
+	    <option value="kendall">Kendall</option>
+	    <option value="euclidean">Euclidean</option>
+	    <option value="maximum">Maximum</option>
+	    <option value="manhattan">Manhattan (AKA city block)</option>
+	    <option value="canberra">Canberra</option>
+	    <option value="binary">Binary</option>
+	  </param>
+    	</when>
+      </conditional>
+      <param name="finalLinkage" type="select" label="Final Linkage" help="Specify the linkage to use when clustering the consensus matrix (see help below)">
+	<option value="average">Average</option>
+	<option value="centroid">Centroid</option>
+	<option value="complete" selected='true'>Complete</option>
+	<option value="mcquitty">McQuitty</option>
+	<option value="median">Median</option>
+	<option value="single">Single</option>
+	<option value="ward">Ward</option>
+      </param>
+      
+      
+      <param name="kmax" type="integer" label="K Max" value="10" help="Maximum number of K to analyze" />
+      <param name="reps" type="integer" label="Repetitions" value="500" help="Number of Sample Permutations to Repeat"/>
+    	
+    </inputs>
+    <outputs>
+      <data format="html" name="report" label="Consensus Clustering Report (HTML)"/>
+      <data format="rdata" name="output" label="Consensus Clustering Data (RData)"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Perform Consensus Clustering (Cluster Samples) on a specified data set**
+
+----
+
+**Parameters**
+
+- **Data Set** - Specify the data matrix to be clustered.  Data must be formated as follows:
+
+         * Tab-delimited
+         * Use row/column headers
+
+- **Cluster Samples or Genes** - Specify the dimension of the matrix to cluster:
+
+         * Rows (Genes)
+         * Columns (Samples)
+
+- **Clustering Algorithm** Specify the choice of algorithm to use.  Choice of:
+
+         * Hierarchical Clustering
+         * K-Means
+
+- **Distance Metric** Specify the choice of distance metric to use.  Choice of:
+
+	 * Cosine (AKA uncentered pearson)
+	 * Absolute Cosine (AKA uncentered pearson, absolute value)
+         * Pearson (pearson correlation)
+	 * Absolute Pearson (pearson correlation, absolute value)
+         * Spearman (spearman correlation)
+	 * Kendall (Kendall's Tau)
+         * Euclidean (euclidean distance)
+	 * Maximum
+	 * Manhattan (AKA city block)
+	 * Canberra
+	 * Binary
+
+- **Final Linkage** Specify the choice linkage to use when clustering Consensus Matrix.  Choice of:
+
+         * Average (see documentation for R's hclust function for explanation of choices)
+         * Single
+         * Complete
+         * Median
+         * Centroid
+         * McQuity
+         * Ward
+
+- **Inner Linkage** Specify the choice linkage to use when using HAC as clustering method.  Same choices as 'Final Linkage'
+
+- **K Max** Specify the number to use for the largest K considered
+
+- **Repititions**  Specify the number of 'bootstrap' repitions to perform to generate the consensus matrix
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/cutree.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/cutree.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,52 @@
+#!/usr/bin/env Rscript
+argspec <- c("tab.2.cdt.R converts a data matrix to cdt format
+
+        Usage: 
+                tab.2.cdt.R -d <data.file> 
+        Optional:
+                            -o <output_file>
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+
+lib.load.quiet( getopt )
+lib.load.quiet( ctc )
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+  if ( 'flashClust' %in% installed.packages() ) {
+    lib.load.quiet( flashClust )
+  } else {
+    if ( 'fastcluster' %in% installed.packages() ) {
+      lib.load.quiet( fastcluster )
+    }
+  }
+}
+
+spec <- matrix( c( "dataset",             "d", 1, "character",
+                   "num.k",               "k", 1, "integer",
+                   "output.fname",        "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+
+opt <- getopt( spec=spec )
+if ( is.null( opt$output.fname ) ) { opt$output.fname <- file.path( opt$output.report.dir, paste( "data", opt$output.format, sep="." ) ) }
+
+
+load( opt$dataset )  ## should load the cl, treecl.res (or partcl.res) and data
+if ( exists( 'treecl.res' ) ) {
+  cutree.res <- cutree( treecl.res, k=opt$num.k )
+  cl <- cutree.res
+  save( file=opt$output.fname, treecl.res, cl, data )
+} else {
+  stop( "no hierarchical clustering result found!\n" )
+}
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/cutree.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/cutree.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "cutree.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/cutree.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/cutree.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,24 @@
+<tool id="cutree" name="Re-Run Cutree on Previous Hierarchical Clustering Result" force_history_refresh="True">
+    <command interpreter="python">cutree.py
+-d $dataset 
+-k ${numk} 
+-o ${rdata_output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='rdata' label="Cluster Result" help="Specify the cluster result to analayze (MUST BE IN rdata format; see help below)"/>
+    	<param name="numk" type="integer" label="Number of Clusters" value="5" help="Specify the number of clusters to use"/>
+    </inputs>
+    <outputs>
+        <data format="rdata" name="rdata_output" label="Cutree from Hierarchical Clustering Result (RData)"/>
+    </outputs>
+<help>
+- **Cluster Result** - Specify the cluster result to analayze (MUST BE IN rdata format), and must contain the same objects that are produced by the 'Partition Clustering,' 'Hierarchical Clustering (HAC),' or 'Consensus Clustering' tools.  Specifically, it must contain the following objects
+
+         * A 'treecl.res' or 'partcl.res' object - corresponding to whether the cluster results is from a partition or tree clustering method
+         * A 'data' object that contains the data that was passed into the clustering method.  NOTE, it is better for this to be the actual data passed in, rather than the data prior to the pre-processing that was performed prior to the actual clustering.
+
+- **Number of Clusters** Specify the number of clusters to use
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/determine.IPL.threshold.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/determine.IPL.threshold.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,263 @@
+#!/usr/bin/env Rscript
+
+##usage, options and doc goes here
+argspec <- c("determine.IPL.threshold.R takes an IPL result, and determines a statistically sound threshold to use
+
+        Usage: 
+                determine.IPL.threshold.R -d <IPL_data_file>
+        Optional:
+                -o output.rdata  ## rdata output file (contains variables used for calculation, for those who want to review them
+                -f filter type # must be either modulated, active, or inactive
+                -p percent of samples passing (must be value on [0,1]
+\n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write( argspec, stderr() )
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "data.fname",          "d", 1, "character",
+                   "output.rdata",        "o", 2, "character",
+                   "filter.type",         "f", 2, "character",
+                   "perc.pass",           "p", 2, "numeric",
+                   "selection.criteria",  "s", 2, "character",
+                   "output.report.dir",   "r", 2, "character",
+                   "output.report.html",  "h", 2, "character"
+                  ),
+                nc=4,
+                byrow=T
+               )
+opt <- getopt( spec=spec )
+
+## default params for non-required params
+if ( is.null( opt$filter.type ) ) { opt$filter.type <- 'modulated' }
+if ( is.null( opt$perc.pass ) ) { opt$perc.pass <- 1/3 }
+if ( is.null( opt$selection.criteria ) ) { opt$selection.criteria <- 'max_diffs' }
+if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" }
+if ( is.null( opt$output.report.html ) ) { opt$output.report.html <- "report/index.html" }
+if ( is.null( opt$output.rdata ) ) { opt$output.rdata <- "output.rdata" }
+if ( opt$perc.pass < 0  ) {
+  stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" )
+}
+
+if (!file.exists(opt$output.report.dir)){
+    dir.create(opt$output.report.dir)
+}
+
+                               
+data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
+genes <- rownames( data )
+genes <- genes[ !grepl( "abstract|family|complex", genes ) ]
+data <- data[ genes, ]
+
+nulls.mat <- grepl( "na_", colnames( data ) )
+reals <- ! nulls.mat
+nulls.mat <- data[ , nulls.mat ]
+reals.mat <- data[, reals ]
+if ( ncol( nulls.mat ) == 0 ) stop( "no nulls were in the file provided!\n" )
+if ( ncol( reals.mat ) == 0 ) stop( "no reals were in the file provided!\n" )
+
+
+if ( opt$filter.type == 'modulated' ) {
+  reals.mat <- abs( reals.mat )
+  nulls.mat <- abs( nulls.mat )
+} else {
+  if ( opt$filter.type == "inactive" ) {
+    reals.mat <- -reals.mat
+    nulls.mat <- -nulls.mat
+  }
+} 
+
+
+##  we only look at the larger 50% of the possible IPL values
+##  as possible  thresholds to use (since the lower 50% are almost
+##  always uninformative)
+thresholds <- unique( quantile( reals.mat,
+                                seq( 0.5, 1, by=0.001 ) ) )
+thresholds <- c( thresholds,
+                 quantile( nulls.mat,
+                           seq( 0.5, 1, by=0.001 ) ) )
+thresholds <- unique( sort( thresholds ) )
+
+
+get.num.filtered.feats <- function( mat,
+                                    threshold,
+                                    perc.samples.passing=1/3 ) {
+  feat.vect <- apply( mat,
+                      1,
+                      function(x) {
+                        tmp <- sum( x > threshold )
+                        if ( perc.samples.passing >=1 ) {
+                          return( tmp >= perc.samples.passing )
+                        } else {
+                          return( tmp > floor( perc.samples.passing * length(x) ) )
+                        }
+                      }
+                     )
+  return( sum( feat.vect ) )
+}
+
+
+real.feats <- null.feats <- length( genes )
+chisq.pvals <- binom.pvals <- numeric()
+
+for ( i in 1:length( thresholds ) ) {
+
+  nul.feats.this.thresh <- get.num.filtered.feats( mat=nulls.mat, threshold=thresholds[i], perc.samples.passing=opt$perc.pass )
+  ## limit the maximum threshold to one where there are at least 75 valid points
+  ##  because if there are fewer nulls than that, it heavily skews the probability
+  if ( nul.feats.this.thresh < 50 ) break
+  
+  null.feats[ i ] <- nul.feats.this.thresh
+  real.feats[ i ] <- get.num.filtered.feats( mat=reals.mat, threshold=thresholds[i], perc.samples.passing=opt$perc.pass )  
+  
+  ## only calculate if there are more real features than nulls, otherwise, give a p-value of 1
+  if ( null.feats[i] < real.feats[i] ) {
+    p <- null.feats[i]/nrow( nulls.mat )
+    sd <- ( nrow( nulls.mat ) * p * (1-p ) )^0.5
+
+    ## binomial test
+    p <- -pnorm( q=real.feats[i],
+                 mean=null.feats[i],
+                 sd=sd,
+                 log.p=TRUE,
+                 lower.tail=FALSE )
+
+    ##chisq test
+    chi <- ( real.feats[i] - null.feats[i] )^2
+    chi <- chi/(null.feats[i])^2
+    chi <- -pchisq( chi, 1, log.p=TRUE, lower=FALSE )
+  } else {
+    p <- chi <- 0 ## 0 == -log(1)
+  }
+
+  binom.pvals <- c( binom.pvals, p )
+  chisq.pvals <- c( chisq.pvals, chi )
+
+  if ( length( chisq.pvals ) != i ) {
+    stop( "lengths differ\n" )
+  }
+}
+
+
+
+##names( binom.pvals ) <- names( chisq.pvals ) <- thresholds
+diffs <- real.feats - null.feats
+if ( opt$selection.criteria == "max_diffs" ) {
+  max.diff <- max( diffs )
+  opt.thresh <- which( diffs %in% max.diff )
+} else if ( opt$selection.criteria == "binomial" ) {
+  max.bin <- max( binom.pvals )
+  opt.thresh <- which( binom.pvals %in% max.bin )
+} else if ( opt$selection.criteria == "chisq" ) {
+  max.chi <- max( chisq.pvals )
+  opt.thresh <- which( chisq.pvals %in% max.chi )
+}
+
+opt.thresh <- mean( c( thresholds[ opt.thresh ], thresholds[ (opt.thresh-1) ] ) )
+opt.thresh <- signif( opt.thresh, 4 )
+
+  
+##corrected.binom.pvals <- binom.pvals + log( length(thresholds) )
+##binom.pvals <- binom.pvals - log( length(thresholds) )
+##corrected.chisq.pvals <- chisq.pvals + log( length(thresholds) )
+##chisq.pvals <- chisq.pvals - log( length(thresholds) )
+
+
+eval.thresh <- thresholds[ 1:length( real.feats ) ]
+##plot.new(); screens <- split.screen( c( 4,1 ) )
+##postscript( "threshold.comparison.ps", paper='letter', horizontal=F )
+##png.fname <- file.path( opt$output.report.dir, "IPL.threshold.determination.png")
+##plot.dev <- png( png.fname,
+##                 width=11,
+##                 height=8.5,
+##                 units='in',
+##                 res=72 )
+##par( mar=rep(0,4) )
+##screens <- split.screen( c( 4,1 ) )
+
+png.fname <- file.path( opt$output.report.dir, "01.num.feats.IPL.threshold.determination.png")
+plot.dev <- png( png.fname,
+                 width=11,
+                 height=8.5,
+                 units='in',
+                 res=72 )
+par( mar=c(2.25,3,1.5,0.5) )
+plot( eval.thresh, null.feats, type='l', lwd=2, col='blue', cex.axis=0.75 )
+lines( eval.thresh, real.feats, type='l', lwd=2, col='black', cex.axis=0.75 )
+abline( v=opt.thresh )
+legend( "topright", c( "Real", "Null" ), lwd=2, col=c('black', 'blue' ) )
+mtext( "Number of Genes Passing Threshold", font=2 )
+mtext( "IPL Threshold", 1, font=2, line=1.5 )
+mtext( "Number of Genes", 2, font=2, line=1.8 )
+dev.off()
+
+
+png.fname <- file.path( opt$output.report.dir, "02.diffs.IPL.threshold.determination.png")
+plot.dev <- png( png.fname,
+                 width=11,
+                 height=8.5,
+                 units='in',
+                 res=72 )
+##screen( screen()+1 )
+par( mar=c(2.25,3,1.5,0.5) )
+plot( eval.thresh, diffs, type='l', lwd=2, col='black', cex.axis=0.75 )
+abline( v=opt.thresh )
+mtext( "Difference between number of Real & Null genes passing Threshold", font=2 )
+mtext( "IPL Threshold", 1, font=2, line=1.5 )
+mtext( "Number of Genes", 2, font=2, line=1.8 )
+dev.off()
+
+
+
+png.fname <- file.path( opt$output.report.dir, "03.chisq.IPL.threshold.determination.png")
+plot.dev <- png( png.fname,
+                 width=11,
+                 height=8.5,
+                 units='in',
+                 res=72 )
+##screen( screen()+1 )
+par( mar=c(2.25,3,1.5,0.5) )
+plot( eval.thresh, chisq.pvals, type='l', lwd=2, col='red', cex.axis=0.75 )
+abline( v=opt.thresh )
+mtext( "Chi-sq p-values", font=2 )
+mtext( "IPL Threshold", 1, font=2, line=1.5 )
+mtext( "-Log p-value", 2, font=2, line=1.8 )
+dev.off()
+
+
+
+png.fname <- file.path( opt$output.report.dir, "04.binom.IPL.threshold.determination.png")
+plot.dev <- png( png.fname,
+                 width=11,
+                 height=8.5,
+                 units='in',
+                 res=72 )
+##screen( screen()+1 )
+par( mar=c(2.25,3,1.5,0.5) )
+plot( eval.thresh, binom.pvals, type='l', lwd=2, col='green', cex.axis=0.75 )
+abline( v=opt.thresh )
+mtext( "Binomial p-values", font=2 )
+mtext( "IPL Threshold", 1, font=2, line=1.5 )
+mtext( "-Log p-value", 2, font=2, line=1.8 )
+dev.off()
+
+##close.screen( all=T ); dev.off()
+
+report_str = paste( "The threshold to use for consensus clustering filtering is ", opt.thresh, "\n", sep="" )
+
+pngs = list.files(path=opt$output.report.dir, patt="png")
+html.out <- paste( "<html>", report_str,
+                   paste( paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), collapse=""),
+                   "</html>" )
+cat( html.out, file=opt$output.report.html )
+
+filter.type <- opt$filter.type
+perc.pass <- opt$perc.pass
+save( file=opt$output.rdata, thresholds, diffs, binom.pvals, chisq.pvals, real.feats, null.feats, data, filter.type, perc.pass, opt.thresh )
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/determine.IPL.threshold.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/determine.IPL.threshold.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "determine.IPL.threshold.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+print cmd_args
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/determine.IPL.threshold.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/determine.IPL.threshold.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,57 @@
+<tool id="determine_ipl_threshold" name="Determine IPL Threshold for Consensus Clustering" force_history_refresh="True">
+    <command interpreter="python">determine.IPL.threshold.py
+-d ${data_fname}
+-f ${filter_type}
+-p ${percentage_pass}
+-s ${selection_criteria}
+-h $report 
+-r ${report.files_path}
+-o ${output_rdata}
+    </command>
+    <inputs>
+      <param name="data_fname" type="data" format="tabular" label="Paradigm Results File (contains both reals and nulls)"/>
+      <param name="filter_type" type="select" label="Activity Filter" help="Specify the filter type to use (see help below)">
+	<option value="active">Active</option>
+	<option value="inactive">Inactive</option>
+	<option value="modulated" selected='true' >Modulated</option>
+      </param>
+      <param name="percentage_pass" type="float" label="% of Samples Passing (value in 0-1 range; >= 1 to indicate exact number of samples)" value="0.33" help="Specify the percentage of samples thatmust pass the threshold (see help below)"/>
+      <param name="selection_criteria" type="select" label="Selection Criteria" help="Specify the test statistic to use to select the threshold (see help below)">
+	<option value="binomial">Binomial P-value</option>
+	<option value="chisq">Chi-Squared P-value</option>
+	<option value="max_diffs" selected='true' >Overall Max Number of Differences</option>
+      </param>
+    </inputs>
+    <outputs>
+      <data format="html" name="report" label="Determine IPL Threshold (HTML)"/>
+      <data format="rdata" name="output_rdata"  label="Determine IPL Threshold (RData)"/>
+    </outputs>
+<help>
+
+.. class:: infomark
+     
+**Determines a statistically sound threshold to use for a given IPL result**
+
+**Parameters**
+- **Paradigm Results File** Output from Paradigm (tab-delimited and contains both the 'real' and 'null' samples)
+
+- **Activity Filter** Specify the filter type to use.  Choice of:
+
+         * Activity -  Features must exceed the user-specified threshold
+         * Inactivity -  Features must fall below the user-specified threshold
+         * Modulated - Absolute value of the features must exceed the specified threshold
+
+- **Percentage of Samples Passing** Percent of samples with an IPL that passes the threshold. Choice of:
+
+         * Real Value in [0,1] - indicate the percentage of samples that pass the threshold
+         * Integer Value       - indicate the exact number of samples that pass the threshold
+
+- **Selection Criteria** Specify the test statistic to use to select the threshold.  Choice of:
+
+         * Binomial P-value - Select the threshold with the largest -log p-value (calculated as a binomial)
+         * Chi-Squared P-value - Select the threshold with the largest -log p-value (calculated as a Chi-squared)
+         * Overall Max Number of Differences - Select the threshold with the largest overall number of differences between the real and null distributions
+     
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/extract.cons.cluster.from.result.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/extract.cons.cluster.from.result.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,62 @@
+#!/usr/bin/env Rscript
+# Extract k cluster assignment from consensus clustering result Script by Peter Waltman
+# Nov. 12, 2012
+# License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
+#
+##usage, options and doc goes here
+argspec <- c("galaxy.extract.cons.clustering.from.result.R extracts a cluster assignment for a specified
+value of K from a specified consensus cluster result file.
+
+
+        Usage: 
+                galaxy.extract.cons.cluster.from.result.R -r <results_file> -k <k_select>
+        Optional:
+                -o consensus class output file # tab-delimitted file format
+\n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write( argspec, stderr() )
+  q();
+}
+
+library(getopt)
+
+spec <- matrix( c( "results.file",   "r", 1, "character",
+                   "k.select",       "k", 1, "integer",
+                   "cluster.class.out",   "o", 2, "character",
+                   "output.select.rdata", "d", 2, "character"
+                  ),
+                nc=4,
+                byrow=T
+               )
+opt <- getopt( spec=spec )
+if ( is.null( opt$output.select.rdata ) ) { opt$output.select.rdata <- "select.RData" }
+##if ( is.null( opt$cluster.class.out) ) { opt$cluster.class.out <- "select.cls" }
+
+load( opt$results.file )
+cons.matrices <- lapply( results[ 2:length(results) ], '[[', 'consensusMatrix' )
+cls <- lapply( results[ 2:length(results) ], '[[', 'consensusClass' )
+names( cons.matrices ) <- names( cls ) <- 2:length( results )
+
+ch.k.select <- as.character( opt$k.select )
+if ( ch.k.select %in% names( cls ) ) {
+  ## get the consensusClass file that's associated with the k.select
+  cl <- cls[[ ch.k.select  ]] 
+
+  if ( ! is.null( opt$cluster.class.out ) ) {
+    cl <- cbind( names( cl ), as.integer(cl) )
+    colnames( cl ) <- c( "ID", "class" )
+    write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE )
+  } else if ( ! is.null( opt$output.select.rdata ) ) {
+    ## re-order the samples to follow the cluster assignment
+    treecl.res <- results[[ opt$k.select ]]$consensusTree
+    select.result <- results[[ opt$k.select ]]
+    save( file=opt$output.select.rdata, treecl.res, cl, select.result, data )
+  } else {
+    stop( 'no valid output format specified\n' )
+  }
+} else {
+  out.string <- paste( "choice of k =", ch.k.select, "not available in this result file. Max k = ", max( as.numeric( names(cls) ) ), "\n" )
+  cat( out.string, file=opt$cluster.class.out )
+}
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/extract.cons.cluster.from.result.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/extract.cons.cluster.from.result.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "extract.cons.cluster.from.result.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/extract.cons.cluster.from.result.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/extract.cons.cluster.from.result.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,47 @@
+<tool id="extract_cons_cluster_from_result" name="Extract Clustering from ConsensusCluster Results (choice of K)" force_history_refresh="True">
+    <command interpreter="python">extract.cons.cluster.from.result.py
+-r ${results_file}
+-k ${k_select}
+
+#if str($out_format.format) == 'rdata':
+-d ${output_rdata}
+#end if
+#if str($out_format.format) == 'tabular':
+-o ${output_tab}
+#end if
+
+</command>
+    <inputs>
+    	<param name="results_file" type="data" format="rdata" label="Consensus Clustering Results File" help="Specify the result from a consensus clustering run (RData format)"/>
+    	<param name="k_select" type="integer" label="Choice of K" value="4" help="Specify the choice of K to extract from the clustering result"/>
+	<conditional name="out_format" >
+	  <param name="format" type="select" label="Select Output Format" help="Specify the output format (RData file or tab-delimited" >
+	    <option value="rdata" selected='true' >RData</option>
+	    <option value="tabular">Tab-delimited</option>
+	  </param>
+	</conditional>
+    </inputs>
+    <outputs>
+        <data format="rdata" name="output_rdata" label="CCPlus cluster RData file" >
+	  <filter>(out_format['format']=="rdata")</filter>
+	</data>
+        <data format="tabular" name="output_tab" label="Tabbed File" >
+	  <filter>(out_format['format']=="tabular")</filter>
+	</data>
+    </outputs>
+<help>
+
+.. class:: infomark
+     
+**Retrieves a class assignment for a user-specified choice of K, from a specified Consensus Clustering result**
+
+**Parameters**
+ - **Consensus Clustering Results File** Specify the result from a consensus clustering run (**MUST BE THE RData** outuput)
+
+ - **Choice of K** Specify the choice of K to extract from the clustering result
+
+ - **Select Output Format**  Specify the output format (Choice of):
+         * RData file 
+         * Tab-delimited
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/fix.and.merge.TCGA.sample.IDs.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/fix.and.merge.TCGA.sample.IDs.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,119 @@
+#!/usr/bin/env Rscript
+argspec <- c("fix.and.merge.TCGA.samples.IDs.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+        Usage: 
+                fix.and.merge.TCGA.samples.IDs.R -d <data.file> 
+
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "data.fname",      "d", 1, "character",
+                   "num.components",  "n", 2, "integer",
+                   "remove.normals",  "r", 0, "logical",
+                   "output.fname",    "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+
+data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
+if ( is.null( opt$num.components ) ) { opt$num.components <- 3 }
+if ( is.null( opt$remove.normals ) ) { opt$remove.normals <- FALSE }
+if ( is.null( opt$output.fname ) ) { opt$output.fname <- paste( "sample.IDs.updated", basename( opt$data.fname ), sep="." ) }
+
+if ( opt$num.components < 3 ) {
+  err.msg <- "Minimum number of barcode components that can be used is 3\n"
+  cat( err.msg, file=opt$output.fname )
+  stop( err.msg )
+}
+
+remove.periods.from.ids <- function( ids ) {
+  return( gsub( "\\.", "-", ids ) )
+}
+
+
+reformat.ids <- function( ids,
+                          num.components=3 ) {
+  return( sapply( strsplit( ids, "-" ), function(x) paste( x[1:num.components], collapse="-" ) ) )
+}
+
+
+merge.cols <- function( mat,
+                        samp.ids ) {
+
+  if ( ! any( duplicated( samp.ids ) ) ) {
+    colnames( mat ) <- samp.ids
+    return( mat )
+  }
+
+  dupes <- unique( samp.ids[ duplicated( samp.ids ) ] )
+  uniqs <- samp.ids[ ! samp.ids %in% dupes ]
+
+  uniq.mat <- mat[ , ( samp.ids %in% uniqs ), drop=FALSE ]
+  colnames( uniq.mat ) <- uniqs
+
+  for ( dup in dupes ) {
+    dup.mat <- apply( mat[, ( samp.ids %in% dup ), drop=FALSE],
+                      1,
+                      mean,
+                      na.rm=TRUE )
+    
+    uniq.mat <- cbind( uniq.mat, dup.mat )
+  }
+  colnames( uniq.mat ) <- c( uniqs, dupes )
+  return( uniq.mat )
+}
+
+
+cnames <- colnames( data )
+rnames <- rownames( data )
+
+transpose.back <- FALSE
+
+if ( all( grepl( "^TCGA", rnames ) ) ) {
+  data <- t( data )
+  transpose.back <- TRUE
+} else {
+  if ( ! all( grepl( "^TCGA", cnames ) ) ) {
+    err.msg <- "can't find any TCGA samples listed in this matrix.  If columns are samples, all columns must be a TCGA sample ID.  Same if rows are samples.\n"
+    cat( err.msg, file=opt$output.fname )
+    stop( err.msg )
+  }
+}
+
+cnames <- remove.periods.from.ids( colnames( data ) )
+nelts <- as.numeric( names( table( as.factor( sapply( strsplit( cnames, "-" ), function(x) length(x ) ) ) ) ) )
+if ( length( nelts ) > 1 ) {
+  err.msg <- "Error: Inconsistent TCGA sample barcodes used.  Have found ID with different numbers of components in the barcodes used\n" 
+    cat( err.msg, file=opt$output.fname )
+    stop( err.msg )
+}
+
+if ( opt$remove.normals ) {
+  if ( nelts > 3 ) {
+    normals <- grepl( "^TCGA-..-....-1", cnames )
+    data <- data[ , (! normals ), drop=FALSE ]
+  }
+}
+
+if ( opt$num.components < nelts ) {
+  cnames <- reformat.ids( ids=cnames, num.components=opt$num.components )
+  data <- merge.cols( data, cnames )
+}
+
+if ( transpose.back ) data <- t( data )
+
+write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/fix.and.merge.TCGA.sample.IDs.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/fix.and.merge.TCGA.sample.IDs.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fix.and.merge.TCGA.sample.IDs.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/fix.and.merge.TCGA.sample.IDs.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/fix.and.merge.TCGA.sample.IDs.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,33 @@
+<tool id="fix_and_merge_TCGA_samples_IDs" name="Fix and Merge TCGA sample IDs" force_history_refresh="True">
+    <command interpreter="python">fix.and.merge.TCGA.sample.IDs.py
+-d $dataset -n ${num_components} ${remove_normals}
+-o ${output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='tabular' label="Matrix with Full TCGA Aliquot Barcodes"/>
+	<param name="num_components" type="integer" label="Number of barcode components to use (min number is 3)" value="3" />
+	<param name="remove_normals" type="boolean" label="Remove Normals from Matrix? (check to exclude)" truevalue="-r" falsevalue="" checked="True" />
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="Matrix with TCGA Patient Barcodes (filtered and merged)"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Update and Merge TCGA Sample IDs** - This will limit the TCGA sample IDs to the specified numnber of elements (min is 3).  If necessary, samples will be merged (by averaging)
+
+**OUTPUT:**  A new matrix using sample (columns) that use the specified number of components
+
+----
+
+**Parameters**
+
+- **Matrix with TCGA barcode sample IDs (e.g. TCGA-AE-####-01-)** Specify a data matrix with TCGA barcodes
+
+- **Number of barcode components to use** Specify the number of barcode components to use in new matrix that is produced **(min number is 3)**
+
+- **Remove Normals from Matrix?** - Remove any normals from the matrix (if necessary)
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/format.raw.TCGA.clinical.data.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/format.raw.TCGA.clinical.data.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,184 @@
+#!/usr/bin/env Rscript
+## 
+## formats raw clinical data from TCGA to contain a single status & time colums
+##
+## Input (required):
+##    - clinical data
+## Input (optional):
+##    - status & time columns: (NOT USED IN THIS SCRIPT - see comment below)
+##         ideally, a better design would allow a user to specify 1 or more columns
+##         to check for the status & time columns - however, due to the necessities
+##         required to pre-process the TCGA clinical data, the script would not be
+##         generalizeable - and for this reason, the TCGA columns are hard-coded.
+##
+## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time
+##
+## Date: August 21, 2012
+## Author: Peter Waltman
+##
+
+##usage, options and doc goes here
+argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+        Usage: 
+                format.raw.TCGA.clinical.data.R -c <clinical.file> 
+        Options:
+                -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time))
+              ")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "clinical.fname", "d", 1, "character",  
+                   "output.fname",   "o", 2, "character"
+                  ),
+                ncol=4,
+                byrow=TRUE
+               )
+opt <- getopt( spec=spec )
+
+##set some reasonable defaults for the options that are needed,
+##but were not specified.
+if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.clinical.data" ) }
+
+##orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE, row.names=1 )
+orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE )
+orig.clinical.data <- unique( orig.clinical.data )
+rownames( orig.clinical.data ) <- orig.clinical.data[,1]
+orig.clinical.data <- orig.clinical.data[, -1 ]
+
+##  ugh, some TCGA data sets have all NAs in the "days_to_..." columns
+if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) {
+  time.cols <- c( "days_to_death", "days_to_last_followup", "days_to_last_known_alive" )
+} else {
+  time.cols <- c( "days_to_death", "days_to_last_followup"  )
+}
+good.samps <- ! apply( orig.clinical.data[, time.cols ], 1, function(x) all( is.na(x) ) | all( x <= 0, na.rm=T ) )
+
+orig.clinical.data <- orig.clinical.data[ good.samps, ]
+
+if ( is.null(opt$status.column ) ) {
+  status.colname <- "vital_status"
+  if ( status.colname %in% colnames( orig.clinical.data ) ) {
+    opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname )
+    clinical.data <- orig.clinical.data[ , opt$status.column ]
+  }
+  else {
+    status.colname <- "days_to_death"
+    if ( status.colname %in% colnames( orig.clinical.data ) ) {
+      opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname )
+      clinical.data <- orig.clinical.data[ , opt$status.column ]
+    }
+    else {
+      stop( "can't find a valid entry with status info - have tried vital_status & days_to_death\n" )
+    }
+  }
+  clinical.data <- as.numeric( ! grepl( "(LIVING|Not)", clinical.data ) )
+}
+if ( is.null(opt$time.column ) ) {
+  time.colname <- "CDE.clinical_time"
+  
+  if ( time.colname %in% colnames( orig.clinical.data ) ) {
+    opt$time.column <- which( colnames( orig.clinical.data ) %in% time.colname )
+    clinical.data <- cbind( clinical.data,
+                           as.numeric( orig.clinical.data[, opt$time.column ] ) )
+  }
+  else {
+    dec.mat <- matrix( NA,
+                       nc=length( time.cols ),
+                       nr=nrow( orig.clinical.data ),
+                       dimnames=list( rownames( orig.clinical.data ),
+                                       time.cols )
+                      )
+    for ( cname in colnames( dec.mat ) ) {
+      if ( cname %in% colnames( orig.clinical.data ) ) {
+        dec.mat[, cname ] <- as.numeric( orig.clinical.data[, cname ] )
+      }
+    }
+                         
+    
+
+    if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) {
+
+      opt$time.column <- sapply( 1:length( clinical.data ),
+                                 function(i) {
+                                   if ( clinical.data[i] ) {
+                                     ## this is a deceased sample
+                                     return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ),
+                                                     dec.mat[ i, "days_to_death" ],
+                                                     ifelse( ( !is.na( dec.mat[ i, "days_to_last_known_alive" ] ) ),
+                                                             dec.mat[ i, "days_to_last_known_alive" ],
+                                                             dec.mat[ i, "days_to_last_followup" ] ) ) )
+                                                   
+                                   }
+                                   else {
+                                     return( max( dec.mat[ i, c( "days_to_last_followup","days_to_last_known_alive") ], na.rm=T ) )
+                                   }
+                                 }
+                                )
+    } else {
+      opt$time.column <- sapply( 1:length( clinical.data ),
+                                 function(i) {
+                                   if ( clinical.data[i] ) {
+                                     ## this is a deceased sample
+                                     return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ),
+                                                     dec.mat[ i, "days_to_death" ],
+                                                     dec.mat[ i, "days_to_last_followup" ] ) )
+                                                   
+                                   }
+                                   else {
+                                     return( max( dec.mat[ i, c( "days_to_last_followup") ], na.rm=T ) )
+                                   }
+                                 }
+                                )
+    }
+                                   
+    
+    clinical.data <- cbind( clinical.data,
+                           as.numeric( opt$time.column ) )
+  }
+}
+
+clinical.data <- as.data.frame( clinical.data )
+colnames( clinical.data ) <- c( "status", "time" )
+rownames( clinical.data ) <- rownames( orig.clinical.data )
+
+
+##  check to make sure that the id's are sync'd correctly
+## the default format is to use hyphens to separate the elt's of the name
+## and to only use the 1st 3 elements of the name
+## so we check to see if they're using something else as separators and/or using more than 3 elts
+reformat.ids <- function( ids ) {
+
+  if ( grepl( "TCGA\\.", ids[1] ) ) {
+    ids <- sapply( strsplit( ids, "\\." ), function(x) paste( x[1:3], collapse="-" ) )
+  } else {
+    ## do this just in case there's more than 3 elements to the names
+    if ( grepl( "TCGA-", ids[1] ) ) {
+      ids <- sapply( strsplit( ids, "-" ), function(x) paste( x[1:min( c(3,length(x) ) )], collapse="-" ) )
+    }
+  }
+  return( ids )
+}
+
+
+new.samp.ids <- reformat.ids( rownames( clinical.data ) )
+if ( any( duplicated( new.samp.ids ) ) ) {
+  ## in some cases, we have duplicate sample ids in the raw data after we truncate to
+  ##   the 1st 3 elts in the barcode, so just simplify the data
+  uniqs <- ! duplicated( new.samp.ids )
+  clinical.data <- clinical.data[ uniqs, ]
+  new.samp.ids <- new.samp.ids[ uniqs ]
+}
+  
+rownames( clinical.data ) <- new.samp.ids
+write.table( clinical.data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/format.raw.TCGA.clinical.data.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/format.raw.TCGA.clinical.data.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "format.raw.TCGA.clinical.data.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/format.raw.TCGA.clinical.data.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/format.raw.TCGA.clinical.data.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,21 @@
+<tool id="format_raw_TCGA_clinical_data" name="Format Raw TCGA Clinical Data" force_history_refresh="True">
+    <command interpreter="python">format.raw.TCGA.clinical.data.py
+-d $dataset
+-o ${output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='tabular' label="Raw Clinical Data"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="Formatted Clinical Data"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Format Raw TCGA Clinical Data** - Tool to convert a raw clinical TCGA data file into a the format expected by the Survival Analysis tools
+
+**OUTPUT:**  A new clinical data file that is a 2 column, tab-delimited file of the format that is expected by the Survival Analysis tools
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/gen.matrix.heatmap.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/gen.matrix.heatmap.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,160 @@
+#!/usr/bin/env Rscript
+argspec <- c("
+
+        Usage: 
+                gen.matrix.heatmap.R -d <data.file> 
+        Optional:
+                            -o <output_file>
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+lib.load.quiet( gplots )
+lib.load.quiet( amap )
+lib.load.quiet( ctc )
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+  if ( 'flashClust' %in% installed.packages() ) {
+    lib.load.quiet( flashClust )
+  } else {
+    if ( 'fastcluster' %in% installed.packages() ) {
+      lib.load.quiet( fastcluster )
+    }
+  }
+}
+
+
+spec <- matrix( c( "dataset",             "d", 1, "character",
+                   "reverse.rows",        "r", 2, "character",
+                   "image.format",        "i", 2, "character",
+                   "output.fname",        "o", 2, "character",
+                   "output.report.html",  "h", 2, "character",
+                   "output.report.dir",   "p", 2, "character",
+                   "output.treeview",     "t", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+
+opt <- getopt( spec=spec )
+if ( is.null( opt$image.format ) ){
+  opt$image.format <- "png"
+} else {
+  if ( ! opt$image.format %in% c( "pdf", "png" ) ) stop( 'invalid image format specified\n' )
+}
+if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" }
+if ( is.null( opt$output.report.html ) ) {
+  if ( opt$image.format == "pdf" ) opt$output.report.html <- "report/heatmap.pdf"
+  if ( opt$image.format == "png" ) opt$output.report.html <- "report/index.html"
+}
+if ( is.null( opt$output.treeview ) ) {
+  opt$output.treeview <- FALSE
+} else {
+  if ( ! opt$output.treeview %in% c( "no", "yes" ) ) {
+    stop( "invalid input to output.treeview param", opt$output.treeview, "\n" )
+  }
+  ##  set to TRUE/FALSE
+  opt$output.treeview <- ( opt$output.treeview == "yes" )
+}
+if ( is.null( opt$reverse.rows ) ) {
+  opt$reverse.rows <- TRUE
+}  else {
+  if ( ! opt$reverse.rows %in% c( "no", "yes" ) ) {
+    stop( "invalid input to reverse.rows param", opt$reverse.rows, "\n" )
+  }
+
+  ##  set to TRUE/FALSE
+  opt$reverse.rows <- ( opt$reverse.rows == "yes" )
+}
+
+if ( ( opt$image.format == "png" ) || opt$output.treeview ) {
+  if ( !file.exists( opt$output.report.dir ) ){
+    dir.create(opt$output.report.dir, recursive=T)
+  }
+}
+
+
+data <- as.matrix( read.delim(opt$dataset, row.names=1, check.names=F ) )  ## should load the cl, treecl.res (or partcl.res) and data
+hr <- hclust( Dist( data, "euclidean" ) )
+row.ddr <- as.dendrogram( hr )
+if ( opt$reverse.rows ) row.ddr <- rev( row.ddr )
+hc <- hclust( Dist( t( data ), "euclidean" ) )
+col.ddr <- as.dendrogram( hc )
+hmcols<-colorRampPalette(c("blue","white","red"))(256)
+
+param.list <- list( x=data,
+                    Rowv=row.ddr,
+                    Colv=col.ddr,
+                    dendrogram="both",
+                    trace="none",
+                    col=hmcols,
+                    symbreaks=TRUE,
+                    scale="none",
+                    labRow="",
+                    labCol="",
+                    na.color='grey' ) #,
+                    ##key=FALSE )
+
+
+if ( opt$image.format == 'png' ) {
+  
+  png.fname <- file.path( opt$output.report.dir, "cluster.heatmap.png")
+  plot.dev <- png( png.fname,
+                   width=8.5,
+                   height=11,
+                   units='in',
+                   res=72 )
+} else {
+  pdf.fname <- opt$output.report.html
+  pdf( opt$output.report.html,
+       paper="letter" )
+}
+
+heatmap.retval <- do.call( "heatmap.2", param.list )
+dev.off()
+
+if ( opt$image.format == 'png' ) {
+  pngs = list.files(path=opt$output.report.dir, patt="png")
+  html.out <- paste( "<html>",
+                     paste( paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), collapse=""),
+                     "</html>" )
+  cat( html.out, file=opt$output.report.html )
+}
+
+
+if ( opt$output.treeview ) {
+  treeview.fname.stem <- file.path( opt$output.report.dir, "cluster.heatmap")
+  
+  fnames <- character()
+  fname <- paste( treeview.fname.stem, ".gtr", sep="" )
+  r2gtr( hr, file=fname )
+  fnames <- c( fnames, fname )
+
+  fname <- paste( treeview.fname.stem, ".atr", sep="" )
+  r2atr( hc, file=fname )
+  fnames <- c( fnames, fname )
+
+  fname <- paste( treeview.fname.stem, ".cdt", sep="" )
+  r2cdt( hr, hc, data, file=fname )
+  fnames <- c( fnames, fname )
+
+  ## jtv file now
+  jtv.str <- '<DocumentConfig><UrlExtractor/><ArrayUrlExtractor/><Views><View type="Dendrogram" dock="1"><ColorExtractor contrast="2.0"><ColorSet zero="#FFFFFF" down="#0000FF"/></ColorExtractor><ArrayDrawer/><GlobalXMap current="Fill"><FixedMap type="Fixed"/><FillMap type="Fill"/><NullMap type="Null"/></GlobalXMap><GlobalYMap current="Fill"><FixedMap type="Fixed"/><FillMap type="Fill"/><NullMap type="Null"/></GlobalYMap><ZoomXMap><FixedMap type="Fixed"/><FillMap type="Fill"/><NullMap type="Null"/></ZoomXMap><ZoomYMap><FixedMap type="Fixed"/><FillMap type="Fill"/><NullMap type="Null"/></ZoomYMap><TextView><TextView face="Monospaced" size="14"><GeneSummary/></TextView><TextView face="Monospaced" size="14"><GeneSummary/></TextView><TextView face="Monospaced" size="14"><GeneSummary/></TextView><TextView face="Monospaced" size="14"><GeneSummary/></TextView></TextView><ArrayNameView face="Monospaced" size="14"><ArraySummary included="0"/></ArrayNameView><AtrSummary/><GtrSummary/></View></Views></DocumentConfig>'
+  fname <- paste( treeview.fname.stem, ".jtv", sep="" )
+  cat( jtv.str, file=fname )
+  fnames <- c( fnames, fname )
+
+  cmd <- paste( "tar -zcf", opt$output.fname, paste( "--directory=", opt$output.report.dir, sep="" ), paste( basename( fnames ), collapse=" " ) )
+  system( cmd )
+}
+
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/gen.matrix.heatmap.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/gen.matrix.heatmap.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "gen.matrix.heatmap.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/gen.matrix.heatmap.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/gen.matrix.heatmap.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,69 @@
+<tool id="gen_matrix_heatmap" name="Generate Heatmap for Matrix" force_history_refresh="True">
+  <command interpreter="python">gen.matrix.heatmap.py
+-d ${dataset1}
+-i ${image_format_cond.image_format}
+-t ${output_treeview_format}
+-r ${reverse_rows}
+
+#if str($image_format_cond.image_format) == 'png':
+-h $report_html
+-p ${report_html.files_path}
+#end if
+#if str($image_format_cond.image_format) == 'pdf':
+-h $report_pdf
+#end if
+#if str($treeview_output) != 'None':
+-o ${treeview_output}
+#end if
+
+</command>
+<inputs>
+  <param format="tabular" name="dataset1" type="data" label="Data Matrix" help="Provide a tab-delimited file with data to put into a heatmap generate"/>
+  <conditional name="image_format_cond">
+    <param name="image_format" type="select" display='radio' label="Image format">
+      <option value="pdf">PDF</option>
+      <option value="png" selected='true' >PNG (html wrapper)</option>
+    </param>
+  </conditional>
+  <param name="output_treeview_format" type='select' display="radio" label="Output in TreeView format as well?" help="Specify whether or not to produce files for TreeView">
+    <option value="no">No</option>
+    <option value="yes" selected='true' >Yes</option>
+  </param>
+  <param name="reverse_rows"  type='select'  display="radio" label="Reverse Row-order (to make consistent w\TreeView Display)?" help="Specify whether or not to reverse Row-order (to make the heatmap consistent with output from TreeView (reversed otherwise)">
+    <option value="no" selected='true'>No</option>
+    <option value="yes">Yes</option>
+  </param>
+</inputs>
+<outputs>
+  <data format="html" name="report_html" label="Heatmap from data matrix (HTML)">
+    <filter>(image_format_cond['image_format']=="png")</filter>
+  </data>
+  <data format="pdf" name="report_pdf" label="Heatmap from data matrix (PDF)">
+    <filter>(image_format_cond['image_format']=="pdf")</filter>
+  </data>
+  <data format="tgz" name="treeview_output" label="Heatmap from data matrix (tgz of TreeView files)">
+    <filter>(output_treeview_format)=="yes"</filter>
+  </data>
+</outputs>
+<help>
+.. class:: infomark
+     
+**Generate Heatmap for Matrix** - Tool to Generate a simple heatmap for a data matrix 
+- The tool uses HAC, with euclidean distance.  If user wants other options, we recomend using either the:
+
+      * Consensus Clustering tool
+      * Hierarchcical Clustering tool, OR
+      * Partitition Clusterint tool
+
+**OUTPUT:**  
+      * **Heatmap** in either PDF or PNG format
+      * **TreeView Files**  gzip tarball file of the relevant files (OPTIONAL)
+
+----
+
+**Parameters**
+
+- **Matrix in tab-delimited format** Tab-delimited file
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/gen.survival.curves.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/gen.survival.curves.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,251 @@
+#!/usr/bin/env Rscript
+## 
+## Calculates the log-rank test for a given clustering, in the output format from ConsensusClusterPlus
+##
+## Input (required):
+##    - consensus cluster file (consensusClass.csv file)
+##    - survival data
+## Input (optional):
+## Output: a KM plot, with the most significant p-value.  Output to stdout can be captured by re-direction
+##
+## Uses: survival library
+## Date: August 21, 2012
+## Author: Peter Waltman
+##
+
+##usage, options and doc goes here
+argspec <- c("gen.survival.curves.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+        Usage: 
+                gen.survival.curves.R -c <cluster.file> -s <clinical.file> 
+        Options:
+                -o <output file> (postscript)
+                -m <mode>        (all, one, both)
+                                  \"all\" - perform all-vs-all log-rank test
+                                  \"one\" - perform one-vs-others log-rank test (returns min)
+                                  \"both\" - perform both \"all\" and \"one\" tests
+                -t <title>
+                -p <pval.only>  ( only return the p-value for this given mode - no plotting at all (to screen or postscript))
+                -v <verbose>
+              ")
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+lib.load.quiet( survival )
+
+
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+spec <- matrix( c( "cluster.fname",  "C", 1, "character",
+                   "survival.fname", "S", 1, "character",  
+                   "mode",           "M", 2, "character",
+                   "title",          "T", 2, "character",
+                   "myplots.rda",    "R", 2, "character",
+                   "image.format",   "I", 2, "character",
+                   "output.fname",   "O", 2, "character",
+                   "pval.only",      "P", 0, "logical",
+                   "verbose",        "V", 0, "logical"
+                  ),
+                ncol=4,
+                byrow=TRUE
+               )
+opt <- getopt( spec=spec )
+
+
+#set some reasonable defaults for the options that are needed,
+#but were not specified.
+if ( is.null(opt$mode ) ) {
+  opt$mode <- "all"
+} else {
+  if ( ! opt$mode %in% c( 'all', 'one', 'both' ) ) {
+    stop( "invalid mode specified,' -m", opt$mode, "'.  must be either {all, one, both}\n" )
+  }
+}
+if ( is.null( opt$title ) ) {
+  opt$title <- opt$cluster.fname
+  opt$title <- strsplit( opt$title, "\\/" )[[1]]
+  opt$title <- opt$title[ length( opt$title ) ]
+}
+if ( is.null( opt$image.format ) ){
+  opt$image.format <- "png"
+} else {
+  if ( ! opt$image.format %in% c( "pdf", "png", "none" ) ) stop( 'invalid image format specified\n' )
+}
+if ( is.null(opt$output.fname ) ) { opt$output.fname <- paste( opt$mode, "survival.curve", opt$image.format, sep="." ) }
+if ( is.null(opt$cluster.header ) ) { opt$cluster.header = FALSE }
+if ( is.null(opt$pval.only ) ) { opt$pval.only = FALSE }
+if ( is.null(opt$verbose ) ) { opt$verbose = FALSE }
+
+##print some progress messages to stderr, if requested.
+if ( opt$verbose ) { write("writing...",stderr()); }
+
+load( opt$cluster.fname )
+cluster.data <- cbind( names( cl ), as.numeric( cl ) )
+colnames( cluster.data ) <- c( "id", "group_num" )
+rownames( cluster.data ) <- names( cl )
+
+survival.data <- read.delim( opt$survival.fname, as.is=TRUE, row.names=1 )
+survival.data <- cbind( rownames( survival.data ), survival.data ) ## add in the ids, so we can merge on them
+if ( length( colnames( survival.data ) ) == 3 ) {
+  ## we have to left-shift the current colanmes to drop the 1st one
+  ##  b/c cbind will add one for the column we just  added
+  colnames( survival.data ) <- c( "id", colnames( survival.data )[-1] )  
+}
+if ( length( colnames( survival.data ) ) == 2 ) {
+  ## added just in case there's a change to cbind as R is prone to doing
+  colnames( survival.data ) <- c( "id", colnames( survival.data ) )  
+}
+survival.data$id <- as.character( survival.data$id )
+
+
+## Now, filter so we only contain the same samples
+n.clust.data.samps <- nrow( cluster.data )
+n.surv.data.samps <- nrow( survival.data )
+if ( n.clust.data.samps > n.surv.data.samps ) {
+  ovp.samples <- rownames( cluster.data )
+  ovp.samples <- ovp.samples[ ovp.samples %in% survival.data$id ]
+} else {
+  ovp.samples <- survival.data$id
+  ovp.samples <- ovp.samples[ ovp.samples %in% rownames( cluster.data ) ]
+}
+
+cluster.data <- cluster.data[ ovp.samples, , drop=FALSE]
+survival.data <- survival.data[ ovp.samples, ]
+survival.data <- merge( survival.data, cluster.data )
+
+
+calc.all.pval <- function( survival.data ) {
+  survdiff( Surv( time, status )~group_num, data=survival.data )
+  surv.res <- survdiff( Surv( time, status )~group_num, data=survival.data )
+  pval <- surv.res$chisq
+  df <- length( surv.res$n ) - 1
+  pval <- pchisq( pval, df=df, lower=F )
+  return( pval )
+}
+
+calc.one.v.others.pval <- function( survival.data ) {
+  grps <- sort( unique( as.numeric( survival.data$group_num ) ) )
+
+  retval <- numeric()
+  for ( g in grps ) {
+    one.v.all.survival.data <- survival.data
+    tmp <- as.numeric( one.v.all.survival.data$group_num )
+    tmp[ ! tmp %in% g ] <- -1
+    tmp[ tmp %in% g ] <- 1
+    tmp[ tmp %in% -1 ] <- 2
+    one.v.all.survival.data$group_num <- tmp
+    surv.res <- survdiff( Surv( time, status )~group_num, data=one.v.all.survival.data )
+    pval <- surv.res$chisq
+    df <- length( surv.res$n ) - 1
+    pval <- pchisq( pval, df=df, lower=F )
+    retval <- c( retval, pval )
+  }
+  names( retval ) <- grps
+  return( retval )
+}
+
+
+if ( opt$mode == "all" ) {
+
+  pval <- calc.all.pval( survival.data )
+  log.rank <- paste( "Log Rank p-value:", sprintf( "%1.2e",pval ) )
+  opt$title <- paste( opt$title, log.rank, sep="\n" )
+} else {
+  if ( opt$mode == "one" ) {
+
+    pvals <- calc.one.v.others.pval( survival.data )
+    min.p <- min( pvals, na.rm=T )
+    if ( length( min.p ) == 0 ) {
+      stop( 'no valid p-value returned from the one-v-others test\n' )
+    }
+    cluster.num <- names( pvals )[ pvals == min.p ]
+    pval <- pvals[ cluster.num ]
+    log.rank <- paste( "Log Rank p-value for cluster", cluster.num,"is:", sprintf( "%1.2e",pval ) )
+    opt$title <- paste( opt$title, log.rank, sep="\n" )
+  } else {
+    if ( opt$mode== "both" ) {
+      ##  add the all-v-all p-value
+      bak <- pval <- calc.all.pval( survival.data )
+      log.rank <- paste( "Log Rank p-value:", sprintf( "%1.2e",pval ) )
+      opt$title <- paste( opt$title, log.rank, sep="\n" )
+
+      ## now add the one-v-all p-value
+      pvals <- calc.one.v.others.pval( survival.data )
+      min.p <- min( pvals, na.rm=T )
+      if ( length( min.p ) == 0 ) {
+        stop( 'no valid p-value returned from the one-v-others test\n' )
+      }
+      cluster.num <- names( pvals )[ pvals == min.p ]
+      pval <- pvals[ cluster.num ]
+      log.rank <- paste( "Log Rank p-value for cluster", cluster.num,"is:", sprintf( "%1.2e",pval ) )
+      opt$title <- paste( opt$title, log.rank, sep="\n" )
+
+      if ( opt$pval.only ) {
+        pval <- min( c( bak, pval ), na.rm=T )
+      }
+    }
+    else {
+      stop( "invalid mode specified, mode = ", opt$mode, "\n" )
+    }
+  }
+}
+
+if ( opt$pval.only ) {
+  cat( paste(pval, "\n", sep="" ), file=stdout() )
+}
+
+
+if ( ! opt$pval.only ) {
+  ngrps <- length( unique( survival.data$group_num ) )
+  col.map <- rainbow( ngrps )
+
+
+
+  ##postscript( opt$output.fname, horizontal=T, paper='letter' )
+  if ( opt$image.format == 'png' ) {
+    plot.dev <- png( opt$output.fname,
+                     width=11,
+                     height=8.5,
+                     units='in',
+                     res=72 )
+  } else if ( opt$image.format == 'pdf' ) {
+    pdf( opt$output.fname,
+         paper="letter" )
+  } else if ( opt$image.format == 'none' ) {
+    ## do nothing - this allows other scripts to call this and hopefully plot into them
+    ## NOPE, this doesn't work.  see what I do with the myplots.rda file
+  }
+  
+  plot( survfit( Surv( time, status )~group_num, data=survival.data ),
+        main = opt$title,
+        ##lty = 1:ngrps,
+        lty=1,
+        col=col.map,      
+        ylab = "Probability",
+        xlab = "Survival Time in Days",
+       )
+
+
+  ## set the legend.labels if they're still not set yet
+  if( ! exists( "legend.labels" ) ) {
+    grp.counts <- table( as.factor( survival.data[, "group_num" ] ) )
+    legend.labels <- paste( "Cluster", 1:ngrps, paste( "(n=", as.integer(grp.counts), ")", sep="" ) )
+  }
+
+  legend( "topright",
+          lty=1,
+          col=col.map,
+          bty = "n",
+          legend=legend.labels
+         )
+  
+  if( opt$image.format != "none" ) dev.off()
+}
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/gen.survival.curves.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/gen.survival.curves.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "gen.survival.curves.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/gen.survival.curves.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/gen.survival.curves.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,62 @@
+<tool id="gen_survival_curve" name="Generate Kaplan-Meiers Plot for Cluster Result" force_history_refresh="True">
+    <command interpreter="python">gen.survival.curves.py
+-C ${cluster_cls}
+-S ${survial_data}
+-M ${mode}
+-T "${title}"
+-I ${image_format_cond.image_format}
+#if str($image_format_cond.image_format) == 'png':
+-O $report_png
+#end if
+#if str($image_format_cond.image_format) == 'pdf':
+-O $report_pdf
+#end if
+</command>
+<inputs>
+  <param name="cluster_cls" type="data" format="rdata" label="Clustering Classification" help="Specify the clustering classification (must be RData file, see help)"/>
+  <param name="survial_data" type="data" format="tabular" label="Clinical Data" help="Specify the clinical data to use for the Kaplan-Meiers Plot (see help)"/>
+  <param name="mode" type="select" display='radio' label="Report Mode" help="Specify the mode to use when performing Log-Rank tests">
+    <option value="all" selected='true' >All</option>
+    <option value="one">One</option>
+    <option value="both">Both</option>
+  </param> 
+  <conditional name="image_format_cond">
+    <param name="image_format" type="select" display='radio' label="Image format" help="">
+      <option value="pdf">PDF</option>
+      <option value="png" selected='true' >PNG (html wrapper)</option>
+    </param>
+  </conditional>
+
+  <param name="title" type="text" label="Title" value="Report" help="Specify title to use in Kaplan-Meiers Plot"/>
+</inputs>
+<outputs>
+  <data format="png" name="report_png" label="Kaplan-Meier Survival Plot (PNG)">
+    <filter>(image_format_cond['image_format']=="png")</filter>
+  </data>
+  <data format="pdf" name="report_pdf" label="Kaplan-Meier Survival Plot (PDF)">
+    <filter>(image_format_cond['image_format']=="pdf")</filter>
+  </data>
+</outputs>
+<help>
+.. class:: infomark
+     
+**Generate a Kaplan-Meiers Plot for a given cluster result**
+
+----
+
+**Parameters**
+
+- **Clustering Classification** Specify the clustering classification (RData file format - use the 'Convert tab-delimited Cluster Assignments to RData" tool to convert assignments in tab-delimited format).
+
+- **Clinical Data** Clinical data to use for the Kaplan-Meiers Plot (must be formated - See the "Format Raw TCGA sample IDs")
+
+- **Report Mode** Mode to use when performing Log-Rank tests **(MUST SPECIFY A PROPERLY FORMATTED CLINICAL DATA FILE)** .  Choice of:
+
+         * All - All clusters versus each other
+         * One - One cluster versus a meta-cluster composed of the others.  Search performed exhaustively.
+         * Both - Perform both all-v-all and and one-v-others test; select the choice of K that gives the best
+
+- **Title** - Title to use for Kaplan-Meiers Plot
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/hclust.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/hclust.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,87 @@
+#!/usr/bin/env Rscript
+
+argspec <- c("hclust.R help TBD
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+lib.load.quiet( amap )
+##  if any of the faster clustering methods are available on this system, load them
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+  if ( 'flashClust' %in% installed.packages() ) {
+    lib.load.quiet( flashClust )
+  } else {
+    if ( 'fastcluster' %in% installed.packages() ) {
+      lib.load.quiet( fastcluster )
+    }
+  }
+}
+
+spec <- matrix( c( "data.fname",         "d", 1, "character",
+                   "num.k",              "k", 1, "integer",
+                   "distance.metric",    "m", 2, "character", 
+                   "dist.obj",           "D", 2, "logical",
+                   "direction",          "n", 2, "character",
+                   "linkage",            "l", 2, "character",
+                   "output.name",        "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+
+if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "euclidean" }
+if ( is.null( opt$dist.obj ) ) { opt$dist.obj <- FALSE }
+if ( is.null( opt$direction ) ) { opt$direction <- "cols"  }
+if ( is.null( opt$linkage ) ) { opt$linkage <- "average" }
+if ( is.null( opt$num.k ) ) { opt$num.k <- 10 }
+if ( is.null( opt$output.name ) ) { opt$output.name <- "hclust.result.rda" }
+
+data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
+if ( opt$direction == "cols" ) {
+  ## need to transpose b/c both kmeans & pam cluster the rows
+  ## this shouldn't have an effect upon a distance matrix
+  data <- t( data )
+}
+if ( opt$num.k > nrow( data ) ) {
+  err.msg <- paste( "K specified is greater than the number of elements (", opt$direction, ") in data matrix to be clustereed\n", sep="" )
+  stop( err.msg )
+}
+
+if ( opt$dist.obj ) {
+  dist.mat <- as.dist( data )  
+} else {
+  ##  we're going to use the amap Dist function, but they misname their correlation
+  ##  functions, so re-name them correctly
+  amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+                      "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
+  names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+                               "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
+
+  if ( ! opt$distance.metric %in% names( amap.distance ) ) stop("unsupported distance.")
+  dist.mat <- Dist( data, method=as.character( amap.distance[ opt$distance.metric ] ) )
+  attr( dist.mat, "method" ) <- opt$distance.metric
+}
+
+##  now, do the clustering
+treecl.res <- hclust( dist.mat, method=opt$linkage )
+cutree.res <- cutree( treecl.res, k=opt$num.k )
+##cl <- cbind( names( cutree.res ), as.numeric( cutree.res ) )
+##colnames( cl ) <- c( "ID", "class" )
+
+if ( opt$direction == "cols" ) {
+  ## need to re-transpose the data back to it's original dimensionality
+  data <- t( data )
+}
+
+cl <- cutree.res
+save( file=opt$output.name, treecl.res, cl, data )
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/hclust.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/hclust.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "hclust.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/hclust.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/hclust.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,98 @@
+<tool id="hcluster" name="Hierarchical Clustering (HAC)" force_history_refresh="True">
+    <command interpreter="python">hclust.py
+-d $dataset 
+${dist_obj}
+-n ${direction} 
+-m ${distance_metric} 
+-l ${linkage} 
+-k ${numk} 
+-o ${rdata_output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='tabular' label="Data Set" help="Specify the data matrix (tab-delimited) to be clustered"/>
+	<param name="dist_obj" type="boolean" label="Distance Object (R dist object)?" truevalue="-D" falsevalue="" checked="False" help="Check if the matrix contains the pairwise distances between a set of objects"/>
+    	<param name="direction" type="select" label="Cluster Samples or Genes?" help="Specify the matrix dimension to cluster (see help below)">
+	  <option value="cols">Columns (Samples)</option>
+	  <option value="rows" selected='true'>Rows (Genes)</option>
+    	</param>
+    	
+    	<param name="distance_metric" type="select" label="Distance Metric" help="Specify the distance metric to use (see help below)">
+	  <option value="cosine" selected='true'>Cosine</option>
+	  <option value="abscosine">Absolute Cosine</option>
+	  <option value="pearson">Pearson</option>
+	  <option value="abspearson">Absolute Pearson</option>
+	  <option value="spearman">Spearman</option>
+	  <option value="kendall">Kendall</option>
+	  <option value="euclidean">Euclidean</option>
+	  <option value="maximum">Maximum</option>
+	  <option value="manhattan">Manhattan (AKA city block)</option>
+	  <option value="canberra">Canberra</option>
+	  <option value="binary">Binary</option>
+    	</param>
+    	
+    	<param name="linkage" type="select" label="Linkage" help="Specify the linkage to use when clustering (see help below)">
+	  <option value="average">Average</option>
+	  <option value="centroid">Centroid</option>
+	  <option value="complete" selected='true'>Complete</option>
+	  <option value="mcquitty">McQuitty</option>
+	  <option value="median">Median</option>
+	  <option value="single">Single</option>
+	  <option value="ward">Ward</option>
+    	</param>
+    	
+    	<param name="numk" type="integer" label="Number of Clusters" value="50" help="Specify the number of clusters to use"/>
+    	
+    </inputs>
+    <outputs>
+        <data format="rdata" name="rdata_output" label="Hierarchical Clustering Result (RData)"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Perform Hierarchical Clustering (Cluster Samples) on a specified data set**
+
+----
+
+**Parameters**
+
+- **Data Set** - Specify the data matrix to be clustered.  Data must be formated as follows:
+
+         * Tab-delimited
+         * Use row/column headers
+
+- **Cluster Samples or Genes** - Specify the dimension of the matrix to cluster:
+
+         * Rows (Genes)
+         * Columns (Samples)
+
+- **Distance Object** Specify whether or not the data set is a pairwise distance matrix
+
+- **Distance Metric** Specify the distance metric to use.  Choice of:
+
+	 * Cosine (AKA uncentered pearson)
+	 * Absolute Cosine (AKA uncentered pearson, absolute value)
+         * Pearson (pearson correlation)
+	 * Absolute Pearson (pearson correlation, absolute value)
+         * Spearman (spearman correlation)
+	 * Kendall (Kendall's Tau)
+         * Euclidean (euclidean distance)
+	 * Maximum
+	 * Manhattan (AKA city block)
+	 * Canberra
+	 * Binary
+
+- **Linkage** Specify the linkage to use when clustering.  Choice of:
+
+         * Average (see documentation for R's hclust function for explanation of choices)
+         * Single
+         * Complete
+         * Median
+         * Centroid
+         * McQuity
+         * Ward
+
+- **Number of Clusters** Specify the number of clusters to use
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/heatmap.from.cluster.result.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/heatmap.from.cluster.result.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,388 @@
+#!/usr/bin/env Rscript
+argspec <- c("tab.2.cdt.R converts a data matrix to cdt format
+
+        Usage: 
+                tab.2.cdt.R -d <data.file> 
+        Optional:
+                            -o <output_file>
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+lib.load.quiet( gplots )
+lib.load.quiet( ctc )
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+  if ( 'flashClust' %in% installed.packages() ) {
+    lib.load.quiet( flashClust )
+  } else {
+    if ( 'fastcluster' %in% installed.packages() ) {
+      lib.load.quiet( fastcluster )
+    }
+  }
+}
+
+
+spec <- matrix( c( "dataset",             "d", 1, "character",
+                   "second.dir",          "s", 2, "character",
+                   "dataset2",            "D", 2, "character",
+                   "reverse.rows",        "r", 2, "character",
+                   "image.format",        "i", 2, "character",
+                   "plot.kms",            "k", 2, "character",
+                   "output.fname",        "o", 2, "character",
+                   "output.report.html",  "h", 2, "character",
+                   "output.report.dir",   "p", 2, "character",
+                   "output.treeview",     "t", 2, "character",
+                   "survival.script",     "z", 2, "character",
+                   "cluster.fname",       "C", 2, "character",
+                   "survival.fname",      "S", 2, "character",
+                   "survival.image",      "I", 2, "character",
+                   "survival.mode",       "M", 2, "character",
+                   "title",               "T", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+
+opt <- getopt( spec=spec )
+if ( is.null( opt$image.format ) ){
+  opt$image.format <- "png"
+} else {
+  if ( ! opt$image.format %in% c( "pdf", "png" ) ) stop( 'invalid image format specified\n' )
+}
+if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" }
+if ( is.null( opt$output.report.html ) ) {
+  opt$out.dir <- 'report'
+  if (! file.exists( opt$out.dir ) ) {
+    dir.create( opt$out.dir )
+  } else {
+      if ( ! file.info( 'report' )$isdir ) {
+        opt$out.dir <- 'heatmap.report'
+        dir.create( opt$out.dir )
+      }
+    }
+
+  if ( opt$image.format == "pdf" ) opt$output.report.html <- file.path( opt$out.dir ,"heatmap.pdf" )
+  if ( opt$image.format == "png" ) opt$output.report.html <- file.path( opt$out.dir ,"index.html" )
+}
+if ( is.null( opt$plot.kms ) ) {
+  opt$plot.kms <- FALSE
+}  else {
+  if ( ! opt$plot.kms %in% c( "no", "yes" ) ) {
+    stop( "invalid input to plot.kms param", opt$plot.kms, "\n" )
+  }
+  ##  set to TRUE/FALSE
+  opt$plot.kms <- ( opt$plot.kms == "yes" )
+  if ( opt$plot.kms ) {
+    opt$cluster.fname <- opt$dataset
+    if ( is.null( opt$survival.fname ) || ( !file.exists( opt$survival.fname ) ) ) stop( 'must provide a valid file w/clinical data\n' )
+    if ( is.null( opt$survival.script ) || ( !file.exists( opt$survival.script ) ) ) stop( 'must provide a valid path to the gen.survival.curves.R file\n' )
+    if ( is.null(opt$mode ) ) {
+      opt$mode <- "all"
+    } else {
+      if ( ! opt$mode %in% c( 'all', 'one', 'both' ) ) {
+        stop( "invalid mode specified,' -m", opt$mode, "'.  must be either {all, one, both}\n" )
+      }
+    }
+    if ( is.null( opt$title ) ) {
+      opt$title <- opt$cluster.fname
+      opt$title <- strsplit( opt$title, "\\/" )[[1]]
+      opt$title <- opt$title[ length( opt$title ) ]
+    }
+  }
+}
+if ( is.null( opt$output.treeview ) ) {
+  opt$output.treeview <- FALSE
+} else {
+  if ( ! opt$output.treeview %in% c( "no", "yes" ) ) {
+    stop( "invalid input to output.treeview param", opt$output.treeview, "\n" )
+  }
+  ##  set to TRUE/FALSE
+  opt$output.treeview <- ( opt$output.treeview == "yes" )
+}
+if ( is.null( opt$reverse.rows ) ) {
+  opt$reverse.rows <- TRUE
+}  else {
+  if ( ! opt$reverse.rows %in% c( "no", "yes" ) ) {
+    stop( "invalid input to reverse.rows param", opt$reverse.rows, "\n" )
+  }
+
+  ##  set to TRUE/FALSE
+  opt$reverse.rows <- ( opt$reverse.rows == "yes" )
+}
+
+if ( is.null( opt$second.dir ) ) { opt$second.dir <- "no" }
+if ( is.null( opt$second.dir ) && is.null( opt$dataset2 ) ) stop( "must specify an rdata file to load if a previous result is to be used to cluster the 2nd direction\n" )
+if ( ( opt$image.format == "png" ) || opt$output.treeview ) {
+  if ( !file.exists( opt$output.report.dir ) ){
+    dir.create(opt$output.report.dir, recursive=T)
+  }
+}
+
+
+
+load( opt$dataset )  ## should load the cl, treecl.res (or partcl.res) and data
+##  pre-set the cluster results for rows & cols to NULL
+hr <- hr.cl <- hc <- hc.cl <- row.ddr <- col.ddr <- NULL
+if ( exists( 'treecl.res' ) ) {
+
+  if ( is.null( treecl.res$dist.method ) ) treecl.res$dist.method <- 'euclidean'  # just set it to some stub so that the ctc fn's don't complain
+  if ( all( names( cl ) %in% rownames( data ) ) ) {
+    hr <- treecl.res
+    hr.cl <- cl
+  } else if ( all( names( cl ) %in% colnames( data ) ) ) {
+    hc <- treecl.res
+    hc.cl <- cl
+  } else {
+    stop( "Specified cluster result does not come from this data set\n" )
+  }
+
+} else {
+  if ( exists( 'partcl.res' ) ) {
+    if ( all( names( cl ) %in% rownames( data ) ) ) {
+      hr <- NA
+      hr.cl <- cl
+      orig.data <- data
+      data <- data[ names( cl ), ]  ## partcl.res should now be sorted in order of cluster
+    } else if ( all( names( cl ) %in% colnames( data ) ) ) {
+      hc <- NA
+      hc.cl <- cl
+      orig.data <- data
+      data <- data[ , names( cl ) ]  ## partcl.res should now be sorted in order of cluster
+    } else {
+      stop( "Specified cluster result does not come from this data set\n" )
+    }
+  }
+  else {
+    stop( 'could not find a valid cluster result to use for primary direction\n' )
+  }
+}
+
+
+if ( opt$second.dir %in% c( "yes", "prev" ) ) {
+
+  if ( opt$second.dir == "yes" ) {
+    if ( is.null( hr ) ) {
+      hr <- hclust( dist( data ) )
+    } else if ( is.null( hc ) ) {
+      hc <- hclust( dist( t( data ) ) )
+    } 
+  } else {  ## opt$second.dir == "prev"
+
+    ## prep for loading new cluster result
+    if ( ! exists( 'orig.data' ) ) orig.data <- data
+    if ( exists( "treecl.res" ) ) {
+      rm( treecl.res )
+    } else if ( exists( "partcl.res" ) ) {
+      rm( partcl.res )
+    } else stop( "no primary clustering found when generating the 2nd\n" )
+    rm( cl, data )
+
+    
+    load( opt$dataset2 ) ## this should bring in the cl obj for the 2nd direction
+
+    ## check the data 1st
+    if ( length( orig.data ) != length( data ) ) stop( "incompatible cluster results in 2nd results file - matrices are diff lengths\n" )
+    if ( nrow( orig.data ) != nrow( data ) ) stop( "incompatible cluster results in 2nd results file - matrices have diff dimensions\n" )
+    if ( any( is.na( orig.data ) ) ) {
+      nas <- which( is.na( orig.data ) )
+      num.nas <- length( nas )
+      ## 1st, chk the NAs
+      if ( sum( which( is.na( data ) ) != nas ) == num.nas ) stop( "incompatible cluster results in 2nd results file - matrices contain diff contents\n" )
+      if ( ( sum( orig.data == data, na.rm=T )+num.nas ) != length( orig.data ) ) {
+        stop( "incompatible cluster results in 2nd results file - matrices contain diff contents\n" )
+      }
+      
+    } else {
+      if ( sum( orig.data == data ) != length( orig.data ) )  stop( "incompatible cluster results in 2nd results file - matrices contain diff contents\n" )
+    }
+    ## looks like data is the same, so drop a copy and start chugging
+    rm( orig.data ); gc()
+
+    if ( exists( 'treecl.res' ) ) {
+      if ( is.null( treecl.res$dist.method ) ) treecl.res$dist.method <- 'euclidean'  # just set it to some stub so that the ctc fn's don't complain
+
+      if ( is.null( hr ) ) {
+        if ( all( rownames( cl ) %in% rownames( data ) ) ) {
+          hr <- treecl.res
+          hr.cl <- cl
+        } else {
+          stop( "results file for 2nd direction doesn't contain cluster for 2ndary direction (rows in this case)\n" )
+        }
+      } else if ( is.null( hc ) ) {
+        if ( all( rownames( cl ) %in% colnames( data ) ) ) {
+          hc <- treecl.res
+          hc.cl <- cl
+        } else {
+          stop( "results file for 2nd direction doesn't contain cluster for 2ndary direction (genes in this case)\n" )
+        }
+      } else {
+        stop( "should never get here\n" )
+      }
+    } else if ( exists( 'partcl.res' ) ) {
+      if ( is.null( hr ) ) {
+        if ( all( names( cl ) %in% rownames( data ) ) ) {
+          hr <- NA
+          hr.cl <- cl
+          data <- data[ names( cl ), ]  ## partcl.res should now be sorted in order of cluster
+        } else {
+          stop( "results file for 2nd direction doesn't contain cluster for 2ndary direction (rows in this case)\n" )
+        }
+      } else if ( is.null( hc ) ) {
+        if ( all( names( cl ) %in% colnames( data ) ) ) {
+          hc <- NA
+          hc.cl <- cl
+          data <- data[ , names( cl ) ]  ## partcl.res should now be sorted in order of cluster
+        } else {
+          stop( "results file for 2nd direction doesn't contain cluster for 2ndary direction (genes in this case)\n" )
+        }
+      } else {
+        stop( "should never get here\n" )
+      }      
+    }
+  }
+}
+
+## Now, re-set hc & nr to NULL if they were set to NA
+## we used NA to signify that they were set by kmeans/pam, but now, we need to reset them
+## for the following lines (that generate the dendrograms (if there was an hclust result)
+if ( ( !is.null( hr ) ) && is.na( hr ) ) hr <- NULL
+if ( ( !is.null( hc ) ) && is.na( hc ) ) hc <- NULL
+
+if ( ! is.null( hr ) ) {
+  row.ddr <- as.dendrogram( hr )
+  ##  need this to make sure that the heatmap is oriented the same way as it is in TreeView
+  if ( opt$reverse.rows ) row.ddr <- rev( row.ddr )  
+}
+
+if ( ! is.null( hc ) ) {
+  col.ddr <- as.dendrogram( hc )
+}
+  
+
+hmcols<-colorRampPalette(c("blue","white","red"))(256)
+
+if ( ( ! is.null( row.ddr ) ) && ( ! is.null( col.ddr ) ) ) {
+  dend.param <- "both"
+} else {
+  dend.param <- "none"
+  if ( ! is.null( row.ddr ) ) dend.param <- "row"
+  if ( ! is.null( col.ddr ) ) dend.param <- "column"
+}
+
+
+param.list <- list( x=data,
+                    Rowv=row.ddr,
+                    Colv=col.ddr,
+                    dendrogram=dend.param,
+                    trace="none",
+                    col=hmcols,
+                    symbreaks=TRUE,
+                    scale="none",
+                    labRow="",
+                    labCol="",
+                    na.color='grey' ) #,
+                    ##key=FALSE )
+
+if ( ! is.null( hr.cl ) ) {
+  hrcols <- rainbow( max( as.numeric( hr.cl ) ) )
+  names( hrcols ) <- sort( unique( as.numeric( hr.cl ) ) )
+  rowColLabs <- hrcols[ as.character( as.numeric( hr.cl ) ) ]
+  param.list <- c( param.list, list( RowSideColors=rowColLabs ) )
+}
+if ( ! is.null( hc.cl ) ) {
+  hccols <- rainbow( max( as.numeric( hc.cl ) ) )
+  names( hccols ) <- sort( unique( as.numeric( hc.cl ) ) )
+  colColLabs <- hccols[ as.character( as.numeric( hc.cl ) ) ]
+  param.list <- c( param.list, list( ColSideColors=colColLabs ) )
+}
+
+
+if ( opt$image.format == 'png' ) {
+  png.fname <- file.path( opt$output.report.dir, "cluster.heatmap.png")
+  plot.dev <- png( png.fname,
+                   width=8.5,
+                   height=11,
+                   units='in',
+                   res=72 )
+} else {
+  pdf.fname <- opt$output.report.html
+  pdf( opt$output.report.html,
+       paper="letter" )
+}
+
+do.call( "heatmap.2", param.list )
+
+dev.off() ## close the previous device
+
+if ( opt$plot.kms ) {
+
+  cmd.string <- opt$survival.script
+
+  ## get the consensusClass file that's associated with the k.select
+  cmd.string <- paste( cmd.string, "-C", opt$dataset )
+  cmd.string <- paste( cmd.string, "-S", opt$survival.fname )
+  cmd.string <- paste( cmd.string, "-M", opt$survival.mode )
+
+  ##  only call kms if we're the image is png
+  if ( opt$image.format=="png" ) {
+    png.fname <- file.path( opt$output.report.dir, "kaplan.meier.survival.png")
+    cmd.string <- paste( cmd.string, "-I", "png" ) 
+    cmd.string <- paste( cmd.string, "-O", png.fname )
+    system( cmd.string )
+  }
+}
+
+
+if ( opt$image.format == 'png' ) {
+  pngs = list.files(path=opt$output.report.dir, patt="png")
+  html.out <- paste( "<html>",
+                     paste( paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), collapse=""),
+                     "</html>" )
+  cat( html.out, file=opt$output.report.html )
+}
+
+
+if ( opt$output.treeview ) {
+  treeview.fname.stem <- file.path( opt$output.report.dir, "cluster.heatmap")
+  fnames <- character()
+  if ( ! is.null( hr ) ) {
+    fname <- paste( treeview.fname.stem, ".gtr", sep="" )
+    r2gtr( hr, file=fname )
+    fnames <- c( fnames, fname )
+  } else {
+    hr <- list( order=1:nrow( data ) )
+  }
+  if ( ! is.null( hc ) ) {
+    fname <- paste( treeview.fname.stem, ".atr", sep="" )
+    r2atr( hc, file=fname )
+    fnames <- c( fnames, fname )
+  } else {
+    hc <- list( order=1:ncol( data ) )
+  }
+
+
+  fname <- paste( treeview.fname.stem, ".cdt", sep="" )
+  r2cdt( hr, hc, data, file=fname )
+  fnames <- c( fnames, fname )
+
+  ## jtv file now
+  jtv.str <- '<DocumentConfig><UrlExtractor/><ArrayUrlExtractor/><Views><View type="Dendrogram" dock="1"><ColorExtractor contrast="2.0"><ColorSet zero="#FFFFFF" down="#0000FF"/></ColorExtractor><ArrayDrawer/><GlobalXMap current="Fill"><FixedMap type="Fixed"/><FillMap type="Fill"/><NullMap type="Null"/></GlobalXMap><GlobalYMap current="Fill"><FixedMap type="Fixed"/><FillMap type="Fill"/><NullMap type="Null"/></GlobalYMap><ZoomXMap><FixedMap type="Fixed"/><FillMap type="Fill"/><NullMap type="Null"/></ZoomXMap><ZoomYMap><FixedMap type="Fixed"/><FillMap type="Fill"/><NullMap type="Null"/></ZoomYMap><TextView><TextView face="Monospaced" size="14"><GeneSummary/></TextView><TextView face="Monospaced" size="14"><GeneSummary/></TextView><TextView face="Monospaced" size="14"><GeneSummary/></TextView><TextView face="Monospaced" size="14"><GeneSummary/></TextView></TextView><ArrayNameView face="Monospaced" size="14"><ArraySummary included="0"/></ArrayNameView><AtrSummary/><GtrSummary/></View></Views></DocumentConfig>'
+  fname <- paste( treeview.fname.stem, ".jtv", sep="" )
+  cat( jtv.str, file=fname )
+  fnames <- c( fnames, fname )
+
+  cmd <- paste( "tar -zcf", opt$output.fname, paste( "--directory=", opt$output.report.dir, sep="" ), paste( basename( fnames ), collapse=" " ) )
+  system( cmd )
+}
+
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/heatmap.from.cluster.result.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/heatmap.from.cluster.result.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "heatmap.from.cluster.result.R")
+survival_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "gen.survival.curves.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] + [ "-z", survival_script_path ]
+
+print cmd_args
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/heatmap.from.cluster.result.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/heatmap.from.cluster.result.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,125 @@
+<tool id="cluster_heatmap" name="Generate Heatmap for Clustering Result" force_history_refresh="True">
+  <command interpreter="python">heatmap.from.cluster.result.py
+-d ${dataset1}
+-s ${cluster_second_direction_cond.cluster_second_direction}
+
+#if str($cluster_second_direction_cond.cluster_second_direction) == 'prev':
+-D ${cluster_second_direction_cond.dataset2}
+#end if
+-i ${image_format_cond.image_format}
+-t ${output_treeview_format}
+-r ${reverse_rows}
+-k ${plot_kms_cond.plot_kms}
+#if str($plot_kms_cond.plot_kms) == 'yes':
+-C ${dataset1}
+-S ${plot_kms_cond.survial_data}
+-M ${plot_kms_cond.mode}
+-T "${plot_kms_cond.title}"
+-I none
+#end if
+
+#if str($image_format_cond.image_format) == 'png':
+-h $report_html
+-p ${report_html.files_path}
+#end if
+#if str($image_format_cond.image_format) == 'pdf':
+-h $report_pdf
+-p ${report_pdf.files_path}
+#end if
+#if str($treeview_output) != 'None':
+-o ${treeview_output}
+#end if
+
+</command>
+<inputs>
+  <param format="rdata" name="dataset1" type="data" label="Clustering Classification" help="Cluster result file from CCPLUS, HAC, or PAM"/>
+  <conditional name="plot_kms_cond">
+    <param name="plot_kms" type='select' label="Plot Kaplan-Meiers Survival Plot as well (primary clustering ONLY)?" help="NOTE: this only works when the Image Format is PNG.  For a PDF of the KM plot, you can use the 'Generate Kaplan-Meiers Plot for Cluster Result' tool">
+      <option value="no" selected='true'>No</option>
+      <option value="yes" >Yes</option>
+    </param>
+    <when value='yes'>
+      <param name="survial_data" type="data" format="tabular" label="Clinical Data" help="Specify the clinical data to use for the Kaplan-Meiers Plot (see help)"/>
+      <param name="mode" type="select" display='radio' label="Report Mode" help="Specify the mode to use when performing Log-Rank tests (see help below)">
+	<option value="all" selected='true' >All</option>
+	<option value="one">One</option>
+	<option value="both">Both</option>
+      </param> 
+      <param name="title" type="text" label="Title" value="Report"/>
+    </when>
+  </conditional>
+  <conditional name="cluster_second_direction_cond">
+    <param name="cluster_second_direction" type="select" label="Cluster the second dimension? (e.g. rows if this is a sample cluster)"  help="Cluster the 2nd dimension of matrix in the cluster result (see help below)">
+      <option value="no" selected="true">No</option>
+      <option value="yes">Yes</option>
+      <option value="prev">Previous Cluster Result</option>
+    </param>
+    <when value="prev">
+      <param format="rdata" name="dataset2" type="data" label="Previous Cluster result #2 (secondary result, e.g. rows)" help="Cluster result file from CCPLUS, HAC, or PAM"/>
+    </when>
+  </conditional>
+  <conditional name="image_format_cond">
+    <param name="image_format" type="select" display='radio' label="Image format">
+      <option value="pdf">PDF</option>
+      <option value="png" selected='true' >PNG (html wrapper)</option>
+    </param>
+  </conditional>
+  <param name="output_treeview_format" type='select' display="radio" label="Output in TreeView format as well?" help="Specify whether or not to produce files for TreeView" >
+    <option value="no">No</option>
+    <option value="yes" selected='true' >Yes</option>
+  </param>
+  <param name="reverse_rows"  type='select'  display="radio" label="Reverse Row-order (to make consistent w\TreeView Display)?" help="Specify whether or not to reverse Row-order (to make the heatmap consistent with output from TreeView (reversed otherwise)" >
+    <option value="no" selected='true'>No</option>
+    <option value="yes">Yes</option>
+  </param>
+</inputs>
+<outputs>
+  <data format="html" name="report_html" label="Heatmap from cluster results (HTML)">
+    <filter>(image_format_cond['image_format']=="png")</filter>
+  </data>
+  <data format="pdf" name="report_pdf" label="Heatmap from cluster results (PDF)">
+    <filter>(image_format_cond['image_format']=="pdf")</filter>
+  </data>
+  <data format="tgz" name="treeview_output" label="Heatmap from cluster results (tgz of TreeView files)">
+    <filter>(output_treeview_format)=="yes"</filter>
+  </data>
+</outputs>
+<help>
+.. class:: infomark
+     
+**Generate Heatmap for Clustering Result** - Tool to generate a heatmap and dendrogram for a clustering result
+
+**OUTPUT:**  
+      * **Heatmap** in either PDF or PNG format
+      * **TreeView Files**  gzip tarball file of the relevant files (OPTIONAL)
+
+----
+
+**Parameters**
+
+- **Clustering Classification** Specify the clustering classification (RData file format - use the 'Convert tab-delimited Cluster Assignments to RData" tool to convert assignments in tab-delimited format).
+
+- **Plot Kaplan-Meiers Survival Plot as well (primary clustering ONLY)?** Specify whether or not to also plot a Kaplan-Meiers Surivial Plot.  **NOTE*, the cluster results must be a **SAMPLE** cluster.
+ 
+- **Cluster the second dimension?** Specify whether or not to cluster the 2nd dimension of matrix in the cluster result.  Choice of:
+         * No
+         * Yes - Generate a default clustering (HAC with Euclidean distance)
+         * Previous Cluster Result - MUST specify a previous clustering result
+
+- **IF Cluster the second dimension? is a previous clustering result:**
+         * **Previous Cluster result #2** Specify the clustering classification for the 2nd dimension (RData file format - use the 'Convert tab-delimited Cluster Assignments to RData" tool to convert assignments in tab-delimited format).
+
+- **IF Plot Kaplan-Meiers Survival Plot is YES:**
+         * **Clinical Data** Clinical data to use for the Kaplan-Meiers Plot (must be formated - See the "Format Raw TCGA sample IDs")
+
+         * **Report Mode** Mode to use when performing Log-Rank tests **(MUST SPECIFY A PROPERLY FORMATTED CLINICAL DATA FILE)** .  Choice of:
+
+                 * All - All clusters versus each other
+                 * One - One cluster versus a meta-cluster composed of the others.  Search performed exhaustively.
+                 * Both - Perform both all-v-all and and one-v-others test; select the choice of K that gives the best
+
+         * **Title** - Title to use for Kaplan-Meiers Plot
+
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/impute.knn.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/impute.knn.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,44 @@
+#!/usr/bin/env Rscript
+argspec <- c("impute.knn.R replaces missing values, using the impute.knn function from the impute package
+
+        Usage: 
+                impute.knn.R -d <data.file> 
+        Optional:
+                             -o <output_file>
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+lib.load.quiet( impute )
+
+spec <- matrix( c( "data.fname",      "d", 1, "character",
+                   "output.fname",    "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+
+data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
+if ( is.null( opt$replacement.val ) ) { opt$replacement.val <- NA }
+if ( is.null( opt$output.fname ) ) { opt$output.fname <- paste( "impute.knn", basename( opt$data.fname ), sep="." ) }
+
+  ## Set any NA, NAN or Inf entries to 0
+if ( is.nan(data) ) {
+  data[ is.nan( data ) ] <- NA
+}
+if ( is.infinite(data) ) {
+  data[ is.infinite( data ) ] <- NA
+}
+
+data <- impute.knn( data )$data
+write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/impute.knn.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/impute.knn.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "impute.knn.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/impute.knn.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/impute.knn.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,27 @@
+<tool id="impute_missing_values" name="Impute Missing Values" force_history_refresh="True">
+    <command interpreter="python">impute.knn.py
+-d $dataset
+-o ${output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='tabular' label="Matrix with Missing Values"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="Matrix with Missing Values Imputed"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Impute Missing Values** - Tool to impute missing values (NAs) from a data matrix.
+
+**OUTPUT:**  A new matrix without missing values
+
+----
+
+**Parameters**
+
+- **Matrix with Missing Values** Input matrix that potentially contains missing values
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/ipl.feature.selection.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/ipl.feature.selection.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,155 @@
+#!/usr/bin/env Rscript
+## IPL selection script by Peter Waltman
+## August 21, 2011
+## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
+##
+#usage, options and doc goes here
+argspec <- c("ipl.feature.selection.R takes a set of results from Paradigm, and filters for features that are
+active, inactive or modulated above a given IPL threshold over a sufficient percentage of samples.
+
+        Usage: 
+                ipl.feature.selection.R -d <data.file> 
+        Optional:
+                -o <output.name>
+                -g <genes-only>   ## to set if only returning genes (default is all features)
+                -f <filter.type>       ## filter.type must be either 'modulated', 'active'or 'inactive' (default is modulated)
+                -t <threshold>    ## the threshold to use for the filter (default is 0.25)
+                -p <perc.pass>    ## the percentage of samples that must pass the filter (default is 0.33)
+                -v <verbose>      ## to set verbose on
+
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "data.fname",         "d", 1, "character",
+                   "output.name",        "o", 2, "character",
+                   "genes.only",         "g", 0, "logical",
+                   "filter.type",        "f", 2, "character", ## must be either 'active', 'inactive' or 'modulated'
+                   "threshold",          "t", 2, "numeric",
+                   "empirical.fname",    "e", 2, "character",
+                   "perc.pass",          "p", 2, "numeric",
+                   "verbose",            "v", 0, "logical",    ## to set verbose on
+                   "help",               "h", 0, "logical"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+#set some reasonable defaults for the options that are needed,
+#but were not specified.
+if ( is.null(opt$verbose ) ) { opt$verbose = FALSE }
+if ( is.null(opt$genes.only ) ) {
+  opt$genes.only <- FALSE
+}
+
+if ( is.null(opt$filter.type ) ) { opt$filter.type = 'modulated' }
+if ( is.null( opt$threshold ) ) { opt$threshold=0.25 }
+if ( is.null( opt$perc.pass ) ) { opt$perc.pass=1/3 }
+if ( opt$perc.pass < 0  ) {
+  stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" )
+}
+## now set filter.type, threshold & perc.pass if an empirical result has been passed in
+if ( ! is.null( opt$empirical.fname ) ) {
+
+  if ( ! file.exists( opt$empirical.fname ) ) stop( "can't file empirical result file:", opt$empirical.fname, "\n" )
+  ## assume this is an RData file
+  emp.fname.contents <- load( opt$empirical.fname )
+  if ( ! "opt.thresh" %in% emp.fname.contents ) stop( "no optimal threshold value found in RData file passed in\n" )
+  opt$threshold <- opt.thresh
+
+  if ( ! "filter.type" %in% emp.fname.contents ) stop( "no filter type value found in RData file passed in\n" )
+  opt$filter.type <- filter.type
+
+  if ( ! "perc.pass" %in% emp.fname.contents ) stop( "no percentage passing value found in RData file passed in\n" )
+  opt$perc.pass <- perc.pass
+}
+if ( ! opt$filter.type %in% c( 'active', 'inactive', 'modulated' ) ) stop( 'invalid filter.type specified:', opt$filter.type, "\n" )
+if ( is.null( opt$output.name ) ) {
+  opt$output.name <- file.path( getwd(),
+                                paste( opt$filter.type, basename( opt$data.fname ), sep="." ) )
+}
+
+
+
+data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
+if ( opt$genes.only ) {
+  genes <- rownames( data )
+  genes <- genes[ ! grepl( "abstract|complex|family", genes ) ]
+  data <- data[ genes, ]
+}
+
+
+count.samps.threshold <- function( data,
+                                   threshold,
+                                   comparator ## must be one of lte, lt, gt, gte
+                                    ) {
+  filter.vect <- rep( TRUE, nrow( data ) ) ## set an initial val
+  if ( comparator == "lt" ) {
+    return( apply( data,
+                   1,
+                   function(x) sum( x < threshold, na.rm=T ) ) )
+  }
+  if ( comparator == "lte" ) {
+    return( apply( data,
+                   1,
+                   function(x) sum( x <= threshold, na.rm=T ) ) )
+  }
+  if ( comparator == "gte" ) {
+    return( apply( data,
+                   1,
+                   function(x) sum( x >= threshold, na.rm=T ) ) )
+  }
+  if ( comparator == "gt" ) {
+    return( apply( data,
+                   1,
+                   function(x) sum( x > threshold, na.rm=T ) ) )
+  }
+  if ( comparator == "bothe" ) {
+    return( apply( data,
+                   1,
+                   function(x) sum( abs(x) >= threshold, na.rm=T ) ) )
+  }
+  if ( comparator == "both" ) {
+    return( apply( data,
+                   1,
+                   function(x) sum( abs(x) > threshold, na.rm=T ) ) )
+  }
+}
+
+
+
+
+if ( opt$filter.type=="active" ) {
+  ## this is an implementation of the activity filter that was used in the original PARADIGM paper
+  filter.vect <- count.samps.threshold( data, opt$threshold, "gt" )
+} else {
+  if ( opt$filter.type=="inactive" ) {
+    filter.vect <- count.samps.threshold( data, -opt$threshold, "lt" )
+  } else {
+    if ( opt$filter.type=="modulated" ) {
+      filter.vect <- count.samps.threshold( data, opt$threshold, "both" )
+    } else {
+      stop( "invalid filter.type specified: ", opt$filter.type )
+    }
+  }
+}
+
+if ( opt$perc.pass <1 ) {
+  filter.vect <- filter.vect > floor( ncol( data ) * opt$perc.pass )
+} else {
+  filter.vect <- filter.vect >= opt$perc.pass
+}
+data <- data[ filter.vect, ]
+
+write.table( data, opt$output.name, sep="\t", row.names=TRUE, col.names=NA, quote=FALSE )
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/ipl.feature.selection.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/ipl.feature.selection.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "ipl.feature.selection.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/ipl.feature.selection.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/ipl.feature.selection.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,83 @@
+<tool id="ipl_feature_selection" name="IPL Feature Selection" force_history_refresh="True">
+    <command interpreter="python">ipl.feature.selection.py
+-d $dataset 
+${genes_only}
+-o ${output}
+
+#if str($thresh_format.format) == 'manual':
+-f ${thresh_format.filter} 
+-t ${thresh_format.threshold} 
+-p ${thresh_format.perc_pass} 
+#end if
+#if str($thresh_format.format) == 'empirical':
+-e ${empirical_fname} 
+#end if
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='tabular' label="Data Set"/>
+    	<param name="genes_only" type="boolean" label="Genes Only (check to set yes)" truevalue="-g" falsevalue="" checked="False" />
+    	
+	<conditional name="thresh_format" >
+	  <param name="format" type='select' label="Input Format for Threshold Used" >
+	    <option value="empirical" selected='true'>Output from the Determine Threshold Tool</option>
+	    <option value="manual">Manually Specify the Threshold</option>
+	  </param>
+	  <when value="manual" >
+	    <param name="filter" type="select" label="Activity Filter" >
+	      <option value="modulated" select='true' >Modulated</option>
+	      <option value="active">Active</option>
+	      <option value="inactive">Inactive</option>
+	    </param>
+	    <param name="perc_pass" type="float" label="% of Samples Passing (value in 0-1 range; >= 1 to indicate exact number of samples)" value="0.33"/>
+	    <param name="threshold" type="float" label="Activity Threshold" value="0.25"/>
+	  </when>
+	  <when value="empirical" >
+	    <param name="empirical_fname" type="data" format='rdata' label="Activity Threshold File" />
+	  </when>
+	</conditional>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="Filtered IPLs"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**IPL Feature Selection** - Tool to filter an IPL matrix to contain only those features that exceed a given threshold for a specified percentage of samples
+
+**OUTPUT:**  A new matrix containing only the feaures that pass the user-specified filter
+
+----
+
+**Parameters**
+
+- **Genes Only** Check to limit the new matrix to only gene features
+
+- **Input Format for Threshold Used:** - Specify the format to specify the Threshold.  Choice of:
+         * Output from the 'Determine Threshold Tool'
+         * Manually Specify the Threshold
+
+- **IF Output from the 'Determine Threshold Tool' IS SELECTED:**
+         * **Activity Threshold File** Specify the result file from the 'Determine IPL Threshold for Consensus Clustering" tool
+
+- **IF Manually Specify the Threshold IS SELECTED:**
+         * **Activity Filter** Specify the filter type to use.  Choice of:
+
+                 * Activity -  Features must exceed the user-specified threshold
+                 * Inactivity -  Features must fall below the user-specified threshold
+                 * Modulated - Absolute value of the features must exceed the specified threshold
+
+         * **Percentage of Samples Passing** Percent of samples with an IPL that passes the threshold. Choice of:
+
+                 * Real Value in [0,1] - indicate the percentage of samples that pass the threshold
+                 * Integer Value       - indicate the exact number of samples that pass the threshold
+
+         * **Selection Criteria** Specify the test statistic to use to select the threshold.  Choice of:
+
+                 * Binomial P-value - Select the threshold with the largest -log p-value (calculated as a binomial)
+                 * Chi-Squared P-value - Select the threshold with the largest -log p-value (calculated as a Chi-squared)
+                 * Overall Max Number of Differences - Select the threshold with the largest overall number of differences between the real and null distributions
+
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/new.ccplus.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/new.ccplus.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,773 @@
+##!/usr/bin/env Rscript
+## Consensus Clustering Script by Peter Waltman
+## May 31, 2011
+## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
+##
+#usage, options and doc goes here
+argspec <- c("consensus.clustering.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+        Usage: 
+                consensus.clustering.R -d <data.file> 
+        Optional:
+                -o <output.name>
+                -a <cluster.alg>  ## must be either 'hc' or 'km'
+                -m <distance.metric> ## must be one supported by ConsensusClusterPlus
+                -k <max.k>
+                -r <reps>
+                -f <filter>            ## filter, o/w no filtering
+
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+require(getopt)
+##require(ConsensusClusterPlus)
+##  if any of the faster clustering methods are available on this system, load them
+require( amap )
+require( cluster )
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+  if ( 'flashClust' %in% installed.packages() ) {
+    require( flashClust )
+  } else {
+    if ( 'fastcluster' %in% installed.packages() ) {
+      require( fastcluster )
+    }
+  }
+}
+
+###################
+## code borrowed/updated from ConsensusClusterPlus
+###################
+
+ConsensusClusterPlus <- function( d=NULL,
+                                  maxK = 3,
+                                  reps=10,
+                                  pItem=0.8,
+                                  pFeature=1,
+                                  clusterAlg="hc",
+                                  title="untitled_consensus_cluster",
+                                  innerLinkage="average",
+                                  finalLinkage="average",
+                                  distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ),
+                                  ml=NULL,
+                                  tmyPal=NULL,
+                                  seed=NULL,
+                                  plot=NULL,
+                                  writeTable=FALSE,
+                                  weightsItem=NULL,
+                                  weightsFeature=NULL,
+                                  verbose=F ) {
+  ##description: runs consensus subsamples 
+
+
+  if(is.null(seed)==TRUE){
+    seed=timeSeed = as.numeric(Sys.time())
+  }
+  set.seed(seed)
+
+  if(is.null(ml)==TRUE){
+
+    if ( inherits( distance, "dist" ) ) {
+      stop( "If you want to pass in a pre-calculated distance object, pass it in as the data, rather than the distance parameter\n" )
+    }
+    
+    if ( ! class( d ) %in% c( "dist", "matrix", "ExpressionSet" ) ) {
+      stop("d must be a matrix, distance object or ExpressionSet (eset object)")
+    }
+
+    if ( inherits( d, "dist" ) ) {
+      ## if d is a distance matrix, fix a few things so that they don't cause problems with the analysis
+      ##  Note, assumption is that if d is a distance matrix, the user doesn't want to sample over the row features
+      if ( is.null( attr( d, "method" ) ) ) {
+        attr( d, "method" ) <- distance <- "unknown - user-specified"
+      }
+      if ( is.null( distance ) || ( distance != attr( d, "method" ) ) ) {
+        distance <- attr( d, "method" )
+      }
+      
+      if ( ( ! is.null( pFeature ) ) && ( pFeature < 1 ) ) {
+        if ( verbose ) warning( "Cannot use the pFeatures parameter when specifying a distance matrix as the data object\n" )
+        pFeature <- 1
+      }
+      if ( ! is.null( weightsFeature ) ) {
+        if ( verbose ) warning( "Cannot use the weightsFeature parameter when specifying a distance matrix as the data object\n" )
+        weightsFeature <- NULL
+      }
+      if ( clusterAlg == "km" ) {
+        if ( verbose ) warning( "You are asking CCPLUS to use K-means to cluster a distance matrix (rather than the data itself) - this may produce unintended results. We suggest using PAM if you want to use alternate distance metrics/objects\n" )
+        ##d <- as.matrix( d )  #this is now done w/in ccRun
+      }
+    } else {
+      if ( is.null( distance ) ) {
+        ## we should never get here, but just in case
+        distance <- "pearson"
+      }
+    }
+
+    if ( ( clusterAlg == "km" ) && inherits( distance, "character" ) && ( distance != "euclidean" ) ) {
+      warning( "WARNING: kmeans can only use the euclidean distance metric.  If you would like to use an alternate metric, we suggest using PAM or HC clustering instead. This parameter combinationwill use k-means, but will NOT use the specified distance metric\n" )
+      distance <- 'euclidean'
+    }
+
+
+    if ( inherits( d,"ExpressionSet" ) ) {
+      d <- exprs(d)
+    }
+
+    ml <- ccRun( d=d,
+                 maxK=maxK,
+                 repCount=reps,
+                 diss=inherits(d,"dist"),
+                 pItem=pItem,
+                 pFeature=pFeature,
+                 innerLinkage=innerLinkage,
+                 clusterAlg=clusterAlg,
+                 weightsFeature=weightsFeature,
+                 weightsItem=weightsItem,
+                 distance=distance,
+                 verbose=verbose)
+  }
+  res=list();
+  
+  ##make results directory
+  if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){
+    dir.create(paste(title,sep=""))
+  }
+  
+  ##write log file
+  log <- matrix( ncol=2,
+                 byrow=T,
+                 c("title",title,
+                   "maxK",maxK,
+                   "input matrix rows",ifelse ( inherits( d, "matrix" ), nrow(d), "dist-mat" ), 
+                   "input matric columns",ifelse ( inherits( d, "matrix" ), ncol(d), ncol( as.matrix(d) ) ), 
+                   "number of bootstraps",reps,
+                   "item subsampling proportion",pItem,
+                   "feature subsampling proportion",ifelse( is.null(pFeature), 1, pFeature ),
+                   "cluster algorithm",clusterAlg,
+                   "inner linkage type",innerLinkage,
+                   "final linkage type",finalLinkage,
+                   "correlation method",distance,
+                   "plot",if(is.null(plot)) NA else plot,
+                   "seed",if(is.null(seed)) NA else seed))
+  colnames(log) = c("option","value")
+  if(writeTable){
+    write.csv(file=paste(title,"/",title,".log.csv",sep=""), log,row.names=F)
+  }
+  if(is.null(plot)){
+    ##nothing
+  }else if(plot=="png"){
+    png(paste(title,"/","consensus%03d.png",sep=""))
+  }else if (plot=="pdf"){
+    pdf(onefile=TRUE, paste(title,"/","consensus.pdf",sep=""))
+  }else if (plot=="ps"){
+    postscript(onefile=TRUE, paste(title,"/","consensus.ps",sep=""))
+  }	
+  
+  colorList=list()
+  colorM = rbind() #matrix of colors.
+  
+                                        #18 colors for marking different clusters
+  thisPal <- c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99","#E31A1C","#FDBF6F","#FF7F00","#CAB2D6","#6A3D9A","#FFFF99","#B15928",
+               "#bd18ea", #magenta
+               "#2ef4ca", #aqua
+               "#f4cced", #pink,
+               "#f4cc03", #lightorange
+               "#05188a", #navy,
+               "#e5a25a", #light brown
+               "#06f106", #bright green
+               "#85848f", #med gray
+               "#000000", #black
+               "#076f25", #dark green
+               "#93cd7f",#lime green
+               "#4d0776", #dark purple
+               "#ffffff" #white
+               )
+  
+  ##plot scale
+  colBreaks=NA
+  if(is.null(tmyPal)==TRUE){
+    colBreaks=10
+    tmyPal = myPal(colBreaks)
+  }else{
+    colBreaks=length(tmyPal)
+  }
+  sc = cbind(seq(0,1,by=1/( colBreaks) )); rownames(sc) = sc[,1]
+  sc = cbind(sc,sc)
+  heatmap(sc, Colv=NA, Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=rownames(sc),labCol=F,main="consensus matrix legend")
+
+  for (tk in 2:maxK){
+    if(verbose){
+      message(paste("consensus ",tk))
+    }
+    fm = ml[[tk]]
+    hc=hclust( as.dist( 1 - fm ), method=finalLinkage);
+    message("clustered")	
+    ct = cutree(hc,tk)
+    names(ct) = colnames(d)
+    c = fm
+    ##colnames(c) = colnames(d)
+    ##rownames(c) = colnames(d)
+
+    colorList = setClusterColors(res[[tk-1]][[3]],ct,thisPal,colorList)
+	
+    pc = c
+    pc=pc[hc$order,] #***pc is matrix for plotting, same as c but is row-ordered and has names and extra row of zeros.
+    pc = rbind(pc,0)
+    
+    heatmap(pc, Colv=as.dendrogram(hc), Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=F,labCol=F,mar=c(5,5),main=paste("consensus matrix k=",tk,sep="") , ColSideCol=colorList[[1]])
+    legend("topright",legend=unique(ct),fill=unique(colorList[[1]]),horiz=FALSE )
+
+    res[[tk]] = list(consensusMatrix=c,consensusTree=hc,consensusClass=ct,ml=ml[[tk]],clrs=colorList)
+    colorM = rbind(colorM,colorList[[1]]) 
+  }
+  CDF(ml)
+  clusterTrackingPlot(colorM[,res[[length(res)]]$consensusTree$order])
+  if(is.null(plot)==FALSE){
+    dev.off();
+  }
+  res[[1]] = colorM
+  if(writeTable){
+    for(i in 2:length(res)){
+      write.csv(file=paste(title,"/",title,".k=",i,".consensusMatrix.csv",sep=""), res[[i]]$consensusMatrix)
+      write.table(file=paste(title,"/",title,".k=",i,".consensusClass.csv",sep=""), res[[i]]$consensusClass,col.names = F,sep=",")
+    }
+  }
+  return(res)
+}
+
+
+calcICL = function(res,title="untitled_consensus_cluster",plot=NULL,writeTable=FALSE){
+  #calculates and plots cluster consensus and item consensus
+  cc=rbind()
+  cci = rbind()
+  sumRes=list()
+  colorsArr=c()
+  
+  #make results directory
+  if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){
+	dir.create(paste(title,sep=""))
+  }
+  if(is.null(plot)){
+    #to screen
+  }else if(plot=="pdf"){
+    pdf(onefile=TRUE, paste(title,"/","icl.pdf",sep=""))
+  }else if(plot=="ps"){
+    postscript(onefile=TRUE, paste(title,"/","icl.ps",sep=""))
+  }else if (plot=="png"){
+    png(paste(title,"/","icl%03d.png",sep=""))
+  }
+
+  par(mfrow=c(3,1),mar=c(4,3,2,0))
+
+  for (k in 2:length(res)){ #each k
+    eiCols = c();
+    o = res[[k]]
+    m = o$consensusMatrix
+    m = triangle(m,mode=2)
+    for (ci in sort(unique(o$consensusClass))){ #each cluster in k
+	items = which(o$consensusClass==ci)
+	nk = length(items)
+	mk = sum( m[items,items], na.rm=T)/((nk*(nk-1))/2)
+	cc=rbind(cc,c(k,ci,mk)) #cluster-consensus
+	
+      for (ei in rev(res[[2]]$consensusTree$order) ){
+		denom = if (ei %in% items) { nk - 1} else { nk }
+        	mei = sum( c(m[ei,items],m[items,ei]), na.rm=T)/denom  # mean item consensus to a cluster.
+		cci = rbind(cci,c(k,ci,ei,mei)) #cluster, cluster index, item index, item-consensus
+      }
+      eiCols = c(eiCols, rep(ci,length(o$consensusClass)) )
+    }
+	  
+	  cck = cci[which(cci[,1]==k),] #only plot the new k data.
+
+	  #group by item, order by cluster i
+	  w=lapply(split(cck,cck[,3]), function(x) { y=matrix(unlist(x),ncol=4); y[order(y[,2]),4] }) 
+	  q = matrix(as.numeric(unlist(w)),ncol=length(w),byrow=F)
+	  q = q[,res[[2]]$consensusTree$order] #order by leave order of k=2
+ 	  #q is a matrix of k rows and sample columns, values are item consensus of sample to the cluster.
+
+	  thisColors = unique(cbind(res[[k]]$consensusClass,res[[k]]$clrs[[1]]))
+	  thisColors=thisColors[order(as.numeric(thisColors[,1])),2]
+	  colorsArr=c(colorsArr,thisColors)
+	  sumRes[[k]] = rankedBarPlot(q,thisColors,cc=res[[k]]$consensusClass[res[[2]]$consensusTree$order],paste("k=",k,sep="") )
+  }
+
+  ys=cs=lab=c()
+  lastk=cc[1,1]
+  for(i in 1:length(colorsArr)){
+    if(lastk != cc[i,1]){
+      ys=c(ys,0,0)
+      cs=c(cs,NA,NA)
+      lastk=cc[i,1]
+      lab=c(lab,NA,NA)
+    }
+    ys=c(ys,cc[i,3])
+    cs=c(cs,colorsArr[i])
+    lab=c(lab,cc[i,1])
+  }
+  names(ys) = lab
+  par(mfrow=c(3,1),mar=c(4,3,2,0))
+  barplot(ys,col=cs,border=cs,main="cluster-consensus",ylim=c(0,1),las=1)
+  if(is.null(plot)==FALSE){
+	  dev.off()
+  }
+  colnames(cc) = c("k","cluster","clusterConsensus")
+  colnames(cci) = c("k","cluster","item","itemConsensus")
+  cci[,"item"] = names(res[[2]]$consensusClass)[ cci[,"item"] ]
+  #type cci
+  cci = data.frame( k=as.numeric(cci[,"k"]), cluster=as.numeric(cci[,"cluster"]), item=cci[,"item"], itemConsensus=as.numeric(cci[,"itemConsensus"])) 
+  
+  #write to file.
+  if(writeTable){
+	write.csv(file=paste(title,"/",title,".summary.cluster.consensus.csv",sep=""),row.names=F, cc)
+	write.csv(file=paste(title,"/",title,".summary.item.consensus.csv",sep=""), row.names=F, cc)
+  }
+  return(list(clusterConsensus=cc,itemConsensus=cci))
+}
+
+
+ccRun <- function( d=d,
+                   maxK=NULL,
+                   repCount=NULL,
+                   diss=inherits( d, "dist" ),
+                   pItem=NULL,
+                   pFeature=NULL,
+                   innerLinkage=NULL,
+                   distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ),
+                   clusterAlg=NULL,
+                   weightsItem=NULL,
+                   weightsFeature=NULL,
+                   verbose=NULL) {
+  m = vector(mode='list', repCount)
+  ml = vector(mode="list",maxK)
+  n <- ifelse( diss, ncol( as.matrix(d) ), ncol(d) )
+  mCount = mConsist = matrix(c(0),ncol=n,nrow=n)
+  ml[[1]] = c(0);
+
+  if (is.null( distance ) ) distance <- 'euclidean'  ## necessary if d is a dist object and attr( d, "method" ) == NULLa
+  
+  require( amap )
+  ##  we're going to use the amap Dist function, but they misname their correlation
+  ##  functions, so re-name them correctly
+  amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+                      "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
+  names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+                               "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
+  main.dist.obj <- NULL
+  ##browser()
+  if ( diss ){
+    main.dist.obj <- d
+
+    ## reset the pFeature & weightsFeature params if they've been set (irrelevant if d is a dist matrix)
+    if ( ( !is.null(pFeature) ) &&
+         ( pFeature < 1 ) ) {
+      if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified pFeature parameter\n" )
+      pFeature <- 1 # set it to 1 to avoid problems with sampleCols
+    }
+    if ( ! is.null( weightsFeature ) ) {
+      if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified weightsFeature parameter\n" )
+      weightsFeature <- NULL  # set it to NULL to avoid problems with sampleCols
+    }
+  } else { ## d is a data matrix
+    ## we're not sampling over the features
+    if ( ( clusterAlg != "km" ) &&
+         ( is.null( pFeature ) ||
+           ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) ) {
+      ## only generate a main.dist.object IFF 1) d is a matrix, 2) we're not sampling the features, and 3) the algorithm isn't 'km'
+      if ( inherits( distance, "character" ) ) {
+        if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
+
+        main.dist.obj <- Dist( t(d), method=as.character( amap.distance[ distance ] ) )
+        ## now fix dumb amap naming convention for distance metrics
+        attr( main.dist.obj, "method" ) <- as.character( amap.distance[ distance ] )
+      } else stop("unsupported distance specified.")
+    } else {
+      ## pFeature < 1 or a weightsFeature != NULL
+      ## since d is a data matrix, the user wants to sample over the gene features, so main.dist.obj is left as NULL
+    }
+  }
+ 
+
+  for (i in 1:repCount){
+    ##browser()  
+    if(verbose){
+      message(paste("random subsample",i));
+    }
+    ## take expression matrix sample, samples and genes
+    sample_x = sampleCols( d, pItem, pFeature, weightsItem, weightsFeature )
+
+    this_dist = NA
+    if ( ! is.null( main.dist.obj ) ) {
+      boot.cols <- sample_x$subcols
+      this_dist <- as.matrix( main.dist.obj )[ boot.cols, boot.cols ]
+      if ( clusterAlg != "km" ) {
+        ## if this isn't kmeans, then convert to a distance object
+        this_dist <- as.dist( this_dist )
+        attr( this_dist, "method" ) <- attr( main.dist.obj, "method" )
+      }
+    } else {
+      ## if main.dist.obj is NULL, then d is a data matrix, and either:
+      ##   1) clusterAlg is 'km'
+      ##   2) pFeatures < 1 or weightsFeatures have been specified, or
+      ##   3) both
+      ## so we can't use a main distance object and for every iteration, we will have to re-calculate either
+      ##   1) the distance matrix (because we're also sampling the features as well), or
+      ##   2) the submat (if using km) 
+
+      if ( clusterAlg != "km" )  {
+        if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
+        ## good, we have a supported distance type
+        this_dist <- Dist( t( sample_x$submat ), method=as.character( amap.distance[ distance ] ) )
+        ## now fix dumb amap naming convention for distance metrics
+        attr( this_dist, "method" ) <- as.character( amap.distance[ distance ] )
+      } else {
+        ##browser()
+        ##clusterAlg == "km" 
+        ## if we're not sampling the features, then grab the colslice
+        if ( is.null( pFeature ) ||
+            ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) {
+          this_dist <- d[, sample_x$subcols ]
+        } else {
+          if ( is.na( sample_x$submat ) ) {
+            save( "ccrun.submat.eq.na.dbg.rda" )
+            stop( "Houston, we have a problem.  sample_x$submat is NA in ccRun when it should be specified - saving state\n" )
+          }
+          
+          this_dist <- sample_x$submat
+        } 
+      }
+    }
+                  
+    ## cluster samples for HC.
+    this_cluster=NA
+    if(clusterAlg=="hc"){
+      this_cluster = hclust( this_dist, method=innerLinkage)
+    }
+    ##browser()
+    ##mCount is possible number of times that two sample occur in same random sample, independent of k
+    ##mCount stores number of times a sample pair was sampled together.
+    mCount <- connectivityMatrix( rep( 1,length(sample_x[[3]])),
+                                  mCount,
+                                  sample_x[[3]] ) 
+
+    ##use samples for each k		
+    for (k in 2:maxK){
+      if(verbose){
+        message(paste("  k =",k))
+      }
+      if (i==1){
+        ml[[k]] = mConsist #initialize
+      }
+      this_assignment=NA
+      if(clusterAlg=="hc"){
+        ##prune to k for hc
+        this_assignment = cutree(this_cluster,k)
+        ##browser()
+      }else if(clusterAlg=="km"){
+        ##this_dist should now be a matrix corresponding to the result from sampleCols
+        this_assignment <- kmeans( t( this_dist ),
+                                   k,
+                                   iter.max = 10,
+                                   nstart = 1,
+                                   algorithm = c("Hartigan-Wong") )$cluster
+      }else if ( clusterAlg == "pam" ) {
+        require( cluster )
+        this_assignment <- pam( x=this_dist,
+                                k,
+                                diss=TRUE,
+                                metric=distance, 
+                                cluster.only=TRUE )
+      } else{
+        ##optional cluterArg Hook.
+        this_assignment <- get(clusterAlg)(this_dist, k)
+      }
+      ##add to tally				
+      ml[[k]] <- connectivityMatrix( this_assignment,
+                                     ml[[k]],
+                                     sample_x[[3]] )
+    }
+  }
+	
+
+  ##consensus fraction
+  res = vector(mode="list",maxK)
+  for (k in 2:maxK){
+    ##fill in other half of matrix for tally and count.
+    tmp = triangle(ml[[k]],mode=3)
+    tmpCount = triangle(mCount,mode=3)
+    res[[k]] = tmp / tmpCount
+    res[[k]][which(tmpCount==0)] = 0
+  }
+  message("end fraction")
+  return(res)
+}
+
+
+connectivityMatrix <- function( clusterAssignments, m, sampleKey){
+  ##input: named vector of cluster assignments, matrix to add connectivities
+  ##output: connectivity matrix
+  names( clusterAssignments ) <- sampleKey 
+  cls <- lapply( unique( clusterAssignments ), function(i) as.numeric( names( clusterAssignments[ clusterAssignments %in% i ] ) ) )
+
+  for ( i in 1:length( cls ) ) {
+    nelts <- 1:ncol( m )
+    cl <- as.numeric( nelts %in% cls[[i]] ) ## produces a binary vector
+    updt <- outer( cl, cl )
+    m <- m + updt
+  }
+  return(m)
+}
+
+## returns a list with the sample columns, as well as the sub-matrix & sample features (if necessary)
+##  if no sampling over the features is performed, the submatrix & sample features are returned as NAs
+##  to reduce memory overhead
+sampleCols <- function( d,
+                        pSamp=NULL,
+                        pRow=NULL,
+                        weightsItem=NULL,
+                        weightsFeature=NULL ){
+  space <- ifelse( inherits( d, "dist" ), ncol( as.matrix(d) ), ncol(d) )
+  sampleN <- floor(space*pSamp)
+  sampCols <- sort( sample(space, sampleN, replace = FALSE, prob = weightsItem) )
+
+  this_sample <- sampRows <- NA
+  if ( inherits( d, "matrix" ) ) {
+    if ( (! is.null( pRow ) ) &&
+         ( (pRow < 1 ) || (! is.null( weightsFeature ) ) ) ) {
+      ## only sample the rows and generate a sub-matrix if we're sampling over the row/gene/features
+      space = nrow(d)
+      sampleN = floor(space*pRow)
+      sampRows = sort( sample(space, sampleN, replace = FALSE, prob = weightsFeature) )
+      this_sample <- d[sampRows,sampCols]
+      dimnames(this_sample) <- NULL
+    } else {
+      ## do nothing
+    }
+  }
+  return( list( submat=this_sample,
+                subrows=sampRows,
+                subcols=sampCols ) )
+}
+
+CDF=function(ml,breaks=100){
+  #plot CDF distribution
+  plot(c(0),xlim=c(0,1),ylim=c(0,1),col="white",bg="white",xlab="consensus index",ylab="CDF",main="consensus CDF", las=2)
+  k=length(ml)
+  this_colors = rainbow(k-1)
+  areaK = c()
+  for (i in 2:length(ml)){
+    v=triangle(ml[[i]],mode=1)
+
+    #empirical CDF distribution. default number of breaks is 100    
+    h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks))
+    h$counts = cumsum(h$counts)/sum(h$counts)
+
+    #calculate area under CDF curve, by histogram method.
+    thisArea=0
+    for (bi in 1:(length(h$breaks)-1)){
+       thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width
+       bi = bi + 1
+    }
+    areaK = c(areaK,thisArea)
+    lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l')
+  }
+  legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors)
+
+  #plot area under CDF change.
+  deltaK=areaK[1] #initial auc at k=2
+  for(i in 2:(length(areaK))){
+    #proportional increase relative to prior K.
+    deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1])
+  }
+  plot(1+(1:length(deltaK)),y=deltaK,xlab="k",ylab="relative change in area under CDF curve",main="Delta area",type="b")
+}
+
+
+myPal = function(n=10){
+  #returns n colors
+  seq = rev(seq(0,255,by=255/(n)))
+  palRGB = cbind(seq,seq,255)
+  rgb(palRGB,maxColorValue=255)
+}
+
+setClusterColors = function(past_ct,ct,colorU,colorList){
+	#description: sets common color of clusters between different K
+	newColors = c()
+	if(length(colorList)==0){
+		#k==2
+		newColors = colorU[ct]
+		colori=2
+	}else{
+		newColors = rep(NULL,length(ct))
+		colori = colorList[[2]]
+		mo=table(past_ct,ct)
+		m=mo/apply(mo,1,sum)
+			for(tci in 1:ncol(m)){ # for each cluster
+				maxC = max(m[,tci])
+				pci = which(m[,tci] == maxC)				
+				if( sum(m[,tci]==maxC)==1 & max(m[pci,])==maxC & sum(m[pci,]==maxC)==1  )  {
+				#if new column maximum is unique, same cell is row maximum and is also unique
+				##Note: the greatest of the prior clusters' members are the greatest in a current cluster's members.
+					newColors[which(ct==tci)] = unique(colorList[[1]][which(past_ct==pci)]) # one value
+				}else{ #add new color.
+					colori=colori+1
+					newColors[which(ct==tci)] = colorU[colori]
+				}
+			}
+	}
+	return(list(newColors,colori,unique(newColors) ))
+}
+
+clusterTrackingPlot = function(m){
+  #description: plots cluster tracking plot
+  #input: m - matrix where rows are k, columns are samples, and values are cluster assignments.
+  plot(NULL,xlim=c(-0.1,1),ylim=c(0,1),axes=FALSE,xlab="samples",ylab="k",main="tracking plot")
+  for(i in 1:nrow(m)){
+    rect(  xleft=seq(0,1-1/ncol(m),by=1/ncol(m)),  ybottom=rep(1-i/nrow(m),ncol(m)) , xright=seq(1/ncol(m),1,by=1/ncol(m)), ytop=rep(1-(i-1)/nrow(m),ncol(m)), col=m[i,],border=NA)   
+  }
+  #hatch lines to indicate samples
+  xl = seq(0,1-1/ncol(m),by=1/ncol(m))
+  segments(  xl, rep(-0.1,ncol(m)) , xl, rep(0,ncol(m)), col="black")    #** alt white and black color?
+  ypos = seq(1,0,by=-1/nrow(m))-1/(2*nrow(m))
+  text(x=-0.1,y=ypos[-length(ypos)],labels=seq(2,nrow(m)+1,by=1))
+}
+
+triangle = function(m,mode=1){
+  #mode=1 for CDF, vector of lower triangle.
+  #mode==3 for full matrix.
+  #mode==2 for calcICL; nonredundant half matrix coun
+  #mode!=1 for summary 
+  n=dim(m)[1]
+  nm = matrix(0,ncol=n,nrow=n)
+  fm = m
+
+
+  nm[upper.tri(nm)] = m[upper.tri(m)] #only upper half
+  
+  fm = t(nm)+nm
+  diag(fm) = diag(m)
+  
+  nm=fm
+  nm[upper.tri(nm)] = NA
+  diag(nm) = NA
+  vm = m[lower.tri(nm)]
+  
+  if(mode==1){
+    return(vm) #vector 		
+  }else if(mode==3){
+    return(fm) #return full matrix
+  }else if(mode == 2){
+    return(nm) #returns lower triangle and no diagonal. no double counts.
+  }
+  
+}
+
+
+rankedBarPlot=function(d,myc,cc,title){
+	colors = rbind() #each row is a barplot series
+	byRank = cbind()
+
+	spaceh = 0.1 #space between bars
+	for(i in 1:ncol(d)){
+	  byRank = cbind(byRank,sort(d[,i],na.last=F))
+	  colors = rbind(colors,order(d[,i],na.last=F))
+	}
+	maxH = max(c(1.5,apply(byRank,2,sum)),na.rm=T) #maximum height of graph
+	
+	#barplot largest to smallest so that smallest is in front.
+	barp = barplot( apply(byRank,2,sum) ,  col=myc[colors[,1]] ,space=spaceh,ylim=c(0,maxH),main=paste("item-consensus", title),border=NA,las=1  )
+	for(i in 2:nrow(byRank)){
+	  barplot( apply(matrix(byRank[i:nrow(byRank),],ncol=ncol(byRank))  ,2,sum), space=spaceh,col=myc[colors[,i]],ylim=c(0,maxH), add=T,border=NA,las=1  )
+	}
+	xr=seq(spaceh,ncol(d)+ncol(d)*spaceh,(ncol(d)+ncol(d)*spaceh)/ncol(d)  )
+	#class labels as asterisks
+	text("*",x=xr+0.5,y=maxH,col=myc[cc],cex=1.4) #rect(xr,1.4,xr+1,1.5,col=myc[cc] )
+}
+
+
+
+###################################################################3333
+## RESTART MY SCRIPTS HERE
+##save.image( '/home/waltman/work.local/tmp/new.ccplus.R.dbg' )
+stop( "phw forced stop\n")
+spec <- matrix( c( "data.fname",         "d", 1, "character",
+                   "direction",          "n", 2, "character",
+                   "output.name",        "o", 2, "character",
+                   "cluster.alg",        "a", 2, "character", ## must be either 'hc' or 'km'
+                   "distance.metric",    "m", 2, "character", ## must be one supported by ConsensusClusterPlus
+                   "max.k",              "k", 2, "integer",
+                   "reps",               "r", 2, "integer",
+                   "innerLinkage",       "i", 1, "character",
+                   "finalLinkage",       "f", 1, "character",
+                   "out.report.dir",     "p", 2, "character",
+                   "out.report.html",    "h", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+
+## default params for non-required params
+if ( is.null( opt$direction ) ) { opt$direction <- "cols"  }
+if ( is.null( opt$cluster.alg ) ) { opt$cluster.alg <- "pam" }
+if ( is.null( opt$output.name ) ) { opt$output.name <- "consensus.cluster.result" }
+if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "cosine" }
+if ( is.null( opt$max.k ) ) { opt$max.k <- 10 }
+if ( is.null( opt$reps ) ) { opt$reps <- 1000 }
+if ( is.null( opt$innerLinkage ) ) { opt$innerLinkage <- "average" }
+if ( is.null( opt$finalLinkage ) ) { opt$finalLinkage <- "average" }
+
+if ( is.null( opt$out.report.dir ) ) { opt$out.report.dir <- "report" }
+if ( is.null( opt$out.report.html ) ) { opt$out.report.html <- file.path( "report", "index.html" ) }
+
+## validate params here (make sure set to valid values)
+if ( !opt$cluster.alg %in% c( "hc", "km", "pam" ) ) {
+  stop( "invalid clustering algorithm specified", cluster.alg )
+}
+
+
+data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
+## transpose the matrix if we want to cluster the rows (genes)
+if ( opt$direction == "rows" ) {
+  data <- t( data )
+}
+
+
+title <- paste( opt$cluster.alg, opt$output.name, sep="." )
+##source( '~/bin/galaxy-dist/tools/ucsc.cancer.tools/cluster.tools/new.ccplus.R' )
+results <- ConsensusClusterPlus( data,
+                                 maxK=opt$max.k,
+                                 reps=opt$reps,
+                                 pItem=0.8,
+                                 ##pFeature=NULL,
+                                 pFeature=0.5,
+                                 title=opt$out.report.dir,
+                                 clusterAlg=opt$cluster.alg,
+                                 distance=opt$distance.metric,
+                                 innerLinkage=opt$innerLinkage,
+                                 finalLinkage=opt$finalLinkage,
+                                 plot='pdf',
+                                 writeTable=FALSE,
+                                 seed=100,
+                                 weightsFeature=abs( rnorm( nrow( orig.data ) ) ),
+                                 ##verbose=FALSE )
+                                 verbose=TRUE )
+
+pngs = list.files(path=opt$out.report.dir, patt="png")
+html.out <- paste( "<html>", 
+                   paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ),
+                   "</html>" )
+cat( html.out, file=opt$out.report.html )
+
+
+## re-transpose the matrix back if we've clustered the rows (genes)
+if ( opt$direction == "rows" ) {
+  data <- t( data )
+}
+save( file=opt$output.name, data, results)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/normalize.matrix.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/normalize.matrix.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,88 @@
+#!/usr/bin/env Rscript
+##  Script by Peter Waltman
+## 
+## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
+##
+argspec <- c("TBD
+
+
+        Usage: 
+                normalize.matrix.R -d <data.file> 
+        Optional:
+                -o <output.name>
+                TBD 
+                \n\n")
+
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+
+spec <- matrix( c( "data.fname",         "d", 1, "character",
+                   "center.rows",        "r", 2, "character",
+                   "var.adj.rows",       "R", 2, "character",
+                   "center.cols",        "c", 2, "character",
+                   "var.adj.cols",       "C", 2, "character",
+                   "output.fname",       "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+
+#set some reasonable defaults for the options that are needed,
+#but were not specified.
+if ( is.null( opt$output.fname ) ) {
+  out.fname <- ""
+  if ( ! is.null( opt$center.rows ) ) out.fname <- paste( "row", opt$center.rows, "centered", out.fname, sep="." )
+  if ( ! is.null( opt$center.cols ) ) out.fname <- paste( "col", opt$center.cols, "centered", out.fname, sep="." )
+  if ( ! is.null( opt$var.adj.rows ) ) out.fname <- paste( "row", opt$var.adj.rows, "var.adjed", out.fname, sep="." )
+  if ( ! is.null( opt$var.adj.cols ) ) out.fname <- paste( "col", opt$var.adj.cols, "var.adjed", out.fname, sep="." )
+  
+  opt$output.fname <- paste( out.fname, "centered.matrix", sep="." )
+}
+if ( is.null( opt$center.rows ) ) { opt$center.rows <- 'none' }
+if ( is.null( opt$center.cols ) ) { opt$center.cols <- 'none' }
+if ( is.null( opt$var.adj.rows ) ) { opt$var.adj.rows <- 'none' }
+if ( is.null( opt$var.adj.cols ) ) { opt$var.adj.cols <- 'none' }
+
+data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
+
+my.center <- rep( 0, nrow( data ) )
+my.var.adj <- rep( 1, nrow( data ) )
+if ( opt$center.rows != "none" ) {
+  my.center.fn <- get( opt$center.rows )
+  ##data <- sweep( data, 1, apply( data, 1, my.center.fn, na.rm=T ) )
+  my.center <- apply( data, 1, my.center.fn, na.rm=T )
+}
+if ( opt$var.adj.rows != "none" ) {
+  my.var.adj.fn <- get( opt$var.adj.rows )
+  my.var.adj <- apply( data, 1, my.var.adj.fn, na.rm=T )
+}
+data <- t( scale( t( data ), center=my.center, scale=my.var.adj ) )
+
+my.center <- rep( 0, nrow( data ) )
+my.var.adj <- rep( 1, nrow( data ) )
+if ( opt$center.cols != "none" ) {
+  my.center.fn <- get( opt$center.cols )
+  ##data <- sweep( data, 2, apply( data, 2, my.center.fn, na.rm=T ) )
+  my.center <- apply( data, 2, my.center.fn, na.rm=T )
+}
+my.var.adj <- rep( 1, ncol( data ) )
+if ( opt$var.adj.cols != "none" ) {
+  my.var.adj.fn <- get( opt$var.adj.cols )
+  my.var.adj <- apply( data, 2, my.var.adj.fn, na.rm=T )
+}
+data <- scale( data, center=my.center, scale=my.var.adj )
+
+write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/normalize.matrix.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/normalize.matrix.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "normalize.matrix.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/normalize.matrix.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/normalize.matrix.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,74 @@
+<tool id="normalize_matrix" name="Normalize Matrix" force_history_refresh="True">
+    <command interpreter="python">normalize.matrix.py
+-d $dataset 
+-r ${center_rows} 
+-c ${center_cols} 
+-R ${var_adj_rows} 
+-C ${var_adj_cols} 
+-o ${output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='tabular' label="Data Set" help="Matrix to be normalized (tab-delimited format)"/>
+    	<param name="center_rows" type="select" label="Center Rows" help="Centering Method for Rows">
+    		<option value="none">No Centering</option>
+    		<option value="mean">Mean</option>
+    		<option value="median" selected="true">Median</option>
+    	</param>
+    	
+    	<param name="center_cols" type="select" label="Center Columns" help="Centering Method for Columns" >
+    		<option value="none" selected="true">No Centering</option>
+    		<option value="mean">Mean</option>
+    		<option value="median">Median</option>
+    	</param>
+    	
+    	<param name="var_adj_rows" type="select" label="Variance Adjustment for Rows" help="Variance Adjustment Method for Rows" >
+    		<option value="none" selected="true">No Adjustment</option>
+    		<option value="mad">Median Absolute Deviation</option>
+    		<option value="sd">Standard Deviation</option>
+    	</param>
+    	<param name="var_adj_cols" type="select" label="Variance Adjustment for Columns" help="Variance Adjustment Method for Columns" >
+    		<option value="none" selected="true">No Adjustment</option>
+    		<option value="mad">Median Absolute Deviation</option>
+    		<option value="sd">Standard Deviation</option>
+    	</param>
+    	
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="Normalized Matrix"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Normalize Matrix - Tool to normalize a matrix**
+
+----
+
+**Parameters**
+
+- **Center Rows** Centering Method for Rows
+
+         * No Centering
+	 * Mean
+	 * Median
+
+- **Center Rows** Centering Method for Columns
+
+         * No Centering
+	 * Mean
+	 * Median
+
+- **Variance Adjustment for Rows** Variance Adjustment Method for Rows
+
+         * No Adjustment
+	 * Median Absolute Deviation
+	 * Standard Deviation
+
+- **Variance Adjustment for Rows** Variance Adjustment Method for Columns
+
+         * No Adjustment
+	 * Median Absolute Deviation
+	 * Standard Deviation
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/order.by.cl.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/order.by.cl.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,108 @@
+#!/usr/bin/env Rscript
+argspec <- c("tab.2.cdt.R converts a data matrix to cdt format
+
+        Usage: 
+                tab.2.cdt.R -d <data.file> 
+        Optional:
+                            -o <output_file>
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+lib.load.quiet( gplots )
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+  if ( 'flashClust' %in% installed.packages() ) {
+    lib.load.quiet( flashClust )
+  } else {
+    if ( 'fastcluster' %in% installed.packages() ) {
+      lib.load.quiet( fastcluster )
+    }
+  }
+}
+
+
+spec <- matrix( c( "data.fname",      "d", 1, "character",
+                   "class.select",    "c", 1, "character",
+                   "genes.only",      "g", 0, "logical",
+                   "within.cl.srt",   "w", 0, "logical",
+                   "output.fname",    "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+
+opt <- getopt( spec=spec )
+if ( is.null( opt$output.fname ) ) opt$output.fname <- sub( "tab$|csv$", "cdt", opt$data.fname )
+if ( is.null( opt$genes.only ) ) opt$genes.only <- FALSE
+if ( is.null( opt$within.cl.srt ) ) opt$within.cl.srt <- FALSE
+
+data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
+
+if ( opt$genes.only ) {
+  feats <- rownames( data )
+  gene.feats <- feats[ ! grepl( "complex|abstract|family", feats ) ]
+  data <- data[ gene.feats, ]
+}
+
+
+cls <- as.matrix( read.delim( opt$class.select, row.names=1 ) )
+cls <- cls[ order( cls[,1] ), , drop=FALSE ]
+
+row.cluster <- FALSE
+##  we assume this is a row-wise cluster if any rows are in the columns
+if ( any( rownames( cls ) %in% rownames( data ) ) ) {
+  row.cluster <- TRUE
+  data <- t( data )
+}
+
+if ( ! all( rownames( cls ) %in% colnames( data ) ) ) {
+
+  ovp <- rownames( cls )
+  ovp <- ovp[ ovp %in% colnames( data ) ]
+  if ( length( ovp ) > 0 ) {
+    cls <- cls[ ovp, ]
+  }
+  else {
+    stop( "no samples in cluster are found in data file\n" )
+  }
+}
+
+if ( opt$within.cl.srt ) {
+
+  cls.orig <- cls
+  cls.vect <- cls[,1]
+  cls <- sort( unique( as.numeric( cls.vect ) ) )
+
+  cls <- unlist( lapply( cls,
+                         function(i) {
+                           elts <- names( cls.vect[ cls.vect %in% i ] )
+                           sub.mat <- data[, elts ]
+                           browser()
+                           sub.dist <- dist( t( sub.mat ) )
+                           return( elts[ hclust( sub.dist )$order ] )
+                         }
+                        )
+                )
+  cls <- cls.orig[ cls, , drop=FALSE ]
+}
+
+
+## re-order and update column names
+data <- data[, rownames(cls) ]
+colnames( data ) <- paste( rownames(cls), paste( "cl", sprintf( "%02d", cls[,1] ), sep=""), sep="-" )
+
+##  now re-transpose
+if ( row.cluster ) {
+  data <- t( data )
+}
+write.table( data, opt$output.fname, sep="\t", col.names=NA, quote=FALSE )
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/order.by.cl.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/order.by.cl.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "order.by.cl.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/order.by.cl.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/order.by.cl.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,33 @@
+<tool id="order_by_clustering" name="Re-order Data Matrix by Cluster Analysis" force_history_refresh="True">
+    <command interpreter="python">order.by.cl.py
+-d $dataset 
+-c ${class_select}
+${genes_only}
+${within_cl_srt}
+-o ${output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format="tabular" label="Matrix"/>
+    	<param name="class_select" type="data" format="rdata" label="Cluster result (rdata file)"/>
+	<param name="genes_only" type="boolean" label="Genes Only?" truevalue="-g" falsevalue="" checked="False"/>
+	<param name="within_cl_srt" type="boolean" label="Sort within-cluster members?" truevalue="-w" falsevalue="" checked="True"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="Matrix Re-ordered by clusters"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Re-order Data Matrix by Cluster Analysis** - Tool to convert a data matrix into a simplified CDT format that can be read by TreeView
+
+**OUTPUT:**  A new Matrix, ordered by cluster membership
+
+----
+
+**Parameters**
+
+- **Matrix in tab-delimited format** Tab-delimited file
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/partition.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/partition.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,168 @@
+#!/usr/bin/env Rscript
+
+argspec <- c("partition.R help TBD
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+lib.load.quiet( amap )
+lib.load.quiet( cluster )
+
+##  we're going to use the amap Dist function, but they misname their correlation
+##  functions, so re-name them correctly
+amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+                    "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
+names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+                             "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
+
+spec <- matrix( c( "data.fname",         "d", 1, "character",
+                   "algorithm",        "a", 2, "character",
+                   "distance.metric",    "m", 2, "character", ## must be one supported by R's dist function
+                   "dist.obj",           "D", 2, "logical",
+                   "direction",          "n", 2, "character",
+                   "num.k",              "k", 2, "integer",
+                   "output.name",        "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+
+if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "euclidean" }
+if ( is.null( opt$algorithm ) ) { opt$algorithm <- "km" }
+if ( is.null( opt$dist.obj ) ) { opt$dist.obj <- FALSE }
+if ( is.null( opt$direction ) ) { opt$direction <- "cols"  }
+if ( is.null( opt$num.k ) ) { opt$num.k <- 10 }
+if ( is.null( opt$output.name ) ) { opt$output.name <- "partition.result" }
+
+data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
+
+if ( opt$direction == "cols" ) {
+  ## need to transpose b/c both kmeans & pam cluster the rows
+  ## this shouldn't have an effect upon a distance matrix
+  data <- t( data )
+}
+if ( opt$num.k > nrow( data ) ) {
+  err.msg <- paste( "K specified is greater than the number of elements (", opt$direction, ") in data matrix to be clustereed\n", sep="" )
+  stop( err.msg )
+}
+
+mat.2.b.clustered <- data
+if ( opt$dist.obj ) {
+  ## To be updated
+
+  mat.2.b.clustered <- as.dist( data )
+
+  if ( opt$algorithm=="km" ) {
+    ##clusterAlg is kmeans
+    if (verbose) warning()
+  }
+} else {
+  ## this is a data matrix -- we always generate a dist.mat object (b/c we need it
+  ##  in case this result is used with a heatmap
+  
+  ## PAM clustering
+  if ( opt$algorithm != "km" ) {
+
+    if ( ! opt$distance.metric %in% names( amap.distance ) ) stop("unsupported distance.")
+    mat.2.b.clustered <- Dist( data, method=as.character( amap.distance[ opt$distance.metric ] ) )
+    attr( mat.2.b.clustered, "method" ) <- opt$distance.metric
+
+  } else {
+    mat.2.b.clustered <- data
+  }
+}
+
+## now run the clustering
+partcl.res <- cl <- NA
+if ( opt$algorithm=="pam" ) {
+  partcl.res <- pam( x=mat.2.b.clustered,
+                     k=opt$num.k,
+                     metric=( ifelse( inherits(data, "dist" ), 
+                                      attr( data, "method" ),  ## this is ok if data is a dist object (b/c pam will ignore it)
+                                      opt$distance.metric) ) ) ##,
+                     ##cluster.only=TRUE )
+  cl <- partcl.res$clustering
+
+  if ( is.character( partcl.res$medoids ) ) {
+    medoids <- data[ partcl.res$medoids, ]
+  } else {
+    ##partcl.res$medoids is a matrix -- we shouldn't get this (only if mat.2.b.clustered is a data matrix)
+    medoids <- partcl.res$medoids
+  }
+  med.names <- rownames( medoids )
+  med.hc <- hclust( as.dist( as.matrix( mat.2.b.clustered )[ med.names, med.names ] ) )
+  med.cls <- as.numeric( cl[ med.names[ med.hc$order ] ] )
+
+  cl.list <- lapply( med.cls, function(i) names( cl[ cl %in% i ] ) )
+  names( cl.list ) <- med.cls 
+
+  cl.list <- lapply( cl.list,
+                     function( elts ) {
+                       if ( length( elts ) == 1 ) {
+                         retval <- 1
+                         names( retval ) <- elts
+                       } else {
+                         subdist <- as.dist( as.matrix( mat.2.b.clustered )[ elts, elts ] )
+                         sub.hc <- hclust( subdist )
+                         retval <- sub.hc$order
+                         names( retval ) <- sub.hc$labels
+                         retval <- sort( retval )
+                       }
+                       return( retval )
+                     }
+                    )
+  
+  fnl.ord <- as.character( unlist( lapply( cl.list, names ) ) )
+  cl <- cl[ fnl.ord ]
+} else {
+  partcl.res <- kmeans( x=mat.2.b.clustered,
+                        centers=opt$num.k )
+  cl <- partcl.res$cluster
+  centroids <- partcl.res$centers
+  cent.hc <- hclust( Dist( centroids, method=as.character( amap.distance[ opt$distance.metric ] ) ) )
+  cent.cls <- as.numeric( cent.hc$labels[ cent.hc$order ] )
+
+  cl.list <- lapply( cent.cls, function(i) names( cl[ cl %in% i ] ) )
+  names( cl.list ) <- cent.cls 
+
+  cl.list <- lapply( cl.list,
+                     function( elts ) {
+                       if ( length( elts ) == 1 ) {
+                         retval <- 1
+                         names( retval ) <- elts
+                       } else {
+                         if ( all( elts %in% colnames( mat.2.b.clustered ) ) ) {
+                           submat <- mat.2.b.clustered[ elts, elts ]
+                         } else {
+                           submat <- mat.2.b.clustered[ elts, ]
+                         }
+                         subdist <- Dist( submat, method=as.character( amap.distance[ opt$distance.metric ] ) )
+                         sub.hc <- hclust( subdist )
+                         retval <- sub.hc$order
+                         names( retval ) <- sub.hc$labels
+                         retval <- sort( retval )
+                       }
+                       return( retval )
+                     }
+                    )
+  
+  fnl.ord <- as.character( unlist( lapply( cl.list, names ) ) )
+  cl <- cl[ fnl.ord ]  
+}
+
+if ( opt$direction == "cols" ) {
+  ## need to transpose back
+  data <- t( data )
+}
+save( file=opt$output.name, partcl.res, cl, data )
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/partition.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/partition.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "partition.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/partition.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/partition.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,93 @@
+<tool id="partiton_clust" name="Partition Clustering" force_history_refresh="True">
+    <command interpreter="python">partition.py
+-d $dataset 
+${dist_obj}
+-n ${direction} 
+-a $alg_cond.algorithm
+#if $alg_cond.algorithm == 'pam' # -m ${alg_cond.distance_metric}
+#end if
+-k ${numk} 
+-o ${output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='tabular' label="Data Set"  help="Specify the data matrix (tab-delimited) to be clustered"/>
+	<param name="dist_obj" type="boolean" label="Distance Object (R dist object)?" truevalue="-D" falsevalue="" checked="False" help="Check if the matrix contains the pairwise distances between a set of objects"/>
+
+    	<param name="direction" type="select" label="Cluster Columns or Rows?" help="Specify the matrix dimension to cluster (see help below)">
+	  <option value="cols">Columns (Samples)</option>
+	  <option value="rows" selected='true'>Rows (Genes)</option>
+    	</param>
+	
+	<conditional name='alg_cond'>
+	  <param name="algorithm" type="select" label="PAM or K-means?" help="Specify the partition cluster method to use (see help below)">
+	    <option value="km">K-means</option>
+	    <option value="pam" selected='true'>PAM</option>
+	  </param>
+	  <when value='pam'>
+	    <param name="distance_metric" type="select" label="Distance Metric" help="Specify the distance metric to use (see help below)">
+	      <option value="cosine" selected='true'>Cosine</option>
+	      <option value="abscosine">Absolute Cosine</option>
+	      <option value="pearson">Pearson</option>
+	      <option value="abspearson">Absolute Pearson</option>
+	      <option value="spearman">Spearman</option>
+	      <option value="kendall">Kendall</option>
+	      <option value="euclidean">Euclidean</option>
+	      <option value="maximum">Maximum</option>
+	      <option value="manhattan">Manhattan (AKA city block)</option>
+	      <option value="canberra">Canberra</option>
+	      <option value="binary">Binary</option>
+	    </param>
+	  </when>
+	</conditional>
+    	<param name="numk" type="integer" label="Number of Clusters" value="50" help="Specify the number of clusters to use"/>
+    	
+    </inputs>
+    <outputs>
+        <data format="rdata" name="output" label="Partition Clustering Data (RData)"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Perform Partition Clustering (Cluster Samples) on a specified data set**
+
+----
+
+**Parameters**
+
+- **Data Set** - Specify the data matrix to be clustered.  Data must be formated as follows:
+
+         * Tab-delimited
+         * Use row/column headers
+
+- **Distance Object** Specify whether or not the data set is a pairwise distance matrix
+
+- **Cluster Samples or Genes** - Specify the dimension of the matrix to cluster:
+
+         * Rows (Genes)
+         * Columns (Samples)
+
+- **PAM or K-means?** Specify which partition clustering method to use - users have choice of:
+
+         * PAM (Partition Around Mediods)
+         * K-means
+
+- **Distance Metric** Specify the distance metric to use.  Note, this is ONLY AVAILABLE IF PAM IS THE ALGORITHM BEING USED.  Choice of:
+
+	 * Cosine (AKA uncentered pearson)
+	 * Absolute Cosine (AKA uncentered pearson, absolute value)
+         * Pearson (pearson correlation)
+	 * Absolute Pearson (pearson correlation, absolute value)
+         * Spearman (spearman correlation)
+	 * Kendall (Kendall's Tau)
+         * Euclidean (euclidean distance)
+	 * Maximum
+	 * Manhattan (AKA city block)
+	 * Canberra
+	 * Binary
+
+
+- **Number of Clusters** Specify the number of clusters to use
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/rdata.2.out.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/rdata.2.out.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,224 @@
+#!/usr/bin/env Rscript
+argspec <- c("tab.2.cdt.R converts a data matrix to cdt format
+
+        Usage: 
+                tab.2.cdt.R -d <data.file> 
+        Optional:
+                            -o <output_file>
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+
+lib.load.quiet( getopt )
+lib.load.quiet( ctc )
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+  if ( 'flashClust' %in% installed.packages() ) {
+    lib.load.quiet( flashClust )
+  } else {
+    if ( 'fastcluster' %in% installed.packages() ) {
+      lib.load.quiet( fastcluster )
+    }
+  }
+}
+
+spec <- matrix( c( "dataset",             "d", 1, "character",
+                   "dataset2",            "D", 2, "character",
+                   "output.format",       "f", 2, "character",
+                   "output.report.dir",   "p", 2, "character",
+                   "output.fname",        "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+
+opt <- getopt( spec=spec )
+if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" }
+if ( is.null( opt$output.fname ) ) { opt$output.fname <- file.path( opt$output.report.dir, paste( "data", opt$output.format, sep="." ) ) }
+if ( is.null( opt$output.format ) ) { opt$output.format <- "cdt" }
+
+
+load( opt$dataset )  ## should load the cl, treecl.res (or partcl.res) and data
+
+if ( opt$output.format %in% c( "cls-only", "newick" ) ) {
+  if ( opt$output.format == "cls-only" ) {
+
+    cl <- cbind( names( cl ), as.numeric( cl ) )
+    colnames( cl ) <- c( "ID", "Class" )
+
+    opt$output.fname <- gsub( "cls-only$", "tab", opt$output.fname )
+    write.table( cl, opt$output.fname, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE )
+  } else {
+    ##if ( opt$output.format == "newick" ) {
+
+    if ( ! exists( "treecl.res" ) ) stop( "no HAC result found in results file proved - necessary to generate a Newick formated file.\n" )
+    write( hc2Newick( treecl.res ), opt$output.fname )
+  }
+} else {
+  if ( ! exists( 'data' ) ) stop( "No data object in the rdata file provided for", opt$output.format, "format!!\n" )
+  if ( inherits( data, "dist" ) ) stop( "data provided is a distance matrix - not a data matrix.  Can't generate TreeView or Tab-delimited files w/distance matrices!\n" )
+
+  ## the rest of this is for the remaining output formats
+  ##  pre-set the cluster results for rows & cols to NULL
+  hr <- hr.cl <- hc <- hc.cl <- NULL
+  if ( exists( 'treecl.res' ) ) {
+
+    if ( is.null( treecl.res$dist.method ) ) treecl.res$dist.method <- 'euclidean'  # just set it to some stub so that the ctc fn's don't complain
+    if ( all( names( cl ) %in% rownames( data ) ) ) {
+      hr <- treecl.res
+      hr.cl <- cl
+    } else if ( all( names( cl ) %in% colnames( data ) ) ) {
+      hc <- treecl.res
+      hc.cl <- cl
+    } else {
+      stop( "Specified cluster result does not come from this data set\n" )
+    }
+
+  } else {
+    if ( exists( 'partcl.res' ) ) {
+      if ( all( names( cl ) %in% rownames( data ) ) ) {
+        hr <- NA
+        hr.cl <- cl
+        orig.data <- data
+        data <- data[ names( cl ), ]  ## partcl.res should now be sorted in order of cluster
+      } else if ( all( names( cl ) %in% colnames( data ) ) ) {
+        hc <- NA
+        hc.cl <- cl
+        orig.data <- data
+        data <- data[ , names( cl ) ]  ## partcl.res should now be sorted in order of cluster
+      } else {
+        stop( "Specified cluster result does not come from this data set\n" )
+      }
+    }
+    else {
+      stop( 'could not find a valid cluster result to use for primary direction\n' )
+    }
+  }
+  
+
+  if ( ! is.null( opt$dataset2 ) ) {
+    
+    ## prep for loading new cluster result
+    if ( ! exists( 'orig.data' ) ) orig.data <- data
+    if ( exists( "treecl.res" ) ) {
+      rm( treecl.res )
+    } else if ( exists( "partcl.res" ) ) {
+      rm( partcl.res )
+    } else stop( "no primary clustering found when generating the 2nd\n" )
+    rm( cl, data )
+    
+  
+    load( opt$dataset2 ) ## this should bring in the cl obj for the 2nd direction
+
+    ## check the data 1st
+    if ( length( orig.data ) != length( data ) ) stop( "incompatible cluster results in 2nd results file - matrices are diff lengths\n" )
+    if ( nrow( orig.data ) != nrow( data ) ) stop( "incompatible cluster results in 2nd results file - matrices have diff dimensions\n" )
+    if ( sum( orig.data == data ) != length( orig.data ) )  stop( "incompatible cluster results in 2nd results file - matrices contain diff contents\n" )
+    ## looks like data is the same, so drop a copy and start chugging
+    rm( orig.data ); gc()
+    
+    if ( exists( 'treecl.res' ) ) {
+      if ( is.null( treecl.res$dist.method ) ) treecl.res$dist.method <- 'euclidean'  # just set it to some stub so that the ctc fn's don't complain
+      
+      if ( is.null( hr ) ) {
+        if ( all( rownames( cl ) %in% rownames( data ) ) ) {
+          hr <- treecl.res
+          hr.cl <- cl
+        } else {
+          stop( "results file for 2nd direction doesn't contain cluster for 2ndary direction (rows in this case)\n" )
+        }
+      } else if ( is.null( hc ) ) {
+        if ( all( rownames( cl ) %in% colnames( data ) ) ) {
+          hc <- treecl.res
+          hc.cl <- cl
+        } else {
+          stop( "results file for 2nd direction doesn't contain cluster for 2ndary direction (genes in this case)\n" )
+        }
+      } else {
+        stop( "should never get here\n" )
+      }
+    } else if ( exists( 'partcl.res' ) ) {
+      if ( is.null( hr ) ) {
+        if ( all( names( cl ) %in% rownames( data ) ) ) {
+          hr <- NA
+          hr.cl <- cl
+          data <- data[ names( cl ), ]  ## partcl.res should now be sorted in order of cluster
+        } else {
+          stop( "results file for 2nd direction doesn't contain cluster for 2ndary direction (rows in this case)\n" )
+        }
+      } else if ( is.null( hc ) ) {
+        if ( all( names( cl ) %in% colnames( data ) ) ) {
+          hc <- NA
+          hc.cl <- cl
+          data <- data[ , names( cl ) ]  ## partcl.res should now be sorted in order of cluster
+        } else {
+          stop( "results file for 2nd direction doesn't contain cluster for 2ndary direction (genes in this case)\n" )
+        }
+      } else {
+        stop( "should never get here\n" )
+      }      
+    }
+  }
+  
+  ## Now, re-set hc & nr to NULL if they were set to NA
+  ## we used NA to signify that they were set by kmeans/pam, but now, we need to reset them
+  ## for the following lines (that generate the dendrograms (if there was an hclust result)
+  if ( ( !is.null( hr ) ) && is.na( hr ) ) hr <- NULL
+  if ( ( !is.null( hc ) ) && is.na( hc ) ) hc <- NULL
+  
+  if ( ! exists( 'data' ) ) stop( "No data object in the rdata file provided!!\n" )
+  
+  if ( is.null( hc ) ) hc <- list( order=1:ncol( data ) )
+  if ( is.null( hr ) ) hr <- list( order=1:nrow( data ) )
+
+  if ( opt$output.format == "tabular" ) {
+    write.table( data[ hr$order, hc$order ], opt$output.fname, quote=FALSE, sep="\t", col.names=NA )
+  } else if ( opt$output.format == "cdt" ) {
+    if ( !file.exists( opt$output.report.dir ) ){
+      dir.create(opt$output.report.dir, recursive=T)
+    }
+    
+    treeview.fname.stem <- file.path( opt$output.report.dir, "cluster.heatmap")
+    fnames <- character()
+    if ( inherits( hr, "hclust" ) ) {
+      fname <- paste( treeview.fname.stem, ".gtr", sep="" )
+      ## we manually specify a 'stub' distance b/c o/w it'll try using the attr(hr,"method")
+      ##  and the r2gtr fn's get grumpy if the distance was anything starting with a 'p'
+      r2gtr( hr, file=fname, distance='stub' )  
+      fnames <- c( fnames, fname )
+    }
+    if ( inherits( hc, "hclust" ) ) {
+      fname <- paste( treeview.fname.stem, ".atr", sep="" )
+      r2atr( hc, file=fname, distance='stub' )
+      fnames <- c( fnames, fname )
+    }
+
+    fname <- paste( treeview.fname.stem, ".cdt", sep="" )
+    r2cdt( hr, hc, data, file=fname )
+    fnames <- c( fnames, fname )
+
+    ## jtv file now
+    jtv.str <- '<DocumentConfig><UrlExtractor/><ArrayUrlExtractor/><Views><View type="Dendrogram" dock="1"><ColorExtractor contrast="2.0"><ColorSet zero="#FFFFFF" down="#0000FF"/></ColorExtractor><ArrayDrawer/><GlobalXMap current="Fill"><FixedMap type="Fixed"/><FillMap type="Fill"/><NullMap type="Null"/></GlobalXMap><GlobalYMap current="Fill"><FixedMap type="Fixed"/><FillMap type="Fill"/><NullMap type="Null"/></GlobalYMap><ZoomXMap><FixedMap type="Fixed"/><FillMap type="Fill"/><NullMap type="Null"/></ZoomXMap><ZoomYMap><FixedMap type="Fixed"/><FillMap type="Fill"/><NullMap type="Null"/></ZoomYMap><TextView><TextView face="Monospaced" size="14"><GeneSummary/></TextView><TextView face="Monospaced" size="14"><GeneSummary/></TextView><TextView face="Monospaced" size="14"><GeneSummary/></TextView><TextView face="Monospaced" size="14"><GeneSummary/></TextView></TextView><ArrayNameView face="Monospaced" size="14"><ArraySummary included="0"/></ArrayNameView><AtrSummary/><GtrSummary/></View></Views></DocumentConfig>'
+    fname <- paste( treeview.fname.stem, ".jtv", sep="" )
+    cat( jtv.str, file=fname )
+    fnames <- c( fnames, fname )
+
+    cmd <- paste( "tar -zcf",
+                  opt$output.fname,
+                  paste( "--directory=", opt$output.report.dir, sep="" ),
+                  paste( basename( fnames ), collapse=" " ) )
+    system( cmd )
+  }
+}
+
+
+
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/rdata.2.out.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/rdata.2.out.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "rdata.2.out.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/rdata.2.out.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/rdata.2.out.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,86 @@
+<tool id="rdata_2_out" name="RData cluster result to Text output" force_history_refresh="True">
+    <command interpreter="python">rdata.2.out.py
+-d $dataset 
+-f ${out_format.format}
+
+#if str($out_format.format) == 'cls-only':
+-o ${output_cls}
+#end if
+
+#if str($out_format.format) == 'newick':
+-o ${output_newick}
+#end if
+
+#if str($out_format.format) == 'cdt':
+#if str($out_format.cdt_sec_dir_data) != "None":
+-D ${out_format.cdt_sec_dir_data}
+#end if
+-o ${output_cdt}
+-p ${output_cdt.files_path}
+#end if
+
+#if str($out_format.format) == 'tabular':
+#if str($out_format.tab_sec_dir_data) != "None":
+-D ${out_format.tab_sec_dir_data}
+#end if
+-o ${output_tab}
+-p ${output_tab.files_path}
+#end if
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='rdata' label="Clustering Classification"/>
+	<conditional name="out_format" >
+	  <param name="format" type="select" label="Select Output Format" >
+	    <option value="cls-only" selected='true'>Cluster Assignments Only</option>
+	    <option value="newick">Newick</option>
+	    <option value="cdt">TreeView (CDT)</option>
+	    <option value="tabular">Tab-delimited (all data, ordered by clusters)</option>
+	  </param>
+	  <when value='cdt'>
+	    <param name="cdt_sec_dir_data" type="data" format='rdata' label="Previous Cluster result #2 (secondary result, e.g. rows)" optional="true"/>
+	  </when>
+	  <when value='tabular'>
+	    <param name="tab_sec_dir_data" type="data" format='rdata' label="Previous Cluster result #2 (secondary result, e.g. rows)" optional="true"/>
+	  </when>
+	</conditional>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output_cls" label="Cluster Assignments Only (tab-delimited)" >
+	  <filter>(out_format['format']=="cls-only")</filter>
+	</data>
+        <data format="tgz" name="output_cdt" label="CDT File (tgz with CDT and ATR or GTR)" >
+	  <filter>(out_format['format']=="cdt")</filter>
+	</data>
+        <data format="tabular" name="output_newick" label="Newick File" >
+	  <filter>(out_format['format']=="newick")</filter>
+	</data>
+        <data format="tabular" name="output_tab" label="Tab-delimited File" >
+	  <filter>(out_format['format']=="tabular")</filter>
+	</data>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Convert RData file with cluster assignments and data to text (see below for text file options)** - Tool to convert a cluster assignment in tab-delimited format into an RData file format that can be read by the other tools in the Cluster-Tools Suite, e.g. The Kaplan-Meier Survival Plotting Tools
+
+
+**OUTPUT:**  A new CDT file
+
+----
+
+**Parameters**
+
+- **Clustering Classification** Specify the clustering classification (RData file format - use the 'Convert tab-delimited Cluster Assignments to RData" tool to convert assignments in tab-delimited format).
+- **Select Output Format** Choice of the following
+         * Cluster Assignments Only (2-column tab-delimited (ID-Cluster pairs)
+         * Newick - dendrogram in Newick format
+         * TreeView (CDT) - tarbal (tgz file) with requisite files (e.g. cdt, atr, gtr, jtv files)
+         * Tab-delimited (all data, ordered by clusters)
+
+- **IF Output Format is either TreeView or Tab-delimited (all data, ordered by clusters)**
+         * **Previous Cluster result #2** Specify the clustering classification for the 2nd dimension (RData file format - use the 'Convert tab-delimited Cluster Assignments to RData" tool to convert assignments in tab-delimited format).
+
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/remove.degenerate.values.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/remove.degenerate.values.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,41 @@
+#!/usr/bin/env Rscript
+argspec <- c("remove.degenerate.values.R replaces degenerate values (NaNs & Infs) with a user-specified value
+
+        Usage: 
+                remove.degenerate.values.R -d <data.file> 
+        Optional:
+                                           -r <replacement_value> (default is NA)
+                                           -o <output_file>
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "data.fname",      "d", 1, "character",
+                   "replacement.val", "r", 2, "character",
+                   "output.fname",    "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+
+data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
+if ( is.null( opt$replacement.val ) ) { opt$replacement.val <- NA }
+if ( ! is.null( opt$replacement.val ) ) { opt$replacement.val <- as.integer( opt$replacement.val ) }
+if ( is.null( opt$output.fname ) ) { opt$output.fname <- paste( "degenerate.vals.replaced", basename( opt$data.fname ), sep="." ) }
+
+## Set any NA, NAN or Inf entries to 0
+if ( any(is.nan(data)) ) data[ is.nan( data ) ] <- opt$replacement.val
+if ( any(is.infinite(data)) ) data[ is.infinite( data ) ] <- opt$replacement.val
+
+write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/remove.degenerate.values.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/remove.degenerate.values.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "remove.degenerate.values.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/remove.degenerate.values.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/remove.degenerate.values.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,30 @@
+<tool id="remove_degenerate_values" name="Remove Degenerate Values" force_history_refresh="True">
+    <command interpreter="python">remove.degenerate.values.py
+-d $dataset -r ${replacement_value}
+-o ${output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='tabular' label="Matrix with Degenerate Values"/>
+	<param name="replacement_value" type="text" label="Value used to replace NaNs and Infs in matrix" value="NA" />
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="Matrix with degenerate values (NaNs and Infs) replaced"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Remove Degenerate Values** - Tool to replace 'degenerate' values (such as NaNs and Infs) from a data matrix with a user-specified value.  **Note**, NAs are not degenerate values.
+
+**OUTPUT:**  A new matrix without degenerate values
+
+----
+
+**Parameters**
+
+- **Matrix with Degenerate Values** Input matrix that potentially contains degenerate values
+
+- **Value used to replace NaNs and Infs in matrix** Single value that is used to replace the degenerate values
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/remove.nulls.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/remove.nulls.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,40 @@
+#!/usr/bin/env Rscript
+argspec <- c("remove.degenerate.values.R replaces degenerate values (NaNs & Infs) with a user-specified value
+
+        Usage: 
+                remove.degenerate.values.R -d <data.file> 
+        Optional:
+                                           -o <output_file>
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "data.fname",      "d", 1, "character",
+                   "output.fname",    "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+if ( is.null( opt$output.fname ) ) { opt$output.fname <- 'merge_merge.reals.tab' }
+
+mat <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
+cnames <-  sapply( strsplit( colnames( mat ), "\\s+" ), function(x) x[1] )
+colnames( mat ) <- cnames
+nulls <- mat[ , grepl( "^na", cnames ) ]
+reals <- mat[ , !grepl( "^na", cnames ) ]
+
+reals <- cbind( rownames( reals ), reals ); rownames( reals ) <- NULL
+reals <- rbind( colnames( reals ), reals ); colnames( reals ) <- NULL
+reals[1,1] <- "ID"
+write.table( reals, opt$output.fname, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE )
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/remove.nulls.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/remove.nulls.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "remove.nulls.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/remove.nulls.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/remove.nulls.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,27 @@
+<tool id="remove_null_samples" name="Remove Null Samples" force_history_refresh="True">
+    <command interpreter="python">remove.nulls.py
+-d $dataset
+-o ${output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='tabular' label="IPL Matrix from Paradigm containing Null and Real Samples"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="Real Sample-only IPL Matrix"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Remove Null Samples** - Tool to remove the 'null' samples from an IPL matrix result from Paradigm
+
+**OUTPUT:**  A new matrix without null samples
+
+----
+
+**Parameters**
+
+- **IPL Matrix from Paradigm that contains null samples** Input matrix that potentially contains null samples
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/remove.tcga.normals.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/remove.tcga.normals.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,41 @@
+#!/usr/bin/env Rscript
+argspec <- c("remove.tcga.normals.R removes TCGA normal samples from a given matrix
+
+        Usage: 
+                remove.tcga.normals.R -d <data.file> 
+        Optional:
+                                           -o <output_file>
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "data.fname",      "d", 1, "character",
+                   "output.fname",    "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+if ( is.null( opt$output.fname ) ) { opt$output.fname <- 'merge_merge.tumors.tab' }
+
+mat <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
+if ( length( strsplit( colnames( mat ), "-" )[[1]] ) == 4 ) {
+  cnames <-  sapply( strsplit( colnames( mat ), "-" ), function(x) x[4] )
+  norms <- grepl( "^1", cnames )
+
+  if ( sum( norms ) > 0  ) {
+    tumors <- ! norms
+    mat <- mat[, tumors ]
+  }
+}
+write.table( mat, opt$output.fname, quote=FALSE, sep="\t", col.names=NA )
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/remove.tcga.normals.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/remove.tcga.normals.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "remove.tcga.normals.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/remove.tcga.normals.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/remove.tcga.normals.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,28 @@
+<tool id="remove_tcga_normal_samples" name="Remove TGCA Normal Samples" force_history_refresh="True">
+    <command interpreter="python">remove.tcga.normals.py
+-d $dataset
+-o ${output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='tabular' label="Matrix containing TCGA Normal Samples"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="Tumor Sample Matrix"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Remove Normal Samples** - Tool to remove the normal samples from a matrix **(ASSUMES samples are in the columns of matrix)**
+
+**OUTPUT:**  A new matrix without normal samples
+
+----
+
+**INPUT**
+
+- **Matrix that contains normal samples** Input matrix that potentially contains normal samples 
+
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/select.k.from.consensus.cluster.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/select.k.from.consensus.cluster.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,338 @@
+#!/usr/bin/env Rscript
+# Consensus Clustering Script by Peter Waltman
+# June 1, 2012
+# License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
+#
+##usage, options and doc goes here
+argspec <- c("select.k.from.consensus.clust4er.R takes a clustering from ConsensusClusterPlus
+and clinical survival data and determines the right k to use.
+
+        Usage: 
+                select.k.from.consensus.cluster.R -r <results_file>
+        Optional:
+                -o output.png # default is stdout
+                -c change.min
+                -m metric (must be either:
+                                        rel.change,
+                                        angle,
+                                        silhouette (must specify data matrix)
+                                        survival (must specify survival data; uses minimal cummulative log-rank p-value)
+                -d data ## for calculating silhouette plots (plotted, but not used unless specified)
+                -s survival.data.fname (plotted, but not used unless specified)
+                -e survival.comp (can be either all, one or both - see the mode param for gen.survival.curves for explanation)
+                -z survival analysis script to be called (defaults to galaxy.gen.survival.curves.R)
+\n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write( argspec, stderr() )
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+lib.load.quiet( amap )
+lib.load.quiet( cluster )
+
+spec <- matrix( c( "results.file",        "r", 1, "character",
+                   "change.min",          "c", 2, "double",
+                   "metric",              "m", 2, "character",
+                   "survival.data",       "s", 2, "character",
+                   "survival.comp",       "e", 2, "character",
+                   "survival.script",     "z", 2, "character",
+                   "output.format",       "f", 2, "character",
+                   "cluster.class.out",   "o", 2, "character",
+                   "output.report.dir",   "p", 2, "character",
+                   "output.report.html",  "h", 2, "character"
+                  ),
+                nc=4,
+                byrow=T
+               )
+opt <- getopt( spec=spec )
+
+## default params for non-required params
+if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" }
+if ( is.null( opt$output.report.html ) ) { opt$output.report.html <- "report/index.html" }
+
+if ( is.null( opt$change.min ) ) { opt$change.min <- 0.075 }
+if ( is.null( opt$metric ) ) { opt$metric <- "difference" } ## alternate is angle }
+if ( is.null( opt$survival.comp ) ) { opt$survival.comp <- "all" } ## alternate is one or both }
+if ( is.null( opt$survival.script ) ) { opt$survival.script <- "galaxy.gen.survival.curves.R" } ## alternate is one or both }
+if ( is.null( opt$cluster.class.out) ) { opt$cluster.class.out <- "select.cls.rda" }
+
+if ( !file.exists( opt$output.report.dir ) ){
+    dir.create(opt$output.report.dir)
+}
+
+if ( ! opt$metric %in% c( "difference", "angle", "silhouette", "survival" ) ) {
+  stop( "invalid metric specified ", opt$metric, "\n" )
+}
+
+                               
+opt$change.min <- as.numeric( opt$change.min )
+if ( abs( opt$change.min ) > 1 ) {
+  stop( "invalid angle specified:", opt$change.min, "Please specify angle in rangel [-1,0]\n" )
+}
+if ( opt$metric=="angle" && opt$change.min > 0 ) {
+  opt$change.min <- -opt$change.min
+  cat( "Using", opt$change.min, "for minimum angle\n" )
+}
+
+if ( opt$metric == "survival" &&
+     ( is.null( opt$survival.data ) ||
+       (! file.exists( opt$survival.data ) ) )
+    ) {
+  stop( "Must provide valid survival file in order to use survival as metric\n" )
+}
+
+
+## From the ConsensusClusterPlust package - modified by phw
+CDF <- function( ml,
+                 breaks=1000,
+                 plot.it=TRUE ){
+  if ( class(ml[[1]])=="matrix" && ( names( ml[1] ) =="2" ) ) {
+    ml <- c( c(0), ml )
+  }
+  ##plot CDF distribution
+  if ( plot.it ) {
+    plot( c(0),
+          xlim=c(0,1),
+          ylim=c(0,1),
+          col="white",
+          bg="white",
+          xlab="consensus index",
+          ylab="CDF",
+          main="consensus CDF",
+          las=2 )
+  }
+  
+  k=length(ml)
+  this_colors <- rainbow(k-1)
+  areaK <- c()
+  for (i in 2:length( ml ) ) {
+    v <- ml[[i]]
+    v <- v[ lower.tri(v) ]
+
+    #empirical CDF distribution. default number of breaks is 100    
+    h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks))
+    h$counts = cumsum(h$counts)/sum(h$counts)
+
+    #calculate area under CDF curve, by histogram method.
+    thisArea=0
+    for (bi in 1:(length(h$breaks)-1)){
+       thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width
+    }
+    areaK = c(areaK,thisArea)
+    if ( plot.it ) lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l')
+  }
+  if ( plot.it ) legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors)
+
+  #Calc area under CDF change.
+  deltaK=areaK[1] #initial auc at k=2
+  for(i in 2:(length(areaK))){
+    #proportional increase relative to prior K.
+    deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1])
+  }
+  return ( list( areaK=areaK, deltaK=deltaK ) )
+}
+
+
+load( opt$results.file )
+
+if ( opt$metric == "silhouette" ) {
+  if ( ! exists( 'data' ) && ( class( data ) != "matrix" ) ) {
+    stop( "Must provide valid data matrix in order to use silhouette as metric\n" )
+  }
+}
+cons.matrices <- lapply( results[ 2:length(results) ], '[[', 'consensusMatrix' )
+cls <- lapply( results[ 2:length(results) ], function( res ) return( res$consensusClass[ res$consensusTree$order ] ) )  ##'[[', 'consensusClass' )
+names( cons.matrices ) <- names( cls ) <- 2:length( results )
+
+png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.CDF.png")
+plot.dev <- png( png.fname,
+                 width=11,
+                 height=8.5,
+                 units='in',
+                 res=72 )
+##  this will calculate the CDF, plus plot them
+rel.delta <- CDF( cons.matrices, breaks=1000, plot.it=TRUE )$deltaK
+dev.off()
+names( rel.delta ) <- seq( from=2, by=1, length=length( rel.delta ) )
+vector.of.metric.changes <- rel.delta
+
+main.txt <- ", per Size K"
+ylab.txt <- ""
+
+main.txt <- paste( "Relative Change in Area", main.txt, sep="" )
+ylab.txt <- paste( "relative change in area under CDF curve", ylab.txt, sep="" )
+png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.diff.png")
+
+plot.dev <- png( png.fname,
+                 width=11,
+                 height=8.5,
+                 units='in',
+                 res=72 )
+plot( as.numeric( names( vector.of.metric.changes ) ),
+      vector.of.metric.changes,
+      main=main.txt,
+      ylab=ylab.txt,
+      xlab="Cluster size (K)",
+      type='b' )
+dev.off()
+
+k.select <- vector.of.metric.changes[ vector.of.metric.changes < opt$change.min ]
+if ( length( k.select ) > 1 ) {
+  k.select <- k.select[1]
+} else {
+  if ( length( k.select ) == 0 ) {
+    k.select <- vector.of.metric.changes[ length( vector.of.metric.changes ) ]
+  } else {
+    ## do nothing
+  }
+}
+k.select <- as.numeric( names( k.select ) )
+## find the search range
+k.search.range <- (k.select-2):(k.select+2)
+k.search.range <- k.search.range[ k.search.range %in% as.numeric( names( vector.of.metric.changes ) ) ]
+k.search.range <- vector.of.metric.changes[ as.character( k.search.range ) ]
+k.search.range <- k.search.range[ k.search.range  < 0.25 ]
+k.search.range <- k.search.range[ k.search.range > 0.025 ]
+k.search.range <- names( k.search.range )
+
+
+if ( exists("data") ) {
+  ## what direction is the clustering in? rows or cols?
+  elts <- unique( names( results[[2]]$consensusClass ) )
+  if ( all( elts %in% colnames( data ) ) ) {
+    ## sample clusters
+    data.dist <- dist( t( data ) )
+    cls <- lapply( cls, function( x ) return( x[ colnames( data ) ] ) )
+  } else if ( all( elts  %in% rownames( data ) ) ) {
+    data.dist <- dist( data )
+    cls <- lapply( cls, function( x ) return( x[ rownames( data ) ] ) )
+  } else {
+    stop( "incompatible cluster results and data matrix\n" )
+  }
+
+  
+  sils <- lapply( cls,
+                  silhouette,
+                  dist=data.dist )
+  sils <- sapply( sils,
+                  function(x) {
+                    return( summary( x )$avg.width )
+                  }
+                 )
+
+  png.fname <- file.path( opt$output.report.dir, "consensus.sel.silhouette.png")
+
+  plot.dev <- png( png.fname,
+                   width=11,
+                   height=8.5,
+                   units='in',
+                   res=72 )
+  plot( as.numeric( names( sils ) ),
+        sils,
+        main="Average Silhouette Widths, per Cluster Size K",
+        ylab="average silhouette width (correlation distance)",
+        xlab="Cluster size (K)",
+        type='b' )
+  dev.off()
+
+  ## if the metric is silhouette, use that (but only over the k's that are on the rel-change "elbow"
+  if ( opt$metric == "silhouette" ) {
+    names( sils ) <- names( cls )
+
+    sils <- sils[ k.search.range ]
+    k.select <- as.numeric( names( sils[ sils == max( sils, na.rm=T ) ] ) )
+  }
+}
+
+if ( ! is.null( opt$survival.data ) ) {
+  if ( ! file.exists( opt$survival.data ) ) {
+    stop( 'specified clinical/survival file can not be found:', opt$survival.data, "\n" )
+  }
+
+  if ( opt$metric == "survival" ) {
+    pvals <- numeric()
+
+    for ( cl in cls ) {
+      
+      cons.class.file <- tempfile( "tmp.class.rdata" )
+      save( file=cons.class.file, cl )
+      
+      cmd.string <- opt$survival.script
+
+      ## get the consensusClass file that's associated with the k.select
+      cmd.string <- paste( cmd.string, "-C", cons.class.file )
+      cmd.string <- paste( cmd.string, "-S", opt$survival.data )
+      cmd.string <- paste( cmd.string, "-M", opt$survival.comp )
+      cmd.string <- paste( cmd.string, "-P" )
+      pvals <- c( pvals, as.numeric( system( cmd.string, intern=T ) ) )
+    }
+    names( pvals ) <- names( cls )
+    
+
+    png.fname <- file.path( opt$output.report.dir, "consensus.sel.criteria.survival.png" )
+
+    plot.dev <- png( png.fname,
+                     width=11,
+                     height=8.5,
+                     units='in',
+                     res=72 )
+    plot( as.numeric( names( pvals ) ),
+          -log( pvals ),
+          main="Average Log-rank p-values (-log), per Cluster Size K",
+          ylab="Average log-rank p-values (-log)",
+          xlab="Cluster size (K)",
+          type='b' )
+    dev.off()
+    
+
+    pvals <- pvals[ k.search.range ]
+    k.select <- as.numeric( names( pvals[ pvals == min( pvals, na.rm=T ) ] ) )
+  }
+  
+  cmd.string <- opt$survival.script
+
+  ## get the consensusClass file that's associated with the k.select 
+  cl <- cls[[ as.character( k.select ) ]] 
+  cl <- cbind( names( cl ), as.integer(cl) )
+  colnames( cl ) <- c( "ID", "class" )
+  write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE )
+
+  cmd.string <- paste( cmd.string, "-c", opt$cluster.class.out )
+  cmd.string <- paste( cmd.string, "-s", opt$survival.data )
+  cmd.string <- paste( cmd.string, "-m", opt$survival.comp )
+  
+  survival.out.file <- paste( opt$output.report.dir, "survival.png", sep="/" )
+  cmd.string <- paste( cmd.string, "-o", survival.out.file )
+  output <- system( cmd.string, intern=T )
+  cat( output, sep="\n" )
+} else {
+  ## get the consensusClass file that's associated with the k.select 
+  cl <- cls[[ as.character( k.select ) ]] 
+  cl <- cbind( names( cl ), as.integer(cl) )
+  colnames( cl ) <- c( "ID", "class" )
+  write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE )
+}
+
+treecl.res <- results[[ k.select ]]$consensusTree
+## cl should already exist, but re-create it just in case
+cl <- cls[[ as.character( k.select ) ]] 
+
+
+select.result <- results[[ k.select ]]
+## over-write the tabular version of the opt$cluster.class.out with an RData file
+save( file=opt$cluster.class.out, treecl.res, cl, select.result, data ) 
+
+report_str = paste( "k selected by consensus clustering and user-specified metric, ", opt$metric, ", is ", k.select, "\n", sep="" )
+
+pngs = list.files(path=opt$output.report.dir, patt="png")
+html.out <- paste( "<html>", report_str,
+                   paste( paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), collapse=""),
+                   "</html>" )
+cat( html.out, file=opt$output.report.html )
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/select.k.from.consensus.cluster.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/select.k.from.consensus.cluster.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "select.k.from.consensus.cluster.R")
+survival_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "gen.survival.curves.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] + [ "-z", survival_script_path ]
+
+print cmd_args
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/select.k.from.consensus.cluster.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/select.k.from.consensus.cluster.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,70 @@
+<tool id="select_consensus_k" name="Select Consensus Clustering K" force_history_refresh="True">
+    <command interpreter="python">select.k.from.consensus.cluster.py
+-r ${cc_results}
+-c ${change_min}
+-m $metric_cond.metric
+-h $report 
+-p ${report.files_path}
+#if str($metric_cond.metric) == 'survival':
+-s $metric_cond.survival
+-e $metric_cond.mode
+#end if
+-o ${cluster_class}
+    </command>
+    <inputs>
+        <param name="cc_results" type="data" format="rdata" label="Consensus Clustering Results File" help="Specify the result from a consensus clustering run (see help below)"/>
+	<conditional name="metric_cond">
+	  <param name="metric" type="select" label="Change Metric" help="Specify the choice of 'change' metric to use">
+            <option value="difference">Difference</option>       
+            <option value="silhouette">silhouette</option>
+            <option value="survival">survival</option>
+	  </param>
+	  <when value="survival">
+	    <param name="survival" type="data" label="Clinical Data" optional="true" help="Specify Clinical data to use"/>
+	    <param name="mode" type="select" label="Survival Report Mode" help="Mode to use when performing Log-Rank tests">
+	      <option value="all">All</option>
+	      <option value="one">One</option>
+	      <option value="both">Both</option>
+	    </param>    
+	  </when>
+	</conditional>
+	<param name="change_min" type="float" label="Minimum relative change (AUC; pivot point for search)" value="0.1" help="Specify threshold to determine the minimum relative change"/>
+
+    </inputs>
+    <outputs>
+        <data format="rdata" name="cluster_class" label="Select K from Consensus Clustering Report (RData)"/>
+        <data format="html" name="report" label="Select K from Consensus Clustering Report (HTML)"/>        
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Select an choice of K from consensus clustering result**
+
+----
+
+**Parameters**
+
+- **Consensus Clustering Results File** Specify the result from a consensus clustering run (**MUST BE THE RDATA** outuput)
+
+- **Change Metric** Specify the choice of 'change' metric to use.  Choice of:
+
+         * Difference (relative difference in AUC between different choices of K)
+         * Silhouette (Max average silhouette width - search anchored by 'Change Min')
+         * Survival (Min p-value from Log-rank tests - search anchored by 'Change Min') **NOTE: WE DISCOURAGE USING THIS METRIC.**
+
+- **Change Min** Threshold to determine the minimum relative change that's used to decide the choice of K
+
+
+- **IF 'SURVIVAL' IS THE CHANGE METRIC, the following become available:**
+
+- **Clinical Data** Specify Clinical data to use for performing the Log-Rank tests if 'Survival' is the metric 
+        *  **(MUST SPECIFY A PROPERLY FORMATTED CLINICAL DATA FILE** - See the "Format Raw TCGA sample IDs")
+
+- **Clinical Report Mode** Mode to use when performing Log-Rank tests.  Choice of:
+
+         * All - All clusters versus each other
+         * One - One cluster versus a meta-cluster composed of the others.  Search performed exhaustively.
+         * Both - Perform both all-v-all and and one-v-others test; select the choice of K that gives the best
+
+</help>
+</tool>
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/tab.2.cdt.R
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/tab.2.cdt.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,43 @@
+#!/usr/bin/env Rscript
+argspec <- c("tab.2.cdt.R converts a data matrix to cdt format
+
+        Usage: 
+                tab.2.cdt.R -d <data.file> 
+        Optional:
+                            -o <output_file>
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+
+spec <- matrix( c( "data.fname",      "d", 1, "character",
+                   "output.fname",    "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+if ( is.null( opt$output.fname ) ) opt$output.fname <- sub( "tab$|csv$", "cdt", opt$data.fname )
+
+data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
+cnames <- colnames( data )
+data <- cbind( data[,1], data[,1], rep(1,nrow(data) ), data[, 2:ncol(data)] )
+data <- rbind( c( "EWEIGHT", "", NA,
+                  rep( 1, length( cnames[-1] ) ) ),
+               data )
+colnames( data ) <- c( "UNIQID", "NAME", "GWEIGHT", cnames[-1] )
+data <- rbind( colnames( data ), data )
+colnames( data ) <- NULL
+
+write.table( data, opt$output.fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/tab.2.cdt.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/tab.2.cdt.py	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tab.2.cdt.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/tab.2.cdt.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/tab.2.cdt.xml	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,28 @@
+<tool id="tab_2_cdt" name="Convert tab-delimitted to CDT" force_history_refresh="True">
+    <command interpreter="python">tab.2.cdt.py
+-d $dataset 
+-o ${output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" label="Tab-delimited Data Matrix" help="Specify a data matrix in tab-delimited format to be converted into CDT"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="CDT File"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Convert tab-delimitted to CDT** - Tool to convert a data matrix into a simplified CDT format that can be read by TreeView
+-**NOTE** NO CLUSTERING performed on data matrix.  Tool is a simple data conversion utility.
+
+**OUTPUT:**  A new CDT file
+
+----
+
+**Parameters**
+
+- **Matrix in tab-delimited format** Tab-delimited file
+
+</help>
+</tool>