Mercurial > repos > peter-waltman > ucsc_cluster_tools2
changeset 8:a58527c632b7 draft
Uploaded
author | peter-waltman |
---|---|
date | Mon, 11 Mar 2013 16:31:29 -0400 |
parents | 2efa1a284546 |
children | a3c03541fe6f |
files | cluster.tools/cluster.tab.2.rdata.R cluster.tools/cluster.tab.2.rdata.py cluster.tools/cluster.tab.2.rdata.xml cluster.tools/consensus.clustering.xml cluster.tools/format.raw.TCGA.RNASeq.data.xml cluster.tools/hclust.R cluster.tools/hclust.xml cluster.tools/heatmap.from.cluster.result.xml cluster.tools/new.ccplus.R cluster.tools/normalize.matrix.by.other.R cluster.tools/normalize.matrix.by.other.py cluster.tools/normalize.matrix.xml cluster.tools/partition.R cluster.tools/partition.xml cluster.tools/rdata.2.out.R cluster.tools/rnaseq.feature.selection.xml cluster.tools/select.k.from.consensus.cluster.R cluster.tools/tab.2.cdt.xml |
diffstat | 16 files changed, 187 insertions(+), 802 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/cluster.tab.2.rdata.R Mon Mar 11 16:31:29 2013 -0400 @@ -0,0 +1,35 @@ +#!/usr/bin/env Rscript +argspec <- c("tab.2.cdt.R converts a data matrix to cdt format + + Usage: + tab.2.cdt.R -d <data.file> + Optional: + -o <output_file> + \n\n") +args <- commandArgs(TRUE) +if ( length( args ) == 1 && args =="--help") { + write(argspec, stderr()) + q(); +} + + +lib.load.quiet <- function( package ) { + package <- as.character(substitute(package)) + suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) +} +lib.load.quiet(getopt) + + +spec <- matrix( c( "data.fname", "d", 1, "character", + "output.fname", "o", 2, "character" + ), + nc=4, + byrow=TRUE + ) + +opt <- getopt( spec=spec ) +if ( is.null( opt$output.fname ) ) opt$output.fname <- sub( "tab$|csv$", "cdt", opt$data.fname ) + +cl <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) ) +cl <- cl[,1] +save( file=opt$output.fname, cl )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/cluster.tab.2.rdata.py Mon Mar 11 16:31:29 2013 -0400 @@ -0,0 +1,14 @@ +#!/usr/bin/env python +import os +import sys +import subprocess + +select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "cluster.tab.2.rdata.R") + +cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] + +proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE ) +(stdoutdata, stderrdata) = proc.communicate() +if proc.returncode: + sys.stderr.write(stderrdata) +sys.stdout.write(stdoutdata)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/cluster.tab.2.rdata.xml Mon Mar 11 16:31:29 2013 -0400 @@ -0,0 +1,27 @@ +<tool id="cluster_tab_2_rdata" name="Convert tab-delimitted Cluster Assignments to RData" force_history_refresh="True"> + <command interpreter="python">cluster.tab.2.rdata.py +-d $dataset +-o ${output} + +</command> + <inputs> + <param name="dataset" type="data" label="Tab-delimited List of Cluster Assignments"/> + </inputs> + <outputs> + <data format="rdata" name="output" label="Cluster RData File"/> + </outputs> +<help> +.. class:: infomark + +**Convert tab-delimitted cluster assignments to RData file** - Tool to convert a cluster assignment in tab-delimited format into an RData file format that can be read by the other tools in the Cluster-Tools Suite, e.g. The Kaplan-Meier Survival Plotting Tools + +**OUTPUT:** A new RData file, with the cluster assignments stored as a named numeric vector + +---- + +**Parameters** + +- **Cluster Assignment Matrix in tab-delimited format** Tab-delimited file AND MUST have a header line! + +</help> +</tool>
--- a/cluster.tools/consensus.clustering.xml Mon Mar 04 04:11:28 2013 -0500 +++ b/cluster.tools/consensus.clustering.xml Mon Mar 11 16:31:29 2013 -0400 @@ -13,7 +13,13 @@ -k ${kmax} -r ${reps} -f ${finalLinkage} --o ${output} +#if str($direction) == "rows": +-o ${rdata_output_rows} +#end if + +#if str($direction) == "cols": +-o ${rdata_output_cols} +#end if -h $report -p ${report.files_path} @@ -89,7 +95,12 @@ </inputs> <outputs> <data format="html" name="report" label="Consensus Clustering Report (HTML)"/> - <data format="rdata" name="output" label="Consensus Clustering Data (RData)"/> + <data format="rdata" name="rdata_output_rows" label="Consensus Clustering Results; Gene Clusters (RData)"> + <filter>(direction)=="rows"</filter> + </data> + <data format="rdata" name="rdata_output_cols" label="Consensus Clustering Results; Sample Clusters (RData)"> + <filter>(direction)=="cols"</filter> + </data> </outputs> <help> .. class:: infomark
--- a/cluster.tools/format.raw.TCGA.RNASeq.data.xml Mon Mar 04 04:11:28 2013 -0500 +++ b/cluster.tools/format.raw.TCGA.RNASeq.data.xml Mon Mar 11 16:31:29 2013 -0400 @@ -26,7 +26,7 @@ **Format Raw TCGA RNASeq Data** - Tool to convert a raw RNASeq TCGA data file (a file from either Synapse or Firehose) into a the format expected by the Survival Analysis tools -**Log-transform data?** -Specify whether or not to log-transform the data matrix. To avoid numeric underflow, this will use log(x+1), where x is the value of the RNASeq data +**Log-transform data?** - Specify whether or not to log-transform the data matrix. To avoid numeric underflow, this will use log(x+1), where x is the value of the RNASeq data **OUTPUT:** A new tab-delimited file containing the log-transformed RNASeq data
--- a/cluster.tools/hclust.R Mon Mar 04 04:11:28 2013 -0500 +++ b/cluster.tools/hclust.R Mon Mar 11 16:31:29 2013 -0400 @@ -39,14 +39,25 @@ opt <- getopt( spec=spec ) +data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "euclidean" } if ( is.null( opt$dist.obj ) ) { opt$dist.obj <- FALSE } if ( is.null( opt$direction ) ) { opt$direction <- "cols" } if ( is.null( opt$linkage ) ) { opt$linkage <- "average" } -if ( is.null( opt$num.k ) ) { opt$num.k <- 10 } if ( is.null( opt$output.name ) ) { opt$output.name <- "hclust.result.rda" } +if ( is.null( opt$num.k ) || ( opt$num.k == -1 )) { + if ( opt$direction == 'cols' ) { + opt$num.k <- 5 + } else if ( opt$direction == 'rows' ) { + opt$num.k <- nrow( data ) / 30 ## we use an estimated average size of gene clusters to be 30 + if ( opt$num.k > 1000 ) { + opt$num.k <- ( opt$num.k %/% 10 ) * 10 + } else { + opt$num.k <- ( opt$num.k %/% 5 ) * 5 + } + } +} -data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) if ( opt$direction == "cols" ) { ## need to transpose b/c both kmeans & pam cluster the rows ## this shouldn't have an effect upon a distance matrix
--- a/cluster.tools/hclust.xml Mon Mar 04 04:11:28 2013 -0500 +++ b/cluster.tools/hclust.xml Mon Mar 11 16:31:29 2013 -0400 @@ -5,8 +5,18 @@ -n ${direction} -m ${distance_metric} -l ${linkage} + +#if str($numk) != "-1": -k ${numk} --o ${rdata_output} +#end if + +#if str($direction) == "rows": +-o ${rdata_output_rows} +#end if + +#if str($direction) == "cols": +-o ${rdata_output_cols} +#end if </command> <inputs> @@ -41,11 +51,16 @@ <option value="ward">Ward</option> </param> - <param name="numk" type="integer" label="Number of Clusters" value="50" help="Specify the number of clusters to use"/> + <param name="numk" type="integer" label="Number of Clusters" value="-1" help="Specify the number of clusters to use (-1 to use default. See help below)."/> </inputs> <outputs> - <data format="rdata" name="rdata_output" label="Hierarchical Clustering Result (RData)"/> + <data format="rdata" name="rdata_output_rows" label="Hierarchical Clustering Results; Gene Clusters (RData)"> + <filter>(direction)=="rows"</filter> + </data> + <data format="rdata" name="rdata_output_cols" label="Hierarchical Clustering Results; Sample Clusters (RData)"> + <filter>(direction)=="cols"</filter> + </data> </outputs> <help> .. class:: infomark @@ -92,7 +107,9 @@ * McQuity * Ward -- **Number of Clusters** Specify the number of clusters to use +- **Number of Clusters** Specify the number of clusters to use. If set to -1, default values will be used, with the default set as follows: + * if samples/columns are being clustered, the **default** is 5. + * if genes/rows are being clustered, the **default** is set to num_rows/30, e.g. if there are 600 row/genes in the matrix, the default will be 20 clusters. </help> </tool>
--- a/cluster.tools/heatmap.from.cluster.result.xml Mon Mar 04 04:11:28 2013 -0500 +++ b/cluster.tools/heatmap.from.cluster.result.xml Mon Mar 11 16:31:29 2013 -0400 @@ -99,7 +99,7 @@ - **Clustering Classification** Specify the clustering classification (RData file format - use the 'Convert tab-delimited Cluster Assignments to RData" tool to convert assignments in tab-delimited format). -- **Plot Kaplan-Meiers Survival Plot as well (primary clustering ONLY)?** Specify whether or not to also plot a Kaplan-Meiers Surivial Plot. **NOTE*, the cluster results must be a **SAMPLE** cluster. +- **Plot Kaplan-Meiers Survival Plot as well (primary clustering ONLY)?** Specify whether or not to also plot a Kaplan-Meiers Surivial Plot. **NOTE**, the cluster results must be a **SAMPLE** cluster. - **Cluster the second dimension?** Specify whether or not to cluster the 2nd dimension of matrix in the cluster result. Choice of: * No
--- a/cluster.tools/new.ccplus.R Mon Mar 04 04:11:28 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,773 +0,0 @@ -##!/usr/bin/env Rscript -## Consensus Clustering Script by Peter Waltman -## May 31, 2011 -## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0) -## -#usage, options and doc goes here -argspec <- c("consensus.clustering.R takes a clustering from ConsensusClusterPlus and clinical survival data -and generates a KM-plot, along with the log-rank p-values - - Usage: - consensus.clustering.R -d <data.file> - Optional: - -o <output.name> - -a <cluster.alg> ## must be either 'hc' or 'km' - -m <distance.metric> ## must be one supported by ConsensusClusterPlus - -k <max.k> - -r <reps> - -f <filter> ## filter, o/w no filtering - - \n\n") -args <- commandArgs(TRUE) -if ( length( args ) == 1 && args =="--help") { - write(argspec, stderr()) - q(); -} - -require(getopt) -##require(ConsensusClusterPlus) -## if any of the faster clustering methods are available on this system, load them -require( amap ) -require( cluster ) -if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) { - if ( 'flashClust' %in% installed.packages() ) { - require( flashClust ) - } else { - if ( 'fastcluster' %in% installed.packages() ) { - require( fastcluster ) - } - } -} - -################### -## code borrowed/updated from ConsensusClusterPlus -################### - -ConsensusClusterPlus <- function( d=NULL, - maxK = 3, - reps=10, - pItem=0.8, - pFeature=1, - clusterAlg="hc", - title="untitled_consensus_cluster", - innerLinkage="average", - finalLinkage="average", - distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ), - ml=NULL, - tmyPal=NULL, - seed=NULL, - plot=NULL, - writeTable=FALSE, - weightsItem=NULL, - weightsFeature=NULL, - verbose=F ) { - ##description: runs consensus subsamples - - - if(is.null(seed)==TRUE){ - seed=timeSeed = as.numeric(Sys.time()) - } - set.seed(seed) - - if(is.null(ml)==TRUE){ - - if ( inherits( distance, "dist" ) ) { - stop( "If you want to pass in a pre-calculated distance object, pass it in as the data, rather than the distance parameter\n" ) - } - - if ( ! class( d ) %in% c( "dist", "matrix", "ExpressionSet" ) ) { - stop("d must be a matrix, distance object or ExpressionSet (eset object)") - } - - if ( inherits( d, "dist" ) ) { - ## if d is a distance matrix, fix a few things so that they don't cause problems with the analysis - ## Note, assumption is that if d is a distance matrix, the user doesn't want to sample over the row features - if ( is.null( attr( d, "method" ) ) ) { - attr( d, "method" ) <- distance <- "unknown - user-specified" - } - if ( is.null( distance ) || ( distance != attr( d, "method" ) ) ) { - distance <- attr( d, "method" ) - } - - if ( ( ! is.null( pFeature ) ) && ( pFeature < 1 ) ) { - if ( verbose ) warning( "Cannot use the pFeatures parameter when specifying a distance matrix as the data object\n" ) - pFeature <- 1 - } - if ( ! is.null( weightsFeature ) ) { - if ( verbose ) warning( "Cannot use the weightsFeature parameter when specifying a distance matrix as the data object\n" ) - weightsFeature <- NULL - } - if ( clusterAlg == "km" ) { - if ( verbose ) warning( "You are asking CCPLUS to use K-means to cluster a distance matrix (rather than the data itself) - this may produce unintended results. We suggest using PAM if you want to use alternate distance metrics/objects\n" ) - ##d <- as.matrix( d ) #this is now done w/in ccRun - } - } else { - if ( is.null( distance ) ) { - ## we should never get here, but just in case - distance <- "pearson" - } - } - - if ( ( clusterAlg == "km" ) && inherits( distance, "character" ) && ( distance != "euclidean" ) ) { - warning( "WARNING: kmeans can only use the euclidean distance metric. If you would like to use an alternate metric, we suggest using PAM or HC clustering instead. This parameter combinationwill use k-means, but will NOT use the specified distance metric\n" ) - distance <- 'euclidean' - } - - - if ( inherits( d,"ExpressionSet" ) ) { - d <- exprs(d) - } - - ml <- ccRun( d=d, - maxK=maxK, - repCount=reps, - diss=inherits(d,"dist"), - pItem=pItem, - pFeature=pFeature, - innerLinkage=innerLinkage, - clusterAlg=clusterAlg, - weightsFeature=weightsFeature, - weightsItem=weightsItem, - distance=distance, - verbose=verbose) - } - res=list(); - - ##make results directory - if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){ - dir.create(paste(title,sep="")) - } - - ##write log file - log <- matrix( ncol=2, - byrow=T, - c("title",title, - "maxK",maxK, - "input matrix rows",ifelse ( inherits( d, "matrix" ), nrow(d), "dist-mat" ), - "input matric columns",ifelse ( inherits( d, "matrix" ), ncol(d), ncol( as.matrix(d) ) ), - "number of bootstraps",reps, - "item subsampling proportion",pItem, - "feature subsampling proportion",ifelse( is.null(pFeature), 1, pFeature ), - "cluster algorithm",clusterAlg, - "inner linkage type",innerLinkage, - "final linkage type",finalLinkage, - "correlation method",distance, - "plot",if(is.null(plot)) NA else plot, - "seed",if(is.null(seed)) NA else seed)) - colnames(log) = c("option","value") - if(writeTable){ - write.csv(file=paste(title,"/",title,".log.csv",sep=""), log,row.names=F) - } - if(is.null(plot)){ - ##nothing - }else if(plot=="png"){ - png(paste(title,"/","consensus%03d.png",sep="")) - }else if (plot=="pdf"){ - pdf(onefile=TRUE, paste(title,"/","consensus.pdf",sep="")) - }else if (plot=="ps"){ - postscript(onefile=TRUE, paste(title,"/","consensus.ps",sep="")) - } - - colorList=list() - colorM = rbind() #matrix of colors. - - #18 colors for marking different clusters - thisPal <- c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99","#E31A1C","#FDBF6F","#FF7F00","#CAB2D6","#6A3D9A","#FFFF99","#B15928", - "#bd18ea", #magenta - "#2ef4ca", #aqua - "#f4cced", #pink, - "#f4cc03", #lightorange - "#05188a", #navy, - "#e5a25a", #light brown - "#06f106", #bright green - "#85848f", #med gray - "#000000", #black - "#076f25", #dark green - "#93cd7f",#lime green - "#4d0776", #dark purple - "#ffffff" #white - ) - - ##plot scale - colBreaks=NA - if(is.null(tmyPal)==TRUE){ - colBreaks=10 - tmyPal = myPal(colBreaks) - }else{ - colBreaks=length(tmyPal) - } - sc = cbind(seq(0,1,by=1/( colBreaks) )); rownames(sc) = sc[,1] - sc = cbind(sc,sc) - heatmap(sc, Colv=NA, Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=rownames(sc),labCol=F,main="consensus matrix legend") - - for (tk in 2:maxK){ - if(verbose){ - message(paste("consensus ",tk)) - } - fm = ml[[tk]] - hc=hclust( as.dist( 1 - fm ), method=finalLinkage); - message("clustered") - ct = cutree(hc,tk) - names(ct) = colnames(d) - c = fm - ##colnames(c) = colnames(d) - ##rownames(c) = colnames(d) - - colorList = setClusterColors(res[[tk-1]][[3]],ct,thisPal,colorList) - - pc = c - pc=pc[hc$order,] #***pc is matrix for plotting, same as c but is row-ordered and has names and extra row of zeros. - pc = rbind(pc,0) - - heatmap(pc, Colv=as.dendrogram(hc), Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=F,labCol=F,mar=c(5,5),main=paste("consensus matrix k=",tk,sep="") , ColSideCol=colorList[[1]]) - legend("topright",legend=unique(ct),fill=unique(colorList[[1]]),horiz=FALSE ) - - res[[tk]] = list(consensusMatrix=c,consensusTree=hc,consensusClass=ct,ml=ml[[tk]],clrs=colorList) - colorM = rbind(colorM,colorList[[1]]) - } - CDF(ml) - clusterTrackingPlot(colorM[,res[[length(res)]]$consensusTree$order]) - if(is.null(plot)==FALSE){ - dev.off(); - } - res[[1]] = colorM - if(writeTable){ - for(i in 2:length(res)){ - write.csv(file=paste(title,"/",title,".k=",i,".consensusMatrix.csv",sep=""), res[[i]]$consensusMatrix) - write.table(file=paste(title,"/",title,".k=",i,".consensusClass.csv",sep=""), res[[i]]$consensusClass,col.names = F,sep=",") - } - } - return(res) -} - - -calcICL = function(res,title="untitled_consensus_cluster",plot=NULL,writeTable=FALSE){ - #calculates and plots cluster consensus and item consensus - cc=rbind() - cci = rbind() - sumRes=list() - colorsArr=c() - - #make results directory - if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){ - dir.create(paste(title,sep="")) - } - if(is.null(plot)){ - #to screen - }else if(plot=="pdf"){ - pdf(onefile=TRUE, paste(title,"/","icl.pdf",sep="")) - }else if(plot=="ps"){ - postscript(onefile=TRUE, paste(title,"/","icl.ps",sep="")) - }else if (plot=="png"){ - png(paste(title,"/","icl%03d.png",sep="")) - } - - par(mfrow=c(3,1),mar=c(4,3,2,0)) - - for (k in 2:length(res)){ #each k - eiCols = c(); - o = res[[k]] - m = o$consensusMatrix - m = triangle(m,mode=2) - for (ci in sort(unique(o$consensusClass))){ #each cluster in k - items = which(o$consensusClass==ci) - nk = length(items) - mk = sum( m[items,items], na.rm=T)/((nk*(nk-1))/2) - cc=rbind(cc,c(k,ci,mk)) #cluster-consensus - - for (ei in rev(res[[2]]$consensusTree$order) ){ - denom = if (ei %in% items) { nk - 1} else { nk } - mei = sum( c(m[ei,items],m[items,ei]), na.rm=T)/denom # mean item consensus to a cluster. - cci = rbind(cci,c(k,ci,ei,mei)) #cluster, cluster index, item index, item-consensus - } - eiCols = c(eiCols, rep(ci,length(o$consensusClass)) ) - } - - cck = cci[which(cci[,1]==k),] #only plot the new k data. - - #group by item, order by cluster i - w=lapply(split(cck,cck[,3]), function(x) { y=matrix(unlist(x),ncol=4); y[order(y[,2]),4] }) - q = matrix(as.numeric(unlist(w)),ncol=length(w),byrow=F) - q = q[,res[[2]]$consensusTree$order] #order by leave order of k=2 - #q is a matrix of k rows and sample columns, values are item consensus of sample to the cluster. - - thisColors = unique(cbind(res[[k]]$consensusClass,res[[k]]$clrs[[1]])) - thisColors=thisColors[order(as.numeric(thisColors[,1])),2] - colorsArr=c(colorsArr,thisColors) - sumRes[[k]] = rankedBarPlot(q,thisColors,cc=res[[k]]$consensusClass[res[[2]]$consensusTree$order],paste("k=",k,sep="") ) - } - - ys=cs=lab=c() - lastk=cc[1,1] - for(i in 1:length(colorsArr)){ - if(lastk != cc[i,1]){ - ys=c(ys,0,0) - cs=c(cs,NA,NA) - lastk=cc[i,1] - lab=c(lab,NA,NA) - } - ys=c(ys,cc[i,3]) - cs=c(cs,colorsArr[i]) - lab=c(lab,cc[i,1]) - } - names(ys) = lab - par(mfrow=c(3,1),mar=c(4,3,2,0)) - barplot(ys,col=cs,border=cs,main="cluster-consensus",ylim=c(0,1),las=1) - if(is.null(plot)==FALSE){ - dev.off() - } - colnames(cc) = c("k","cluster","clusterConsensus") - colnames(cci) = c("k","cluster","item","itemConsensus") - cci[,"item"] = names(res[[2]]$consensusClass)[ cci[,"item"] ] - #type cci - cci = data.frame( k=as.numeric(cci[,"k"]), cluster=as.numeric(cci[,"cluster"]), item=cci[,"item"], itemConsensus=as.numeric(cci[,"itemConsensus"])) - - #write to file. - if(writeTable){ - write.csv(file=paste(title,"/",title,".summary.cluster.consensus.csv",sep=""),row.names=F, cc) - write.csv(file=paste(title,"/",title,".summary.item.consensus.csv",sep=""), row.names=F, cc) - } - return(list(clusterConsensus=cc,itemConsensus=cci)) -} - - -ccRun <- function( d=d, - maxK=NULL, - repCount=NULL, - diss=inherits( d, "dist" ), - pItem=NULL, - pFeature=NULL, - innerLinkage=NULL, - distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ), - clusterAlg=NULL, - weightsItem=NULL, - weightsFeature=NULL, - verbose=NULL) { - m = vector(mode='list', repCount) - ml = vector(mode="list",maxK) - n <- ifelse( diss, ncol( as.matrix(d) ), ncol(d) ) - mCount = mConsist = matrix(c(0),ncol=n,nrow=n) - ml[[1]] = c(0); - - if (is.null( distance ) ) distance <- 'euclidean' ## necessary if d is a dist object and attr( d, "method" ) == NULLa - - require( amap ) - ## we're going to use the amap Dist function, but they misname their correlation - ## functions, so re-name them correctly - amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary", - "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" ) - names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary", - "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" ) - main.dist.obj <- NULL - ##browser() - if ( diss ){ - main.dist.obj <- d - - ## reset the pFeature & weightsFeature params if they've been set (irrelevant if d is a dist matrix) - if ( ( !is.null(pFeature) ) && - ( pFeature < 1 ) ) { - if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified pFeature parameter\n" ) - pFeature <- 1 # set it to 1 to avoid problems with sampleCols - } - if ( ! is.null( weightsFeature ) ) { - if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified weightsFeature parameter\n" ) - weightsFeature <- NULL # set it to NULL to avoid problems with sampleCols - } - } else { ## d is a data matrix - ## we're not sampling over the features - if ( ( clusterAlg != "km" ) && - ( is.null( pFeature ) || - ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) ) { - ## only generate a main.dist.object IFF 1) d is a matrix, 2) we're not sampling the features, and 3) the algorithm isn't 'km' - if ( inherits( distance, "character" ) ) { - if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.") - - main.dist.obj <- Dist( t(d), method=as.character( amap.distance[ distance ] ) ) - ## now fix dumb amap naming convention for distance metrics - attr( main.dist.obj, "method" ) <- as.character( amap.distance[ distance ] ) - } else stop("unsupported distance specified.") - } else { - ## pFeature < 1 or a weightsFeature != NULL - ## since d is a data matrix, the user wants to sample over the gene features, so main.dist.obj is left as NULL - } - } - - - for (i in 1:repCount){ - ##browser() - if(verbose){ - message(paste("random subsample",i)); - } - ## take expression matrix sample, samples and genes - sample_x = sampleCols( d, pItem, pFeature, weightsItem, weightsFeature ) - - this_dist = NA - if ( ! is.null( main.dist.obj ) ) { - boot.cols <- sample_x$subcols - this_dist <- as.matrix( main.dist.obj )[ boot.cols, boot.cols ] - if ( clusterAlg != "km" ) { - ## if this isn't kmeans, then convert to a distance object - this_dist <- as.dist( this_dist ) - attr( this_dist, "method" ) <- attr( main.dist.obj, "method" ) - } - } else { - ## if main.dist.obj is NULL, then d is a data matrix, and either: - ## 1) clusterAlg is 'km' - ## 2) pFeatures < 1 or weightsFeatures have been specified, or - ## 3) both - ## so we can't use a main distance object and for every iteration, we will have to re-calculate either - ## 1) the distance matrix (because we're also sampling the features as well), or - ## 2) the submat (if using km) - - if ( clusterAlg != "km" ) { - if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.") - ## good, we have a supported distance type - this_dist <- Dist( t( sample_x$submat ), method=as.character( amap.distance[ distance ] ) ) - ## now fix dumb amap naming convention for distance metrics - attr( this_dist, "method" ) <- as.character( amap.distance[ distance ] ) - } else { - ##browser() - ##clusterAlg == "km" - ## if we're not sampling the features, then grab the colslice - if ( is.null( pFeature ) || - ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) { - this_dist <- d[, sample_x$subcols ] - } else { - if ( is.na( sample_x$submat ) ) { - save( "ccrun.submat.eq.na.dbg.rda" ) - stop( "Houston, we have a problem. sample_x$submat is NA in ccRun when it should be specified - saving state\n" ) - } - - this_dist <- sample_x$submat - } - } - } - - ## cluster samples for HC. - this_cluster=NA - if(clusterAlg=="hc"){ - this_cluster = hclust( this_dist, method=innerLinkage) - } - ##browser() - ##mCount is possible number of times that two sample occur in same random sample, independent of k - ##mCount stores number of times a sample pair was sampled together. - mCount <- connectivityMatrix( rep( 1,length(sample_x[[3]])), - mCount, - sample_x[[3]] ) - - ##use samples for each k - for (k in 2:maxK){ - if(verbose){ - message(paste(" k =",k)) - } - if (i==1){ - ml[[k]] = mConsist #initialize - } - this_assignment=NA - if(clusterAlg=="hc"){ - ##prune to k for hc - this_assignment = cutree(this_cluster,k) - ##browser() - }else if(clusterAlg=="km"){ - ##this_dist should now be a matrix corresponding to the result from sampleCols - this_assignment <- kmeans( t( this_dist ), - k, - iter.max = 10, - nstart = 1, - algorithm = c("Hartigan-Wong") )$cluster - }else if ( clusterAlg == "pam" ) { - require( cluster ) - this_assignment <- pam( x=this_dist, - k, - diss=TRUE, - metric=distance, - cluster.only=TRUE ) - } else{ - ##optional cluterArg Hook. - this_assignment <- get(clusterAlg)(this_dist, k) - } - ##add to tally - ml[[k]] <- connectivityMatrix( this_assignment, - ml[[k]], - sample_x[[3]] ) - } - } - - - ##consensus fraction - res = vector(mode="list",maxK) - for (k in 2:maxK){ - ##fill in other half of matrix for tally and count. - tmp = triangle(ml[[k]],mode=3) - tmpCount = triangle(mCount,mode=3) - res[[k]] = tmp / tmpCount - res[[k]][which(tmpCount==0)] = 0 - } - message("end fraction") - return(res) -} - - -connectivityMatrix <- function( clusterAssignments, m, sampleKey){ - ##input: named vector of cluster assignments, matrix to add connectivities - ##output: connectivity matrix - names( clusterAssignments ) <- sampleKey - cls <- lapply( unique( clusterAssignments ), function(i) as.numeric( names( clusterAssignments[ clusterAssignments %in% i ] ) ) ) - - for ( i in 1:length( cls ) ) { - nelts <- 1:ncol( m ) - cl <- as.numeric( nelts %in% cls[[i]] ) ## produces a binary vector - updt <- outer( cl, cl ) - m <- m + updt - } - return(m) -} - -## returns a list with the sample columns, as well as the sub-matrix & sample features (if necessary) -## if no sampling over the features is performed, the submatrix & sample features are returned as NAs -## to reduce memory overhead -sampleCols <- function( d, - pSamp=NULL, - pRow=NULL, - weightsItem=NULL, - weightsFeature=NULL ){ - space <- ifelse( inherits( d, "dist" ), ncol( as.matrix(d) ), ncol(d) ) - sampleN <- floor(space*pSamp) - sampCols <- sort( sample(space, sampleN, replace = FALSE, prob = weightsItem) ) - - this_sample <- sampRows <- NA - if ( inherits( d, "matrix" ) ) { - if ( (! is.null( pRow ) ) && - ( (pRow < 1 ) || (! is.null( weightsFeature ) ) ) ) { - ## only sample the rows and generate a sub-matrix if we're sampling over the row/gene/features - space = nrow(d) - sampleN = floor(space*pRow) - sampRows = sort( sample(space, sampleN, replace = FALSE, prob = weightsFeature) ) - this_sample <- d[sampRows,sampCols] - dimnames(this_sample) <- NULL - } else { - ## do nothing - } - } - return( list( submat=this_sample, - subrows=sampRows, - subcols=sampCols ) ) -} - -CDF=function(ml,breaks=100){ - #plot CDF distribution - plot(c(0),xlim=c(0,1),ylim=c(0,1),col="white",bg="white",xlab="consensus index",ylab="CDF",main="consensus CDF", las=2) - k=length(ml) - this_colors = rainbow(k-1) - areaK = c() - for (i in 2:length(ml)){ - v=triangle(ml[[i]],mode=1) - - #empirical CDF distribution. default number of breaks is 100 - h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks)) - h$counts = cumsum(h$counts)/sum(h$counts) - - #calculate area under CDF curve, by histogram method. - thisArea=0 - for (bi in 1:(length(h$breaks)-1)){ - thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width - bi = bi + 1 - } - areaK = c(areaK,thisArea) - lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l') - } - legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors) - - #plot area under CDF change. - deltaK=areaK[1] #initial auc at k=2 - for(i in 2:(length(areaK))){ - #proportional increase relative to prior K. - deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1]) - } - plot(1+(1:length(deltaK)),y=deltaK,xlab="k",ylab="relative change in area under CDF curve",main="Delta area",type="b") -} - - -myPal = function(n=10){ - #returns n colors - seq = rev(seq(0,255,by=255/(n))) - palRGB = cbind(seq,seq,255) - rgb(palRGB,maxColorValue=255) -} - -setClusterColors = function(past_ct,ct,colorU,colorList){ - #description: sets common color of clusters between different K - newColors = c() - if(length(colorList)==0){ - #k==2 - newColors = colorU[ct] - colori=2 - }else{ - newColors = rep(NULL,length(ct)) - colori = colorList[[2]] - mo=table(past_ct,ct) - m=mo/apply(mo,1,sum) - for(tci in 1:ncol(m)){ # for each cluster - maxC = max(m[,tci]) - pci = which(m[,tci] == maxC) - if( sum(m[,tci]==maxC)==1 & max(m[pci,])==maxC & sum(m[pci,]==maxC)==1 ) { - #if new column maximum is unique, same cell is row maximum and is also unique - ##Note: the greatest of the prior clusters' members are the greatest in a current cluster's members. - newColors[which(ct==tci)] = unique(colorList[[1]][which(past_ct==pci)]) # one value - }else{ #add new color. - colori=colori+1 - newColors[which(ct==tci)] = colorU[colori] - } - } - } - return(list(newColors,colori,unique(newColors) )) -} - -clusterTrackingPlot = function(m){ - #description: plots cluster tracking plot - #input: m - matrix where rows are k, columns are samples, and values are cluster assignments. - plot(NULL,xlim=c(-0.1,1),ylim=c(0,1),axes=FALSE,xlab="samples",ylab="k",main="tracking plot") - for(i in 1:nrow(m)){ - rect( xleft=seq(0,1-1/ncol(m),by=1/ncol(m)), ybottom=rep(1-i/nrow(m),ncol(m)) , xright=seq(1/ncol(m),1,by=1/ncol(m)), ytop=rep(1-(i-1)/nrow(m),ncol(m)), col=m[i,],border=NA) - } - #hatch lines to indicate samples - xl = seq(0,1-1/ncol(m),by=1/ncol(m)) - segments( xl, rep(-0.1,ncol(m)) , xl, rep(0,ncol(m)), col="black") #** alt white and black color? - ypos = seq(1,0,by=-1/nrow(m))-1/(2*nrow(m)) - text(x=-0.1,y=ypos[-length(ypos)],labels=seq(2,nrow(m)+1,by=1)) -} - -triangle = function(m,mode=1){ - #mode=1 for CDF, vector of lower triangle. - #mode==3 for full matrix. - #mode==2 for calcICL; nonredundant half matrix coun - #mode!=1 for summary - n=dim(m)[1] - nm = matrix(0,ncol=n,nrow=n) - fm = m - - - nm[upper.tri(nm)] = m[upper.tri(m)] #only upper half - - fm = t(nm)+nm - diag(fm) = diag(m) - - nm=fm - nm[upper.tri(nm)] = NA - diag(nm) = NA - vm = m[lower.tri(nm)] - - if(mode==1){ - return(vm) #vector - }else if(mode==3){ - return(fm) #return full matrix - }else if(mode == 2){ - return(nm) #returns lower triangle and no diagonal. no double counts. - } - -} - - -rankedBarPlot=function(d,myc,cc,title){ - colors = rbind() #each row is a barplot series - byRank = cbind() - - spaceh = 0.1 #space between bars - for(i in 1:ncol(d)){ - byRank = cbind(byRank,sort(d[,i],na.last=F)) - colors = rbind(colors,order(d[,i],na.last=F)) - } - maxH = max(c(1.5,apply(byRank,2,sum)),na.rm=T) #maximum height of graph - - #barplot largest to smallest so that smallest is in front. - barp = barplot( apply(byRank,2,sum) , col=myc[colors[,1]] ,space=spaceh,ylim=c(0,maxH),main=paste("item-consensus", title),border=NA,las=1 ) - for(i in 2:nrow(byRank)){ - barplot( apply(matrix(byRank[i:nrow(byRank),],ncol=ncol(byRank)) ,2,sum), space=spaceh,col=myc[colors[,i]],ylim=c(0,maxH), add=T,border=NA,las=1 ) - } - xr=seq(spaceh,ncol(d)+ncol(d)*spaceh,(ncol(d)+ncol(d)*spaceh)/ncol(d) ) - #class labels as asterisks - text("*",x=xr+0.5,y=maxH,col=myc[cc],cex=1.4) #rect(xr,1.4,xr+1,1.5,col=myc[cc] ) -} - - - -###################################################################3333 -## RESTART MY SCRIPTS HERE -##save.image( '/home/waltman/work.local/tmp/new.ccplus.R.dbg' ) -stop( "phw forced stop\n") -spec <- matrix( c( "data.fname", "d", 1, "character", - "direction", "n", 2, "character", - "output.name", "o", 2, "character", - "cluster.alg", "a", 2, "character", ## must be either 'hc' or 'km' - "distance.metric", "m", 2, "character", ## must be one supported by ConsensusClusterPlus - "max.k", "k", 2, "integer", - "reps", "r", 2, "integer", - "innerLinkage", "i", 1, "character", - "finalLinkage", "f", 1, "character", - "out.report.dir", "p", 2, "character", - "out.report.html", "h", 2, "character" - ), - nc=4, - byrow=TRUE - ) - -opt <- getopt( spec=spec ) - -## default params for non-required params -if ( is.null( opt$direction ) ) { opt$direction <- "cols" } -if ( is.null( opt$cluster.alg ) ) { opt$cluster.alg <- "pam" } -if ( is.null( opt$output.name ) ) { opt$output.name <- "consensus.cluster.result" } -if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "cosine" } -if ( is.null( opt$max.k ) ) { opt$max.k <- 10 } -if ( is.null( opt$reps ) ) { opt$reps <- 1000 } -if ( is.null( opt$innerLinkage ) ) { opt$innerLinkage <- "average" } -if ( is.null( opt$finalLinkage ) ) { opt$finalLinkage <- "average" } - -if ( is.null( opt$out.report.dir ) ) { opt$out.report.dir <- "report" } -if ( is.null( opt$out.report.html ) ) { opt$out.report.html <- file.path( "report", "index.html" ) } - -## validate params here (make sure set to valid values) -if ( !opt$cluster.alg %in% c( "hc", "km", "pam" ) ) { - stop( "invalid clustering algorithm specified", cluster.alg ) -} - - -data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) -## transpose the matrix if we want to cluster the rows (genes) -if ( opt$direction == "rows" ) { - data <- t( data ) -} - - -title <- paste( opt$cluster.alg, opt$output.name, sep="." ) -##source( '~/bin/galaxy-dist/tools/ucsc.cancer.tools/cluster.tools/new.ccplus.R' ) -results <- ConsensusClusterPlus( data, - maxK=opt$max.k, - reps=opt$reps, - pItem=0.8, - ##pFeature=NULL, - pFeature=0.5, - title=opt$out.report.dir, - clusterAlg=opt$cluster.alg, - distance=opt$distance.metric, - innerLinkage=opt$innerLinkage, - finalLinkage=opt$finalLinkage, - plot='pdf', - writeTable=FALSE, - seed=100, - weightsFeature=abs( rnorm( nrow( orig.data ) ) ), - ##verbose=FALSE ) - verbose=TRUE ) - -pngs = list.files(path=opt$out.report.dir, patt="png") -html.out <- paste( "<html>", - paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), - "</html>" ) -cat( html.out, file=opt$out.report.html ) - - -## re-transpose the matrix back if we've clustered the rows (genes) -if ( opt$direction == "rows" ) { - data <- t( data ) -} -save( file=opt$output.name, data, results)
--- a/cluster.tools/normalize.matrix.xml Mon Mar 04 04:11:28 2013 -0500 +++ b/cluster.tools/normalize.matrix.xml Mon Mar 11 16:31:29 2013 -0400 @@ -64,7 +64,7 @@ * Median Absolute Deviation * Standard Deviation -- **Variance Adjustment for Rows** Variance Adjustment Method for Columns +- **Variance Adjustment for Cols** Variance Adjustment Method for Columns * No Adjustment * Median Absolute Deviation
--- a/cluster.tools/partition.R Mon Mar 04 04:11:28 2013 -0500 +++ b/cluster.tools/partition.R Mon Mar 11 16:31:29 2013 -0400 @@ -36,15 +36,26 @@ ) opt <- getopt( spec=spec ) +data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "euclidean" } if ( is.null( opt$algorithm ) ) { opt$algorithm <- "km" } if ( is.null( opt$dist.obj ) ) { opt$dist.obj <- FALSE } if ( is.null( opt$direction ) ) { opt$direction <- "cols" } -if ( is.null( opt$num.k ) ) { opt$num.k <- 10 } if ( is.null( opt$output.name ) ) { opt$output.name <- "partition.result" } +if ( is.null( opt$num.k ) || ( opt$num.k == -1 )) { + if ( opt$direction == 'cols' ) { + opt$num.k <- 5 + } else if ( opt$direction == 'rows' ) { + opt$num.k <- nrow( data ) / 30 ## we use an estimated average size of gene clusters to be 30 + if ( opt$num.k > 1000 ) { + opt$num.k <- ( opt$num.k %/% 10 ) * 10 + } else { + opt$num.k <- ( opt$num.k %/% 5 ) * 5 + } + } +} -data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) if ( opt$direction == "cols" ) { ## need to transpose b/c both kmeans & pam cluster the rows
--- a/cluster.tools/partition.xml Mon Mar 04 04:11:28 2013 -0500 +++ b/cluster.tools/partition.xml Mon Mar 11 16:31:29 2013 -0400 @@ -6,8 +6,19 @@ -a $alg_cond.algorithm #if $alg_cond.algorithm == 'pam' # -m ${alg_cond.distance_metric} #end if + +#if str($numk) != "-1": -k ${numk} --o ${output} +#end if + +#if str($direction) == "rows": +-o ${rdata_output_rows} +#end if + +#if str($direction) == "cols": +-o ${rdata_output_cols} +#end if + </command> <inputs> @@ -40,11 +51,16 @@ </param> </when> </conditional> - <param name="numk" type="integer" label="Number of Clusters" value="50" help="Specify the number of clusters to use"/> + <param name="numk" type="integer" label="Number of Clusters" value="-1" help="Specify the number of clusters to use (-1 to use default. See help below)."/> </inputs> <outputs> - <data format="rdata" name="output" label="Partition Clustering Data (RData)"/> + <data format="rdata" name="rdata_output_rows" label="Partition Clustering Results; Gene Clusters (RData)"> + <filter>(direction)=="rows"</filter> + </data> + <data format="rdata" name="rdata_output_cols" label="Partition Clustering Results; Sample Clusters (RData)"> + <filter>(direction)=="cols"</filter> + </data> </outputs> <help> .. class:: infomark @@ -87,7 +103,9 @@ * Binary -- **Number of Clusters** Specify the number of clusters to use +- **Number of Clusters** Specify the number of clusters to use. If set to -1, default values will be used, with the default set as follows: + * if samples/columns are being clustered, the **default** is 5. + * if genes/rows are being clustered, the **default** is set to num_rows/30, e.g. if there are 600 row/genes in the matrix, the default will be 20 clusters. </help> </tool>
--- a/cluster.tools/rdata.2.out.R Mon Mar 04 04:11:28 2013 -0500 +++ b/cluster.tools/rdata.2.out.R Mon Mar 11 16:31:29 2013 -0400 @@ -12,6 +12,15 @@ q(); } +## some helper fn's +write.2.tab <- function( mat, + fname ) { + mat <- rbind( colnames( mat ), mat ) + mat <- cbind( c( "ID", rownames( mat )[-1] ), + mat ) + write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) +} + lib.load.quiet <- function( package ) { package <- as.character(substitute(package)) suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) @@ -51,11 +60,10 @@ if ( opt$output.format %in% c( "cls-only", "newick" ) ) { if ( opt$output.format == "cls-only" ) { - cl <- cbind( names( cl ), as.numeric( cl ) ) - colnames( cl ) <- c( "ID", "Class" ) + cl <- matrix( as.numeric( cl ), nc=1, dimnames=list( names(cl), "Class" ) ) + opt$output.fname <- gsub( "cls-only$", "tab", opt$output.fname ) - opt$output.fname <- gsub( "cls-only$", "tab", opt$output.fname ) - write.table( cl, opt$output.fname, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE ) + write.2.tab( cl, opt$output.fname ) } else { ##if ( opt$output.format == "newick" ) {
--- a/cluster.tools/rnaseq.feature.selection.xml Mon Mar 04 04:11:28 2013 -0500 +++ b/cluster.tools/rnaseq.feature.selection.xml Mon Mar 11 16:31:29 2013 -0400 @@ -35,18 +35,17 @@ **Parameters** - **Z-transform data?** - Specify whether or not to Z-transform the rows (mean=0, sd=1) -- **Variance Metric for Genes** - Specify Metric to use for calculating Gene Variance. Choice of: +- **Variance Metric for Genes** - Specify Metric to use for calculating Gene Variance. Choice of * Median Absolute Deviation (MAD) * Maximum Absolute Deviation - similar to MAD, but uses the _Maximum_, instead of the Median Absolute Deviatioin * Standard Deviation - - **Percentage of Samples Passing** Percent of samples with an IPL that passes the threshold. Choice of: + + - **Percentage of Samples Passing** Percent of samples with an IPL that passes the threshold. Choice of * Integer Value - indicate the exact number of genes that are to be kept * Real Value in [0,1] - indicate the percentage of genes that are to be kept - - </help> </tool>
--- a/cluster.tools/select.k.from.consensus.cluster.R Mon Mar 04 04:11:28 2013 -0500 +++ b/cluster.tools/select.k.from.consensus.cluster.R Mon Mar 11 16:31:29 2013 -0400 @@ -319,12 +319,19 @@ write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE ) } -treecl.res <- results[[ k.select ]]$consensusTree ## cl should already exist, but re-create it just in case cl <- cls[[ as.character( k.select ) ]] - +treecl.res <- results[[ k.select ]]$consensusTree +select.result <- results[[ k.select ]] -select.result <- results[[ k.select ]] +if ( length(cl) == ncol(data) ) { + names( cl ) <- treecl.res$labels <- select.result$consensusTree$labels <- colnames(data) +} else if ( length(cl) == nrow(data) ) { + names( cl ) <- treecl.res$labels <- select.result$consensusTree$labels <- rownames(data) +} else { + stop( "Number of clustered elements not equal to either number of rows or columns of data matrix\n" ) +} + ## over-write the tabular version of the opt$cluster.class.out with an RData file save( file=opt$cluster.class.out, treecl.res, cl, select.result, data )
--- a/cluster.tools/tab.2.cdt.xml Mon Mar 04 04:11:28 2013 -0500 +++ b/cluster.tools/tab.2.cdt.xml Mon Mar 11 16:31:29 2013 -0400 @@ -14,7 +14,7 @@ .. class:: infomark **Convert tab-delimitted to CDT** - Tool to convert a data matrix into a simplified CDT format that can be read by TreeView --**NOTE** NO CLUSTERING performed on data matrix. Tool is a simple data conversion utility. +- **NOTE** NO CLUSTERING performed on data matrix. Tool is a simple data conversion utility. **OUTPUT:** A new CDT file