# HG changeset patch
# User peter-waltman
# Date 1362033939 18000
# Node ID 0decf3fd54bc4570a98078388015db0c88eb6747
Uploaded
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/cluster.2.centroid.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/cluster.2.centroid.R Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,137 @@
+#!/usr/bin/env Rscript
+argspec <- c("tab.2.cdt.R converts a data matrix to cdt format
+
+ Usage:
+
+ Optional:
+
+ \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") {
+ write(argspec, stderr())
+ q();
+}
+
+lib.load.quiet <- function( package ) {
+ package <- as.character(substitute(package))
+ suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+
+lib.load.quiet( getopt )
+lib.load.quiet( amap )
+
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+ if ( 'flashClust' %in% installed.packages() ) {
+ lib.load.quiet( flashClust )
+ } else {
+ if ( 'fastcluster' %in% installed.packages() ) {
+ lib.load.quiet( fastcluster )
+ }
+ }
+}
+
+spec <- matrix( c( "dataset", "d", 1, "character",
+ "gen.new.dgram", "g", 2, "character",
+ "output.fname", "o", 2, "character"
+ ),
+ nc=4,
+ byrow=TRUE
+ )
+
+
+opt <- getopt( spec=spec )
+if ( is.null( opt$output.report.dir ) ) {
+ opt$output.report.dir <- "report"
+
+ if (! file.exists( opt$output.report.dir ) ) {
+ dir.create( opt$output.report.dir )
+ } else {
+ if ( ! file.info( 'report' )$isdir ) {
+ opt$output.report.dir <- 'heatmap.report'
+ dir.create( opt$output.report.dir )
+ }
+ }
+}
+if ( is.null( opt$output.fname ) ) { opt$output.fname <- file.path( opt$output.report.dir, paste( "data.RData", sep="." ) ) }
+if ( is.null( opt$gen.new.dgram ) ) {
+ opt$gen.new.dgram <- FALSE
+} else {
+ if ( ! opt$gen.new.dgram %in% c( "no", "yes" ) ) {
+ stop( "invalid input to gen.new.dgram param", opt$gen.new.dgram, "\n" )
+ }
+ ## set to TRUE/FALSE
+ opt$gen.new.dgram <- ( opt$gen.new.dgram == "yes" )
+}
+
+
+load( opt$dataset ) ## should load the cl, treecl.res (or partcl.res) and data
+
+if ( ! exists( 'data' ) ) stop( "No data object in the rdata file provided for", opt$output.format, "format!!\n" )
+if ( inherits( data, "dist" ) ) stop( "data provided is a distance matrix - not a data matrix. Can't generate TreeView or Tab-delimited files w/distance matrices!\n" )
+
+## the rest of this is for the remaining output formats
+## pre-set the cluster results for rows & cols to NULL
+direction <- NULL
+if ( exists( 'treecl.res' ) ) {
+ cl.res <- treecl.res
+ if ( is.null( treecl.res$dist.method ) ) treecl.res$dist.method <- 'euclidean' # just set it to some stub so that the ctc fn's don't complain
+} else {
+ if ( exists( 'partcl.res' ) ) {
+ cl.res <- partcl.res
+ }
+ else {
+ stop( 'could not find a valid cluster result to use for primary direction\n' )
+ }
+}
+
+if ( all( names( cl ) %in% rownames( data ) ) ) {
+ direction <- "rows"
+} else if ( all( names( cl ) %in% colnames( data ) ) ) {
+ direction <- "cols"
+ data <- t( data )
+} else {
+ stop( "Specified cluster result does not come from this data set\n" )
+}
+
+
+centroids <- NULL
+cl <- sort( cl )
+if ( inherits( cl.res, "kmeans" ) ) {
+ ## already comes pre-calculated for us!!
+ centroids <- cl.res$centers
+} else {
+ data <- data[ names( cl ), ]
+ cl.list <- unique( cl )
+ cl.list <- lapply( cl.list, function(i) cl[ cl %in% i ] )
+ centroids <- sapply( cl.list,
+ function(x) {
+ return( apply( data[ names(x), , drop=F ], 2, mean, na.rm=T ) )
+ }
+ )
+ centroids <- t( centroids ) ## get them back to the same number of columns that data has now
+}
+
+data <- centroids
+rownames( data ) <- sapply( 1:max( cl ), function(i) sprintf( "cluster-%02d", i ) )
+
+if ( opt$gen.new.dgram ) {
+ distance <- 'euclidean'
+ if ( inherits( cl.res, 'hclust' ) ) {
+ distance <- cl.res$dist.method
+ }
+ amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+ "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
+ names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+ "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
+
+ if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
+ dist.mat <- Dist( data, method=as.character( amap.distance[ distance ] ) )
+ treecl.res <- hclust( dist.mat )
+ cl <- cutree( treecl.res, nrow(data) )
+}
+
+if ( direction == "cols" ) {
+ data <- t( data )
+}
+
+save( file=opt$output.fname, treecl.res, cl, data )
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/cluster.2.centroid.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/cluster.2.centroid.py Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "cluster.2.centroid.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:]
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+ sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/cluster.2.centroid.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/cluster.2.centroid.xml Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,41 @@
+
+ cluster.2.centroid.py
+-d $dataset
+-g ${gen_new_dgram}
+-o ${output_fname}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Calculate Matrix of Cluster Centroids** - Tool to calculate a new matrix containing the cluster centroids, NOTE, this tool will automatically detect whether the dimensionality of the clusters (rows or columns).
+
+**OUTPUT:** A new CDT file
+
+----
+
+**Parameters**
+
+- **Cluster Result** - Specify the cluster result to analayze (MUST BE IN rdata format), and must contain the same objects that are produced by the 'Partition Clustering,' 'Hierarchical Clustering (HAC),' or 'Consensus Clustering' tools. Specifically, it must contain the following objects
+
+ * A 'treecl.res' or 'partcl.res' object - corresponding to whether the cluster results is from a partition or tree clustering method
+ * A 'data' object that contains the data that was passed into the clustering method. NOTE, it is better for this to be the actual data passed in, rather than the data prior to the pre-processing that was performed prior to the actual clustering.
+
+- **Re-calculate cluster tree for new matrix?** - Specify whether or not to re-calculate a dendrogram for the cluster centroid matrix.
+ * IF the cluster result was a tree cluster, the new dendrogram will use the distance method from the original clustering
+ * IF the cluster result was a partition cluster, the dendrogram will use 'Euclidean' distance
+
+
+
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/consensus.clustering.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/consensus.clustering.R Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,774 @@
+#!/usr/bin/env Rscript
+## Consensus Clustering Script by Peter Waltman
+## May 31, 2011
+## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
+##
+#usage, options and doc goes here
+argspec <- c("consensus.clustering.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+ Usage:
+ consensus.clustering.R -d
+ Optional:
+ -o
+ -a ## must be either 'hc' or 'km'
+ -m ## must be one supported by ConsensusClusterPlus
+ -k
+ -r
+ -f ## filter, o/w no filtering
+
+ \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") {
+ write(argspec, stderr())
+ q();
+}
+
+lib.load.quiet <- function( package ) {
+ package <- as.character(substitute(package))
+ suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+lib.load.quiet( gplots )
+lib.load.quiet( amap )
+## if any of the faster clustering methods are available on this system, load them
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+ if ( 'flashClust' %in% installed.packages() ) {
+ lib.load.quiet( flashClust )
+ } else {
+ if ( 'fastcluster' %in% installed.packages() ) {
+ lib.load.quiet( fastcluster )
+ }
+ }
+}
+##lib.load.quiet(ConsensusClusterPlus)
+lib.load.quiet( amap )
+lib.load.quiet( cluster )
+
+###################
+## code borrowed/updated from ConsensusClusterPlus
+###################
+
+ConsensusClusterPlus <- function( d=NULL,
+ maxK = 3,
+ reps=10,
+ pItem=0.8,
+ pFeature=1,
+ clusterAlg="hc",
+ title="untitled_consensus_cluster",
+ innerLinkage="average",
+ finalLinkage="average",
+ distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ),
+ ml=NULL,
+ tmyPal=NULL,
+ seed=NULL,
+ plot=NULL,
+ writeTable=FALSE,
+ weightsItem=NULL,
+ weightsFeature=NULL,
+ verbose=F ) {
+ ##description: runs consensus subsamples
+
+
+ if(is.null(seed)==TRUE){
+ seed=timeSeed = as.numeric(Sys.time())
+ }
+ set.seed(seed)
+
+ if(is.null(ml)==TRUE){
+
+ if ( inherits( distance, "dist" ) ) {
+ stop( "If you want to pass in a pre-calculated distance object, pass it in as the data, rather than the distance parameter\n" )
+ }
+
+ if ( ! class( d ) %in% c( "dist", "matrix", "ExpressionSet" ) ) {
+ stop("d must be a matrix, distance object or ExpressionSet (eset object)")
+ }
+
+ if ( inherits( d, "dist" ) ) {
+ ## if d is a distance matrix, fix a few things so that they don't cause problems with the analysis
+ ## Note, assumption is that if d is a distance matrix, the user doesn't want to sample over the row features
+ if ( is.null( attr( d, "method" ) ) ) {
+ attr( d, "method" ) <- distance <- "unknown - user-specified"
+ }
+ if ( is.null( distance ) || ( distance != attr( d, "method" ) ) ) {
+ distance <- attr( d, "method" )
+ }
+
+ if ( ( ! is.null( pFeature ) ) && ( pFeature < 1 ) ) {
+ if ( verbose ) warning( "Cannot use the pFeatures parameter when specifying a distance matrix as the data object\n" )
+ pFeature <- 1
+ }
+ if ( ! is.null( weightsFeature ) ) {
+ if ( verbose ) warning( "Cannot use the weightsFeature parameter when specifying a distance matrix as the data object\n" )
+ weightsFeature <- NULL
+ }
+ if ( clusterAlg == "km" ) {
+ if ( verbose ) warning( "You are asking CCPLUS to use K-means to cluster a distance matrix (rather than the data itself) - this may produce unintended results. We suggest using PAM if you want to use alternate distance metrics/objects\n" )
+ ##d <- as.matrix( d ) #this is now done w/in ccRun
+ }
+ } else {
+ if ( is.null( distance ) ) {
+ ## we should never get here, but just in case
+ distance <- "pearson"
+ }
+ }
+
+ if ( ( clusterAlg == "km" ) && inherits( distance, "character" ) && ( distance != "euclidean" ) ) {
+ warning( "WARNING: kmeans can only use the euclidean distance metric. If you would like to use an alternate metric, we suggest using PAM or HC clustering instead. This parameter combinationwill use k-means, but will NOT use the specified distance metric\n" )
+ distance <- 'euclidean'
+ }
+
+
+ if ( inherits( d,"ExpressionSet" ) ) {
+ d <- exprs(d)
+ }
+
+ ml <- ccRun( d=d,
+ maxK=maxK,
+ repCount=reps,
+ diss=inherits(d,"dist"),
+ pItem=pItem,
+ pFeature=pFeature,
+ innerLinkage=innerLinkage,
+ clusterAlg=clusterAlg,
+ weightsFeature=weightsFeature,
+ weightsItem=weightsItem,
+ distance=distance,
+ verbose=verbose)
+ }
+ res=list();
+
+ ##make results directory
+ if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){
+ dir.create(paste(title,sep=""))
+ }
+
+ ##write log file
+ log <- matrix( ncol=2,
+ byrow=T,
+ c("title",title,
+ "maxK",maxK,
+ "input matrix rows",ifelse ( inherits( d, "matrix" ), nrow(d), "dist-mat" ),
+ "input matric columns",ifelse ( inherits( d, "matrix" ), ncol(d), ncol( as.matrix(d) ) ),
+ "number of bootstraps",reps,
+ "item subsampling proportion",pItem,
+ "feature subsampling proportion",ifelse( is.null(pFeature), 1, pFeature ),
+ "cluster algorithm",clusterAlg,
+ "inner linkage type",innerLinkage,
+ "final linkage type",finalLinkage,
+ "correlation method",distance,
+ "plot",if(is.null(plot)) NA else plot,
+ "seed",if(is.null(seed)) NA else seed))
+ colnames(log) = c("option","value")
+ if(writeTable){
+ write.csv(file=paste(title,"/",title,".log.csv",sep=""), log,row.names=F)
+ }
+ if(is.null(plot)){
+ ##nothing
+ }else if(plot=="png"){
+ png(paste(title,"/","consensus%03d.png",sep=""))
+ }else if (plot=="pdf"){
+ pdf(onefile=TRUE, paste(title,"/","consensus.pdf",sep=""))
+ }else if (plot=="ps"){
+ postscript(onefile=TRUE, paste(title,"/","consensus.ps",sep=""))
+ }
+
+ colorList=list()
+ colorM = rbind() #matrix of colors.
+
+ #18 colors for marking different clusters
+ thisPal <- c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C","#FB9A99","#E31A1C","#FDBF6F","#FF7F00","#CAB2D6","#6A3D9A","#FFFF99","#B15928",
+ "#bd18ea", #magenta
+ "#2ef4ca", #aqua
+ "#f4cced", #pink,
+ "#f4cc03", #lightorange
+ "#05188a", #navy,
+ "#e5a25a", #light brown
+ "#06f106", #bright green
+ "#85848f", #med gray
+ "#000000", #black
+ "#076f25", #dark green
+ "#93cd7f",#lime green
+ "#4d0776", #dark purple
+ "#ffffff" #white
+ )
+
+ ##plot scale
+ colBreaks=NA
+ if(is.null(tmyPal)==TRUE){
+ colBreaks=10
+ tmyPal = myPal(colBreaks)
+ }else{
+ colBreaks=length(tmyPal)
+ }
+ sc = cbind(seq(0,1,by=1/( colBreaks) )); rownames(sc) = sc[,1]
+ sc = cbind(sc,sc)
+ heatmap(sc, Colv=NA, Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=rownames(sc),labCol=F,main="consensus matrix legend")
+
+ for (tk in 2:maxK){
+ if(verbose){
+ message(paste("consensus ",tk))
+ }
+ fm = ml[[tk]]
+ hc=hclust( as.dist( 1 - fm ), method=finalLinkage);
+ message("clustered")
+ ct = cutree(hc,tk)
+ names(ct) = colnames(d)
+ c = fm
+ ##colnames(c) = colnames(d)
+ ##rownames(c) = colnames(d)
+
+ colorList = setClusterColors(res[[tk-1]][[3]],ct,thisPal,colorList)
+
+ pc = c
+ pc=pc[hc$order,] #***pc is matrix for plotting, same as c but is row-ordered and has names and extra row of zeros.
+ pc = rbind(pc,0)
+
+ heatmap(pc, Colv=as.dendrogram(hc), Rowv=NA, symm=FALSE, scale='none', col=tmyPal, na.rm=TRUE,labRow=F,labCol=F,mar=c(5,5),main=paste("consensus matrix k=",tk,sep="") , ColSideCol=colorList[[1]])
+ legend("topright",legend=unique(ct),fill=unique(colorList[[1]]),horiz=FALSE )
+
+ res[[tk]] = list(consensusMatrix=c,consensusTree=hc,consensusClass=ct,ml=ml[[tk]],clrs=colorList)
+ colorM = rbind(colorM,colorList[[1]])
+ }
+ CDF(ml)
+ clusterTrackingPlot(colorM[,res[[length(res)]]$consensusTree$order])
+ if(is.null(plot)==FALSE){
+ dev.off();
+ }
+ res[[1]] = colorM
+ if(writeTable){
+ for(i in 2:length(res)){
+ write.csv(file=paste(title,"/",title,".k=",i,".consensusMatrix.csv",sep=""), res[[i]]$consensusMatrix)
+ write.table(file=paste(title,"/",title,".k=",i,".consensusClass.csv",sep=""), res[[i]]$consensusClass,col.names = F,sep=",")
+ }
+ }
+ return(res)
+}
+
+
+calcICL = function(res,title="untitled_consensus_cluster",plot=NULL,writeTable=FALSE){
+ #calculates and plots cluster consensus and item consensus
+ cc=rbind()
+ cci = rbind()
+ sumRes=list()
+ colorsArr=c()
+
+ #make results directory
+ if((is.null(plot)==FALSE | writeTable) & !file.exists(paste(title,sep=""))){
+ dir.create(paste(title,sep=""))
+ }
+ if(is.null(plot)){
+ #to screen
+ }else if(plot=="pdf"){
+ pdf(onefile=TRUE, paste(title,"/","icl.pdf",sep=""))
+ }else if(plot=="ps"){
+ postscript(onefile=TRUE, paste(title,"/","icl.ps",sep=""))
+ }else if (plot=="png"){
+ png(paste(title,"/","icl%03d.png",sep=""))
+ }
+
+ par(mfrow=c(3,1),mar=c(4,3,2,0))
+
+ for (k in 2:length(res)){ #each k
+ eiCols = c();
+ o = res[[k]]
+ m = o$consensusMatrix
+ m = triangle(m,mode=2)
+ for (ci in sort(unique(o$consensusClass))){ #each cluster in k
+ items = which(o$consensusClass==ci)
+ nk = length(items)
+ mk = sum( m[items,items], na.rm=T)/((nk*(nk-1))/2)
+ cc=rbind(cc,c(k,ci,mk)) #cluster-consensus
+
+ for (ei in rev(res[[2]]$consensusTree$order) ){
+ denom = if (ei %in% items) { nk - 1} else { nk }
+ mei = sum( c(m[ei,items],m[items,ei]), na.rm=T)/denom # mean item consensus to a cluster.
+ cci = rbind(cci,c(k,ci,ei,mei)) #cluster, cluster index, item index, item-consensus
+ }
+ eiCols = c(eiCols, rep(ci,length(o$consensusClass)) )
+ }
+
+ cck = cci[which(cci[,1]==k),] #only plot the new k data.
+
+ #group by item, order by cluster i
+ w=lapply(split(cck,cck[,3]), function(x) { y=matrix(unlist(x),ncol=4); y[order(y[,2]),4] })
+ q = matrix(as.numeric(unlist(w)),ncol=length(w),byrow=F)
+ q = q[,res[[2]]$consensusTree$order] #order by leave order of k=2
+ #q is a matrix of k rows and sample columns, values are item consensus of sample to the cluster.
+
+ thisColors = unique(cbind(res[[k]]$consensusClass,res[[k]]$clrs[[1]]))
+ thisColors=thisColors[order(as.numeric(thisColors[,1])),2]
+ colorsArr=c(colorsArr,thisColors)
+ sumRes[[k]] = rankedBarPlot(q,thisColors,cc=res[[k]]$consensusClass[res[[2]]$consensusTree$order],paste("k=",k,sep="") )
+ }
+
+ ys=cs=lab=c()
+ lastk=cc[1,1]
+ for(i in 1:length(colorsArr)){
+ if(lastk != cc[i,1]){
+ ys=c(ys,0,0)
+ cs=c(cs,NA,NA)
+ lastk=cc[i,1]
+ lab=c(lab,NA,NA)
+ }
+ ys=c(ys,cc[i,3])
+ cs=c(cs,colorsArr[i])
+ lab=c(lab,cc[i,1])
+ }
+ names(ys) = lab
+ par(mfrow=c(3,1),mar=c(4,3,2,0))
+ barplot(ys,col=cs,border=cs,main="cluster-consensus",ylim=c(0,1),las=1)
+ if(is.null(plot)==FALSE){
+ dev.off()
+ }
+ colnames(cc) = c("k","cluster","clusterConsensus")
+ colnames(cci) = c("k","cluster","item","itemConsensus")
+ cci[,"item"] = names(res[[2]]$consensusClass)[ cci[,"item"] ]
+ #type cci
+ cci = data.frame( k=as.numeric(cci[,"k"]), cluster=as.numeric(cci[,"cluster"]), item=cci[,"item"], itemConsensus=as.numeric(cci[,"itemConsensus"]))
+
+ #write to file.
+ if(writeTable){
+ write.csv(file=paste(title,"/",title,".summary.cluster.consensus.csv",sep=""),row.names=F, cc)
+ write.csv(file=paste(title,"/",title,".summary.item.consensus.csv",sep=""), row.names=F, cc)
+ }
+ return(list(clusterConsensus=cc,itemConsensus=cci))
+}
+
+
+ccRun <- function( d=d,
+ maxK=NULL,
+ repCount=NULL,
+ diss=inherits( d, "dist" ),
+ pItem=NULL,
+ pFeature=NULL,
+ innerLinkage=NULL,
+ distance=ifelse( inherits(d,"dist"), attr( d, "method" ), "euclidean" ),
+ clusterAlg=NULL,
+ weightsItem=NULL,
+ weightsFeature=NULL,
+ verbose=NULL) {
+ m = vector(mode='list', repCount)
+ ml = vector(mode="list",maxK)
+ n <- ifelse( diss, ncol( as.matrix(d) ), ncol(d) )
+ mCount = mConsist = matrix(c(0),ncol=n,nrow=n)
+ ml[[1]] = c(0);
+
+ if (is.null( distance ) ) distance <- 'euclidean' ## necessary if d is a dist object and attr( d, "method" ) == NULLa
+
+ require( amap )
+ ## we're going to use the amap Dist function, but they misname their correlation
+ ## functions, so re-name them correctly
+ amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+ "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
+ names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
+ "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
+ main.dist.obj <- NULL
+ ##browser()
+ if ( diss ){
+ main.dist.obj <- d
+
+ ## reset the pFeature & weightsFeature params if they've been set (irrelevant if d is a dist matrix)
+ if ( ( !is.null(pFeature) ) &&
+ ( pFeature < 1 ) ) {
+ if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified pFeature parameter\n" )
+ pFeature <- 1 # set it to 1 to avoid problems with sampleCols
+ }
+ if ( ! is.null( weightsFeature ) ) {
+ if (verbose) warning( "user-supplied data is a distance matrix; ignoring user-specified weightsFeature parameter\n" )
+ weightsFeature <- NULL # set it to NULL to avoid problems with sampleCols
+ }
+ } else { ## d is a data matrix
+ ## we're not sampling over the features
+ if ( ( clusterAlg != "km" ) &&
+ ( is.null( pFeature ) ||
+ ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) ) {
+ ## only generate a main.dist.object IFF 1) d is a matrix, 2) we're not sampling the features, and 3) the algorithm isn't 'km'
+ if ( inherits( distance, "character" ) ) {
+ if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
+
+ main.dist.obj <- Dist( t(d), method=as.character( amap.distance[ distance ] ) )
+ ## now fix dumb amap naming convention for distance metrics
+ attr( main.dist.obj, "method" ) <- as.character( amap.distance[ distance ] )
+ } else stop("unsupported distance specified.")
+ } else {
+ ## pFeature < 1 or a weightsFeature != NULL
+ ## since d is a data matrix, the user wants to sample over the gene features, so main.dist.obj is left as NULL
+ }
+ }
+
+
+ for (i in 1:repCount){
+ ##browser()
+ if(verbose){
+ message(paste("random subsample",i));
+ }
+ ## take expression matrix sample, samples and genes
+ sample_x = sampleCols( d, pItem, pFeature, weightsItem, weightsFeature )
+
+ this_dist = NA
+ if ( ! is.null( main.dist.obj ) ) {
+ boot.cols <- sample_x$subcols
+ this_dist <- as.matrix( main.dist.obj )[ boot.cols, boot.cols ]
+ if ( clusterAlg != "km" ) {
+ ## if this isn't kmeans, then convert to a distance object
+ this_dist <- as.dist( this_dist )
+ attr( this_dist, "method" ) <- attr( main.dist.obj, "method" )
+ }
+ } else {
+ ## if main.dist.obj is NULL, then d is a data matrix, and either:
+ ## 1) clusterAlg is 'km'
+ ## 2) pFeatures < 1 or weightsFeatures have been specified, or
+ ## 3) both
+ ## so we can't use a main distance object and for every iteration, we will have to re-calculate either
+ ## 1) the distance matrix (because we're also sampling the features as well), or
+ ## 2) the submat (if using km)
+
+ if ( clusterAlg != "km" ) {
+ if ( ! distance %in% names( amap.distance ) ) stop("unsupported distance.")
+ ## good, we have a supported distance type
+ this_dist <- Dist( t( sample_x$submat ), method=as.character( amap.distance[ distance ] ) )
+ ## now fix dumb amap naming convention for distance metrics
+ attr( this_dist, "method" ) <- as.character( amap.distance[ distance ] )
+ } else {
+ ##browser()
+ ##clusterAlg == "km"
+ ## if we're not sampling the features, then grab the colslice
+ if ( is.null( pFeature ) ||
+ ( ( pFeature == 1 ) && is.null( weightsFeature ) ) ) {
+ this_dist <- d[, sample_x$subcols ]
+ } else {
+ if ( is.na( sample_x$submat ) ) {
+ save( "ccrun.submat.eq.na.dbg.rda" )
+ stop( "Houston, we have a problem. sample_x$submat is NA in ccRun when it should be specified - saving state\n" )
+ }
+
+ this_dist <- sample_x$submat
+ }
+ }
+ }
+
+ ## cluster samples for HC.
+ this_cluster=NA
+ if(clusterAlg=="hc"){
+ this_cluster = hclust( this_dist, method=innerLinkage)
+ }
+ ##browser()
+ ##mCount is possible number of times that two sample occur in same random sample, independent of k
+ ##mCount stores number of times a sample pair was sampled together.
+ mCount <- connectivityMatrix( rep( 1,length(sample_x[[3]])),
+ mCount,
+ sample_x[[3]] )
+
+ ##use samples for each k
+ for (k in 2:maxK){
+ if(verbose){
+ message(paste(" k =",k))
+ }
+ if (i==1){
+ ml[[k]] = mConsist #initialize
+ }
+ this_assignment=NA
+ if(clusterAlg=="hc"){
+ ##prune to k for hc
+ this_assignment = cutree(this_cluster,k)
+ ##browser()
+ }else if(clusterAlg=="km"){
+ ##this_dist should now be a matrix corresponding to the result from sampleCols
+ this_assignment <- kmeans( t( this_dist ),
+ k,
+ iter.max = 10,
+ nstart = 1,
+ algorithm = c("Hartigan-Wong") )$cluster
+ }else if ( clusterAlg == "pam" ) {
+ require( cluster )
+ this_assignment <- pam( x=this_dist,
+ k,
+ diss=TRUE,
+ metric=distance,
+ cluster.only=TRUE )
+ } else{
+ ##optional cluterArg Hook.
+ this_assignment <- get(clusterAlg)(this_dist, k)
+ }
+ ##add to tally
+ ml[[k]] <- connectivityMatrix( this_assignment,
+ ml[[k]],
+ sample_x[[3]] )
+ }
+ }
+
+
+ ##consensus fraction
+ res = vector(mode="list",maxK)
+ for (k in 2:maxK){
+ ##fill in other half of matrix for tally and count.
+ tmp = triangle(ml[[k]],mode=3)
+ tmpCount = triangle(mCount,mode=3)
+ res[[k]] = tmp / tmpCount
+ res[[k]][which(tmpCount==0)] = 0
+ }
+ message("end fraction")
+ return(res)
+}
+
+
+connectivityMatrix <- function( clusterAssignments, m, sampleKey){
+ ##input: named vector of cluster assignments, matrix to add connectivities
+ ##output: connectivity matrix
+ names( clusterAssignments ) <- sampleKey
+ cls <- lapply( unique( clusterAssignments ), function(i) as.numeric( names( clusterAssignments[ clusterAssignments %in% i ] ) ) )
+
+ for ( i in 1:length( cls ) ) {
+ nelts <- 1:ncol( m )
+ cl <- as.numeric( nelts %in% cls[[i]] ) ## produces a binary vector
+ updt <- outer( cl, cl )
+ m <- m + updt
+ }
+ return(m)
+}
+
+## returns a list with the sample columns, as well as the sub-matrix & sample features (if necessary)
+## if no sampling over the features is performed, the submatrix & sample features are returned as NAs
+## to reduce memory overhead
+sampleCols <- function( d,
+ pSamp=NULL,
+ pRow=NULL,
+ weightsItem=NULL,
+ weightsFeature=NULL ){
+ space <- ifelse( inherits( d, "dist" ), ncol( as.matrix(d) ), ncol(d) )
+ sampleN <- floor(space*pSamp)
+ sampCols <- sort( sample(space, sampleN, replace = FALSE, prob = weightsItem) )
+
+ this_sample <- sampRows <- NA
+ if ( inherits( d, "matrix" ) ) {
+ if ( (! is.null( pRow ) ) &&
+ ( (pRow < 1 ) || (! is.null( weightsFeature ) ) ) ) {
+ ## only sample the rows and generate a sub-matrix if we're sampling over the row/gene/features
+ space = nrow(d)
+ sampleN = floor(space*pRow)
+ sampRows = sort( sample(space, sampleN, replace = FALSE, prob = weightsFeature) )
+ this_sample <- d[sampRows,sampCols]
+ dimnames(this_sample) <- NULL
+ } else {
+ ## do nothing
+ }
+ }
+ return( list( submat=this_sample,
+ subrows=sampRows,
+ subcols=sampCols ) )
+}
+
+CDF=function(ml,breaks=100){
+ #plot CDF distribution
+ plot(c(0),xlim=c(0,1),ylim=c(0,1),col="white",bg="white",xlab="consensus index",ylab="CDF",main="consensus CDF", las=2)
+ k=length(ml)
+ this_colors = rainbow(k-1)
+ areaK = c()
+ for (i in 2:length(ml)){
+ v=triangle(ml[[i]],mode=1)
+
+ #empirical CDF distribution. default number of breaks is 100
+ h = hist(v, plot=FALSE, breaks=seq(0,1,by=1/breaks))
+ h$counts = cumsum(h$counts)/sum(h$counts)
+
+ #calculate area under CDF curve, by histogram method.
+ thisArea=0
+ for (bi in 1:(length(h$breaks)-1)){
+ thisArea = thisArea + h$counts[bi]*(h$breaks[bi+1]-h$breaks[bi]) #increment by height by width
+ bi = bi + 1
+ }
+ areaK = c(areaK,thisArea)
+ lines(h$mids,h$counts,col=this_colors[i-1],lwd=2,type='l')
+ }
+ legend(0.8,0.5,legend=paste(rep("",k-1),seq(2,k,by=1),sep=""),fill=this_colors)
+
+ #plot area under CDF change.
+ deltaK=areaK[1] #initial auc at k=2
+ for(i in 2:(length(areaK))){
+ #proportional increase relative to prior K.
+ deltaK = c(deltaK,( areaK[i] - areaK[i-1])/areaK[i-1])
+ }
+ plot(1+(1:length(deltaK)),y=deltaK,xlab="k",ylab="relative change in area under CDF curve",main="Delta area",type="b")
+}
+
+
+myPal = function(n=10){
+ #returns n colors
+ seq = rev(seq(0,255,by=255/(n)))
+ palRGB = cbind(seq,seq,255)
+ rgb(palRGB,maxColorValue=255)
+}
+
+setClusterColors = function(past_ct,ct,colorU,colorList){
+ #description: sets common color of clusters between different K
+ newColors = c()
+ if(length(colorList)==0){
+ #k==2
+ newColors = colorU[ct]
+ colori=2
+ }else{
+ newColors = rep(NULL,length(ct))
+ colori = colorList[[2]]
+ mo=table(past_ct,ct)
+ m=mo/apply(mo,1,sum)
+ for(tci in 1:ncol(m)){ # for each cluster
+ maxC = max(m[,tci])
+ pci = which(m[,tci] == maxC)
+ if( sum(m[,tci]==maxC)==1 & max(m[pci,])==maxC & sum(m[pci,]==maxC)==1 ) {
+ #if new column maximum is unique, same cell is row maximum and is also unique
+ ##Note: the greatest of the prior clusters' members are the greatest in a current cluster's members.
+ newColors[which(ct==tci)] = unique(colorList[[1]][which(past_ct==pci)]) # one value
+ }else{ #add new color.
+ colori=colori+1
+ newColors[which(ct==tci)] = colorU[colori]
+ }
+ }
+ }
+ return(list(newColors,colori,unique(newColors) ))
+}
+
+clusterTrackingPlot = function(m){
+ #description: plots cluster tracking plot
+ #input: m - matrix where rows are k, columns are samples, and values are cluster assignments.
+ plot(NULL,xlim=c(-0.1,1),ylim=c(0,1),axes=FALSE,xlab="samples",ylab="k",main="tracking plot")
+ for(i in 1:nrow(m)){
+ rect( xleft=seq(0,1-1/ncol(m),by=1/ncol(m)), ybottom=rep(1-i/nrow(m),ncol(m)) , xright=seq(1/ncol(m),1,by=1/ncol(m)), ytop=rep(1-(i-1)/nrow(m),ncol(m)), col=m[i,],border=NA)
+ }
+ #hatch lines to indicate samples
+ xl = seq(0,1-1/ncol(m),by=1/ncol(m))
+ segments( xl, rep(-0.1,ncol(m)) , xl, rep(0,ncol(m)), col="black") #** alt white and black color?
+ ypos = seq(1,0,by=-1/nrow(m))-1/(2*nrow(m))
+ text(x=-0.1,y=ypos[-length(ypos)],labels=seq(2,nrow(m)+1,by=1))
+}
+
+triangle = function(m,mode=1){
+ #mode=1 for CDF, vector of lower triangle.
+ #mode==3 for full matrix.
+ #mode==2 for calcICL; nonredundant half matrix coun
+ #mode!=1 for summary
+ n=dim(m)[1]
+ nm = matrix(0,ncol=n,nrow=n)
+ fm = m
+
+
+ nm[upper.tri(nm)] = m[upper.tri(m)] #only upper half
+
+ fm = t(nm)+nm
+ diag(fm) = diag(m)
+
+ nm=fm
+ nm[upper.tri(nm)] = NA
+ diag(nm) = NA
+ vm = m[lower.tri(nm)]
+
+ if(mode==1){
+ return(vm) #vector
+ }else if(mode==3){
+ return(fm) #return full matrix
+ }else if(mode == 2){
+ return(nm) #returns lower triangle and no diagonal. no double counts.
+ }
+
+}
+
+
+rankedBarPlot=function(d,myc,cc,title){
+ colors = rbind() #each row is a barplot series
+ byRank = cbind()
+
+ spaceh = 0.1 #space between bars
+ for(i in 1:ncol(d)){
+ byRank = cbind(byRank,sort(d[,i],na.last=F))
+ colors = rbind(colors,order(d[,i],na.last=F))
+ }
+ maxH = max(c(1.5,apply(byRank,2,sum)),na.rm=T) #maximum height of graph
+
+ #barplot largest to smallest so that smallest is in front.
+ barp = barplot( apply(byRank,2,sum) , col=myc[colors[,1]] ,space=spaceh,ylim=c(0,maxH),main=paste("item-consensus", title),border=NA,las=1 )
+ for(i in 2:nrow(byRank)){
+ barplot( apply(matrix(byRank[i:nrow(byRank),],ncol=ncol(byRank)) ,2,sum), space=spaceh,col=myc[colors[,i]],ylim=c(0,maxH), add=T,border=NA,las=1 )
+ }
+ xr=seq(spaceh,ncol(d)+ncol(d)*spaceh,(ncol(d)+ncol(d)*spaceh)/ncol(d) )
+ #class labels as asterisks
+ text("*",x=xr+0.5,y=maxH,col=myc[cc],cex=1.4) #rect(xr,1.4,xr+1,1.5,col=myc[cc] )
+}
+
+
+
+###################################################################3333
+## RESTART MY SCRIPTS HERE
+
+spec <- matrix( c( "data.fname", "d", 1, "character",
+ "direction", "n", 2, "character",
+ "output.name", "o", 2, "character",
+ "cluster.alg", "a", 2, "character",
+ "distance.metric", "m", 2, "character",
+ "max.k", "k", 2, "integer",
+ "reps", "r", 2, "integer",
+ "innerLinkage", "i", 1, "character",
+ "finalLinkage", "f", 1, "character",
+ "out.report.dir", "p", 2, "character",
+ "out.report.html", "h", 2, "character"
+ ),
+ nc=4,
+ byrow=TRUE
+ )
+
+opt <- getopt( spec=spec )
+
+## default params for non-required params
+if ( is.null( opt$direction ) ) { opt$direction <- "cols" }
+if ( is.null( opt$cluster.alg ) ) { opt$cluster.alg <- "pam" }
+if ( is.null( opt$output.name ) ) { opt$output.name <- "consensus.cluster.result" }
+if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "cosine" }
+if ( is.null( opt$max.k ) ) { opt$max.k <- 10 }
+if ( is.null( opt$reps ) ) { opt$reps <- 1000 }
+if ( is.null( opt$innerLinkage ) ) { opt$innerLinkage <- "average" }
+if ( is.null( opt$finalLinkage ) ) { opt$finalLinkage <- "average" }
+
+if ( is.null( opt$out.report.dir ) ) { opt$out.report.dir <- "report" }
+if ( is.null( opt$out.report.html ) ) { opt$out.report.html <- file.path( "report", "index.html" ) }
+
+## validate params here (make sure set to valid values)
+if ( !opt$cluster.alg %in% c( "hc", "km", "pam" ) ) {
+ stop( "invalid clustering algorithm specified", cluster.alg )
+}
+
+
+data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
+## transpose the matrix if we want to cluster the rows (genes)
+if ( opt$direction == "rows" ) {
+ data <- t( data )
+}
+
+title <- paste( opt$cluster.alg, opt$output.name, sep="." )
+results <- ConsensusClusterPlus( data,
+ maxK=opt$max.k,
+ reps=opt$reps,
+ pItem=0.8,
+ pFeature=1,
+ title=opt$out.report.dir,
+ clusterAlg=opt$cluster.alg,
+ distance=opt$distance.metric,
+ innerLinkage=opt$innerLinkage,
+ finalLinkage=opt$finalLinkage,
+ plot='png',
+ writeTable=FALSE,
+ seed=100,
+ ##weightsFeature=abs( rnorm( nrow( orig.data ) ) ),
+ verbose=FALSE )
+
+pngs = list.files(path=opt$out.report.dir, patt="png")
+html.out <- paste( "",
+ paste( paste( "
", sep="" ),
+ "" )
+cat( html.out, file=opt$out.report.html )
+
+
+## re-transpose the matrix back if we've clustered the rows (genes)
+if ( opt$direction == "rows" ) {
+ data <- t( data )
+}
+save( file=opt$output.name, data, results)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/consensus.clustering.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/consensus.clustering.py Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "consensus.clustering.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:]
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+ sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/consensus.clustering.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/consensus.clustering.xml Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,149 @@
+
+ consensus.clustering.py
+-d $dataset
+-n ${direction}
+-a ${method.algorithm}
+#if $method.algorithm == 'hc' # -m ${method.hc_distance_metric}
+-i ${method.innerLinkage}
+#end if
+#if $method.algorithm == 'pam' # -m ${method.pam_distance_metric}
+#end if
+#if $method.algorithm == 'km' # -m euclidean
+#end if
+-k ${kmax}
+-r ${reps}
+-f ${finalLinkage}
+-o ${output}
+-h $report
+-p ${report.files_path}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Perform Consensus Clustering (Cluster Samples) on a specified data set**
+
+----
+
+**Parameters**
+
+- **Data Set** - Specify the data matrix to be clustered. Data must be formated as follows:
+
+ * Tab-delimited
+ * Use row/column headers
+
+- **Cluster Samples or Genes** - Specify the dimension of the matrix to cluster:
+
+ * Rows (Genes)
+ * Columns (Samples)
+
+- **Clustering Algorithm** Specify the choice of algorithm to use. Choice of:
+
+ * Hierarchical Clustering
+ * K-Means
+
+- **Distance Metric** Specify the choice of distance metric to use. Choice of:
+
+ * Cosine (AKA uncentered pearson)
+ * Absolute Cosine (AKA uncentered pearson, absolute value)
+ * Pearson (pearson correlation)
+ * Absolute Pearson (pearson correlation, absolute value)
+ * Spearman (spearman correlation)
+ * Kendall (Kendall's Tau)
+ * Euclidean (euclidean distance)
+ * Maximum
+ * Manhattan (AKA city block)
+ * Canberra
+ * Binary
+
+- **Final Linkage** Specify the choice linkage to use when clustering Consensus Matrix. Choice of:
+
+ * Average (see documentation for R's hclust function for explanation of choices)
+ * Single
+ * Complete
+ * Median
+ * Centroid
+ * McQuity
+ * Ward
+
+- **Inner Linkage** Specify the choice linkage to use when using HAC as clustering method. Same choices as 'Final Linkage'
+
+- **K Max** Specify the number to use for the largest K considered
+
+- **Repititions** Specify the number of 'bootstrap' repitions to perform to generate the consensus matrix
+
+
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/cutree.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/cutree.R Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,52 @@
+#!/usr/bin/env Rscript
+argspec <- c("tab.2.cdt.R converts a data matrix to cdt format
+
+ Usage:
+ tab.2.cdt.R -d
+ Optional:
+ -o
+ \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") {
+ write(argspec, stderr())
+ q();
+}
+
+lib.load.quiet <- function( package ) {
+ package <- as.character(substitute(package))
+ suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+
+lib.load.quiet( getopt )
+lib.load.quiet( ctc )
+if ( any( c( 'flashClust', 'fastcluster' ) %in% installed.packages() ) ) {
+ if ( 'flashClust' %in% installed.packages() ) {
+ lib.load.quiet( flashClust )
+ } else {
+ if ( 'fastcluster' %in% installed.packages() ) {
+ lib.load.quiet( fastcluster )
+ }
+ }
+}
+
+spec <- matrix( c( "dataset", "d", 1, "character",
+ "num.k", "k", 1, "integer",
+ "output.fname", "o", 2, "character"
+ ),
+ nc=4,
+ byrow=TRUE
+ )
+
+
+opt <- getopt( spec=spec )
+if ( is.null( opt$output.fname ) ) { opt$output.fname <- file.path( opt$output.report.dir, paste( "data", opt$output.format, sep="." ) ) }
+
+
+load( opt$dataset ) ## should load the cl, treecl.res (or partcl.res) and data
+if ( exists( 'treecl.res' ) ) {
+ cutree.res <- cutree( treecl.res, k=opt$num.k )
+ cl <- cutree.res
+ save( file=opt$output.fname, treecl.res, cl, data )
+} else {
+ stop( "no hierarchical clustering result found!\n" )
+}
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/cutree.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/cutree.py Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "cutree.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:]
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+ sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/cutree.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/cutree.xml Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,24 @@
+
+ cutree.py
+-d $dataset
+-k ${numk}
+-o ${rdata_output}
+
+
+
+
+
+
+
+
+
+
+- **Cluster Result** - Specify the cluster result to analayze (MUST BE IN rdata format), and must contain the same objects that are produced by the 'Partition Clustering,' 'Hierarchical Clustering (HAC),' or 'Consensus Clustering' tools. Specifically, it must contain the following objects
+
+ * A 'treecl.res' or 'partcl.res' object - corresponding to whether the cluster results is from a partition or tree clustering method
+ * A 'data' object that contains the data that was passed into the clustering method. NOTE, it is better for this to be the actual data passed in, rather than the data prior to the pre-processing that was performed prior to the actual clustering.
+
+- **Number of Clusters** Specify the number of clusters to use
+
+
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/determine.IPL.threshold.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/determine.IPL.threshold.R Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,263 @@
+#!/usr/bin/env Rscript
+
+##usage, options and doc goes here
+argspec <- c("determine.IPL.threshold.R takes an IPL result, and determines a statistically sound threshold to use
+
+ Usage:
+ determine.IPL.threshold.R -d
+ Optional:
+ -o output.rdata ## rdata output file (contains variables used for calculation, for those who want to review them
+ -f filter type # must be either modulated, active, or inactive
+ -p percent of samples passing (must be value on [0,1]
+\n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") {
+ write( argspec, stderr() )
+ q();
+}
+
+lib.load.quiet <- function( package ) {
+ package <- as.character(substitute(package))
+ suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "data.fname", "d", 1, "character",
+ "output.rdata", "o", 2, "character",
+ "filter.type", "f", 2, "character",
+ "perc.pass", "p", 2, "numeric",
+ "selection.criteria", "s", 2, "character",
+ "output.report.dir", "r", 2, "character",
+ "output.report.html", "h", 2, "character"
+ ),
+ nc=4,
+ byrow=T
+ )
+opt <- getopt( spec=spec )
+
+## default params for non-required params
+if ( is.null( opt$filter.type ) ) { opt$filter.type <- 'modulated' }
+if ( is.null( opt$perc.pass ) ) { opt$perc.pass <- 1/3 }
+if ( is.null( opt$selection.criteria ) ) { opt$selection.criteria <- 'max_diffs' }
+if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" }
+if ( is.null( opt$output.report.html ) ) { opt$output.report.html <- "report/index.html" }
+if ( is.null( opt$output.rdata ) ) { opt$output.rdata <- "output.rdata" }
+if ( opt$perc.pass < 0 ) {
+ stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" )
+}
+
+if (!file.exists(opt$output.report.dir)){
+ dir.create(opt$output.report.dir)
+}
+
+
+data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
+genes <- rownames( data )
+genes <- genes[ !grepl( "abstract|family|complex", genes ) ]
+data <- data[ genes, ]
+
+nulls.mat <- grepl( "na_", colnames( data ) )
+reals <- ! nulls.mat
+nulls.mat <- data[ , nulls.mat ]
+reals.mat <- data[, reals ]
+if ( ncol( nulls.mat ) == 0 ) stop( "no nulls were in the file provided!\n" )
+if ( ncol( reals.mat ) == 0 ) stop( "no reals were in the file provided!\n" )
+
+
+if ( opt$filter.type == 'modulated' ) {
+ reals.mat <- abs( reals.mat )
+ nulls.mat <- abs( nulls.mat )
+} else {
+ if ( opt$filter.type == "inactive" ) {
+ reals.mat <- -reals.mat
+ nulls.mat <- -nulls.mat
+ }
+}
+
+
+## we only look at the larger 50% of the possible IPL values
+## as possible thresholds to use (since the lower 50% are almost
+## always uninformative)
+thresholds <- unique( quantile( reals.mat,
+ seq( 0.5, 1, by=0.001 ) ) )
+thresholds <- c( thresholds,
+ quantile( nulls.mat,
+ seq( 0.5, 1, by=0.001 ) ) )
+thresholds <- unique( sort( thresholds ) )
+
+
+get.num.filtered.feats <- function( mat,
+ threshold,
+ perc.samples.passing=1/3 ) {
+ feat.vect <- apply( mat,
+ 1,
+ function(x) {
+ tmp <- sum( x > threshold )
+ if ( perc.samples.passing >=1 ) {
+ return( tmp >= perc.samples.passing )
+ } else {
+ return( tmp > floor( perc.samples.passing * length(x) ) )
+ }
+ }
+ )
+ return( sum( feat.vect ) )
+}
+
+
+real.feats <- null.feats <- length( genes )
+chisq.pvals <- binom.pvals <- numeric()
+
+for ( i in 1:length( thresholds ) ) {
+
+ nul.feats.this.thresh <- get.num.filtered.feats( mat=nulls.mat, threshold=thresholds[i], perc.samples.passing=opt$perc.pass )
+ ## limit the maximum threshold to one where there are at least 75 valid points
+ ## because if there are fewer nulls than that, it heavily skews the probability
+ if ( nul.feats.this.thresh < 50 ) break
+
+ null.feats[ i ] <- nul.feats.this.thresh
+ real.feats[ i ] <- get.num.filtered.feats( mat=reals.mat, threshold=thresholds[i], perc.samples.passing=opt$perc.pass )
+
+ ## only calculate if there are more real features than nulls, otherwise, give a p-value of 1
+ if ( null.feats[i] < real.feats[i] ) {
+ p <- null.feats[i]/nrow( nulls.mat )
+ sd <- ( nrow( nulls.mat ) * p * (1-p ) )^0.5
+
+ ## binomial test
+ p <- -pnorm( q=real.feats[i],
+ mean=null.feats[i],
+ sd=sd,
+ log.p=TRUE,
+ lower.tail=FALSE )
+
+ ##chisq test
+ chi <- ( real.feats[i] - null.feats[i] )^2
+ chi <- chi/(null.feats[i])^2
+ chi <- -pchisq( chi, 1, log.p=TRUE, lower=FALSE )
+ } else {
+ p <- chi <- 0 ## 0 == -log(1)
+ }
+
+ binom.pvals <- c( binom.pvals, p )
+ chisq.pvals <- c( chisq.pvals, chi )
+
+ if ( length( chisq.pvals ) != i ) {
+ stop( "lengths differ\n" )
+ }
+}
+
+
+
+##names( binom.pvals ) <- names( chisq.pvals ) <- thresholds
+diffs <- real.feats - null.feats
+if ( opt$selection.criteria == "max_diffs" ) {
+ max.diff <- max( diffs )
+ opt.thresh <- which( diffs %in% max.diff )
+} else if ( opt$selection.criteria == "binomial" ) {
+ max.bin <- max( binom.pvals )
+ opt.thresh <- which( binom.pvals %in% max.bin )
+} else if ( opt$selection.criteria == "chisq" ) {
+ max.chi <- max( chisq.pvals )
+ opt.thresh <- which( chisq.pvals %in% max.chi )
+}
+
+opt.thresh <- mean( c( thresholds[ opt.thresh ], thresholds[ (opt.thresh-1) ] ) )
+opt.thresh <- signif( opt.thresh, 4 )
+
+
+##corrected.binom.pvals <- binom.pvals + log( length(thresholds) )
+##binom.pvals <- binom.pvals - log( length(thresholds) )
+##corrected.chisq.pvals <- chisq.pvals + log( length(thresholds) )
+##chisq.pvals <- chisq.pvals - log( length(thresholds) )
+
+
+eval.thresh <- thresholds[ 1:length( real.feats ) ]
+##plot.new(); screens <- split.screen( c( 4,1 ) )
+##postscript( "threshold.comparison.ps", paper='letter', horizontal=F )
+##png.fname <- file.path( opt$output.report.dir, "IPL.threshold.determination.png")
+##plot.dev <- png( png.fname,
+## width=11,
+## height=8.5,
+## units='in',
+## res=72 )
+##par( mar=rep(0,4) )
+##screens <- split.screen( c( 4,1 ) )
+
+png.fname <- file.path( opt$output.report.dir, "01.num.feats.IPL.threshold.determination.png")
+plot.dev <- png( png.fname,
+ width=11,
+ height=8.5,
+ units='in',
+ res=72 )
+par( mar=c(2.25,3,1.5,0.5) )
+plot( eval.thresh, null.feats, type='l', lwd=2, col='blue', cex.axis=0.75 )
+lines( eval.thresh, real.feats, type='l', lwd=2, col='black', cex.axis=0.75 )
+abline( v=opt.thresh )
+legend( "topright", c( "Real", "Null" ), lwd=2, col=c('black', 'blue' ) )
+mtext( "Number of Genes Passing Threshold", font=2 )
+mtext( "IPL Threshold", 1, font=2, line=1.5 )
+mtext( "Number of Genes", 2, font=2, line=1.8 )
+dev.off()
+
+
+png.fname <- file.path( opt$output.report.dir, "02.diffs.IPL.threshold.determination.png")
+plot.dev <- png( png.fname,
+ width=11,
+ height=8.5,
+ units='in',
+ res=72 )
+##screen( screen()+1 )
+par( mar=c(2.25,3,1.5,0.5) )
+plot( eval.thresh, diffs, type='l', lwd=2, col='black', cex.axis=0.75 )
+abline( v=opt.thresh )
+mtext( "Difference between number of Real & Null genes passing Threshold", font=2 )
+mtext( "IPL Threshold", 1, font=2, line=1.5 )
+mtext( "Number of Genes", 2, font=2, line=1.8 )
+dev.off()
+
+
+
+png.fname <- file.path( opt$output.report.dir, "03.chisq.IPL.threshold.determination.png")
+plot.dev <- png( png.fname,
+ width=11,
+ height=8.5,
+ units='in',
+ res=72 )
+##screen( screen()+1 )
+par( mar=c(2.25,3,1.5,0.5) )
+plot( eval.thresh, chisq.pvals, type='l', lwd=2, col='red', cex.axis=0.75 )
+abline( v=opt.thresh )
+mtext( "Chi-sq p-values", font=2 )
+mtext( "IPL Threshold", 1, font=2, line=1.5 )
+mtext( "-Log p-value", 2, font=2, line=1.8 )
+dev.off()
+
+
+
+png.fname <- file.path( opt$output.report.dir, "04.binom.IPL.threshold.determination.png")
+plot.dev <- png( png.fname,
+ width=11,
+ height=8.5,
+ units='in',
+ res=72 )
+##screen( screen()+1 )
+par( mar=c(2.25,3,1.5,0.5) )
+plot( eval.thresh, binom.pvals, type='l', lwd=2, col='green', cex.axis=0.75 )
+abline( v=opt.thresh )
+mtext( "Binomial p-values", font=2 )
+mtext( "IPL Threshold", 1, font=2, line=1.5 )
+mtext( "-Log p-value", 2, font=2, line=1.8 )
+dev.off()
+
+##close.screen( all=T ); dev.off()
+
+report_str = paste( "The threshold to use for consensus clustering filtering is ", opt.thresh, "\n", sep="" )
+
+pngs = list.files(path=opt$output.report.dir, patt="png")
+html.out <- paste( "", report_str,
+ paste( paste( paste( "
", sep="" ), collapse=""),
+ "" )
+cat( html.out, file=opt$output.report.html )
+
+filter.type <- opt$filter.type
+perc.pass <- opt$perc.pass
+save( file=opt$output.rdata, thresholds, diffs, binom.pvals, chisq.pvals, real.feats, null.feats, data, filter.type, perc.pass, opt.thresh )
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/determine.IPL.threshold.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/determine.IPL.threshold.py Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "determine.IPL.threshold.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:]
+
+print cmd_args
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+ sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/determine.IPL.threshold.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/determine.IPL.threshold.xml Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,57 @@
+
+ determine.IPL.threshold.py
+-d ${data_fname}
+-f ${filter_type}
+-p ${percentage_pass}
+-s ${selection_criteria}
+-h $report
+-r ${report.files_path}
+-o ${output_rdata}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Determines a statistically sound threshold to use for a given IPL result**
+
+**Parameters**
+- **Paradigm Results File** Output from Paradigm (tab-delimited and contains both the 'real' and 'null' samples)
+
+- **Activity Filter** Specify the filter type to use. Choice of:
+
+ * Activity - Features must exceed the user-specified threshold
+ * Inactivity - Features must fall below the user-specified threshold
+ * Modulated - Absolute value of the features must exceed the specified threshold
+
+- **Percentage of Samples Passing** Percent of samples with an IPL that passes the threshold. Choice of:
+
+ * Real Value in [0,1] - indicate the percentage of samples that pass the threshold
+ * Integer Value - indicate the exact number of samples that pass the threshold
+
+- **Selection Criteria** Specify the test statistic to use to select the threshold. Choice of:
+
+ * Binomial P-value - Select the threshold with the largest -log p-value (calculated as a binomial)
+ * Chi-Squared P-value - Select the threshold with the largest -log p-value (calculated as a Chi-squared)
+ * Overall Max Number of Differences - Select the threshold with the largest overall number of differences between the real and null distributions
+
+
+
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/extract.cons.cluster.from.result.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/extract.cons.cluster.from.result.R Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,62 @@
+#!/usr/bin/env Rscript
+# Extract k cluster assignment from consensus clustering result Script by Peter Waltman
+# Nov. 12, 2012
+# License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
+#
+##usage, options and doc goes here
+argspec <- c("galaxy.extract.cons.clustering.from.result.R extracts a cluster assignment for a specified
+value of K from a specified consensus cluster result file.
+
+
+ Usage:
+ galaxy.extract.cons.cluster.from.result.R -r -k
+ Optional:
+ -o consensus class output file # tab-delimitted file format
+\n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") {
+ write( argspec, stderr() )
+ q();
+}
+
+library(getopt)
+
+spec <- matrix( c( "results.file", "r", 1, "character",
+ "k.select", "k", 1, "integer",
+ "cluster.class.out", "o", 2, "character",
+ "output.select.rdata", "d", 2, "character"
+ ),
+ nc=4,
+ byrow=T
+ )
+opt <- getopt( spec=spec )
+if ( is.null( opt$output.select.rdata ) ) { opt$output.select.rdata <- "select.RData" }
+##if ( is.null( opt$cluster.class.out) ) { opt$cluster.class.out <- "select.cls" }
+
+load( opt$results.file )
+cons.matrices <- lapply( results[ 2:length(results) ], '[[', 'consensusMatrix' )
+cls <- lapply( results[ 2:length(results) ], '[[', 'consensusClass' )
+names( cons.matrices ) <- names( cls ) <- 2:length( results )
+
+ch.k.select <- as.character( opt$k.select )
+if ( ch.k.select %in% names( cls ) ) {
+ ## get the consensusClass file that's associated with the k.select
+ cl <- cls[[ ch.k.select ]]
+
+ if ( ! is.null( opt$cluster.class.out ) ) {
+ cl <- cbind( names( cl ), as.integer(cl) )
+ colnames( cl ) <- c( "ID", "class" )
+ write.table( cl, opt$cluster.class.out, sep="\t", row.names=FALSE, quote=FALSE )
+ } else if ( ! is.null( opt$output.select.rdata ) ) {
+ ## re-order the samples to follow the cluster assignment
+ treecl.res <- results[[ opt$k.select ]]$consensusTree
+ select.result <- results[[ opt$k.select ]]
+ save( file=opt$output.select.rdata, treecl.res, cl, select.result, data )
+ } else {
+ stop( 'no valid output format specified\n' )
+ }
+} else {
+ out.string <- paste( "choice of k =", ch.k.select, "not available in this result file. Max k = ", max( as.numeric( names(cls) ) ), "\n" )
+ cat( out.string, file=opt$cluster.class.out )
+}
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/extract.cons.cluster.from.result.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/extract.cons.cluster.from.result.py Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "extract.cons.cluster.from.result.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:]
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+ sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/extract.cons.cluster.from.result.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/extract.cons.cluster.from.result.xml Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,47 @@
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/fix.and.merge.TCGA.sample.IDs.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/fix.and.merge.TCGA.sample.IDs.R Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,119 @@
+#!/usr/bin/env Rscript
+argspec <- c("fix.and.merge.TCGA.samples.IDs.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+ Usage:
+ fix.and.merge.TCGA.samples.IDs.R -d
+
+ \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") {
+ write(argspec, stderr())
+ q();
+}
+
+lib.load.quiet <- function( package ) {
+ package <- as.character(substitute(package))
+ suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "data.fname", "d", 1, "character",
+ "num.components", "n", 2, "integer",
+ "remove.normals", "r", 0, "logical",
+ "output.fname", "o", 2, "character"
+ ),
+ nc=4,
+ byrow=TRUE
+ )
+
+opt <- getopt( spec=spec )
+
+data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
+if ( is.null( opt$num.components ) ) { opt$num.components <- 3 }
+if ( is.null( opt$remove.normals ) ) { opt$remove.normals <- FALSE }
+if ( is.null( opt$output.fname ) ) { opt$output.fname <- paste( "sample.IDs.updated", basename( opt$data.fname ), sep="." ) }
+
+if ( opt$num.components < 3 ) {
+ err.msg <- "Minimum number of barcode components that can be used is 3\n"
+ cat( err.msg, file=opt$output.fname )
+ stop( err.msg )
+}
+
+remove.periods.from.ids <- function( ids ) {
+ return( gsub( "\\.", "-", ids ) )
+}
+
+
+reformat.ids <- function( ids,
+ num.components=3 ) {
+ return( sapply( strsplit( ids, "-" ), function(x) paste( x[1:num.components], collapse="-" ) ) )
+}
+
+
+merge.cols <- function( mat,
+ samp.ids ) {
+
+ if ( ! any( duplicated( samp.ids ) ) ) {
+ colnames( mat ) <- samp.ids
+ return( mat )
+ }
+
+ dupes <- unique( samp.ids[ duplicated( samp.ids ) ] )
+ uniqs <- samp.ids[ ! samp.ids %in% dupes ]
+
+ uniq.mat <- mat[ , ( samp.ids %in% uniqs ), drop=FALSE ]
+ colnames( uniq.mat ) <- uniqs
+
+ for ( dup in dupes ) {
+ dup.mat <- apply( mat[, ( samp.ids %in% dup ), drop=FALSE],
+ 1,
+ mean,
+ na.rm=TRUE )
+
+ uniq.mat <- cbind( uniq.mat, dup.mat )
+ }
+ colnames( uniq.mat ) <- c( uniqs, dupes )
+ return( uniq.mat )
+}
+
+
+cnames <- colnames( data )
+rnames <- rownames( data )
+
+transpose.back <- FALSE
+
+if ( all( grepl( "^TCGA", rnames ) ) ) {
+ data <- t( data )
+ transpose.back <- TRUE
+} else {
+ if ( ! all( grepl( "^TCGA", cnames ) ) ) {
+ err.msg <- "can't find any TCGA samples listed in this matrix. If columns are samples, all columns must be a TCGA sample ID. Same if rows are samples.\n"
+ cat( err.msg, file=opt$output.fname )
+ stop( err.msg )
+ }
+}
+
+cnames <- remove.periods.from.ids( colnames( data ) )
+nelts <- as.numeric( names( table( as.factor( sapply( strsplit( cnames, "-" ), function(x) length(x ) ) ) ) ) )
+if ( length( nelts ) > 1 ) {
+ err.msg <- "Error: Inconsistent TCGA sample barcodes used. Have found ID with different numbers of components in the barcodes used\n"
+ cat( err.msg, file=opt$output.fname )
+ stop( err.msg )
+}
+
+if ( opt$remove.normals ) {
+ if ( nelts > 3 ) {
+ normals <- grepl( "^TCGA-..-....-1", cnames )
+ data <- data[ , (! normals ), drop=FALSE ]
+ }
+}
+
+if ( opt$num.components < nelts ) {
+ cnames <- reformat.ids( ids=cnames, num.components=opt$num.components )
+ data <- merge.cols( data, cnames )
+}
+
+if ( transpose.back ) data <- t( data )
+
+write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/fix.and.merge.TCGA.sample.IDs.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/fix.and.merge.TCGA.sample.IDs.py Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fix.and.merge.TCGA.sample.IDs.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:]
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+ sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/fix.and.merge.TCGA.sample.IDs.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/fix.and.merge.TCGA.sample.IDs.xml Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,33 @@
+
+ fix.and.merge.TCGA.sample.IDs.py
+-d $dataset -n ${num_components} ${remove_normals}
+-o ${output}
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Update and Merge TCGA Sample IDs** - This will limit the TCGA sample IDs to the specified numnber of elements (min is 3). If necessary, samples will be merged (by averaging)
+
+**OUTPUT:** A new matrix using sample (columns) that use the specified number of components
+
+----
+
+**Parameters**
+
+- **Matrix with TCGA barcode sample IDs (e.g. TCGA-AE-####-01-)** Specify a data matrix with TCGA barcodes
+
+- **Number of barcode components to use** Specify the number of barcode components to use in new matrix that is produced **(min number is 3)**
+
+- **Remove Normals from Matrix?** - Remove any normals from the matrix (if necessary)
+
+
+
diff -r 000000000000 -r 0decf3fd54bc cluster.tools/format.raw.TCGA.clinical.data.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/format.raw.TCGA.clinical.data.R Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,184 @@
+#!/usr/bin/env Rscript
+##
+## formats raw clinical data from TCGA to contain a single status & time colums
+##
+## Input (required):
+## - clinical data
+## Input (optional):
+## - status & time columns: (NOT USED IN THIS SCRIPT - see comment below)
+## ideally, a better design would allow a user to specify 1 or more columns
+## to check for the status & time columns - however, due to the necessities
+## required to pre-process the TCGA clinical data, the script would not be
+## generalizeable - and for this reason, the TCGA columns are hard-coded.
+##
+## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time
+##
+## Date: August 21, 2012
+## Author: Peter Waltman
+##
+
+##usage, options and doc goes here
+argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+ Usage:
+ format.raw.TCGA.clinical.data.R -c
+ Options:
+ -o