Mercurial > repos > peter-waltman > ucsc_cluster_tools

diff cluster.tools/gen.survival.curves.R @ 2:b442996b66ae draft
Uploaded
author: peter-waltman
date: Wed, 27 Feb 2013 20:17:04 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/gen.survival.curves.R	Wed Feb 27 20:17:04 2013 -0500
@@ -0,0 +1,249 @@
+#!/usr/bin/env Rscript
+## 
+## Calculates the log-rank test for a given clustering, in the output format from ConsensusClusterPlus
+##
+## Input (required):
+##    - consensus cluster file (consensusClass.csv file)
+##    - survival data
+## Input (optional):
+## Output: a KM plot, with the most significant p-value.  Output to stdout can be captured by re-direction
+##
+## Uses: survival library
+## Date: August 21, 2012
+## Author: Peter Waltman
+##
+
+##usage, options and doc goes here
+argspec <- c("gen.survival.curves.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+        Usage: 
+                gen.survival.curves.R -c <cluster.file> -s <clinical.file> 
+        Options:
+                -o <output file> (postscript)
+                -m <mode>        (all, one, both)
+                                  \"all\" - perform all-vs-all log-rank test
+                                  \"one\" - perform one-vs-others log-rank test (returns min)
+                                  \"both\" - perform both \"all\" and \"one\" tests
+                -t <title>
+                -p <pval.only>  ( only return the p-value for this given mode - no plotting at all (to screen or postscript))
+                -v <verbose>
+              ")
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+lib.load.quiet( survival )
+
+
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+spec <- matrix( c( "cluster.fname",  "C", 1, "character",
+                   "survival.fname", "S", 1, "character",  
+                   "mode",           "M", 2, "character",
+                   "title",          "T", 2, "character",
+                   "image.format",   "I", 2, "character",
+                   "output.fname",   "O", 2, "character",
+                   "pval.only",      "P", 0, "logical",
+                   "verbose",        "V", 0, "logical"
+                  ),
+                ncol=4,
+                byrow=TRUE
+               )
+opt <- getopt( spec=spec )
+
+
+#set some reasonable defaults for the options that are needed,
+#but were not specified.
+if ( is.null(opt$mode ) ) {
+  opt$mode <- "all"
+} else {
+  if ( ! opt$mode %in% c( 'all', 'one', 'both' ) ) {
+    stop( "invalid mode specified,' -m", opt$mode, "'.  must be either {all, one, both}\n" )
+  }
+}
+if ( is.null( opt$title ) ) {
+  opt$title <- opt$cluster.fname
+  opt$title <- strsplit( opt$title, "\\/" )[[1]]
+  opt$title <- opt$title[ length( opt$title ) ]
+}
+if ( is.null( opt$image.format ) ){
+  opt$image.format <- "png"
+} else {
+  if ( ! opt$image.format %in% c( "pdf", "png", "none" ) ) stop( 'invalid image format specified\n' )
+}
+if ( is.null(opt$output.fname ) ) { opt$output.fname <- paste( opt$mode, "survival.curve", opt$image.format, sep="." ) }
+if ( is.null(opt$cluster.header ) ) { opt$cluster.header = FALSE }
+if ( is.null(opt$pval.only ) ) { opt$pval.only = FALSE }
+if ( is.null(opt$verbose ) ) { opt$verbose = FALSE }
+
+##print some progress messages to stderr, if requested.
+if ( opt$verbose ) { write("writing...",stderr()); }
+
+load( opt$cluster.fname )
+cluster.data <- cbind( names( cl ), as.numeric( cl ) )
+colnames( cluster.data ) <- c( "id", "group_num" )
+rownames( cluster.data ) <- names( cl )
+
+survival.data <- read.delim( opt$survival.fname, as.is=TRUE, row.names=1 )
+survival.data <- cbind( rownames( survival.data ), survival.data ) ## add in the ids, so we can merge on them
+if ( length( colnames( survival.data ) ) == 3 ) {
+  ## we have to left-shift the current colanmes to drop the 1st one
+  ##  b/c cbind will add one for the column we just  added
+  colnames( survival.data ) <- c( "id", colnames( survival.data )[-1] )  
+}
+if ( length( colnames( survival.data ) ) == 2 ) {
+  ## added just in case there's a change to cbind as R is prone to doing
+  colnames( survival.data ) <- c( "id", colnames( survival.data ) )  
+}
+survival.data$id <- as.character( survival.data$id )
+
+
+## Now, filter so we only contain the same samples
+n.clust.data.samps <- nrow( cluster.data )
+n.surv.data.samps <- nrow( survival.data )
+if ( n.clust.data.samps > n.surv.data.samps ) {
+  ovp.samples <- rownames( cluster.data )
+  ovp.samples <- ovp.samples[ ovp.samples %in% survival.data$id ]
+} else {
+  ovp.samples <- survival.data$id
+  ovp.samples <- ovp.samples[ ovp.samples %in% rownames( cluster.data ) ]
+}
+
+cluster.data <- cluster.data[ ovp.samples, , drop=FALSE]
+survival.data <- survival.data[ ovp.samples, ]
+survival.data <- merge( survival.data, cluster.data )
+
+
+calc.all.pval <- function( survival.data ) {
+  survdiff( Surv( time, status )~group_num, data=survival.data )
+  surv.res <- survdiff( Surv( time, status )~group_num, data=survival.data )
+  pval <- surv.res$chisq
+  df <- length( surv.res$n ) - 1
+  pval <- pchisq( pval, df=df, lower=F )
+  return( pval )
+}
+
+calc.one.v.others.pval <- function( survival.data ) {
+  grps <- sort( unique( as.numeric( survival.data$group_num ) ) )
+
+  retval <- numeric()
+  for ( g in grps ) {
+    one.v.all.survival.data <- survival.data
+    tmp <- as.numeric( one.v.all.survival.data$group_num )
+    tmp[ ! tmp %in% g ] <- -1
+    tmp[ tmp %in% g ] <- 1
+    tmp[ tmp %in% -1 ] <- 2
+    one.v.all.survival.data$group_num <- tmp
+    surv.res <- survdiff( Surv( time, status )~group_num, data=one.v.all.survival.data )
+    pval <- surv.res$chisq
+    df <- length( surv.res$n ) - 1
+    pval <- pchisq( pval, df=df, lower=F )
+    retval <- c( retval, pval )
+  }
+  names( retval ) <- grps
+  return( retval )
+}
+
+
+if ( opt$mode == "all" ) {
+
+  pval <- calc.all.pval( survival.data )
+  log.rank <- paste( "Log Rank p-value:", sprintf( "%1.2e",pval ) )
+  opt$title <- paste( opt$title, log.rank, sep="\n" )
+} else {
+  if ( opt$mode == "one" ) {
+
+    pvals <- calc.one.v.others.pval( survival.data )
+    min.p <- min( pvals, na.rm=T )
+    if ( length( min.p ) == 0 ) {
+      stop( 'no valid p-value returned from the one-v-others test\n' )
+    }
+    cluster.num <- names( pvals )[ pvals == min.p ]
+    pval <- pvals[ cluster.num ]
+    log.rank <- paste( "Log Rank p-value for cluster", cluster.num,"is:", sprintf( "%1.2e",pval ) )
+    opt$title <- paste( opt$title, log.rank, sep="\n" )
+  } else {
+    if ( opt$mode== "both" ) {
+      ##  add the all-v-all p-value
+      bak <- pval <- calc.all.pval( survival.data )
+      log.rank <- paste( "Log Rank p-value:", sprintf( "%1.2e",pval ) )
+      opt$title <- paste( opt$title, log.rank, sep="\n" )
+
+      ## now add the one-v-all p-value
+      pvals <- calc.one.v.others.pval( survival.data )
+      min.p <- min( pvals, na.rm=T )
+      if ( length( min.p ) == 0 ) {
+        stop( 'no valid p-value returned from the one-v-others test\n' )
+      }
+      cluster.num <- names( pvals )[ pvals == min.p ]
+      pval <- pvals[ cluster.num ]
+      log.rank <- paste( "Log Rank p-value for cluster", cluster.num,"is:", sprintf( "%1.2e",pval ) )
+      opt$title <- paste( opt$title, log.rank, sep="\n" )
+
+      if ( opt$pval.only ) {
+        pval <- min( c( bak, pval ), na.rm=T )
+      }
+    }
+    else {
+      stop( "invalid mode specified, mode = ", opt$mode, "\n" )
+    }
+  }
+}
+
+if ( opt$pval.only ) {
+  cat( paste(pval, "\n", sep="" ), file=stdout() )
+}
+
+
+if ( ! opt$pval.only ) {
+  ngrps <- length( unique( survival.data$group_num ) )
+  col.map <- rainbow( ngrps )
+
+
+
+  ##postscript( opt$output.fname, horizontal=T, paper='letter' )
+  if ( opt$image.format == 'png' ) {
+    plot.dev <- png( opt$output.fname,
+                     width=11,
+                     height=8.5,
+                     units='in',
+                     res=72 )
+  } else if ( opt$image.format == 'pdf' ) {
+    pdf( opt$output.fname,
+         paper="letter" )
+  } else if ( opt$image.format == 'none' ) {
+    ## do nothing - this allows other scripts to call this and hopefully plot into them
+    
+  }
+  
+  plot( survfit( Surv( time, status )~group_num, data=survival.data ),
+        main = opt$title,
+        ##lty = 1:ngrps,
+        lty=1,
+        col=col.map,      
+        ylab = "Probability",
+        xlab = "Survival Time in Days",
+       )
+
+  ## set the legend.labels if they're still not set yet
+  if( ! exists( "legend.labels" ) ) {
+    grp.counts <- table( as.factor( survival.data[, "group_num" ] ) )
+    legend.labels <- paste( "Cluster", 1:ngrps, paste( "(n=", as.integer(grp.counts), ")", sep="" ) )
+  }
+
+  legend( "topright",
+          lty=1,
+          col=col.map,
+          bty = "n",
+          legend=legend.labels
+         )
+  
+  if( opt$image.format != "none" ) dev.off()
+}
author	peter-waltman
date	Wed, 27 Feb 2013 20:17:04 -0500
parents
children