diff cluster.tools/format.raw.TCGA.clinical.data.R @ 2:b442996b66ae draft

Uploaded
author peter-waltman
date Wed, 27 Feb 2013 20:17:04 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/format.raw.TCGA.clinical.data.R	Wed Feb 27 20:17:04 2013 -0500
@@ -0,0 +1,185 @@
+#!/usr/bin/env Rscript
+## 
+## formats raw clinical data from TCGA to contain a single status & time colums
+##
+## Input (required):
+##    - clinical data
+## Input (optional):
+##    - status & time columns: (NOT USED IN THIS SCRIPT - see comment below)
+##         ideally, a better design would allow a user to specify 1 or more columns
+##         to check for the status & time columns - however, due to the necessities
+##         required to pre-process the TCGA clinical data, the script would not be
+##         generalizeable - and for this reason, the TCGA columns are hard-coded.
+##
+## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time
+##
+## Date: August 21, 2012
+## Author: Peter Waltman
+##
+
+##usage, options and doc goes here
+argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+        Usage: 
+                format.raw.TCGA.clinical.data.R -c <clinical.file> 
+        Options:
+                -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time))
+              ")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "clinical.fname", "d", 1, "character",  
+                   "output.fname",   "o", 2, "character"
+                  ),
+                ncol=4,
+                byrow=TRUE
+               )
+opt <- getopt( spec=spec )
+save.image( "/tmp/format.dbg.rda")
+
+##set some reasonable defaults for the options that are needed,
+##but were not specified.
+if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.clinical.data" ) }
+
+##orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE, row.names=1 )
+orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE )
+orig.clinical.data <- unique( orig.clinical.data )
+rownames( orig.clinical.data ) <- orig.clinical.data[,1]
+orig.clinical.data <- orig.clinical.data[, -1 ]
+
+##  ugh, some TCGA data sets have all NAs in the "days_to_..." columns
+if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) {
+  time.cols <- c( "days_to_death", "days_to_last_followup", "days_to_last_known_alive" )
+} else {
+  time.cols <- c( "days_to_death", "days_to_last_followup"  )
+}
+good.samps <- ! apply( orig.clinical.data[, time.cols ], 1, function(x) all( is.na(x) ) | all( x <= 0, na.rm=T ) )
+
+orig.clinical.data <- orig.clinical.data[ good.samps, ]
+
+if ( is.null(opt$status.column ) ) {
+  status.colname <- "vital_status"
+  if ( status.colname %in% colnames( orig.clinical.data ) ) {
+    opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname )
+    clinical.data <- orig.clinical.data[ , opt$status.column ]
+  }
+  else {
+    status.colname <- "days_to_death"
+    if ( status.colname %in% colnames( orig.clinical.data ) ) {
+      opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname )
+      clinical.data <- orig.clinical.data[ , opt$status.column ]
+    }
+    else {
+      stop( "can't find a valid entry with status info - have tried vital_status & days_to_death\n" )
+    }
+  }
+  clinical.data <- as.numeric( ! grepl( "(LIVING|Not)", clinical.data ) )
+}
+if ( is.null(opt$time.column ) ) {
+  time.colname <- "CDE.clinical_time"
+  
+  if ( time.colname %in% colnames( orig.clinical.data ) ) {
+    opt$time.column <- which( colnames( orig.clinical.data ) %in% time.colname )
+    clinical.data <- cbind( clinical.data,
+                           as.numeric( orig.clinical.data[, opt$time.column ] ) )
+  }
+  else {
+    dec.mat <- matrix( NA,
+                       nc=length( time.cols ),
+                       nr=nrow( orig.clinical.data ),
+                       dimnames=list( rownames( orig.clinical.data ),
+                                       time.cols )
+                      )
+    for ( cname in colnames( dec.mat ) ) {
+      if ( cname %in% colnames( orig.clinical.data ) ) {
+        dec.mat[, cname ] <- as.numeric( orig.clinical.data[, cname ] )
+      }
+    }
+                         
+    
+
+    if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) {
+
+      opt$time.column <- sapply( 1:length( clinical.data ),
+                                 function(i) {
+                                   if ( clinical.data[i] ) {
+                                     ## this is a deceased sample
+                                     return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ),
+                                                     dec.mat[ i, "days_to_death" ],
+                                                     ifelse( ( !is.na( dec.mat[ i, "days_to_last_known_alive" ] ) ),
+                                                             dec.mat[ i, "days_to_last_known_alive" ],
+                                                             dec.mat[ i, "days_to_last_followup" ] ) ) )
+                                                   
+                                   }
+                                   else {
+                                     return( max( dec.mat[ i, c( "days_to_last_followup","days_to_last_known_alive") ], na.rm=T ) )
+                                   }
+                                 }
+                                )
+    } else {
+      opt$time.column <- sapply( 1:length( clinical.data ),
+                                 function(i) {
+                                   if ( clinical.data[i] ) {
+                                     ## this is a deceased sample
+                                     return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ),
+                                                     dec.mat[ i, "days_to_death" ],
+                                                     dec.mat[ i, "days_to_last_followup" ] ) )
+                                                   
+                                   }
+                                   else {
+                                     return( max( dec.mat[ i, c( "days_to_last_followup") ], na.rm=T ) )
+                                   }
+                                 }
+                                )
+    }
+                                   
+    
+    clinical.data <- cbind( clinical.data,
+                           as.numeric( opt$time.column ) )
+  }
+}
+
+clinical.data <- as.data.frame( clinical.data )
+colnames( clinical.data ) <- c( "status", "time" )
+rownames( clinical.data ) <- rownames( orig.clinical.data )
+
+
+##  check to make sure that the id's are sync'd correctly
+## the default format is to use hyphens to separate the elt's of the name
+## and to only use the 1st 3 elements of the name
+## so we check to see if they're using something else as separators and/or using more than 3 elts
+reformat.ids <- function( ids ) {
+
+  if ( grepl( "TCGA\\.", ids[1] ) ) {
+    ids <- sapply( strsplit( ids, "\\." ), function(x) paste( x[1:3], collapse="-" ) )
+  } else {
+    ## do this just in case there's more than 3 elements to the names
+    if ( grepl( "TCGA-", ids[1] ) ) {
+      ids <- sapply( strsplit( ids, "-" ), function(x) paste( x[1:min( c(3,length(x) ) )], collapse="-" ) )
+    }
+  }
+  return( ids )
+}
+
+
+new.samp.ids <- reformat.ids( rownames( clinical.data ) )
+if ( any( duplicated( new.samp.ids ) ) ) {
+  ## in some cases, we have duplicate sample ids in the raw data after we truncate to
+  ##   the 1st 3 elts in the barcode, so just simplify the data
+  uniqs <- ! duplicated( new.samp.ids )
+  clinical.data <- clinical.data[ uniqs, ]
+  new.samp.ids <- new.samp.ids[ uniqs ]
+}
+  
+rownames( clinical.data ) <- new.samp.ids
+write.table( clinical.data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )