Mercurial > repos > peter-waltman > ucsc_cluster_tools2
diff cluster.tools/extract.TCGA.survival.data.R @ 1:dddfeedb85af draft
Uploaded
author | peter-waltman |
---|---|
date | Fri, 01 Mar 2013 10:16:53 -0500 |
parents | |
children | 563832f48c08 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/extract.TCGA.survival.data.R Fri Mar 01 10:16:53 2013 -0500 @@ -0,0 +1,195 @@ +#!/usr/bin/env Rscript +## +## formats raw clinical data from TCGA to contain a single status & time colums +## +## Input (required): +## - clinical data +## Input (optional): +## - status & time columns: (NOT USED IN THIS SCRIPT - see comment below) +## ideally, a better design would allow a user to specify 1 or more columns +## to check for the status & time columns - however, due to the necessities +## required to pre-process the TCGA clinical data, the script would not be +## generalizeable - and for this reason, the TCGA columns are hard-coded. +## +## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time +## +## Date: August 21, 2012 +## Author: Peter Waltman +## + +##usage, options and doc goes here +argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data +and generates a KM-plot, along with the log-rank p-values + + Usage: + format.raw.TCGA.clinical.data.R -c <clinical.file> + Options: + -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time)) + ") +args <- commandArgs(TRUE) +if ( length( args ) == 1 && args =="--help") { + write(argspec, stderr()) + q(); +} + +## some helper fn's +write.2.tab <- function( mat, + fname ) { + mat <- rbind( colnames( mat ), mat ) + mat <- cbind( c( "ID", rownames( mat )[-1] ), + mat ) + write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) +} + +lib.load.quiet <- function( package ) { + package <- as.character(substitute(package)) + suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) +} +lib.load.quiet(getopt) + +spec <- matrix( c( "clinical.fname", "d", 1, "character", + "output.fname", "o", 2, "character" + ), + ncol=4, + byrow=TRUE + ) +opt <- getopt( spec=spec ) + +##set some reasonable defaults for the options that are needed, +##but were not specified. +if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.clinical.data" ) } + +##orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE, row.names=1 ) +orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE ) +orig.clinical.data <- unique( orig.clinical.data ) +rownames( orig.clinical.data ) <- orig.clinical.data[,1] +orig.clinical.data <- orig.clinical.data[, -1 ] + +## ugh, some TCGA data sets have all NAs in the "days_to_..." columns +if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) { + time.cols <- c( "days_to_death", "days_to_last_followup", "days_to_last_known_alive" ) +} else { + time.cols <- c( "days_to_death", "days_to_last_followup" ) +} +good.samps <- ! apply( orig.clinical.data[, time.cols ], 1, function(x) all( is.na(x) ) | all( x <= 0, na.rm=T ) ) + +orig.clinical.data <- orig.clinical.data[ good.samps, ] + +if ( is.null(opt$status.column ) ) { + status.colname <- "vital_status" + if ( status.colname %in% colnames( orig.clinical.data ) ) { + opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname ) + clinical.data <- orig.clinical.data[ , opt$status.column ] + } + else { + status.colname <- "days_to_death" + if ( status.colname %in% colnames( orig.clinical.data ) ) { + opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname ) + clinical.data <- orig.clinical.data[ , opt$status.column ] + } + else { + stop( "can't find a valid entry with status info - have tried vital_status & days_to_death\n" ) + } + } + clinical.data <- as.numeric( ! grepl( "(LIVING|Not)", clinical.data ) ) +} +if ( is.null(opt$time.column ) ) { + time.colname <- "CDE.clinical_time" + + if ( time.colname %in% colnames( orig.clinical.data ) ) { + opt$time.column <- which( colnames( orig.clinical.data ) %in% time.colname ) + clinical.data <- cbind( clinical.data, + as.numeric( orig.clinical.data[, opt$time.column ] ) ) + } + else { + dec.mat <- matrix( NA, + nc=length( time.cols ), + nr=nrow( orig.clinical.data ), + dimnames=list( rownames( orig.clinical.data ), + time.cols ) + ) + for ( cname in colnames( dec.mat ) ) { + if ( cname %in% colnames( orig.clinical.data ) ) { + dec.mat[, cname ] <- as.numeric( orig.clinical.data[, cname ] ) + } + } + + + + if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) { + + opt$time.column <- sapply( 1:length( clinical.data ), + function(i) { + if ( clinical.data[i] ) { + ## this is a deceased sample + return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ), + dec.mat[ i, "days_to_death" ], + ifelse( ( !is.na( dec.mat[ i, "days_to_last_known_alive" ] ) ), + dec.mat[ i, "days_to_last_known_alive" ], + dec.mat[ i, "days_to_last_followup" ] ) ) ) + + } + else { + return( max( dec.mat[ i, c( "days_to_last_followup","days_to_last_known_alive") ], na.rm=T ) ) + } + } + ) + } else { + opt$time.column <- sapply( 1:length( clinical.data ), + function(i) { + if ( clinical.data[i] ) { + ## this is a deceased sample + return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ), + dec.mat[ i, "days_to_death" ], + dec.mat[ i, "days_to_last_followup" ] ) ) + + } + else { + return( max( dec.mat[ i, c( "days_to_last_followup") ], na.rm=T ) ) + } + } + ) + } + + + clinical.data <- cbind( clinical.data, + as.numeric( opt$time.column ) ) + } +} + +clinical.data <- as.data.frame( clinical.data ) +colnames( clinical.data ) <- c( "status", "time" ) +rownames( clinical.data ) <- rownames( orig.clinical.data ) + + +## check to make sure that the id's are sync'd correctly +## the default format is to use hyphens to separate the elt's of the name +## and to only use the 1st 3 elements of the name +## so we check to see if they're using something else as separators and/or using more than 3 elts +reformat.ids <- function( ids ) { + + if ( grepl( "TCGA\\.", ids[1] ) ) { + ids <- sapply( strsplit( ids, "\\." ), function(x) paste( x[1:3], collapse="-" ) ) + } else { + ## do this just in case there's more than 3 elements to the names + if ( grepl( "TCGA-", ids[1] ) ) { + ids <- sapply( strsplit( ids, "-" ), function(x) paste( x[1:min( c(3,length(x) ) )], collapse="-" ) ) + } + } + return( ids ) +} + + +new.samp.ids <- reformat.ids( rownames( clinical.data ) ) +if ( any( duplicated( new.samp.ids ) ) ) { + ## in some cases, we have duplicate sample ids in the raw data after we truncate to + ## the 1st 3 elts in the barcode, so just simplify the data + uniqs <- ! duplicated( new.samp.ids ) + clinical.data <- clinical.data[ uniqs, ] + new.samp.ids <- new.samp.ids[ uniqs ] +} + +rownames( clinical.data ) <- new.samp.ids + +write.2.tab( clinical.data, opt$output.fname ) +##write.table( clinical.data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )