Mercurial > repos > peter-waltman > ucsc_cluster_tools
comparison cluster.tools/format.raw.TCGA.clinical.data.R @ 2:b442996b66ae draft
Uploaded
| author | peter-waltman |
|---|---|
| date | Wed, 27 Feb 2013 20:17:04 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:e25d2bece0a2 | 2:b442996b66ae |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 ## | |
| 3 ## formats raw clinical data from TCGA to contain a single status & time colums | |
| 4 ## | |
| 5 ## Input (required): | |
| 6 ## - clinical data | |
| 7 ## Input (optional): | |
| 8 ## - status & time columns: (NOT USED IN THIS SCRIPT - see comment below) | |
| 9 ## ideally, a better design would allow a user to specify 1 or more columns | |
| 10 ## to check for the status & time columns - however, due to the necessities | |
| 11 ## required to pre-process the TCGA clinical data, the script would not be | |
| 12 ## generalizeable - and for this reason, the TCGA columns are hard-coded. | |
| 13 ## | |
| 14 ## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time | |
| 15 ## | |
| 16 ## Date: August 21, 2012 | |
| 17 ## Author: Peter Waltman | |
| 18 ## | |
| 19 | |
| 20 ##usage, options and doc goes here | |
| 21 argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data | |
| 22 and generates a KM-plot, along with the log-rank p-values | |
| 23 | |
| 24 Usage: | |
| 25 format.raw.TCGA.clinical.data.R -c <clinical.file> | |
| 26 Options: | |
| 27 -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time)) | |
| 28 ") | |
| 29 args <- commandArgs(TRUE) | |
| 30 if ( length( args ) == 1 && args =="--help") { | |
| 31 write(argspec, stderr()) | |
| 32 q(); | |
| 33 } | |
| 34 | |
| 35 lib.load.quiet <- function( package ) { | |
| 36 package <- as.character(substitute(package)) | |
| 37 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) | |
| 38 } | |
| 39 lib.load.quiet(getopt) | |
| 40 | |
| 41 spec <- matrix( c( "clinical.fname", "d", 1, "character", | |
| 42 "output.fname", "o", 2, "character" | |
| 43 ), | |
| 44 ncol=4, | |
| 45 byrow=TRUE | |
| 46 ) | |
| 47 opt <- getopt( spec=spec ) | |
| 48 save.image( "/tmp/format.dbg.rda") | |
| 49 | |
| 50 ##set some reasonable defaults for the options that are needed, | |
| 51 ##but were not specified. | |
| 52 if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.clinical.data" ) } | |
| 53 | |
| 54 ##orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE, row.names=1 ) | |
| 55 orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE ) | |
| 56 orig.clinical.data <- unique( orig.clinical.data ) | |
| 57 rownames( orig.clinical.data ) <- orig.clinical.data[,1] | |
| 58 orig.clinical.data <- orig.clinical.data[, -1 ] | |
| 59 | |
| 60 ## ugh, some TCGA data sets have all NAs in the "days_to_..." columns | |
| 61 if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) { | |
| 62 time.cols <- c( "days_to_death", "days_to_last_followup", "days_to_last_known_alive" ) | |
| 63 } else { | |
| 64 time.cols <- c( "days_to_death", "days_to_last_followup" ) | |
| 65 } | |
| 66 good.samps <- ! apply( orig.clinical.data[, time.cols ], 1, function(x) all( is.na(x) ) | all( x <= 0, na.rm=T ) ) | |
| 67 | |
| 68 orig.clinical.data <- orig.clinical.data[ good.samps, ] | |
| 69 | |
| 70 if ( is.null(opt$status.column ) ) { | |
| 71 status.colname <- "vital_status" | |
| 72 if ( status.colname %in% colnames( orig.clinical.data ) ) { | |
| 73 opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname ) | |
| 74 clinical.data <- orig.clinical.data[ , opt$status.column ] | |
| 75 } | |
| 76 else { | |
| 77 status.colname <- "days_to_death" | |
| 78 if ( status.colname %in% colnames( orig.clinical.data ) ) { | |
| 79 opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname ) | |
| 80 clinical.data <- orig.clinical.data[ , opt$status.column ] | |
| 81 } | |
| 82 else { | |
| 83 stop( "can't find a valid entry with status info - have tried vital_status & days_to_death\n" ) | |
| 84 } | |
| 85 } | |
| 86 clinical.data <- as.numeric( ! grepl( "(LIVING|Not)", clinical.data ) ) | |
| 87 } | |
| 88 if ( is.null(opt$time.column ) ) { | |
| 89 time.colname <- "CDE.clinical_time" | |
| 90 | |
| 91 if ( time.colname %in% colnames( orig.clinical.data ) ) { | |
| 92 opt$time.column <- which( colnames( orig.clinical.data ) %in% time.colname ) | |
| 93 clinical.data <- cbind( clinical.data, | |
| 94 as.numeric( orig.clinical.data[, opt$time.column ] ) ) | |
| 95 } | |
| 96 else { | |
| 97 dec.mat <- matrix( NA, | |
| 98 nc=length( time.cols ), | |
| 99 nr=nrow( orig.clinical.data ), | |
| 100 dimnames=list( rownames( orig.clinical.data ), | |
| 101 time.cols ) | |
| 102 ) | |
| 103 for ( cname in colnames( dec.mat ) ) { | |
| 104 if ( cname %in% colnames( orig.clinical.data ) ) { | |
| 105 dec.mat[, cname ] <- as.numeric( orig.clinical.data[, cname ] ) | |
| 106 } | |
| 107 } | |
| 108 | |
| 109 | |
| 110 | |
| 111 if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) { | |
| 112 | |
| 113 opt$time.column <- sapply( 1:length( clinical.data ), | |
| 114 function(i) { | |
| 115 if ( clinical.data[i] ) { | |
| 116 ## this is a deceased sample | |
| 117 return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ), | |
| 118 dec.mat[ i, "days_to_death" ], | |
| 119 ifelse( ( !is.na( dec.mat[ i, "days_to_last_known_alive" ] ) ), | |
| 120 dec.mat[ i, "days_to_last_known_alive" ], | |
| 121 dec.mat[ i, "days_to_last_followup" ] ) ) ) | |
| 122 | |
| 123 } | |
| 124 else { | |
| 125 return( max( dec.mat[ i, c( "days_to_last_followup","days_to_last_known_alive") ], na.rm=T ) ) | |
| 126 } | |
| 127 } | |
| 128 ) | |
| 129 } else { | |
| 130 opt$time.column <- sapply( 1:length( clinical.data ), | |
| 131 function(i) { | |
| 132 if ( clinical.data[i] ) { | |
| 133 ## this is a deceased sample | |
| 134 return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ), | |
| 135 dec.mat[ i, "days_to_death" ], | |
| 136 dec.mat[ i, "days_to_last_followup" ] ) ) | |
| 137 | |
| 138 } | |
| 139 else { | |
| 140 return( max( dec.mat[ i, c( "days_to_last_followup") ], na.rm=T ) ) | |
| 141 } | |
| 142 } | |
| 143 ) | |
| 144 } | |
| 145 | |
| 146 | |
| 147 clinical.data <- cbind( clinical.data, | |
| 148 as.numeric( opt$time.column ) ) | |
| 149 } | |
| 150 } | |
| 151 | |
| 152 clinical.data <- as.data.frame( clinical.data ) | |
| 153 colnames( clinical.data ) <- c( "status", "time" ) | |
| 154 rownames( clinical.data ) <- rownames( orig.clinical.data ) | |
| 155 | |
| 156 | |
| 157 ## check to make sure that the id's are sync'd correctly | |
| 158 ## the default format is to use hyphens to separate the elt's of the name | |
| 159 ## and to only use the 1st 3 elements of the name | |
| 160 ## so we check to see if they're using something else as separators and/or using more than 3 elts | |
| 161 reformat.ids <- function( ids ) { | |
| 162 | |
| 163 if ( grepl( "TCGA\\.", ids[1] ) ) { | |
| 164 ids <- sapply( strsplit( ids, "\\." ), function(x) paste( x[1:3], collapse="-" ) ) | |
| 165 } else { | |
| 166 ## do this just in case there's more than 3 elements to the names | |
| 167 if ( grepl( "TCGA-", ids[1] ) ) { | |
| 168 ids <- sapply( strsplit( ids, "-" ), function(x) paste( x[1:min( c(3,length(x) ) )], collapse="-" ) ) | |
| 169 } | |
| 170 } | |
| 171 return( ids ) | |
| 172 } | |
| 173 | |
| 174 | |
| 175 new.samp.ids <- reformat.ids( rownames( clinical.data ) ) | |
| 176 if ( any( duplicated( new.samp.ids ) ) ) { | |
| 177 ## in some cases, we have duplicate sample ids in the raw data after we truncate to | |
| 178 ## the 1st 3 elts in the barcode, so just simplify the data | |
| 179 uniqs <- ! duplicated( new.samp.ids ) | |
| 180 clinical.data <- clinical.data[ uniqs, ] | |
| 181 new.samp.ids <- new.samp.ids[ uniqs ] | |
| 182 } | |
| 183 | |
| 184 rownames( clinical.data ) <- new.samp.ids | |
| 185 write.table( clinical.data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA ) |
