annotate cluster.tools/format.raw.TCGA.clinical.data.R @ 2:b442996b66ae draft

Uploaded
author peter-waltman
date Wed, 27 Feb 2013 20:17:04 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
1 #!/usr/bin/env Rscript
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
2 ##
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
3 ## formats raw clinical data from TCGA to contain a single status & time colums
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
4 ##
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
5 ## Input (required):
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
6 ## - clinical data
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
7 ## Input (optional):
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
8 ## - status & time columns: (NOT USED IN THIS SCRIPT - see comment below)
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
9 ## ideally, a better design would allow a user to specify 1 or more columns
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
10 ## to check for the status & time columns - however, due to the necessities
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
11 ## required to pre-process the TCGA clinical data, the script would not be
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
12 ## generalizeable - and for this reason, the TCGA columns are hard-coded.
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
13 ##
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
14 ## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
15 ##
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
16 ## Date: August 21, 2012
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
17 ## Author: Peter Waltman
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
18 ##
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
19
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
20 ##usage, options and doc goes here
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
21 argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
22 and generates a KM-plot, along with the log-rank p-values
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
23
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
24 Usage:
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
25 format.raw.TCGA.clinical.data.R -c <clinical.file>
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
26 Options:
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
27 -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time))
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
28 ")
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
29 args <- commandArgs(TRUE)
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
30 if ( length( args ) == 1 && args =="--help") {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
31 write(argspec, stderr())
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
32 q();
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
33 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
34
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
35 lib.load.quiet <- function( package ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
36 package <- as.character(substitute(package))
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
37 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
38 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
39 lib.load.quiet(getopt)
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
40
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
41 spec <- matrix( c( "clinical.fname", "d", 1, "character",
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
42 "output.fname", "o", 2, "character"
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
43 ),
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
44 ncol=4,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
45 byrow=TRUE
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
46 )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
47 opt <- getopt( spec=spec )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
48 save.image( "/tmp/format.dbg.rda")
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
49
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
50 ##set some reasonable defaults for the options that are needed,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
51 ##but were not specified.
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
52 if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.clinical.data" ) }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
53
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
54 ##orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE, row.names=1 )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
55 orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
56 orig.clinical.data <- unique( orig.clinical.data )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
57 rownames( orig.clinical.data ) <- orig.clinical.data[,1]
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
58 orig.clinical.data <- orig.clinical.data[, -1 ]
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
59
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
60 ## ugh, some TCGA data sets have all NAs in the "days_to_..." columns
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
61 if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
62 time.cols <- c( "days_to_death", "days_to_last_followup", "days_to_last_known_alive" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
63 } else {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
64 time.cols <- c( "days_to_death", "days_to_last_followup" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
65 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
66 good.samps <- ! apply( orig.clinical.data[, time.cols ], 1, function(x) all( is.na(x) ) | all( x <= 0, na.rm=T ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
67
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
68 orig.clinical.data <- orig.clinical.data[ good.samps, ]
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
69
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
70 if ( is.null(opt$status.column ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
71 status.colname <- "vital_status"
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
72 if ( status.colname %in% colnames( orig.clinical.data ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
73 opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
74 clinical.data <- orig.clinical.data[ , opt$status.column ]
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
75 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
76 else {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
77 status.colname <- "days_to_death"
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
78 if ( status.colname %in% colnames( orig.clinical.data ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
79 opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
80 clinical.data <- orig.clinical.data[ , opt$status.column ]
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
81 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
82 else {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
83 stop( "can't find a valid entry with status info - have tried vital_status & days_to_death\n" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
84 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
85 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
86 clinical.data <- as.numeric( ! grepl( "(LIVING|Not)", clinical.data ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
87 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
88 if ( is.null(opt$time.column ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
89 time.colname <- "CDE.clinical_time"
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
90
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
91 if ( time.colname %in% colnames( orig.clinical.data ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
92 opt$time.column <- which( colnames( orig.clinical.data ) %in% time.colname )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
93 clinical.data <- cbind( clinical.data,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
94 as.numeric( orig.clinical.data[, opt$time.column ] ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
95 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
96 else {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
97 dec.mat <- matrix( NA,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
98 nc=length( time.cols ),
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
99 nr=nrow( orig.clinical.data ),
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
100 dimnames=list( rownames( orig.clinical.data ),
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
101 time.cols )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
102 )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
103 for ( cname in colnames( dec.mat ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
104 if ( cname %in% colnames( orig.clinical.data ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
105 dec.mat[, cname ] <- as.numeric( orig.clinical.data[, cname ] )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
106 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
107 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
108
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
109
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
110
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
111 if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
112
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
113 opt$time.column <- sapply( 1:length( clinical.data ),
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
114 function(i) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
115 if ( clinical.data[i] ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
116 ## this is a deceased sample
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
117 return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ),
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
118 dec.mat[ i, "days_to_death" ],
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
119 ifelse( ( !is.na( dec.mat[ i, "days_to_last_known_alive" ] ) ),
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
120 dec.mat[ i, "days_to_last_known_alive" ],
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
121 dec.mat[ i, "days_to_last_followup" ] ) ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
122
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
123 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
124 else {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
125 return( max( dec.mat[ i, c( "days_to_last_followup","days_to_last_known_alive") ], na.rm=T ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
126 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
127 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
128 )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
129 } else {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
130 opt$time.column <- sapply( 1:length( clinical.data ),
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
131 function(i) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
132 if ( clinical.data[i] ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
133 ## this is a deceased sample
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
134 return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ),
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
135 dec.mat[ i, "days_to_death" ],
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
136 dec.mat[ i, "days_to_last_followup" ] ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
137
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
138 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
139 else {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
140 return( max( dec.mat[ i, c( "days_to_last_followup") ], na.rm=T ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
141 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
142 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
143 )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
144 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
145
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
146
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
147 clinical.data <- cbind( clinical.data,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
148 as.numeric( opt$time.column ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
149 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
150 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
151
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
152 clinical.data <- as.data.frame( clinical.data )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
153 colnames( clinical.data ) <- c( "status", "time" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
154 rownames( clinical.data ) <- rownames( orig.clinical.data )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
155
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
156
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
157 ## check to make sure that the id's are sync'd correctly
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
158 ## the default format is to use hyphens to separate the elt's of the name
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
159 ## and to only use the 1st 3 elements of the name
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
160 ## so we check to see if they're using something else as separators and/or using more than 3 elts
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
161 reformat.ids <- function( ids ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
162
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
163 if ( grepl( "TCGA\\.", ids[1] ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
164 ids <- sapply( strsplit( ids, "\\." ), function(x) paste( x[1:3], collapse="-" ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
165 } else {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
166 ## do this just in case there's more than 3 elements to the names
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
167 if ( grepl( "TCGA-", ids[1] ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
168 ids <- sapply( strsplit( ids, "-" ), function(x) paste( x[1:min( c(3,length(x) ) )], collapse="-" ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
169 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
170 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
171 return( ids )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
172 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
173
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
174
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
175 new.samp.ids <- reformat.ids( rownames( clinical.data ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
176 if ( any( duplicated( new.samp.ids ) ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
177 ## in some cases, we have duplicate sample ids in the raw data after we truncate to
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
178 ## the 1st 3 elts in the barcode, so just simplify the data
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
179 uniqs <- ! duplicated( new.samp.ids )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
180 clinical.data <- clinical.data[ uniqs, ]
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
181 new.samp.ids <- new.samp.ids[ uniqs ]
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
182 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
183
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
184 rownames( clinical.data ) <- new.samp.ids
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
185 write.table( clinical.data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )