1
|
1 #!/usr/bin/env Rscript
|
|
2 ##
|
|
3 ## formats raw clinical data from TCGA to contain a single status & time colums
|
|
4 ##
|
|
5 ## Input (required):
|
|
6 ## - clinical data
|
|
7 ## Input (optional):
|
|
8 ## - status & time columns: (NOT USED IN THIS SCRIPT - see comment below)
|
|
9 ## ideally, a better design would allow a user to specify 1 or more columns
|
|
10 ## to check for the status & time columns - however, due to the necessities
|
|
11 ## required to pre-process the TCGA clinical data, the script would not be
|
|
12 ## generalizeable - and for this reason, the TCGA columns are hard-coded.
|
|
13 ##
|
|
14 ## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time
|
|
15 ##
|
|
16 ## Date: August 21, 2012
|
|
17 ## Author: Peter Waltman
|
|
18 ##
|
|
19
|
|
20 ##usage, options and doc goes here
|
|
21 argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data
|
|
22 and generates a KM-plot, along with the log-rank p-values
|
|
23
|
|
24 Usage:
|
|
25 format.raw.TCGA.clinical.data.R -c <clinical.file>
|
|
26 Options:
|
|
27 -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time))
|
|
28 ")
|
|
29 args <- commandArgs(TRUE)
|
|
30 if ( length( args ) == 1 && args =="--help") {
|
|
31 write(argspec, stderr())
|
|
32 q();
|
|
33 }
|
|
34
|
|
35 ## some helper fn's
|
|
36 write.2.tab <- function( mat,
|
|
37 fname ) {
|
|
38 mat <- rbind( colnames( mat ), mat )
|
|
39 mat <- cbind( c( "ID", rownames( mat )[-1] ),
|
|
40 mat )
|
|
41 write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
|
|
42 }
|
|
43
|
|
44 lib.load.quiet <- function( package ) {
|
|
45 package <- as.character(substitute(package))
|
|
46 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
|
|
47 }
|
|
48 lib.load.quiet(getopt)
|
|
49
|
|
50 spec <- matrix( c( "clinical.fname", "d", 1, "character",
|
|
51 "output.fname", "o", 2, "character"
|
|
52 ),
|
|
53 ncol=4,
|
|
54 byrow=TRUE
|
|
55 )
|
|
56 opt <- getopt( spec=spec )
|
|
57
|
|
58 ##set some reasonable defaults for the options that are needed,
|
|
59 ##but were not specified.
|
|
60 if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.clinical.data" ) }
|
|
61
|
|
62 ##orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE, row.names=1 )
|
|
63 orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE )
|
|
64 orig.clinical.data <- unique( orig.clinical.data )
|
|
65 rownames( orig.clinical.data ) <- orig.clinical.data[,1]
|
|
66 orig.clinical.data <- orig.clinical.data[, -1 ]
|
|
67
|
|
68 ## ugh, some TCGA data sets have all NAs in the "days_to_..." columns
|
|
69 if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) {
|
|
70 time.cols <- c( "days_to_death", "days_to_last_followup", "days_to_last_known_alive" )
|
|
71 } else {
|
|
72 time.cols <- c( "days_to_death", "days_to_last_followup" )
|
|
73 }
|
|
74 good.samps <- ! apply( orig.clinical.data[, time.cols ], 1, function(x) all( is.na(x) ) | all( x <= 0, na.rm=T ) )
|
|
75
|
|
76 orig.clinical.data <- orig.clinical.data[ good.samps, ]
|
|
77
|
|
78 if ( is.null(opt$status.column ) ) {
|
|
79 status.colname <- "vital_status"
|
|
80 if ( status.colname %in% colnames( orig.clinical.data ) ) {
|
|
81 opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname )
|
|
82 clinical.data <- orig.clinical.data[ , opt$status.column ]
|
|
83 }
|
|
84 else {
|
|
85 status.colname <- "days_to_death"
|
|
86 if ( status.colname %in% colnames( orig.clinical.data ) ) {
|
|
87 opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname )
|
|
88 clinical.data <- orig.clinical.data[ , opt$status.column ]
|
|
89 }
|
|
90 else {
|
|
91 stop( "can't find a valid entry with status info - have tried vital_status & days_to_death\n" )
|
|
92 }
|
|
93 }
|
|
94 clinical.data <- as.numeric( ! grepl( "(LIVING|Not)", clinical.data ) )
|
|
95 }
|
3
|
96
|
1
|
97 if ( is.null(opt$time.column ) ) {
|
|
98 time.colname <- "CDE.clinical_time"
|
|
99
|
|
100 if ( time.colname %in% colnames( orig.clinical.data ) ) {
|
|
101 opt$time.column <- which( colnames( orig.clinical.data ) %in% time.colname )
|
|
102 clinical.data <- cbind( clinical.data,
|
|
103 as.numeric( orig.clinical.data[, opt$time.column ] ) )
|
|
104 }
|
|
105 else {
|
|
106 dec.mat <- matrix( NA,
|
|
107 nc=length( time.cols ),
|
|
108 nr=nrow( orig.clinical.data ),
|
|
109 dimnames=list( rownames( orig.clinical.data ),
|
|
110 time.cols )
|
|
111 )
|
|
112 for ( cname in colnames( dec.mat ) ) {
|
|
113 if ( cname %in% colnames( orig.clinical.data ) ) {
|
|
114 dec.mat[, cname ] <- as.numeric( orig.clinical.data[, cname ] )
|
|
115 }
|
|
116 }
|
|
117
|
|
118
|
|
119
|
|
120 if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) {
|
|
121
|
|
122 opt$time.column <- sapply( 1:length( clinical.data ),
|
|
123 function(i) {
|
|
124 if ( clinical.data[i] ) {
|
|
125 ## this is a deceased sample
|
|
126 return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ),
|
|
127 dec.mat[ i, "days_to_death" ],
|
|
128 ifelse( ( !is.na( dec.mat[ i, "days_to_last_known_alive" ] ) ),
|
|
129 dec.mat[ i, "days_to_last_known_alive" ],
|
|
130 dec.mat[ i, "days_to_last_followup" ] ) ) )
|
|
131
|
|
132 }
|
|
133 else {
|
|
134 return( max( dec.mat[ i, c( "days_to_last_followup","days_to_last_known_alive") ], na.rm=T ) )
|
|
135 }
|
|
136 }
|
|
137 )
|
|
138 } else {
|
|
139 opt$time.column <- sapply( 1:length( clinical.data ),
|
|
140 function(i) {
|
|
141 if ( clinical.data[i] ) {
|
|
142 ## this is a deceased sample
|
|
143 return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ),
|
|
144 dec.mat[ i, "days_to_death" ],
|
|
145 dec.mat[ i, "days_to_last_followup" ] ) )
|
|
146
|
|
147 }
|
|
148 else {
|
|
149 return( max( dec.mat[ i, c( "days_to_last_followup") ], na.rm=T ) )
|
|
150 }
|
|
151 }
|
|
152 )
|
|
153 }
|
|
154
|
|
155
|
|
156 clinical.data <- cbind( clinical.data,
|
|
157 as.numeric( opt$time.column ) )
|
|
158 }
|
|
159 }
|
|
160
|
|
161 clinical.data <- as.data.frame( clinical.data )
|
|
162 colnames( clinical.data ) <- c( "status", "time" )
|
|
163 rownames( clinical.data ) <- rownames( orig.clinical.data )
|
|
164
|
|
165
|
|
166 ## check to make sure that the id's are sync'd correctly
|
|
167 ## the default format is to use hyphens to separate the elt's of the name
|
|
168 ## and to only use the 1st 3 elements of the name
|
|
169 ## so we check to see if they're using something else as separators and/or using more than 3 elts
|
|
170 reformat.ids <- function( ids ) {
|
|
171
|
|
172 if ( grepl( "TCGA\\.", ids[1] ) ) {
|
|
173 ids <- sapply( strsplit( ids, "\\." ), function(x) paste( x[1:3], collapse="-" ) )
|
|
174 } else {
|
|
175 ## do this just in case there's more than 3 elements to the names
|
|
176 if ( grepl( "TCGA-", ids[1] ) ) {
|
|
177 ids <- sapply( strsplit( ids, "-" ), function(x) paste( x[1:min( c(3,length(x) ) )], collapse="-" ) )
|
|
178 }
|
|
179 }
|
|
180 return( ids )
|
|
181 }
|
|
182
|
|
183
|
|
184 new.samp.ids <- reformat.ids( rownames( clinical.data ) )
|
|
185 if ( any( duplicated( new.samp.ids ) ) ) {
|
|
186 ## in some cases, we have duplicate sample ids in the raw data after we truncate to
|
|
187 ## the 1st 3 elts in the barcode, so just simplify the data
|
|
188 uniqs <- ! duplicated( new.samp.ids )
|
|
189 clinical.data <- clinical.data[ uniqs, ]
|
|
190 new.samp.ids <- new.samp.ids[ uniqs ]
|
|
191 }
|
|
192
|
|
193 rownames( clinical.data ) <- new.samp.ids
|
|
194
|
|
195 write.2.tab( clinical.data, opt$output.fname )
|
|
196 ##write.table( clinical.data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )
|