Mercurial > repos > peter-waltman > ucsc_cluster_tools2
comparison cluster.tools/extract.TCGA.survival.data.R @ 1:dddfeedb85af draft
Uploaded
author | peter-waltman |
---|---|
date | Fri, 01 Mar 2013 10:16:53 -0500 |
parents | |
children | 563832f48c08 |
comparison
equal
deleted
inserted
replaced
0:0decf3fd54bc | 1:dddfeedb85af |
---|---|
1 #!/usr/bin/env Rscript | |
2 ## | |
3 ## formats raw clinical data from TCGA to contain a single status & time colums | |
4 ## | |
5 ## Input (required): | |
6 ## - clinical data | |
7 ## Input (optional): | |
8 ## - status & time columns: (NOT USED IN THIS SCRIPT - see comment below) | |
9 ## ideally, a better design would allow a user to specify 1 or more columns | |
10 ## to check for the status & time columns - however, due to the necessities | |
11 ## required to pre-process the TCGA clinical data, the script would not be | |
12 ## generalizeable - and for this reason, the TCGA columns are hard-coded. | |
13 ## | |
14 ## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time | |
15 ## | |
16 ## Date: August 21, 2012 | |
17 ## Author: Peter Waltman | |
18 ## | |
19 | |
20 ##usage, options and doc goes here | |
21 argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data | |
22 and generates a KM-plot, along with the log-rank p-values | |
23 | |
24 Usage: | |
25 format.raw.TCGA.clinical.data.R -c <clinical.file> | |
26 Options: | |
27 -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time)) | |
28 ") | |
29 args <- commandArgs(TRUE) | |
30 if ( length( args ) == 1 && args =="--help") { | |
31 write(argspec, stderr()) | |
32 q(); | |
33 } | |
34 | |
35 ## some helper fn's | |
36 write.2.tab <- function( mat, | |
37 fname ) { | |
38 mat <- rbind( colnames( mat ), mat ) | |
39 mat <- cbind( c( "ID", rownames( mat )[-1] ), | |
40 mat ) | |
41 write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) | |
42 } | |
43 | |
44 lib.load.quiet <- function( package ) { | |
45 package <- as.character(substitute(package)) | |
46 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) | |
47 } | |
48 lib.load.quiet(getopt) | |
49 | |
50 spec <- matrix( c( "clinical.fname", "d", 1, "character", | |
51 "output.fname", "o", 2, "character" | |
52 ), | |
53 ncol=4, | |
54 byrow=TRUE | |
55 ) | |
56 opt <- getopt( spec=spec ) | |
57 | |
58 ##set some reasonable defaults for the options that are needed, | |
59 ##but were not specified. | |
60 if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.clinical.data" ) } | |
61 | |
62 ##orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE, row.names=1 ) | |
63 orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE ) | |
64 orig.clinical.data <- unique( orig.clinical.data ) | |
65 rownames( orig.clinical.data ) <- orig.clinical.data[,1] | |
66 orig.clinical.data <- orig.clinical.data[, -1 ] | |
67 | |
68 ## ugh, some TCGA data sets have all NAs in the "days_to_..." columns | |
69 if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) { | |
70 time.cols <- c( "days_to_death", "days_to_last_followup", "days_to_last_known_alive" ) | |
71 } else { | |
72 time.cols <- c( "days_to_death", "days_to_last_followup" ) | |
73 } | |
74 good.samps <- ! apply( orig.clinical.data[, time.cols ], 1, function(x) all( is.na(x) ) | all( x <= 0, na.rm=T ) ) | |
75 | |
76 orig.clinical.data <- orig.clinical.data[ good.samps, ] | |
77 | |
78 if ( is.null(opt$status.column ) ) { | |
79 status.colname <- "vital_status" | |
80 if ( status.colname %in% colnames( orig.clinical.data ) ) { | |
81 opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname ) | |
82 clinical.data <- orig.clinical.data[ , opt$status.column ] | |
83 } | |
84 else { | |
85 status.colname <- "days_to_death" | |
86 if ( status.colname %in% colnames( orig.clinical.data ) ) { | |
87 opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname ) | |
88 clinical.data <- orig.clinical.data[ , opt$status.column ] | |
89 } | |
90 else { | |
91 stop( "can't find a valid entry with status info - have tried vital_status & days_to_death\n" ) | |
92 } | |
93 } | |
94 clinical.data <- as.numeric( ! grepl( "(LIVING|Not)", clinical.data ) ) | |
95 } | |
96 if ( is.null(opt$time.column ) ) { | |
97 time.colname <- "CDE.clinical_time" | |
98 | |
99 if ( time.colname %in% colnames( orig.clinical.data ) ) { | |
100 opt$time.column <- which( colnames( orig.clinical.data ) %in% time.colname ) | |
101 clinical.data <- cbind( clinical.data, | |
102 as.numeric( orig.clinical.data[, opt$time.column ] ) ) | |
103 } | |
104 else { | |
105 dec.mat <- matrix( NA, | |
106 nc=length( time.cols ), | |
107 nr=nrow( orig.clinical.data ), | |
108 dimnames=list( rownames( orig.clinical.data ), | |
109 time.cols ) | |
110 ) | |
111 for ( cname in colnames( dec.mat ) ) { | |
112 if ( cname %in% colnames( orig.clinical.data ) ) { | |
113 dec.mat[, cname ] <- as.numeric( orig.clinical.data[, cname ] ) | |
114 } | |
115 } | |
116 | |
117 | |
118 | |
119 if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) { | |
120 | |
121 opt$time.column <- sapply( 1:length( clinical.data ), | |
122 function(i) { | |
123 if ( clinical.data[i] ) { | |
124 ## this is a deceased sample | |
125 return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ), | |
126 dec.mat[ i, "days_to_death" ], | |
127 ifelse( ( !is.na( dec.mat[ i, "days_to_last_known_alive" ] ) ), | |
128 dec.mat[ i, "days_to_last_known_alive" ], | |
129 dec.mat[ i, "days_to_last_followup" ] ) ) ) | |
130 | |
131 } | |
132 else { | |
133 return( max( dec.mat[ i, c( "days_to_last_followup","days_to_last_known_alive") ], na.rm=T ) ) | |
134 } | |
135 } | |
136 ) | |
137 } else { | |
138 opt$time.column <- sapply( 1:length( clinical.data ), | |
139 function(i) { | |
140 if ( clinical.data[i] ) { | |
141 ## this is a deceased sample | |
142 return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ), | |
143 dec.mat[ i, "days_to_death" ], | |
144 dec.mat[ i, "days_to_last_followup" ] ) ) | |
145 | |
146 } | |
147 else { | |
148 return( max( dec.mat[ i, c( "days_to_last_followup") ], na.rm=T ) ) | |
149 } | |
150 } | |
151 ) | |
152 } | |
153 | |
154 | |
155 clinical.data <- cbind( clinical.data, | |
156 as.numeric( opt$time.column ) ) | |
157 } | |
158 } | |
159 | |
160 clinical.data <- as.data.frame( clinical.data ) | |
161 colnames( clinical.data ) <- c( "status", "time" ) | |
162 rownames( clinical.data ) <- rownames( orig.clinical.data ) | |
163 | |
164 | |
165 ## check to make sure that the id's are sync'd correctly | |
166 ## the default format is to use hyphens to separate the elt's of the name | |
167 ## and to only use the 1st 3 elements of the name | |
168 ## so we check to see if they're using something else as separators and/or using more than 3 elts | |
169 reformat.ids <- function( ids ) { | |
170 | |
171 if ( grepl( "TCGA\\.", ids[1] ) ) { | |
172 ids <- sapply( strsplit( ids, "\\." ), function(x) paste( x[1:3], collapse="-" ) ) | |
173 } else { | |
174 ## do this just in case there's more than 3 elements to the names | |
175 if ( grepl( "TCGA-", ids[1] ) ) { | |
176 ids <- sapply( strsplit( ids, "-" ), function(x) paste( x[1:min( c(3,length(x) ) )], collapse="-" ) ) | |
177 } | |
178 } | |
179 return( ids ) | |
180 } | |
181 | |
182 | |
183 new.samp.ids <- reformat.ids( rownames( clinical.data ) ) | |
184 if ( any( duplicated( new.samp.ids ) ) ) { | |
185 ## in some cases, we have duplicate sample ids in the raw data after we truncate to | |
186 ## the 1st 3 elts in the barcode, so just simplify the data | |
187 uniqs <- ! duplicated( new.samp.ids ) | |
188 clinical.data <- clinical.data[ uniqs, ] | |
189 new.samp.ids <- new.samp.ids[ uniqs ] | |
190 } | |
191 | |
192 rownames( clinical.data ) <- new.samp.ids | |
193 | |
194 write.2.tab( clinical.data, opt$output.fname ) | |
195 ##write.table( clinical.data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA ) |