1
|
1 #!/usr/bin/env Rscript
|
|
2 ##
|
|
3 ## formats raw clinical data from TCGA to contain a single status & time colums
|
|
4 ##
|
|
5 ## Input (required):
|
|
6 ## - clinical data
|
|
7 ## Input (optional):
|
|
8 ## - status & time columns: (NOT USED IN THIS SCRIPT - see comment below)
|
|
9 ## ideally, a better design would allow a user to specify 1 or more columns
|
|
10 ## to check for the status & time columns - however, due to the necessities
|
|
11 ## required to pre-process the TCGA clinical data, the script would not be
|
|
12 ## generalizeable - and for this reason, the TCGA columns are hard-coded.
|
|
13 ##
|
|
14 ## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time
|
|
15 ##
|
|
16 ## Date: August 21, 2012
|
|
17 ## Author: Peter Waltman
|
|
18 ##
|
|
19
|
|
20 ##usage, options and doc goes here
|
|
21 argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data
|
|
22 and generates a KM-plot, along with the log-rank p-values
|
|
23
|
|
24 Usage:
|
|
25 format.raw.TCGA.clinical.data.R -c <clinical.file>
|
|
26 Options:
|
|
27 -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time))
|
|
28 ")
|
|
29 args <- commandArgs(TRUE)
|
|
30 if ( length( args ) == 1 && args =="--help") {
|
|
31 write(argspec, stderr())
|
|
32 q();
|
|
33 }
|
|
34
|
|
35 ## some helper fn's
|
|
36 write.2.tab <- function( mat,
|
|
37 fname ) {
|
|
38 mat <- rbind( colnames( mat ), mat )
|
|
39 mat <- cbind( c( "ID", rownames( mat )[-1] ),
|
|
40 mat )
|
|
41 write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
|
|
42 }
|
|
43
|
|
44 fix.genes <- function( mat ) {
|
|
45 ## filter out the unknowns
|
|
46 mat <- mat[ ( ! grepl( "^\\?", rownames( mat ) ) ), ]
|
|
47
|
|
48 genes <- rownames( mat )
|
|
49 ## select the HUGO name portion of the "gene-names" that are in the TCGA matrices
|
|
50 genes <- sapply( strsplit( genes, "\\|" ), function(x) x[1] )
|
|
51
|
|
52 if ( any( duplicated( genes ) ) ) {
|
|
53 dupes <- unique( genes[ duplicated( genes ) ] )
|
|
54 no.duped.mat <- which( ! genes %in% dupes )
|
|
55 no.duped.mat <- mat[ no.duped.mat, , drop=FALSE ]
|
|
56
|
|
57 ## now merge in the duplicates
|
|
58 for ( dup in dupes ) {
|
|
59 duped.mat <- mat[ grepl( paste( "^", dup, sep="" ), rownames( mat ) ), ]
|
|
60 duped.mat <- apply( duped.mat, 2, mean, na.rm=TRUE )
|
|
61 no.duped.mat <- rbind( no.duped.mat, duped.mat )
|
|
62 ## weird bit of follow up to make sure the new row/gene is named correctly
|
|
63 rownames( no.duped.mat ) <- c( rownames( no.duped.mat )[ -nrow(no.duped.mat) ],
|
|
64 paste( dup, "|0000", sep="" ) )
|
|
65 }
|
|
66 mat <- no.duped.mat; rm( no.duped.mat ); gc()
|
|
67 }
|
|
68
|
|
69 rownames( mat ) <- sapply( strsplit( rownames( mat ), "\\|" ), function(x) x[1] )
|
|
70 return( mat )
|
|
71 }
|
|
72
|
|
73 fix.sample.ids <- function( mat ) {
|
|
74 colnames( mat ) <- gsub( "\\.", "-", colnames( mat ) )
|
|
75 return( mat )
|
|
76 }
|
|
77
|
|
78 lib.load.quiet <- function( package ) {
|
|
79 package <- as.character(substitute(package))
|
|
80 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
|
|
81 }
|
|
82 lib.load.quiet(getopt)
|
|
83
|
|
84 spec <- matrix( c( "dataset", "d", 1, "character",
|
|
85 "log.tranform", "l", 2, "character",
|
|
86 "filter.low.variant", "f", 2, "integer",
|
|
87 "output.fname", "o", 2, "character"
|
|
88 ),
|
|
89 ncol=4,
|
|
90 byrow=TRUE
|
|
91 )
|
|
92 opt <- getopt( spec=spec )
|
|
93
|
|
94 ##set some reasonable defaults for the options that are needed,
|
|
95 ##but were not specified.
|
|
96 if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.RNASeq.data" ) }
|
|
97 if ( is.null(opt$log.transform ) ) {
|
|
98 opt$log.transform <- TRUE
|
|
99 } else {
|
|
100 opt$log.transform <- ( tolower( opt$log.transform ) %in% "yes" )
|
|
101 }
|
|
102
|
|
103 if ( is.null(opt$filter.low.variant ) ) { opt$log.transform <- 10 }
|
|
104
|
|
105
|
|
106 data <- as.matrix( read.delim( opt$dataset, row.names=1, check.names=F ) )
|
|
107 data <- fix.genes( data )
|
|
108 data <- fix.sample.ids( data )
|
|
109
|
|
110 if ( opt$filter.low.variant > 0 ) {
|
|
111 rpkm.meds <- apply( data, 1, median, na.rm=TRUE )
|
|
112 rpkm.meds <- rpkm.meds >= opt$filter.low.variant
|
|
113 data <- data[ rpkm.meds, ]
|
|
114 }
|
|
115
|
|
116 if ( opt$log.transform ) {
|
|
117 data <- log( data + 1 )
|
|
118 }
|
|
119
|
|
120 write.2.tab( data, opt$output.fname )
|