| 1 | 1 #!/usr/bin/env Rscript | 
|  | 2 ## | 
|  | 3 ## formats raw clinical data from TCGA to contain a single status & time colums | 
|  | 4 ## | 
|  | 5 ## Input (required): | 
|  | 6 ##    - clinical data | 
|  | 7 ## Input (optional): | 
|  | 8 ##    - status & time columns: (NOT USED IN THIS SCRIPT - see comment below) | 
|  | 9 ##         ideally, a better design would allow a user to specify 1 or more columns | 
|  | 10 ##         to check for the status & time columns - however, due to the necessities | 
|  | 11 ##         required to pre-process the TCGA clinical data, the script would not be | 
|  | 12 ##         generalizeable - and for this reason, the TCGA columns are hard-coded. | 
|  | 13 ## | 
|  | 14 ## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time | 
|  | 15 ## | 
|  | 16 ## Date: August 21, 2012 | 
|  | 17 ## Author: Peter Waltman | 
|  | 18 ## | 
|  | 19 | 
|  | 20 ##usage, options and doc goes here | 
|  | 21 argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data | 
|  | 22 and generates a KM-plot, along with the log-rank p-values | 
|  | 23 | 
|  | 24         Usage: | 
|  | 25                 format.raw.TCGA.clinical.data.R -c <clinical.file> | 
|  | 26         Options: | 
|  | 27                 -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time)) | 
|  | 28               ") | 
|  | 29 args <- commandArgs(TRUE) | 
|  | 30 if ( length( args ) == 1 && args =="--help") { | 
|  | 31   write(argspec, stderr()) | 
|  | 32   q(); | 
|  | 33 } | 
|  | 34 | 
|  | 35 ## some helper fn's | 
|  | 36 write.2.tab <- function( mat, | 
|  | 37                          fname ) { | 
|  | 38   mat <- rbind( colnames( mat ), mat ) | 
|  | 39   mat <- cbind( c( "ID", rownames( mat )[-1] ), | 
|  | 40                       mat ) | 
|  | 41   write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) | 
|  | 42 } | 
|  | 43 | 
|  | 44 fix.genes <- function( mat ) { | 
|  | 45   ## filter out the unknowns | 
|  | 46   mat <- mat[ ( !  grepl( "^\\?", rownames( mat ) ) ), ] | 
|  | 47 | 
|  | 48   genes <-  rownames( mat ) | 
|  | 49   ## select the HUGO name portion of the "gene-names" that are in the TCGA matrices | 
|  | 50   genes <- sapply( strsplit( genes, "\\|" ), function(x) x[1] ) | 
|  | 51 | 
|  | 52   if ( any( duplicated( genes ) ) ) { | 
|  | 53     dupes <- unique( genes[ duplicated( genes ) ] ) | 
|  | 54     no.duped.mat <- which( ! genes %in% dupes ) | 
|  | 55     no.duped.mat <- mat[ no.duped.mat, , drop=FALSE ] | 
|  | 56 | 
|  | 57     ## now merge in the duplicates | 
|  | 58     for ( dup in dupes ) { | 
|  | 59       duped.mat <- mat[ grepl( paste( "^", dup, sep="" ), rownames( mat ) ), ] | 
|  | 60       duped.mat <- apply( duped.mat, 2, mean, na.rm=TRUE ) | 
|  | 61       no.duped.mat <- rbind( no.duped.mat, duped.mat ) | 
|  | 62       ## weird bit of follow up to make sure the new row/gene is named correctly | 
|  | 63       rownames( no.duped.mat ) <- c( rownames( no.duped.mat )[ -nrow(no.duped.mat) ], | 
|  | 64                                      paste( dup, "|0000", sep="" ) ) | 
|  | 65     } | 
|  | 66     mat <- no.duped.mat; rm( no.duped.mat ); gc() | 
|  | 67   } | 
|  | 68 | 
|  | 69   rownames( mat ) <- sapply( strsplit( rownames( mat ), "\\|" ), function(x) x[1] ) | 
|  | 70   return( mat ) | 
|  | 71 } | 
|  | 72 | 
|  | 73 fix.sample.ids <- function( mat ) { | 
|  | 74   colnames( mat ) <- gsub( "\\.", "-", colnames( mat ) ) | 
|  | 75   return( mat ) | 
|  | 76 } | 
|  | 77 | 
|  | 78 lib.load.quiet <- function( package ) { | 
|  | 79    package <- as.character(substitute(package)) | 
|  | 80    suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) | 
|  | 81 } | 
|  | 82 lib.load.quiet(getopt) | 
|  | 83 | 
|  | 84 spec <- matrix( c( "dataset",             "d", 1, "character", | 
|  | 85                    "log.tranform",        "l", 2, "character", | 
|  | 86                    "filter.low.variant",  "f", 2, "integer", | 
|  | 87                    "output.fname",        "o", 2, "character" | 
|  | 88                   ), | 
|  | 89                 ncol=4, | 
|  | 90                 byrow=TRUE | 
|  | 91                ) | 
|  | 92 opt <- getopt( spec=spec ) | 
|  | 93 | 
|  | 94 ##set some reasonable defaults for the options that are needed, | 
|  | 95 ##but were not specified. | 
|  | 96 if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.RNASeq.data" ) } | 
|  | 97 if ( is.null(opt$log.transform ) ) { | 
|  | 98   opt$log.transform <- TRUE | 
|  | 99 } else { | 
|  | 100   opt$log.transform <- ( tolower( opt$log.transform ) %in% "yes" ) | 
|  | 101 } | 
|  | 102 | 
|  | 103 if ( is.null(opt$filter.low.variant ) ) { opt$log.transform <- 10 } | 
|  | 104 | 
|  | 105 | 
|  | 106 data <- as.matrix( read.delim( opt$dataset, row.names=1, check.names=F ) ) | 
|  | 107 data <- fix.genes( data ) | 
|  | 108 data <- fix.sample.ids( data ) | 
|  | 109 | 
|  | 110 if ( opt$filter.low.variant > 0 ) { | 
|  | 111   rpkm.meds <- apply( data, 1, median, na.rm=TRUE ) | 
|  | 112   rpkm.meds <- rpkm.meds >= opt$filter.low.variant | 
|  | 113   data <- data[ rpkm.meds, ] | 
|  | 114 } | 
|  | 115 | 
|  | 116 if ( opt$log.transform ) { | 
|  | 117   data <- log( data + 1 ) | 
|  | 118 } | 
|  | 119 | 
|  | 120 write.2.tab( data, opt$output.fname ) |