diff cluster.tools/format.raw.TCGA.RNASeq.data.R @ 1:dddfeedb85af draft

Uploaded
author peter-waltman
date Fri, 01 Mar 2013 10:16:53 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/format.raw.TCGA.RNASeq.data.R	Fri Mar 01 10:16:53 2013 -0500
@@ -0,0 +1,120 @@
+#!/usr/bin/env Rscript
+## 
+## formats raw clinical data from TCGA to contain a single status & time colums
+##
+## Input (required):
+##    - clinical data
+## Input (optional):
+##    - status & time columns: (NOT USED IN THIS SCRIPT - see comment below)
+##         ideally, a better design would allow a user to specify 1 or more columns
+##         to check for the status & time columns - however, due to the necessities
+##         required to pre-process the TCGA clinical data, the script would not be
+##         generalizeable - and for this reason, the TCGA columns are hard-coded.
+##
+## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time
+##
+## Date: August 21, 2012
+## Author: Peter Waltman
+##
+
+##usage, options and doc goes here
+argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+        Usage: 
+                format.raw.TCGA.clinical.data.R -c <clinical.file> 
+        Options:
+                -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time))
+              ")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+## some helper fn's
+write.2.tab <- function( mat,
+                         fname ) {
+  mat <- rbind( colnames( mat ), mat )
+  mat <- cbind( c( "ID", rownames( mat )[-1] ),
+                      mat )
+  write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
+}
+
+fix.genes <- function( mat ) {
+  ## filter out the unknowns
+  mat <- mat[ ( !  grepl( "^\\?", rownames( mat ) ) ), ]  
+
+  genes <-  rownames( mat )
+  ## select the HUGO name portion of the "gene-names" that are in the TCGA matrices
+  genes <- sapply( strsplit( genes, "\\|" ), function(x) x[1] )
+
+  if ( any( duplicated( genes ) ) ) {
+    dupes <- unique( genes[ duplicated( genes ) ] )
+    no.duped.mat <- which( ! genes %in% dupes )
+    no.duped.mat <- mat[ no.duped.mat, , drop=FALSE ]
+
+    ## now merge in the duplicates
+    for ( dup in dupes ) {
+      duped.mat <- mat[ grepl( paste( "^", dup, sep="" ), rownames( mat ) ), ]
+      duped.mat <- apply( duped.mat, 2, mean, na.rm=TRUE )
+      no.duped.mat <- rbind( no.duped.mat, duped.mat )
+      ## weird bit of follow up to make sure the new row/gene is named correctly
+      rownames( no.duped.mat ) <- c( rownames( no.duped.mat )[ -nrow(no.duped.mat) ],
+                                     paste( dup, "|0000", sep="" ) )
+    }
+    mat <- no.duped.mat; rm( no.duped.mat ); gc()
+  }
+
+  rownames( mat ) <- sapply( strsplit( rownames( mat ), "\\|" ), function(x) x[1] )
+  return( mat )
+}
+
+fix.sample.ids <- function( mat ) {
+  colnames( mat ) <- gsub( "\\.", "-", colnames( mat ) )
+  return( mat )
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "dataset",             "d", 1, "character",
+                   "log.tranform",        "l", 2, "character",
+                   "filter.low.variant",  "f", 2, "integer",
+                   "output.fname",        "o", 2, "character"
+                  ),
+                ncol=4,
+                byrow=TRUE
+               )
+opt <- getopt( spec=spec )
+
+##set some reasonable defaults for the options that are needed,
+##but were not specified.
+if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.RNASeq.data" ) }
+if ( is.null(opt$log.transform ) ) {
+  opt$log.transform <- TRUE
+} else {
+  opt$log.transform <- ( tolower( opt$log.transform ) %in% "yes" )
+}
+
+if ( is.null(opt$filter.low.variant ) ) { opt$log.transform <- 10 } 
+
+
+data <- as.matrix( read.delim( opt$dataset, row.names=1, check.names=F ) )
+data <- fix.genes( data )
+data <- fix.sample.ids( data )
+
+if ( opt$filter.low.variant > 0 ) {
+  rpkm.meds <- apply( data, 1, median, na.rm=TRUE )
+  rpkm.meds <- rpkm.meds >= opt$filter.low.variant
+  data <- data[ rpkm.meds, ]
+}
+
+if ( opt$log.transform ) {
+  data <- log( data + 1 )
+}
+
+write.2.tab( data, opt$output.fname )