Mercurial > repos > peter-waltman > ucsc_cluster_tools2
diff cluster.tools/format.raw.TCGA.RNASeq.data.R @ 1:dddfeedb85af draft
Uploaded
author | peter-waltman |
---|---|
date | Fri, 01 Mar 2013 10:16:53 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/format.raw.TCGA.RNASeq.data.R Fri Mar 01 10:16:53 2013 -0500 @@ -0,0 +1,120 @@ +#!/usr/bin/env Rscript +## +## formats raw clinical data from TCGA to contain a single status & time colums +## +## Input (required): +## - clinical data +## Input (optional): +## - status & time columns: (NOT USED IN THIS SCRIPT - see comment below) +## ideally, a better design would allow a user to specify 1 or more columns +## to check for the status & time columns - however, due to the necessities +## required to pre-process the TCGA clinical data, the script would not be +## generalizeable - and for this reason, the TCGA columns are hard-coded. +## +## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time +## +## Date: August 21, 2012 +## Author: Peter Waltman +## + +##usage, options and doc goes here +argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data +and generates a KM-plot, along with the log-rank p-values + + Usage: + format.raw.TCGA.clinical.data.R -c <clinical.file> + Options: + -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time)) + ") +args <- commandArgs(TRUE) +if ( length( args ) == 1 && args =="--help") { + write(argspec, stderr()) + q(); +} + +## some helper fn's +write.2.tab <- function( mat, + fname ) { + mat <- rbind( colnames( mat ), mat ) + mat <- cbind( c( "ID", rownames( mat )[-1] ), + mat ) + write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) +} + +fix.genes <- function( mat ) { + ## filter out the unknowns + mat <- mat[ ( ! grepl( "^\\?", rownames( mat ) ) ), ] + + genes <- rownames( mat ) + ## select the HUGO name portion of the "gene-names" that are in the TCGA matrices + genes <- sapply( strsplit( genes, "\\|" ), function(x) x[1] ) + + if ( any( duplicated( genes ) ) ) { + dupes <- unique( genes[ duplicated( genes ) ] ) + no.duped.mat <- which( ! genes %in% dupes ) + no.duped.mat <- mat[ no.duped.mat, , drop=FALSE ] + + ## now merge in the duplicates + for ( dup in dupes ) { + duped.mat <- mat[ grepl( paste( "^", dup, sep="" ), rownames( mat ) ), ] + duped.mat <- apply( duped.mat, 2, mean, na.rm=TRUE ) + no.duped.mat <- rbind( no.duped.mat, duped.mat ) + ## weird bit of follow up to make sure the new row/gene is named correctly + rownames( no.duped.mat ) <- c( rownames( no.duped.mat )[ -nrow(no.duped.mat) ], + paste( dup, "|0000", sep="" ) ) + } + mat <- no.duped.mat; rm( no.duped.mat ); gc() + } + + rownames( mat ) <- sapply( strsplit( rownames( mat ), "\\|" ), function(x) x[1] ) + return( mat ) +} + +fix.sample.ids <- function( mat ) { + colnames( mat ) <- gsub( "\\.", "-", colnames( mat ) ) + return( mat ) +} + +lib.load.quiet <- function( package ) { + package <- as.character(substitute(package)) + suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) +} +lib.load.quiet(getopt) + +spec <- matrix( c( "dataset", "d", 1, "character", + "log.tranform", "l", 2, "character", + "filter.low.variant", "f", 2, "integer", + "output.fname", "o", 2, "character" + ), + ncol=4, + byrow=TRUE + ) +opt <- getopt( spec=spec ) + +##set some reasonable defaults for the options that are needed, +##but were not specified. +if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.RNASeq.data" ) } +if ( is.null(opt$log.transform ) ) { + opt$log.transform <- TRUE +} else { + opt$log.transform <- ( tolower( opt$log.transform ) %in% "yes" ) +} + +if ( is.null(opt$filter.low.variant ) ) { opt$log.transform <- 10 } + + +data <- as.matrix( read.delim( opt$dataset, row.names=1, check.names=F ) ) +data <- fix.genes( data ) +data <- fix.sample.ids( data ) + +if ( opt$filter.low.variant > 0 ) { + rpkm.meds <- apply( data, 1, median, na.rm=TRUE ) + rpkm.meds <- rpkm.meds >= opt$filter.low.variant + data <- data[ rpkm.meds, ] +} + +if ( opt$log.transform ) { + data <- log( data + 1 ) +} + +write.2.tab( data, opt$output.fname )