Mercurial > repos > peter-waltman > ucsc_cluster_tools2
view cluster.tools/format.raw.TCGA.RNASeq.data.R @ 9:a3c03541fe6f draft default tip
Uploaded
author | peter-waltman |
---|---|
date | Mon, 11 Mar 2013 17:30:48 -0400 |
parents | dddfeedb85af |
children |
line wrap: on
line source
#!/usr/bin/env Rscript ## ## formats raw clinical data from TCGA to contain a single status & time colums ## ## Input (required): ## - clinical data ## Input (optional): ## - status & time columns: (NOT USED IN THIS SCRIPT - see comment below) ## ideally, a better design would allow a user to specify 1 or more columns ## to check for the status & time columns - however, due to the necessities ## required to pre-process the TCGA clinical data, the script would not be ## generalizeable - and for this reason, the TCGA columns are hard-coded. ## ## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time ## ## Date: August 21, 2012 ## Author: Peter Waltman ## ##usage, options and doc goes here argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data and generates a KM-plot, along with the log-rank p-values Usage: format.raw.TCGA.clinical.data.R -c <clinical.file> Options: -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time)) ") args <- commandArgs(TRUE) if ( length( args ) == 1 && args =="--help") { write(argspec, stderr()) q(); } ## some helper fn's write.2.tab <- function( mat, fname ) { mat <- rbind( colnames( mat ), mat ) mat <- cbind( c( "ID", rownames( mat )[-1] ), mat ) write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) } fix.genes <- function( mat ) { ## filter out the unknowns mat <- mat[ ( ! grepl( "^\\?", rownames( mat ) ) ), ] genes <- rownames( mat ) ## select the HUGO name portion of the "gene-names" that are in the TCGA matrices genes <- sapply( strsplit( genes, "\\|" ), function(x) x[1] ) if ( any( duplicated( genes ) ) ) { dupes <- unique( genes[ duplicated( genes ) ] ) no.duped.mat <- which( ! genes %in% dupes ) no.duped.mat <- mat[ no.duped.mat, , drop=FALSE ] ## now merge in the duplicates for ( dup in dupes ) { duped.mat <- mat[ grepl( paste( "^", dup, sep="" ), rownames( mat ) ), ] duped.mat <- apply( duped.mat, 2, mean, na.rm=TRUE ) no.duped.mat <- rbind( no.duped.mat, duped.mat ) ## weird bit of follow up to make sure the new row/gene is named correctly rownames( no.duped.mat ) <- c( rownames( no.duped.mat )[ -nrow(no.duped.mat) ], paste( dup, "|0000", sep="" ) ) } mat <- no.duped.mat; rm( no.duped.mat ); gc() } rownames( mat ) <- sapply( strsplit( rownames( mat ), "\\|" ), function(x) x[1] ) return( mat ) } fix.sample.ids <- function( mat ) { colnames( mat ) <- gsub( "\\.", "-", colnames( mat ) ) return( mat ) } lib.load.quiet <- function( package ) { package <- as.character(substitute(package)) suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) } lib.load.quiet(getopt) spec <- matrix( c( "dataset", "d", 1, "character", "log.tranform", "l", 2, "character", "filter.low.variant", "f", 2, "integer", "output.fname", "o", 2, "character" ), ncol=4, byrow=TRUE ) opt <- getopt( spec=spec ) ##set some reasonable defaults for the options that are needed, ##but were not specified. if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.RNASeq.data" ) } if ( is.null(opt$log.transform ) ) { opt$log.transform <- TRUE } else { opt$log.transform <- ( tolower( opt$log.transform ) %in% "yes" ) } if ( is.null(opt$filter.low.variant ) ) { opt$log.transform <- 10 } data <- as.matrix( read.delim( opt$dataset, row.names=1, check.names=F ) ) data <- fix.genes( data ) data <- fix.sample.ids( data ) if ( opt$filter.low.variant > 0 ) { rpkm.meds <- apply( data, 1, median, na.rm=TRUE ) rpkm.meds <- rpkm.meds >= opt$filter.low.variant data <- data[ rpkm.meds, ] } if ( opt$log.transform ) { data <- log( data + 1 ) } write.2.tab( data, opt$output.fname )