Mercurial > repos > peter-waltman > ucsc_cluster_tools2
view cluster.tools/fix.and.merge.TCGA.sample.IDs.R @ 3:563832f48c08 draft
Uploaded
author | peter-waltman |
---|---|
date | Fri, 01 Mar 2013 19:51:25 -0500 |
parents | dddfeedb85af |
children |
line wrap: on
line source
#!/usr/bin/env Rscript argspec <- c("fix.and.merge.TCGA.samples.IDs.R takes a clustering from ConsensusClusterPlus and clinical survival data and generates a KM-plot, along with the log-rank p-values Usage: fix.and.merge.TCGA.samples.IDs.R -d <data.file> \n\n") args <- commandArgs(TRUE) if ( length( args ) == 1 && args =="--help") { write(argspec, stderr()) q(); } ## some helper fn's write.2.tab <- function( mat, fname ) { mat <- rbind( colnames( mat ), mat ) mat <- cbind( c( "ID", rownames( mat )[-1] ), mat ) write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) } lib.load.quiet <- function( package ) { package <- as.character(substitute(package)) suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) } lib.load.quiet(getopt) spec <- matrix( c( "data.fname", "d", 1, "character", "num.components", "n", 2, "integer", "remove.normals", "r", 0, "logical", "output.fname", "o", 2, "character" ), nc=4, byrow=TRUE ) opt <- getopt( spec=spec ) data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) ) if ( is.null( opt$num.components ) ) { opt$num.components <- 3 } if ( is.null( opt$remove.normals ) ) { opt$remove.normals <- FALSE } if ( is.null( opt$output.fname ) ) { opt$output.fname <- paste( "sample.IDs.updated", basename( opt$data.fname ), sep="." ) } if ( opt$num.components < 3 ) { err.msg <- "Minimum number of barcode components that can be used is 3\n" cat( err.msg, file=opt$output.fname ) stop( err.msg ) } remove.periods.from.ids <- function( ids ) { return( gsub( "\\.", "-", ids ) ) } reformat.ids <- function( ids, num.components=3 ) { return( sapply( strsplit( ids, "-" ), function(x) paste( x[1:num.components], collapse="-" ) ) ) } merge.cols <- function( mat, samp.ids ) { if ( ! any( duplicated( samp.ids ) ) ) { colnames( mat ) <- samp.ids return( mat ) } dupes <- unique( samp.ids[ duplicated( samp.ids ) ] ) uniqs <- samp.ids[ ! samp.ids %in% dupes ] uniq.mat <- mat[ , ( samp.ids %in% uniqs ), drop=FALSE ] colnames( uniq.mat ) <- uniqs for ( dup in dupes ) { dup.mat <- apply( mat[, ( samp.ids %in% dup ), drop=FALSE], 1, mean, na.rm=TRUE ) uniq.mat <- cbind( uniq.mat, dup.mat ) } colnames( uniq.mat ) <- c( uniqs, dupes ) return( uniq.mat ) } cnames <- colnames( data ) rnames <- rownames( data ) transpose.back <- FALSE if ( all( grepl( "^TCGA", rnames ) ) ) { data <- t( data ) transpose.back <- TRUE } else { if ( ! all( grepl( "^TCGA", cnames ) ) ) { err.msg <- "can't find any TCGA samples listed in this matrix. If columns are samples, all columns must be a TCGA sample ID. Same if rows are samples.\n" cat( err.msg, file=opt$output.fname ) stop( err.msg ) } } cnames <- remove.periods.from.ids( colnames( data ) ) nelts <- as.numeric( names( table( as.factor( sapply( strsplit( cnames, "-" ), function(x) length(x ) ) ) ) ) ) if ( length( nelts ) > 1 ) { err.msg <- "Error: Inconsistent TCGA sample barcodes used. Have found ID with different numbers of components in the barcodes used\n" cat( err.msg, file=opt$output.fname ) stop( err.msg ) } if ( opt$remove.normals ) { if ( nelts > 3 ) { normals <- grepl( "^TCGA-..-....-1", cnames ) data <- data[ , (! normals ), drop=FALSE ] cnames <- cnames[ ! normals ] } } if ( opt$num.components < nelts ) { cnames <- reformat.ids( ids=cnames, num.components=opt$num.components ) data <- merge.cols( data, cnames ) } if ( transpose.back ) data <- t( data ) write.2.tab( data, opt$output.fname )