Mercurial > repos > peter-waltman > ucsc_cluster_tools2
diff cluster.tools/fix.and.merge.TCGA.sample.IDs.R @ 0:0decf3fd54bc draft
Uploaded
author | peter-waltman |
---|---|
date | Thu, 28 Feb 2013 01:45:39 -0500 |
parents | |
children | dddfeedb85af |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/fix.and.merge.TCGA.sample.IDs.R Thu Feb 28 01:45:39 2013 -0500 @@ -0,0 +1,119 @@ +#!/usr/bin/env Rscript +argspec <- c("fix.and.merge.TCGA.samples.IDs.R takes a clustering from ConsensusClusterPlus and clinical survival data +and generates a KM-plot, along with the log-rank p-values + + Usage: + fix.and.merge.TCGA.samples.IDs.R -d <data.file> + + \n\n") +args <- commandArgs(TRUE) +if ( length( args ) == 1 && args =="--help") { + write(argspec, stderr()) + q(); +} + +lib.load.quiet <- function( package ) { + package <- as.character(substitute(package)) + suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) +} +lib.load.quiet(getopt) + +spec <- matrix( c( "data.fname", "d", 1, "character", + "num.components", "n", 2, "integer", + "remove.normals", "r", 0, "logical", + "output.fname", "o", 2, "character" + ), + nc=4, + byrow=TRUE + ) + +opt <- getopt( spec=spec ) + +data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) ) +if ( is.null( opt$num.components ) ) { opt$num.components <- 3 } +if ( is.null( opt$remove.normals ) ) { opt$remove.normals <- FALSE } +if ( is.null( opt$output.fname ) ) { opt$output.fname <- paste( "sample.IDs.updated", basename( opt$data.fname ), sep="." ) } + +if ( opt$num.components < 3 ) { + err.msg <- "Minimum number of barcode components that can be used is 3\n" + cat( err.msg, file=opt$output.fname ) + stop( err.msg ) +} + +remove.periods.from.ids <- function( ids ) { + return( gsub( "\\.", "-", ids ) ) +} + + +reformat.ids <- function( ids, + num.components=3 ) { + return( sapply( strsplit( ids, "-" ), function(x) paste( x[1:num.components], collapse="-" ) ) ) +} + + +merge.cols <- function( mat, + samp.ids ) { + + if ( ! any( duplicated( samp.ids ) ) ) { + colnames( mat ) <- samp.ids + return( mat ) + } + + dupes <- unique( samp.ids[ duplicated( samp.ids ) ] ) + uniqs <- samp.ids[ ! samp.ids %in% dupes ] + + uniq.mat <- mat[ , ( samp.ids %in% uniqs ), drop=FALSE ] + colnames( uniq.mat ) <- uniqs + + for ( dup in dupes ) { + dup.mat <- apply( mat[, ( samp.ids %in% dup ), drop=FALSE], + 1, + mean, + na.rm=TRUE ) + + uniq.mat <- cbind( uniq.mat, dup.mat ) + } + colnames( uniq.mat ) <- c( uniqs, dupes ) + return( uniq.mat ) +} + + +cnames <- colnames( data ) +rnames <- rownames( data ) + +transpose.back <- FALSE + +if ( all( grepl( "^TCGA", rnames ) ) ) { + data <- t( data ) + transpose.back <- TRUE +} else { + if ( ! all( grepl( "^TCGA", cnames ) ) ) { + err.msg <- "can't find any TCGA samples listed in this matrix. If columns are samples, all columns must be a TCGA sample ID. Same if rows are samples.\n" + cat( err.msg, file=opt$output.fname ) + stop( err.msg ) + } +} + +cnames <- remove.periods.from.ids( colnames( data ) ) +nelts <- as.numeric( names( table( as.factor( sapply( strsplit( cnames, "-" ), function(x) length(x ) ) ) ) ) ) +if ( length( nelts ) > 1 ) { + err.msg <- "Error: Inconsistent TCGA sample barcodes used. Have found ID with different numbers of components in the barcodes used\n" + cat( err.msg, file=opt$output.fname ) + stop( err.msg ) +} + +if ( opt$remove.normals ) { + if ( nelts > 3 ) { + normals <- grepl( "^TCGA-..-....-1", cnames ) + data <- data[ , (! normals ), drop=FALSE ] + } +} + +if ( opt$num.components < nelts ) { + cnames <- reformat.ids( ids=cnames, num.components=opt$num.components ) + data <- merge.cols( data, cnames ) +} + +if ( transpose.back ) data <- t( data ) + +write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )