diff cluster.tools/fix.and.merge.TCGA.sample.IDs.R @ 2:b442996b66ae draft

Uploaded
author peter-waltman
date Wed, 27 Feb 2013 20:17:04 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/fix.and.merge.TCGA.sample.IDs.R	Wed Feb 27 20:17:04 2013 -0500
@@ -0,0 +1,119 @@
+#!/usr/bin/env Rscript
+argspec <- c("fix.and.merge.TCGA.samples.IDs.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+        Usage: 
+                fix.and.merge.TCGA.samples.IDs.R -d <data.file> 
+
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "data.fname",      "d", 1, "character",
+                   "num.components",  "n", 2, "integer",
+                   "remove.normals",  "r", 0, "logical",
+                   "output.fname",    "o", 2, "character"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+
+data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
+if ( is.null( opt$num.components ) ) { opt$num.components <- 3 }
+if ( is.null( opt$remove.normals ) ) { opt$remove.normals <- FALSE }
+if ( is.null( opt$output.fname ) ) { opt$output.fname <- paste( "sample.IDs.updated", basename( opt$data.fname ), sep="." ) }
+
+if ( opt$num.components < 3 ) {
+  err.msg <- "Minimum number of barcode components that can be used is 3\n"
+  cat( err.msg, file=opt$output.fname )
+  stop( err.msg )
+}
+
+remove.periods.from.ids <- function( ids ) {
+  return( gsub( "\\.", "-", ids ) )
+}
+
+
+reformat.ids <- function( ids,
+                          num.components=3 ) {
+  return( sapply( strsplit( ids, "-" ), function(x) paste( x[1:num.components], collapse="-" ) ) )
+}
+
+
+merge.cols <- function( mat,
+                        samp.ids ) {
+
+  if ( ! any( duplicated( samp.ids ) ) ) {
+    colnames( mat ) <- samp.ids
+    return( mat )
+  }
+
+  dupes <- unique( samp.ids[ duplicated( samp.ids ) ] )
+  uniqs <- samp.ids[ ! samp.ids %in% dupes ]
+
+  uniq.mat <- mat[ , ( samp.ids %in% uniqs ), drop=FALSE ]
+  colnames( uniq.mat ) <- uniqs
+
+  for ( dup in dupes ) {
+    dup.mat <- apply( mat[, ( samp.ids %in% dup ), drop=FALSE],
+                      1,
+                      mean,
+                      na.rm=TRUE )
+    
+    uniq.mat <- cbind( uniq.mat, dup.mat )
+  }
+  colnames( uniq.mat ) <- c( uniqs, dupes )
+  return( uniq.mat )
+}
+
+
+cnames <- colnames( data )
+rnames <- rownames( data )
+
+transpose.back <- FALSE
+
+if ( all( grepl( "^TCGA", rnames ) ) ) {
+  data <- t( data )
+  transpose.back <- TRUE
+} else {
+  if ( ! all( grepl( "^TCGA", cnames ) ) ) {
+    err.msg <- "can't find any TCGA samples listed in this matrix.  If columns are samples, all columns must be a TCGA sample ID.  Same if rows are samples.\n"
+    cat( err.msg, file=opt$output.fname )
+    stop( err.msg )
+  }
+}
+
+cnames <- remove.periods.from.ids( colnames( data ) )
+nelts <- as.numeric( names( table( as.factor( sapply( strsplit( cnames, "-" ), function(x) length(x ) ) ) ) ) )
+if ( length( nelts ) > 1 ) {
+  err.msg <- "Error: Inconsistent TCGA sample barcodes used.  Have found ID with different numbers of components in the barcodes used\n" 
+    cat( err.msg, file=opt$output.fname )
+    stop( err.msg )
+}
+
+if ( opt$remove.normals ) {
+  if ( nelts > 3 ) {
+    normals <- grepl( "^TCGA-..-....-1", cnames )
+    data <- data[ , (! normals ), drop=FALSE ]
+  }
+}
+
+if ( opt$num.components < nelts ) {
+  cnames <- reformat.ids( ids=cnames, num.components=opt$num.components )
+  data <- merge.cols( data, cnames )
+}
+
+if ( transpose.back ) data <- t( data )
+
+write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )