Mercurial > repos > peter-waltman > ucsc_cluster_tools2
comparison cluster.tools/fix.and.merge.TCGA.sample.IDs.R @ 0:0decf3fd54bc draft
Uploaded
| author | peter-waltman |
|---|---|
| date | Thu, 28 Feb 2013 01:45:39 -0500 |
| parents | |
| children | dddfeedb85af |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0decf3fd54bc |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 argspec <- c("fix.and.merge.TCGA.samples.IDs.R takes a clustering from ConsensusClusterPlus and clinical survival data | |
| 3 and generates a KM-plot, along with the log-rank p-values | |
| 4 | |
| 5 Usage: | |
| 6 fix.and.merge.TCGA.samples.IDs.R -d <data.file> | |
| 7 | |
| 8 \n\n") | |
| 9 args <- commandArgs(TRUE) | |
| 10 if ( length( args ) == 1 && args =="--help") { | |
| 11 write(argspec, stderr()) | |
| 12 q(); | |
| 13 } | |
| 14 | |
| 15 lib.load.quiet <- function( package ) { | |
| 16 package <- as.character(substitute(package)) | |
| 17 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) | |
| 18 } | |
| 19 lib.load.quiet(getopt) | |
| 20 | |
| 21 spec <- matrix( c( "data.fname", "d", 1, "character", | |
| 22 "num.components", "n", 2, "integer", | |
| 23 "remove.normals", "r", 0, "logical", | |
| 24 "output.fname", "o", 2, "character" | |
| 25 ), | |
| 26 nc=4, | |
| 27 byrow=TRUE | |
| 28 ) | |
| 29 | |
| 30 opt <- getopt( spec=spec ) | |
| 31 | |
| 32 data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) ) | |
| 33 if ( is.null( opt$num.components ) ) { opt$num.components <- 3 } | |
| 34 if ( is.null( opt$remove.normals ) ) { opt$remove.normals <- FALSE } | |
| 35 if ( is.null( opt$output.fname ) ) { opt$output.fname <- paste( "sample.IDs.updated", basename( opt$data.fname ), sep="." ) } | |
| 36 | |
| 37 if ( opt$num.components < 3 ) { | |
| 38 err.msg <- "Minimum number of barcode components that can be used is 3\n" | |
| 39 cat( err.msg, file=opt$output.fname ) | |
| 40 stop( err.msg ) | |
| 41 } | |
| 42 | |
| 43 remove.periods.from.ids <- function( ids ) { | |
| 44 return( gsub( "\\.", "-", ids ) ) | |
| 45 } | |
| 46 | |
| 47 | |
| 48 reformat.ids <- function( ids, | |
| 49 num.components=3 ) { | |
| 50 return( sapply( strsplit( ids, "-" ), function(x) paste( x[1:num.components], collapse="-" ) ) ) | |
| 51 } | |
| 52 | |
| 53 | |
| 54 merge.cols <- function( mat, | |
| 55 samp.ids ) { | |
| 56 | |
| 57 if ( ! any( duplicated( samp.ids ) ) ) { | |
| 58 colnames( mat ) <- samp.ids | |
| 59 return( mat ) | |
| 60 } | |
| 61 | |
| 62 dupes <- unique( samp.ids[ duplicated( samp.ids ) ] ) | |
| 63 uniqs <- samp.ids[ ! samp.ids %in% dupes ] | |
| 64 | |
| 65 uniq.mat <- mat[ , ( samp.ids %in% uniqs ), drop=FALSE ] | |
| 66 colnames( uniq.mat ) <- uniqs | |
| 67 | |
| 68 for ( dup in dupes ) { | |
| 69 dup.mat <- apply( mat[, ( samp.ids %in% dup ), drop=FALSE], | |
| 70 1, | |
| 71 mean, | |
| 72 na.rm=TRUE ) | |
| 73 | |
| 74 uniq.mat <- cbind( uniq.mat, dup.mat ) | |
| 75 } | |
| 76 colnames( uniq.mat ) <- c( uniqs, dupes ) | |
| 77 return( uniq.mat ) | |
| 78 } | |
| 79 | |
| 80 | |
| 81 cnames <- colnames( data ) | |
| 82 rnames <- rownames( data ) | |
| 83 | |
| 84 transpose.back <- FALSE | |
| 85 | |
| 86 if ( all( grepl( "^TCGA", rnames ) ) ) { | |
| 87 data <- t( data ) | |
| 88 transpose.back <- TRUE | |
| 89 } else { | |
| 90 if ( ! all( grepl( "^TCGA", cnames ) ) ) { | |
| 91 err.msg <- "can't find any TCGA samples listed in this matrix. If columns are samples, all columns must be a TCGA sample ID. Same if rows are samples.\n" | |
| 92 cat( err.msg, file=opt$output.fname ) | |
| 93 stop( err.msg ) | |
| 94 } | |
| 95 } | |
| 96 | |
| 97 cnames <- remove.periods.from.ids( colnames( data ) ) | |
| 98 nelts <- as.numeric( names( table( as.factor( sapply( strsplit( cnames, "-" ), function(x) length(x ) ) ) ) ) ) | |
| 99 if ( length( nelts ) > 1 ) { | |
| 100 err.msg <- "Error: Inconsistent TCGA sample barcodes used. Have found ID with different numbers of components in the barcodes used\n" | |
| 101 cat( err.msg, file=opt$output.fname ) | |
| 102 stop( err.msg ) | |
| 103 } | |
| 104 | |
| 105 if ( opt$remove.normals ) { | |
| 106 if ( nelts > 3 ) { | |
| 107 normals <- grepl( "^TCGA-..-....-1", cnames ) | |
| 108 data <- data[ , (! normals ), drop=FALSE ] | |
| 109 } | |
| 110 } | |
| 111 | |
| 112 if ( opt$num.components < nelts ) { | |
| 113 cnames <- reformat.ids( ids=cnames, num.components=opt$num.components ) | |
| 114 data <- merge.cols( data, cnames ) | |
| 115 } | |
| 116 | |
| 117 if ( transpose.back ) data <- t( data ) | |
| 118 | |
| 119 write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA ) |
