comparison cluster.tools/fix.and.merge.TCGA.sample.IDs.R @ 2:b442996b66ae draft

Uploaded
author peter-waltman
date Wed, 27 Feb 2013 20:17:04 -0500
parents
children
comparison
equal deleted inserted replaced
1:e25d2bece0a2 2:b442996b66ae
1 #!/usr/bin/env Rscript
2 argspec <- c("fix.and.merge.TCGA.samples.IDs.R takes a clustering from ConsensusClusterPlus and clinical survival data
3 and generates a KM-plot, along with the log-rank p-values
4
5 Usage:
6 fix.and.merge.TCGA.samples.IDs.R -d <data.file>
7
8 \n\n")
9 args <- commandArgs(TRUE)
10 if ( length( args ) == 1 && args =="--help") {
11 write(argspec, stderr())
12 q();
13 }
14
15 lib.load.quiet <- function( package ) {
16 package <- as.character(substitute(package))
17 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
18 }
19 lib.load.quiet(getopt)
20
21 spec <- matrix( c( "data.fname", "d", 1, "character",
22 "num.components", "n", 2, "integer",
23 "remove.normals", "r", 0, "logical",
24 "output.fname", "o", 2, "character"
25 ),
26 nc=4,
27 byrow=TRUE
28 )
29
30 opt <- getopt( spec=spec )
31
32 data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
33 if ( is.null( opt$num.components ) ) { opt$num.components <- 3 }
34 if ( is.null( opt$remove.normals ) ) { opt$remove.normals <- FALSE }
35 if ( is.null( opt$output.fname ) ) { opt$output.fname <- paste( "sample.IDs.updated", basename( opt$data.fname ), sep="." ) }
36
37 if ( opt$num.components < 3 ) {
38 err.msg <- "Minimum number of barcode components that can be used is 3\n"
39 cat( err.msg, file=opt$output.fname )
40 stop( err.msg )
41 }
42
43 remove.periods.from.ids <- function( ids ) {
44 return( gsub( "\\.", "-", ids ) )
45 }
46
47
48 reformat.ids <- function( ids,
49 num.components=3 ) {
50 return( sapply( strsplit( ids, "-" ), function(x) paste( x[1:num.components], collapse="-" ) ) )
51 }
52
53
54 merge.cols <- function( mat,
55 samp.ids ) {
56
57 if ( ! any( duplicated( samp.ids ) ) ) {
58 colnames( mat ) <- samp.ids
59 return( mat )
60 }
61
62 dupes <- unique( samp.ids[ duplicated( samp.ids ) ] )
63 uniqs <- samp.ids[ ! samp.ids %in% dupes ]
64
65 uniq.mat <- mat[ , ( samp.ids %in% uniqs ), drop=FALSE ]
66 colnames( uniq.mat ) <- uniqs
67
68 for ( dup in dupes ) {
69 dup.mat <- apply( mat[, ( samp.ids %in% dup ), drop=FALSE],
70 1,
71 mean,
72 na.rm=TRUE )
73
74 uniq.mat <- cbind( uniq.mat, dup.mat )
75 }
76 colnames( uniq.mat ) <- c( uniqs, dupes )
77 return( uniq.mat )
78 }
79
80
81 cnames <- colnames( data )
82 rnames <- rownames( data )
83
84 transpose.back <- FALSE
85
86 if ( all( grepl( "^TCGA", rnames ) ) ) {
87 data <- t( data )
88 transpose.back <- TRUE
89 } else {
90 if ( ! all( grepl( "^TCGA", cnames ) ) ) {
91 err.msg <- "can't find any TCGA samples listed in this matrix. If columns are samples, all columns must be a TCGA sample ID. Same if rows are samples.\n"
92 cat( err.msg, file=opt$output.fname )
93 stop( err.msg )
94 }
95 }
96
97 cnames <- remove.periods.from.ids( colnames( data ) )
98 nelts <- as.numeric( names( table( as.factor( sapply( strsplit( cnames, "-" ), function(x) length(x ) ) ) ) ) )
99 if ( length( nelts ) > 1 ) {
100 err.msg <- "Error: Inconsistent TCGA sample barcodes used. Have found ID with different numbers of components in the barcodes used\n"
101 cat( err.msg, file=opt$output.fname )
102 stop( err.msg )
103 }
104
105 if ( opt$remove.normals ) {
106 if ( nelts > 3 ) {
107 normals <- grepl( "^TCGA-..-....-1", cnames )
108 data <- data[ , (! normals ), drop=FALSE ]
109 }
110 }
111
112 if ( opt$num.components < nelts ) {
113 cnames <- reformat.ids( ids=cnames, num.components=opt$num.components )
114 data <- merge.cols( data, cnames )
115 }
116
117 if ( transpose.back ) data <- t( data )
118
119 write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )