annotate cluster.tools/fix.and.merge.TCGA.sample.IDs.R @ 1:dddfeedb85af draft

Uploaded
author peter-waltman
date Fri, 01 Mar 2013 10:16:53 -0500
parents 0decf3fd54bc
children 563832f48c08
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
1 #!/usr/bin/env Rscript
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
2 argspec <- c("fix.and.merge.TCGA.samples.IDs.R takes a clustering from ConsensusClusterPlus and clinical survival data
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
3 and generates a KM-plot, along with the log-rank p-values
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
4
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
5 Usage:
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
6 fix.and.merge.TCGA.samples.IDs.R -d <data.file>
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
7
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
8 \n\n")
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
9 args <- commandArgs(TRUE)
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
10 if ( length( args ) == 1 && args =="--help") {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
11 write(argspec, stderr())
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
12 q();
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
13 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
14
1
dddfeedb85af Uploaded
peter-waltman
parents: 0
diff changeset
15 ## some helper fn's
dddfeedb85af Uploaded
peter-waltman
parents: 0
diff changeset
16 write.2.tab <- function( mat,
dddfeedb85af Uploaded
peter-waltman
parents: 0
diff changeset
17 fname ) {
dddfeedb85af Uploaded
peter-waltman
parents: 0
diff changeset
18 mat <- rbind( colnames( mat ), mat )
dddfeedb85af Uploaded
peter-waltman
parents: 0
diff changeset
19 mat <- cbind( c( "ID", rownames( mat )[-1] ),
dddfeedb85af Uploaded
peter-waltman
parents: 0
diff changeset
20 mat )
dddfeedb85af Uploaded
peter-waltman
parents: 0
diff changeset
21 write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
dddfeedb85af Uploaded
peter-waltman
parents: 0
diff changeset
22 }
dddfeedb85af Uploaded
peter-waltman
parents: 0
diff changeset
23
0
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
24 lib.load.quiet <- function( package ) {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
25 package <- as.character(substitute(package))
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
26 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
27 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
28 lib.load.quiet(getopt)
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
29
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
30 spec <- matrix( c( "data.fname", "d", 1, "character",
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
31 "num.components", "n", 2, "integer",
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
32 "remove.normals", "r", 0, "logical",
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
33 "output.fname", "o", 2, "character"
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
34 ),
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
35 nc=4,
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
36 byrow=TRUE
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
37 )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
38
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
39 opt <- getopt( spec=spec )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
40
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
41 data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
42 if ( is.null( opt$num.components ) ) { opt$num.components <- 3 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
43 if ( is.null( opt$remove.normals ) ) { opt$remove.normals <- FALSE }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
44 if ( is.null( opt$output.fname ) ) { opt$output.fname <- paste( "sample.IDs.updated", basename( opt$data.fname ), sep="." ) }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
45
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
46 if ( opt$num.components < 3 ) {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
47 err.msg <- "Minimum number of barcode components that can be used is 3\n"
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
48 cat( err.msg, file=opt$output.fname )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
49 stop( err.msg )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
50 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
51
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
52 remove.periods.from.ids <- function( ids ) {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
53 return( gsub( "\\.", "-", ids ) )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
54 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
55
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
56
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
57 reformat.ids <- function( ids,
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
58 num.components=3 ) {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
59 return( sapply( strsplit( ids, "-" ), function(x) paste( x[1:num.components], collapse="-" ) ) )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
60 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
61
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
62
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
63 merge.cols <- function( mat,
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
64 samp.ids ) {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
65
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
66 if ( ! any( duplicated( samp.ids ) ) ) {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
67 colnames( mat ) <- samp.ids
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
68 return( mat )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
69 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
70
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
71 dupes <- unique( samp.ids[ duplicated( samp.ids ) ] )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
72 uniqs <- samp.ids[ ! samp.ids %in% dupes ]
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
73
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
74 uniq.mat <- mat[ , ( samp.ids %in% uniqs ), drop=FALSE ]
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
75 colnames( uniq.mat ) <- uniqs
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
76
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
77 for ( dup in dupes ) {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
78 dup.mat <- apply( mat[, ( samp.ids %in% dup ), drop=FALSE],
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
79 1,
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
80 mean,
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
81 na.rm=TRUE )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
82
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
83 uniq.mat <- cbind( uniq.mat, dup.mat )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
84 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
85 colnames( uniq.mat ) <- c( uniqs, dupes )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
86 return( uniq.mat )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
87 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
88
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
89
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
90 cnames <- colnames( data )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
91 rnames <- rownames( data )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
92
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
93 transpose.back <- FALSE
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
94
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
95 if ( all( grepl( "^TCGA", rnames ) ) ) {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
96 data <- t( data )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
97 transpose.back <- TRUE
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
98 } else {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
99 if ( ! all( grepl( "^TCGA", cnames ) ) ) {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
100 err.msg <- "can't find any TCGA samples listed in this matrix. If columns are samples, all columns must be a TCGA sample ID. Same if rows are samples.\n"
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
101 cat( err.msg, file=opt$output.fname )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
102 stop( err.msg )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
103 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
104 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
105
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
106 cnames <- remove.periods.from.ids( colnames( data ) )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
107 nelts <- as.numeric( names( table( as.factor( sapply( strsplit( cnames, "-" ), function(x) length(x ) ) ) ) ) )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
108 if ( length( nelts ) > 1 ) {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
109 err.msg <- "Error: Inconsistent TCGA sample barcodes used. Have found ID with different numbers of components in the barcodes used\n"
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
110 cat( err.msg, file=opt$output.fname )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
111 stop( err.msg )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
112 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
113
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
114 if ( opt$remove.normals ) {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
115 if ( nelts > 3 ) {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
116 normals <- grepl( "^TCGA-..-....-1", cnames )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
117 data <- data[ , (! normals ), drop=FALSE ]
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
118 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
119 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
120
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
121 if ( opt$num.components < nelts ) {
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
122 cnames <- reformat.ids( ids=cnames, num.components=opt$num.components )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
123 data <- merge.cols( data, cnames )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
124 }
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
125
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
126 if ( transpose.back ) data <- t( data )
0decf3fd54bc Uploaded
peter-waltman
parents:
diff changeset
127
1
dddfeedb85af Uploaded
peter-waltman
parents: 0
diff changeset
128 write.2.tab( data, opt$output.fname )