0
|
1 #!/usr/bin/env Rscript
|
|
2 argspec <- c("fix.and.merge.TCGA.samples.IDs.R takes a clustering from ConsensusClusterPlus and clinical survival data
|
|
3 and generates a KM-plot, along with the log-rank p-values
|
|
4
|
|
5 Usage:
|
|
6 fix.and.merge.TCGA.samples.IDs.R -d <data.file>
|
|
7
|
|
8 \n\n")
|
|
9 args <- commandArgs(TRUE)
|
|
10 if ( length( args ) == 1 && args =="--help") {
|
|
11 write(argspec, stderr())
|
|
12 q();
|
|
13 }
|
|
14
|
1
|
15 ## some helper fn's
|
|
16 write.2.tab <- function( mat,
|
|
17 fname ) {
|
|
18 mat <- rbind( colnames( mat ), mat )
|
|
19 mat <- cbind( c( "ID", rownames( mat )[-1] ),
|
|
20 mat )
|
|
21 write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
|
|
22 }
|
|
23
|
0
|
24 lib.load.quiet <- function( package ) {
|
|
25 package <- as.character(substitute(package))
|
|
26 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
|
|
27 }
|
|
28 lib.load.quiet(getopt)
|
|
29
|
|
30 spec <- matrix( c( "data.fname", "d", 1, "character",
|
|
31 "num.components", "n", 2, "integer",
|
|
32 "remove.normals", "r", 0, "logical",
|
|
33 "output.fname", "o", 2, "character"
|
|
34 ),
|
|
35 nc=4,
|
|
36 byrow=TRUE
|
|
37 )
|
|
38
|
|
39 opt <- getopt( spec=spec )
|
|
40
|
|
41 data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
|
|
42 if ( is.null( opt$num.components ) ) { opt$num.components <- 3 }
|
|
43 if ( is.null( opt$remove.normals ) ) { opt$remove.normals <- FALSE }
|
|
44 if ( is.null( opt$output.fname ) ) { opt$output.fname <- paste( "sample.IDs.updated", basename( opt$data.fname ), sep="." ) }
|
|
45
|
|
46 if ( opt$num.components < 3 ) {
|
|
47 err.msg <- "Minimum number of barcode components that can be used is 3\n"
|
|
48 cat( err.msg, file=opt$output.fname )
|
|
49 stop( err.msg )
|
|
50 }
|
|
51 remove.periods.from.ids <- function( ids ) {
|
|
52 return( gsub( "\\.", "-", ids ) )
|
|
53 }
|
|
54
|
|
55
|
|
56 reformat.ids <- function( ids,
|
|
57 num.components=3 ) {
|
|
58 return( sapply( strsplit( ids, "-" ), function(x) paste( x[1:num.components], collapse="-" ) ) )
|
|
59 }
|
|
60
|
|
61
|
|
62 merge.cols <- function( mat,
|
|
63 samp.ids ) {
|
|
64
|
|
65 if ( ! any( duplicated( samp.ids ) ) ) {
|
|
66 colnames( mat ) <- samp.ids
|
|
67 return( mat )
|
|
68 }
|
|
69
|
|
70 dupes <- unique( samp.ids[ duplicated( samp.ids ) ] )
|
|
71 uniqs <- samp.ids[ ! samp.ids %in% dupes ]
|
|
72
|
|
73 uniq.mat <- mat[ , ( samp.ids %in% uniqs ), drop=FALSE ]
|
|
74 colnames( uniq.mat ) <- uniqs
|
|
75
|
|
76 for ( dup in dupes ) {
|
|
77 dup.mat <- apply( mat[, ( samp.ids %in% dup ), drop=FALSE],
|
|
78 1,
|
|
79 mean,
|
|
80 na.rm=TRUE )
|
|
81
|
|
82 uniq.mat <- cbind( uniq.mat, dup.mat )
|
|
83 }
|
|
84 colnames( uniq.mat ) <- c( uniqs, dupes )
|
|
85 return( uniq.mat )
|
|
86 }
|
|
87
|
|
88
|
|
89 cnames <- colnames( data )
|
|
90 rnames <- rownames( data )
|
|
91
|
|
92 transpose.back <- FALSE
|
|
93
|
|
94 if ( all( grepl( "^TCGA", rnames ) ) ) {
|
|
95 data <- t( data )
|
|
96 transpose.back <- TRUE
|
|
97 } else {
|
|
98 if ( ! all( grepl( "^TCGA", cnames ) ) ) {
|
|
99 err.msg <- "can't find any TCGA samples listed in this matrix. If columns are samples, all columns must be a TCGA sample ID. Same if rows are samples.\n"
|
|
100 cat( err.msg, file=opt$output.fname )
|
|
101 stop( err.msg )
|
|
102 }
|
|
103 }
|
|
104
|
|
105 cnames <- remove.periods.from.ids( colnames( data ) )
|
|
106 nelts <- as.numeric( names( table( as.factor( sapply( strsplit( cnames, "-" ), function(x) length(x ) ) ) ) ) )
|
|
107 if ( length( nelts ) > 1 ) {
|
|
108 err.msg <- "Error: Inconsistent TCGA sample barcodes used. Have found ID with different numbers of components in the barcodes used\n"
|
|
109 cat( err.msg, file=opt$output.fname )
|
|
110 stop( err.msg )
|
|
111 }
|
|
112
|
|
113 if ( opt$remove.normals ) {
|
|
114 if ( nelts > 3 ) {
|
|
115 normals <- grepl( "^TCGA-..-....-1", cnames )
|
|
116 data <- data[ , (! normals ), drop=FALSE ]
|
3
|
117 cnames <- cnames[ ! normals ]
|
0
|
118 }
|
|
119 }
|
|
120
|
|
121 if ( opt$num.components < nelts ) {
|
|
122 cnames <- reformat.ids( ids=cnames, num.components=opt$num.components )
|
|
123 data <- merge.cols( data, cnames )
|
|
124 }
|
|
125
|
|
126 if ( transpose.back ) data <- t( data )
|
|
127
|
1
|
128 write.2.tab( data, opt$output.fname )
|