Mercurial > repos > peter-waltman > ucsc_cluster_tools2
comparison cluster.tools/fix.and.merge.TCGA.sample.IDs.R @ 0:0decf3fd54bc draft
Uploaded
author | peter-waltman |
---|---|
date | Thu, 28 Feb 2013 01:45:39 -0500 |
parents | |
children | dddfeedb85af |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0decf3fd54bc |
---|---|
1 #!/usr/bin/env Rscript | |
2 argspec <- c("fix.and.merge.TCGA.samples.IDs.R takes a clustering from ConsensusClusterPlus and clinical survival data | |
3 and generates a KM-plot, along with the log-rank p-values | |
4 | |
5 Usage: | |
6 fix.and.merge.TCGA.samples.IDs.R -d <data.file> | |
7 | |
8 \n\n") | |
9 args <- commandArgs(TRUE) | |
10 if ( length( args ) == 1 && args =="--help") { | |
11 write(argspec, stderr()) | |
12 q(); | |
13 } | |
14 | |
15 lib.load.quiet <- function( package ) { | |
16 package <- as.character(substitute(package)) | |
17 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) | |
18 } | |
19 lib.load.quiet(getopt) | |
20 | |
21 spec <- matrix( c( "data.fname", "d", 1, "character", | |
22 "num.components", "n", 2, "integer", | |
23 "remove.normals", "r", 0, "logical", | |
24 "output.fname", "o", 2, "character" | |
25 ), | |
26 nc=4, | |
27 byrow=TRUE | |
28 ) | |
29 | |
30 opt <- getopt( spec=spec ) | |
31 | |
32 data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) ) | |
33 if ( is.null( opt$num.components ) ) { opt$num.components <- 3 } | |
34 if ( is.null( opt$remove.normals ) ) { opt$remove.normals <- FALSE } | |
35 if ( is.null( opt$output.fname ) ) { opt$output.fname <- paste( "sample.IDs.updated", basename( opt$data.fname ), sep="." ) } | |
36 | |
37 if ( opt$num.components < 3 ) { | |
38 err.msg <- "Minimum number of barcode components that can be used is 3\n" | |
39 cat( err.msg, file=opt$output.fname ) | |
40 stop( err.msg ) | |
41 } | |
42 | |
43 remove.periods.from.ids <- function( ids ) { | |
44 return( gsub( "\\.", "-", ids ) ) | |
45 } | |
46 | |
47 | |
48 reformat.ids <- function( ids, | |
49 num.components=3 ) { | |
50 return( sapply( strsplit( ids, "-" ), function(x) paste( x[1:num.components], collapse="-" ) ) ) | |
51 } | |
52 | |
53 | |
54 merge.cols <- function( mat, | |
55 samp.ids ) { | |
56 | |
57 if ( ! any( duplicated( samp.ids ) ) ) { | |
58 colnames( mat ) <- samp.ids | |
59 return( mat ) | |
60 } | |
61 | |
62 dupes <- unique( samp.ids[ duplicated( samp.ids ) ] ) | |
63 uniqs <- samp.ids[ ! samp.ids %in% dupes ] | |
64 | |
65 uniq.mat <- mat[ , ( samp.ids %in% uniqs ), drop=FALSE ] | |
66 colnames( uniq.mat ) <- uniqs | |
67 | |
68 for ( dup in dupes ) { | |
69 dup.mat <- apply( mat[, ( samp.ids %in% dup ), drop=FALSE], | |
70 1, | |
71 mean, | |
72 na.rm=TRUE ) | |
73 | |
74 uniq.mat <- cbind( uniq.mat, dup.mat ) | |
75 } | |
76 colnames( uniq.mat ) <- c( uniqs, dupes ) | |
77 return( uniq.mat ) | |
78 } | |
79 | |
80 | |
81 cnames <- colnames( data ) | |
82 rnames <- rownames( data ) | |
83 | |
84 transpose.back <- FALSE | |
85 | |
86 if ( all( grepl( "^TCGA", rnames ) ) ) { | |
87 data <- t( data ) | |
88 transpose.back <- TRUE | |
89 } else { | |
90 if ( ! all( grepl( "^TCGA", cnames ) ) ) { | |
91 err.msg <- "can't find any TCGA samples listed in this matrix. If columns are samples, all columns must be a TCGA sample ID. Same if rows are samples.\n" | |
92 cat( err.msg, file=opt$output.fname ) | |
93 stop( err.msg ) | |
94 } | |
95 } | |
96 | |
97 cnames <- remove.periods.from.ids( colnames( data ) ) | |
98 nelts <- as.numeric( names( table( as.factor( sapply( strsplit( cnames, "-" ), function(x) length(x ) ) ) ) ) ) | |
99 if ( length( nelts ) > 1 ) { | |
100 err.msg <- "Error: Inconsistent TCGA sample barcodes used. Have found ID with different numbers of components in the barcodes used\n" | |
101 cat( err.msg, file=opt$output.fname ) | |
102 stop( err.msg ) | |
103 } | |
104 | |
105 if ( opt$remove.normals ) { | |
106 if ( nelts > 3 ) { | |
107 normals <- grepl( "^TCGA-..-....-1", cnames ) | |
108 data <- data[ , (! normals ), drop=FALSE ] | |
109 } | |
110 } | |
111 | |
112 if ( opt$num.components < nelts ) { | |
113 cnames <- reformat.ids( ids=cnames, num.components=opt$num.components ) | |
114 data <- merge.cols( data, cnames ) | |
115 } | |
116 | |
117 if ( transpose.back ) data <- t( data ) | |
118 | |
119 write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA ) |