0
|
1 #!/usr/bin/env Rscript
|
|
2
|
|
3 argspec <- c("partition.R help TBD
|
|
4 \n\n")
|
|
5 args <- commandArgs(TRUE)
|
|
6 if ( length( args ) == 1 && args =="--help") {
|
|
7 write(argspec, stderr())
|
|
8 q();
|
|
9 }
|
|
10
|
|
11 lib.load.quiet <- function( package ) {
|
|
12 package <- as.character(substitute(package))
|
|
13 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
|
|
14 }
|
|
15 lib.load.quiet(getopt)
|
|
16 lib.load.quiet( amap )
|
|
17 lib.load.quiet( cluster )
|
|
18
|
|
19 ## we're going to use the amap Dist function, but they misname their correlation
|
|
20 ## functions, so re-name them correctly
|
|
21 amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
|
|
22 "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
|
|
23 names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
|
|
24 "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
|
|
25
|
|
26 spec <- matrix( c( "data.fname", "d", 1, "character",
|
|
27 "algorithm", "a", 2, "character",
|
|
28 "distance.metric", "m", 2, "character", ## must be one supported by R's dist function
|
|
29 "dist.obj", "D", 2, "logical",
|
|
30 "direction", "n", 2, "character",
|
|
31 "num.k", "k", 2, "integer",
|
|
32 "output.name", "o", 2, "character"
|
|
33 ),
|
|
34 nc=4,
|
|
35 byrow=TRUE
|
|
36 )
|
|
37
|
|
38 opt <- getopt( spec=spec )
|
8
|
39 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
|
0
|
40
|
|
41 if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "euclidean" }
|
|
42 if ( is.null( opt$algorithm ) ) { opt$algorithm <- "km" }
|
|
43 if ( is.null( opt$dist.obj ) ) { opt$dist.obj <- FALSE }
|
|
44 if ( is.null( opt$direction ) ) { opt$direction <- "cols" }
|
|
45 if ( is.null( opt$output.name ) ) { opt$output.name <- "partition.result" }
|
8
|
46 if ( is.null( opt$num.k ) || ( opt$num.k == -1 )) {
|
|
47 if ( opt$direction == 'cols' ) {
|
|
48 opt$num.k <- 5
|
|
49 } else if ( opt$direction == 'rows' ) {
|
|
50 opt$num.k <- nrow( data ) / 30 ## we use an estimated average size of gene clusters to be 30
|
|
51 if ( opt$num.k > 1000 ) {
|
|
52 opt$num.k <- ( opt$num.k %/% 10 ) * 10
|
|
53 } else {
|
|
54 opt$num.k <- ( opt$num.k %/% 5 ) * 5
|
|
55 }
|
|
56 }
|
|
57 }
|
0
|
58
|
|
59
|
|
60 if ( opt$direction == "cols" ) {
|
|
61 ## need to transpose b/c both kmeans & pam cluster the rows
|
|
62 ## this shouldn't have an effect upon a distance matrix
|
|
63 data <- t( data )
|
|
64 }
|
|
65 if ( opt$num.k > nrow( data ) ) {
|
|
66 err.msg <- paste( "K specified is greater than the number of elements (", opt$direction, ") in data matrix to be clustereed\n", sep="" )
|
|
67 stop( err.msg )
|
|
68 }
|
|
69
|
|
70 mat.2.b.clustered <- data
|
|
71 if ( opt$dist.obj ) {
|
|
72 ## To be updated
|
|
73
|
|
74 mat.2.b.clustered <- as.dist( data )
|
|
75
|
|
76 if ( opt$algorithm=="km" ) {
|
|
77 ##clusterAlg is kmeans
|
|
78 if (verbose) warning()
|
|
79 }
|
|
80 } else {
|
|
81 ## this is a data matrix -- we always generate a dist.mat object (b/c we need it
|
|
82 ## in case this result is used with a heatmap
|
|
83
|
|
84 ## PAM clustering
|
|
85 if ( opt$algorithm != "km" ) {
|
|
86
|
|
87 if ( ! opt$distance.metric %in% names( amap.distance ) ) stop("unsupported distance.")
|
|
88 mat.2.b.clustered <- Dist( data, method=as.character( amap.distance[ opt$distance.metric ] ) )
|
|
89 attr( mat.2.b.clustered, "method" ) <- opt$distance.metric
|
|
90
|
|
91 } else {
|
|
92 mat.2.b.clustered <- data
|
|
93 }
|
|
94 }
|
|
95
|
|
96 ## now run the clustering
|
|
97 partcl.res <- cl <- NA
|
|
98 if ( opt$algorithm=="pam" ) {
|
|
99 partcl.res <- pam( x=mat.2.b.clustered,
|
|
100 k=opt$num.k,
|
|
101 metric=( ifelse( inherits(data, "dist" ),
|
|
102 attr( data, "method" ), ## this is ok if data is a dist object (b/c pam will ignore it)
|
|
103 opt$distance.metric) ) ) ##,
|
|
104 ##cluster.only=TRUE )
|
|
105 cl <- partcl.res$clustering
|
|
106
|
|
107 if ( is.character( partcl.res$medoids ) ) {
|
|
108 medoids <- data[ partcl.res$medoids, ]
|
|
109 } else {
|
|
110 ##partcl.res$medoids is a matrix -- we shouldn't get this (only if mat.2.b.clustered is a data matrix)
|
|
111 medoids <- partcl.res$medoids
|
|
112 }
|
|
113 med.names <- rownames( medoids )
|
|
114 med.hc <- hclust( as.dist( as.matrix( mat.2.b.clustered )[ med.names, med.names ] ) )
|
|
115 med.cls <- as.numeric( cl[ med.names[ med.hc$order ] ] )
|
|
116
|
|
117 cl.list <- lapply( med.cls, function(i) names( cl[ cl %in% i ] ) )
|
|
118 names( cl.list ) <- med.cls
|
|
119
|
|
120 cl.list <- lapply( cl.list,
|
|
121 function( elts ) {
|
|
122 if ( length( elts ) == 1 ) {
|
|
123 retval <- 1
|
|
124 names( retval ) <- elts
|
|
125 } else {
|
|
126 subdist <- as.dist( as.matrix( mat.2.b.clustered )[ elts, elts ] )
|
|
127 sub.hc <- hclust( subdist )
|
|
128 retval <- sub.hc$order
|
|
129 names( retval ) <- sub.hc$labels
|
|
130 retval <- sort( retval )
|
|
131 }
|
|
132 return( retval )
|
|
133 }
|
|
134 )
|
|
135
|
|
136 fnl.ord <- as.character( unlist( lapply( cl.list, names ) ) )
|
|
137 cl <- cl[ fnl.ord ]
|
|
138 } else {
|
|
139 partcl.res <- kmeans( x=mat.2.b.clustered,
|
|
140 centers=opt$num.k )
|
|
141 cl <- partcl.res$cluster
|
|
142 centroids <- partcl.res$centers
|
|
143 cent.hc <- hclust( Dist( centroids, method=as.character( amap.distance[ opt$distance.metric ] ) ) )
|
|
144 cent.cls <- as.numeric( cent.hc$labels[ cent.hc$order ] )
|
|
145
|
|
146 cl.list <- lapply( cent.cls, function(i) names( cl[ cl %in% i ] ) )
|
|
147 names( cl.list ) <- cent.cls
|
|
148
|
|
149 cl.list <- lapply( cl.list,
|
|
150 function( elts ) {
|
|
151 if ( length( elts ) == 1 ) {
|
|
152 retval <- 1
|
|
153 names( retval ) <- elts
|
|
154 } else {
|
|
155 if ( all( elts %in% colnames( mat.2.b.clustered ) ) ) {
|
|
156 submat <- mat.2.b.clustered[ elts, elts ]
|
|
157 } else {
|
|
158 submat <- mat.2.b.clustered[ elts, ]
|
|
159 }
|
|
160 subdist <- Dist( submat, method=as.character( amap.distance[ opt$distance.metric ] ) )
|
|
161 sub.hc <- hclust( subdist )
|
|
162 retval <- sub.hc$order
|
|
163 names( retval ) <- sub.hc$labels
|
|
164 retval <- sort( retval )
|
|
165 }
|
|
166 return( retval )
|
|
167 }
|
|
168 )
|
|
169
|
|
170 fnl.ord <- as.character( unlist( lapply( cl.list, names ) ) )
|
|
171 cl <- cl[ fnl.ord ]
|
|
172 }
|
|
173
|
|
174 if ( opt$direction == "cols" ) {
|
|
175 ## need to transpose back
|
|
176 data <- t( data )
|
|
177 }
|
|
178 save( file=opt$output.name, partcl.res, cl, data )
|
|
179
|