0
|
1 #!/usr/bin/env Rscript
|
|
2
|
|
3 argspec <- c("partition.R help TBD
|
|
4 \n\n")
|
|
5 args <- commandArgs(TRUE)
|
|
6 if ( length( args ) == 1 && args =="--help") {
|
|
7 write(argspec, stderr())
|
|
8 q();
|
|
9 }
|
|
10
|
|
11 lib.load.quiet <- function( package ) {
|
|
12 package <- as.character(substitute(package))
|
|
13 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
|
|
14 }
|
|
15 lib.load.quiet(getopt)
|
|
16 lib.load.quiet( amap )
|
|
17 lib.load.quiet( cluster )
|
|
18
|
|
19 ## we're going to use the amap Dist function, but they misname their correlation
|
|
20 ## functions, so re-name them correctly
|
|
21 amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
|
|
22 "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
|
|
23 names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
|
|
24 "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
|
|
25
|
|
26 spec <- matrix( c( "data.fname", "d", 1, "character",
|
|
27 "algorithm", "a", 2, "character",
|
|
28 "distance.metric", "m", 2, "character", ## must be one supported by R's dist function
|
|
29 "dist.obj", "D", 2, "logical",
|
|
30 "direction", "n", 2, "character",
|
|
31 "num.k", "k", 2, "integer",
|
|
32 "output.name", "o", 2, "character"
|
|
33 ),
|
|
34 nc=4,
|
|
35 byrow=TRUE
|
|
36 )
|
|
37
|
|
38 opt <- getopt( spec=spec )
|
|
39
|
|
40 if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "euclidean" }
|
|
41 if ( is.null( opt$algorithm ) ) { opt$algorithm <- "km" }
|
|
42 if ( is.null( opt$dist.obj ) ) { opt$dist.obj <- FALSE }
|
|
43 if ( is.null( opt$direction ) ) { opt$direction <- "cols" }
|
|
44 if ( is.null( opt$num.k ) ) { opt$num.k <- 10 }
|
|
45 if ( is.null( opt$output.name ) ) { opt$output.name <- "partition.result" }
|
|
46
|
|
47 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
|
|
48
|
|
49 if ( opt$direction == "cols" ) {
|
|
50 ## need to transpose b/c both kmeans & pam cluster the rows
|
|
51 ## this shouldn't have an effect upon a distance matrix
|
|
52 data <- t( data )
|
|
53 }
|
|
54 if ( opt$num.k > nrow( data ) ) {
|
|
55 err.msg <- paste( "K specified is greater than the number of elements (", opt$direction, ") in data matrix to be clustereed\n", sep="" )
|
|
56 stop( err.msg )
|
|
57 }
|
|
58
|
|
59 mat.2.b.clustered <- data
|
|
60 if ( opt$dist.obj ) {
|
|
61 ## To be updated
|
|
62
|
|
63 mat.2.b.clustered <- as.dist( data )
|
|
64
|
|
65 if ( opt$algorithm=="km" ) {
|
|
66 ##clusterAlg is kmeans
|
|
67 if (verbose) warning()
|
|
68 }
|
|
69 } else {
|
|
70 ## this is a data matrix -- we always generate a dist.mat object (b/c we need it
|
|
71 ## in case this result is used with a heatmap
|
|
72
|
|
73 ## PAM clustering
|
|
74 if ( opt$algorithm != "km" ) {
|
|
75
|
|
76 if ( ! opt$distance.metric %in% names( amap.distance ) ) stop("unsupported distance.")
|
|
77 mat.2.b.clustered <- Dist( data, method=as.character( amap.distance[ opt$distance.metric ] ) )
|
|
78 attr( mat.2.b.clustered, "method" ) <- opt$distance.metric
|
|
79
|
|
80 } else {
|
|
81 mat.2.b.clustered <- data
|
|
82 }
|
|
83 }
|
|
84
|
|
85 ## now run the clustering
|
|
86 partcl.res <- cl <- NA
|
|
87 if ( opt$algorithm=="pam" ) {
|
|
88 partcl.res <- pam( x=mat.2.b.clustered,
|
|
89 k=opt$num.k,
|
|
90 metric=( ifelse( inherits(data, "dist" ),
|
|
91 attr( data, "method" ), ## this is ok if data is a dist object (b/c pam will ignore it)
|
|
92 opt$distance.metric) ) ) ##,
|
|
93 ##cluster.only=TRUE )
|
|
94 cl <- partcl.res$clustering
|
|
95
|
|
96 if ( is.character( partcl.res$medoids ) ) {
|
|
97 medoids <- data[ partcl.res$medoids, ]
|
|
98 } else {
|
|
99 ##partcl.res$medoids is a matrix -- we shouldn't get this (only if mat.2.b.clustered is a data matrix)
|
|
100 medoids <- partcl.res$medoids
|
|
101 }
|
|
102 med.names <- rownames( medoids )
|
|
103 med.hc <- hclust( as.dist( as.matrix( mat.2.b.clustered )[ med.names, med.names ] ) )
|
|
104 med.cls <- as.numeric( cl[ med.names[ med.hc$order ] ] )
|
|
105
|
|
106 cl.list <- lapply( med.cls, function(i) names( cl[ cl %in% i ] ) )
|
|
107 names( cl.list ) <- med.cls
|
|
108
|
|
109 cl.list <- lapply( cl.list,
|
|
110 function( elts ) {
|
|
111 if ( length( elts ) == 1 ) {
|
|
112 retval <- 1
|
|
113 names( retval ) <- elts
|
|
114 } else {
|
|
115 subdist <- as.dist( as.matrix( mat.2.b.clustered )[ elts, elts ] )
|
|
116 sub.hc <- hclust( subdist )
|
|
117 retval <- sub.hc$order
|
|
118 names( retval ) <- sub.hc$labels
|
|
119 retval <- sort( retval )
|
|
120 }
|
|
121 return( retval )
|
|
122 }
|
|
123 )
|
|
124
|
|
125 fnl.ord <- as.character( unlist( lapply( cl.list, names ) ) )
|
|
126 cl <- cl[ fnl.ord ]
|
|
127 } else {
|
|
128 partcl.res <- kmeans( x=mat.2.b.clustered,
|
|
129 centers=opt$num.k )
|
|
130 cl <- partcl.res$cluster
|
|
131 centroids <- partcl.res$centers
|
|
132 cent.hc <- hclust( Dist( centroids, method=as.character( amap.distance[ opt$distance.metric ] ) ) )
|
|
133 cent.cls <- as.numeric( cent.hc$labels[ cent.hc$order ] )
|
|
134
|
|
135 cl.list <- lapply( cent.cls, function(i) names( cl[ cl %in% i ] ) )
|
|
136 names( cl.list ) <- cent.cls
|
|
137
|
|
138 cl.list <- lapply( cl.list,
|
|
139 function( elts ) {
|
|
140 if ( length( elts ) == 1 ) {
|
|
141 retval <- 1
|
|
142 names( retval ) <- elts
|
|
143 } else {
|
|
144 if ( all( elts %in% colnames( mat.2.b.clustered ) ) ) {
|
|
145 submat <- mat.2.b.clustered[ elts, elts ]
|
|
146 } else {
|
|
147 submat <- mat.2.b.clustered[ elts, ]
|
|
148 }
|
|
149 subdist <- Dist( submat, method=as.character( amap.distance[ opt$distance.metric ] ) )
|
|
150 sub.hc <- hclust( subdist )
|
|
151 retval <- sub.hc$order
|
|
152 names( retval ) <- sub.hc$labels
|
|
153 retval <- sort( retval )
|
|
154 }
|
|
155 return( retval )
|
|
156 }
|
|
157 )
|
|
158
|
|
159 fnl.ord <- as.character( unlist( lapply( cl.list, names ) ) )
|
|
160 cl <- cl[ fnl.ord ]
|
|
161 }
|
|
162
|
|
163 if ( opt$direction == "cols" ) {
|
|
164 ## need to transpose back
|
|
165 data <- t( data )
|
|
166 }
|
|
167 save( file=opt$output.name, partcl.res, cl, data )
|
|
168
|