comparison cluster.tools/partition.R @ 2:b442996b66ae draft

Uploaded
author peter-waltman
date Wed, 27 Feb 2013 20:17:04 -0500
parents
children
comparison
equal deleted inserted replaced
1:e25d2bece0a2 2:b442996b66ae
1 #!/usr/bin/env Rscript
2
3 argspec <- c("partition.R help TBD
4 \n\n")
5 args <- commandArgs(TRUE)
6 if ( length( args ) == 1 && args =="--help") {
7 write(argspec, stderr())
8 q();
9 }
10
11 lib.load.quiet <- function( package ) {
12 package <- as.character(substitute(package))
13 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
14 }
15 lib.load.quiet(getopt)
16 lib.load.quiet( amap )
17 lib.load.quiet( cluster )
18
19 ## we're going to use the amap Dist function, but they misname their correlation
20 ## functions, so re-name them correctly
21 amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
22 "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" )
23 names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary",
24 "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" )
25
26 spec <- matrix( c( "data.fname", "d", 1, "character",
27 "algorithm", "a", 2, "character",
28 "distance.metric", "m", 2, "character", ## must be one supported by R's dist function
29 "dist.obj", "D", 2, "logical",
30 "direction", "n", 2, "character",
31 "num.k", "k", 2, "integer",
32 "output.name", "o", 2, "character"
33 ),
34 nc=4,
35 byrow=TRUE
36 )
37
38 opt <- getopt( spec=spec )
39
40 if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "euclidean" }
41 if ( is.null( opt$algorithm ) ) { opt$algorithm <- "km" }
42 if ( is.null( opt$dist.obj ) ) { opt$dist.obj <- FALSE }
43 if ( is.null( opt$direction ) ) { opt$direction <- "cols" }
44 if ( is.null( opt$num.k ) ) { opt$num.k <- 10 }
45 if ( is.null( opt$output.name ) ) { opt$output.name <- "partition.result" }
46
47 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
48
49 if ( opt$direction == "cols" ) {
50 ## need to transpose b/c both kmeans & pam cluster the rows
51 ## this shouldn't have an effect upon a distance matrix
52 data <- t( data )
53 }
54 if ( opt$num.k > nrow( data ) ) {
55 err.msg <- paste( "K specified is greater than the number of elements (", opt$direction, ") in data matrix to be clustereed\n", sep="" )
56 stop( err.msg )
57 }
58
59 mat.2.b.clustered <- data
60 if ( opt$dist.obj ) {
61 ## To be updated
62
63 mat.2.b.clustered <- as.dist( data )
64
65 if ( opt$algorithm=="km" ) {
66 ##clusterAlg is kmeans
67 if (verbose) warning()
68 }
69 } else {
70 ## this is a data matrix -- we always generate a dist.mat object (b/c we need it
71 ## in case this result is used with a heatmap
72
73 ## PAM clustering
74 if ( opt$algorithm != "km" ) {
75
76 if ( ! opt$distance.metric %in% names( amap.distance ) ) stop("unsupported distance.")
77 mat.2.b.clustered <- Dist( data, method=as.character( amap.distance[ opt$distance.metric ] ) )
78 attr( mat.2.b.clustered, "method" ) <- opt$distance.metric
79
80 } else {
81 mat.2.b.clustered <- data
82 }
83 }
84
85 ## now run the clustering
86 partcl.res <- cl <- NA
87 if ( opt$algorithm=="pam" ) {
88 partcl.res <- pam( x=mat.2.b.clustered,
89 k=opt$num.k,
90 metric=( ifelse( inherits(data, "dist" ),
91 attr( data, "method" ), ## this is ok if data is a dist object (b/c pam will ignore it)
92 opt$distance.metric) ) ) ##,
93 ##cluster.only=TRUE )
94 cl <- partcl.res$clustering
95
96 if ( is.character( partcl.res$medoids ) ) {
97 medoids <- data[ partcl.res$medoids, ]
98 } else {
99 ##partcl.res$medoids is a matrix -- we shouldn't get this (only if mat.2.b.clustered is a data matrix)
100 medoids <- partcl.res$medoids
101 }
102 med.names <- rownames( medoids )
103 med.hc <- hclust( as.dist( as.matrix( mat.2.b.clustered )[ med.names, med.names ] ) )
104 med.cls <- as.numeric( cl[ med.names[ med.hc$order ] ] )
105
106 cl.list <- lapply( med.cls, function(i) names( cl[ cl %in% i ] ) )
107 names( cl.list ) <- med.cls
108
109 cl.list <- lapply( cl.list,
110 function( elts ) {
111 if ( length( elts ) == 1 ) {
112 retval <- 1
113 names( retval ) <- elts
114 } else {
115 subdist <- as.dist( as.matrix( mat.2.b.clustered )[ elts, elts ] )
116 sub.hc <- hclust( subdist )
117 retval <- sub.hc$order
118 names( retval ) <- sub.hc$labels
119 retval <- sort( retval )
120 }
121 return( retval )
122 }
123 )
124
125 fnl.ord <- as.character( unlist( lapply( cl.list, names ) ) )
126 cl <- cl[ fnl.ord ]
127 } else {
128 partcl.res <- kmeans( x=mat.2.b.clustered,
129 centers=opt$num.k )
130 cl <- partcl.res$cluster
131 centroids <- partcl.res$centers
132 cent.hc <- hclust( Dist( centroids, method=as.character( amap.distance[ opt$distance.metric ] ) ) )
133 cent.cls <- as.numeric( cent.hc$labels[ cent.hc$order ] )
134
135 cl.list <- lapply( cent.cls, function(i) names( cl[ cl %in% i ] ) )
136 names( cl.list ) <- cent.cls
137
138 cl.list <- lapply( cl.list,
139 function( elts ) {
140 if ( length( elts ) == 1 ) {
141 retval <- 1
142 names( retval ) <- elts
143 } else {
144 if ( all( elts %in% colnames( mat.2.b.clustered ) ) ) {
145 submat <- mat.2.b.clustered[ elts, elts ]
146 } else {
147 submat <- mat.2.b.clustered[ elts, ]
148 }
149 subdist <- Dist( submat, method=as.character( amap.distance[ opt$distance.metric ] ) )
150 sub.hc <- hclust( subdist )
151 retval <- sub.hc$order
152 names( retval ) <- sub.hc$labels
153 retval <- sort( retval )
154 }
155 return( retval )
156 }
157 )
158
159 fnl.ord <- as.character( unlist( lapply( cl.list, names ) ) )
160 cl <- cl[ fnl.ord ]
161 }
162
163 if ( opt$direction == "cols" ) {
164 ## need to transpose back
165 data <- t( data )
166 }
167 save( file=opt$output.name, partcl.res, cl, data )
168