Mercurial > repos > peter-waltman > ucsc_cluster_tools2
comparison cluster.tools/partition.R @ 0:0decf3fd54bc draft
Uploaded
author | peter-waltman |
---|---|
date | Thu, 28 Feb 2013 01:45:39 -0500 |
parents | |
children | a58527c632b7 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0decf3fd54bc |
---|---|
1 #!/usr/bin/env Rscript | |
2 | |
3 argspec <- c("partition.R help TBD | |
4 \n\n") | |
5 args <- commandArgs(TRUE) | |
6 if ( length( args ) == 1 && args =="--help") { | |
7 write(argspec, stderr()) | |
8 q(); | |
9 } | |
10 | |
11 lib.load.quiet <- function( package ) { | |
12 package <- as.character(substitute(package)) | |
13 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) | |
14 } | |
15 lib.load.quiet(getopt) | |
16 lib.load.quiet( amap ) | |
17 lib.load.quiet( cluster ) | |
18 | |
19 ## we're going to use the amap Dist function, but they misname their correlation | |
20 ## functions, so re-name them correctly | |
21 amap.distance <- c( "euclidean", "maximum", "manhattan", "canberra", "binary", | |
22 "pearson", "abspearson", "correlation", "abscorrelation", "spearman", "kendall" ) | |
23 names( amap.distance ) <- c( "euclidean", "maximum", "manhattan", "canberra", "binary", | |
24 "cosine", "abscosine", "pearson", "abspearson", "spearman", "kendall" ) | |
25 | |
26 spec <- matrix( c( "data.fname", "d", 1, "character", | |
27 "algorithm", "a", 2, "character", | |
28 "distance.metric", "m", 2, "character", ## must be one supported by R's dist function | |
29 "dist.obj", "D", 2, "logical", | |
30 "direction", "n", 2, "character", | |
31 "num.k", "k", 2, "integer", | |
32 "output.name", "o", 2, "character" | |
33 ), | |
34 nc=4, | |
35 byrow=TRUE | |
36 ) | |
37 | |
38 opt <- getopt( spec=spec ) | |
39 | |
40 if ( is.null( opt$distance.metric ) ) { opt$distance.metric <- "euclidean" } | |
41 if ( is.null( opt$algorithm ) ) { opt$algorithm <- "km" } | |
42 if ( is.null( opt$dist.obj ) ) { opt$dist.obj <- FALSE } | |
43 if ( is.null( opt$direction ) ) { opt$direction <- "cols" } | |
44 if ( is.null( opt$num.k ) ) { opt$num.k <- 10 } | |
45 if ( is.null( opt$output.name ) ) { opt$output.name <- "partition.result" } | |
46 | |
47 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) | |
48 | |
49 if ( opt$direction == "cols" ) { | |
50 ## need to transpose b/c both kmeans & pam cluster the rows | |
51 ## this shouldn't have an effect upon a distance matrix | |
52 data <- t( data ) | |
53 } | |
54 if ( opt$num.k > nrow( data ) ) { | |
55 err.msg <- paste( "K specified is greater than the number of elements (", opt$direction, ") in data matrix to be clustereed\n", sep="" ) | |
56 stop( err.msg ) | |
57 } | |
58 | |
59 mat.2.b.clustered <- data | |
60 if ( opt$dist.obj ) { | |
61 ## To be updated | |
62 | |
63 mat.2.b.clustered <- as.dist( data ) | |
64 | |
65 if ( opt$algorithm=="km" ) { | |
66 ##clusterAlg is kmeans | |
67 if (verbose) warning() | |
68 } | |
69 } else { | |
70 ## this is a data matrix -- we always generate a dist.mat object (b/c we need it | |
71 ## in case this result is used with a heatmap | |
72 | |
73 ## PAM clustering | |
74 if ( opt$algorithm != "km" ) { | |
75 | |
76 if ( ! opt$distance.metric %in% names( amap.distance ) ) stop("unsupported distance.") | |
77 mat.2.b.clustered <- Dist( data, method=as.character( amap.distance[ opt$distance.metric ] ) ) | |
78 attr( mat.2.b.clustered, "method" ) <- opt$distance.metric | |
79 | |
80 } else { | |
81 mat.2.b.clustered <- data | |
82 } | |
83 } | |
84 | |
85 ## now run the clustering | |
86 partcl.res <- cl <- NA | |
87 if ( opt$algorithm=="pam" ) { | |
88 partcl.res <- pam( x=mat.2.b.clustered, | |
89 k=opt$num.k, | |
90 metric=( ifelse( inherits(data, "dist" ), | |
91 attr( data, "method" ), ## this is ok if data is a dist object (b/c pam will ignore it) | |
92 opt$distance.metric) ) ) ##, | |
93 ##cluster.only=TRUE ) | |
94 cl <- partcl.res$clustering | |
95 | |
96 if ( is.character( partcl.res$medoids ) ) { | |
97 medoids <- data[ partcl.res$medoids, ] | |
98 } else { | |
99 ##partcl.res$medoids is a matrix -- we shouldn't get this (only if mat.2.b.clustered is a data matrix) | |
100 medoids <- partcl.res$medoids | |
101 } | |
102 med.names <- rownames( medoids ) | |
103 med.hc <- hclust( as.dist( as.matrix( mat.2.b.clustered )[ med.names, med.names ] ) ) | |
104 med.cls <- as.numeric( cl[ med.names[ med.hc$order ] ] ) | |
105 | |
106 cl.list <- lapply( med.cls, function(i) names( cl[ cl %in% i ] ) ) | |
107 names( cl.list ) <- med.cls | |
108 | |
109 cl.list <- lapply( cl.list, | |
110 function( elts ) { | |
111 if ( length( elts ) == 1 ) { | |
112 retval <- 1 | |
113 names( retval ) <- elts | |
114 } else { | |
115 subdist <- as.dist( as.matrix( mat.2.b.clustered )[ elts, elts ] ) | |
116 sub.hc <- hclust( subdist ) | |
117 retval <- sub.hc$order | |
118 names( retval ) <- sub.hc$labels | |
119 retval <- sort( retval ) | |
120 } | |
121 return( retval ) | |
122 } | |
123 ) | |
124 | |
125 fnl.ord <- as.character( unlist( lapply( cl.list, names ) ) ) | |
126 cl <- cl[ fnl.ord ] | |
127 } else { | |
128 partcl.res <- kmeans( x=mat.2.b.clustered, | |
129 centers=opt$num.k ) | |
130 cl <- partcl.res$cluster | |
131 centroids <- partcl.res$centers | |
132 cent.hc <- hclust( Dist( centroids, method=as.character( amap.distance[ opt$distance.metric ] ) ) ) | |
133 cent.cls <- as.numeric( cent.hc$labels[ cent.hc$order ] ) | |
134 | |
135 cl.list <- lapply( cent.cls, function(i) names( cl[ cl %in% i ] ) ) | |
136 names( cl.list ) <- cent.cls | |
137 | |
138 cl.list <- lapply( cl.list, | |
139 function( elts ) { | |
140 if ( length( elts ) == 1 ) { | |
141 retval <- 1 | |
142 names( retval ) <- elts | |
143 } else { | |
144 if ( all( elts %in% colnames( mat.2.b.clustered ) ) ) { | |
145 submat <- mat.2.b.clustered[ elts, elts ] | |
146 } else { | |
147 submat <- mat.2.b.clustered[ elts, ] | |
148 } | |
149 subdist <- Dist( submat, method=as.character( amap.distance[ opt$distance.metric ] ) ) | |
150 sub.hc <- hclust( subdist ) | |
151 retval <- sub.hc$order | |
152 names( retval ) <- sub.hc$labels | |
153 retval <- sort( retval ) | |
154 } | |
155 return( retval ) | |
156 } | |
157 ) | |
158 | |
159 fnl.ord <- as.character( unlist( lapply( cl.list, names ) ) ) | |
160 cl <- cl[ fnl.ord ] | |
161 } | |
162 | |
163 if ( opt$direction == "cols" ) { | |
164 ## need to transpose back | |
165 data <- t( data ) | |
166 } | |
167 save( file=opt$output.name, partcl.res, cl, data ) | |
168 |