comparison numeric_clustering.xml @ 19:8a7b460ab534 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 5d71c93a3dd804b1469852240a86021ab9130364
author bgruening
date Mon, 09 Jul 2018 14:27:04 -0400
parents 4edccd1eaaf0
children 60d1b396cea2
comparison
equal deleted inserted replaced
18:06d67d77907c 19:8a7b460ab534
20 import sklearn.cluster 20 import sklearn.cluster
21 import pandas 21 import pandas
22 from sklearn import metrics 22 from sklearn import metrics
23 from scipy.io import mmread 23 from scipy.io import mmread
24 24
25 @COLUMNS_FUNCTION@
26
25 input_json_path = sys.argv[1] 27 input_json_path = sys.argv[1]
26 params = json.load(open(input_json_path, "r")) 28 params = json.load(open(input_json_path, "r"))
27 29
28 selected_algorithm = params["input_types"]["algorithm_options"]["selected_algorithm"] 30 selected_algorithm = params["input_types"]["algorithm_options"]["selected_algorithm"]
29 31
35 37
36 #if $input_types.selected_input_type == "sparse": 38 #if $input_types.selected_input_type == "sparse":
37 data_matrix = mmread(open("$infile", 'r')) 39 data_matrix = mmread(open("$infile", 'r'))
38 #else: 40 #else:
39 data = pandas.read_csv("$infile", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) 41 data = pandas.read_csv("$infile", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False )
40 42 header = 'infer' if params["input_types"]["header"] else None
41 start_column = $input_types.start_column 43 column_option = params["input_types"]["column_selector_options"]["selected_column_selector_option"]
42 end_column = $input_types.end_column 44 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
43 45 c = params["input_types"]["column_selector_options"]["col"]
44 if end_column and start_column:
45 if end_column >= start_column:
46 data_matrix = data.values[:, start_column-1:end_column]
47 else:
48 data_matrix = data.values
49 else: 46 else:
50 data_matrix = data.values 47 c = None
48 data_matrix = read_columns(
49 "$infile",
50 c = c,
51 c_option = column_option,
52 sep='\t',
53 header=header,
54 parse_dates=True,
55 encoding=None,
56 tupleize_cols=False
57 )
51 #end if 58 #end if
52 59
53 prediction = cluster_object.fit_predict( data_matrix ) 60 prediction = cluster_object.fit_predict( data_matrix )
54 61
55 if len(np.unique(prediction)) > 1: 62 if len(np.unique(prediction)) > 1:
80 <param name="infile" type="data" format="txt" label="Sparse vector (scipy.sparse.csr_matrix) file:" help="The following clustering algorithms support sparse matrix operations: ''Birch'', ''DBSCAN'', ''KMeans'', ''Mini BatchK Means'', and ''Spectral Clustering''. If your data is in tabular format, please use other clustering algorithms."/> 87 <param name="infile" type="data" format="txt" label="Sparse vector (scipy.sparse.csr_matrix) file:" help="The following clustering algorithms support sparse matrix operations: ''Birch'', ''DBSCAN'', ''KMeans'', ''Mini BatchK Means'', and ''Spectral Clustering''. If your data is in tabular format, please use other clustering algorithms."/>
81 <expand macro="clustering_algorithms_options"/> 88 <expand macro="clustering_algorithms_options"/>
82 </when> 89 </when>
83 <when value="tabular"> 90 <when value="tabular">
84 <param name="infile" type="data" format="tabular" label="Data file with numeric values"/> 91 <param name="infile" type="data" format="tabular" label="Data file with numeric values"/>
85 <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Select a subset of data. Start column:" /> 92 <param name="header" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="True" label="Does the dataset contain header:" />
86 <param name="end_column" type="data_column" data_ref="infile" optional="True" label="End column:" /> 93 <conditional name="column_selector_options">
94 <expand macro="samples_column_selector_options" col_name="col" multiple="true" infile="infile"/>
95 </conditional>
87 <!--expand macro="clustering_algorithms_options"--> 96 <!--expand macro="clustering_algorithms_options"-->
88 <conditional name="algorithm_options"> 97 <conditional name="algorithm_options">
89 <param name="selected_algorithm" type="select" label="Clustering Algorithm"> 98 <param name="selected_algorithm" type="select" label="Clustering Algorithm">
90 <option value="AgglomerativeClustering">Hierarchical Agglomerative Clustering</option> 99 <option value="AgglomerativeClustering">Hierarchical Agglomerative Clustering</option>
91 <option value="AffinityPropagation">Affinity Propagation</option> 100 <option value="AffinityPropagation">Affinity Propagation</option>
166 <tests> 175 <tests>
167 <test> 176 <test>
168 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 177 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
169 <param name="selected_input_type" value="tabular"/> 178 <param name="selected_input_type" value="tabular"/>
170 <param name="selected_algorithm" value="KMeans"/> 179 <param name="selected_algorithm" value="KMeans"/>
171 <param name="start_column" value="2" /> 180 <param name="col" value="2,3,4" />
172 <param name="end_column" value="4" />
173 <param name="n_clusters" value="4" /> 181 <param name="n_clusters" value="4" />
174 <param name="init" value="k-means++" /> 182 <param name="init" value="k-means++" />
175 <param name="random_state" value="100"/> 183 <param name="random_state" value="100"/>
176 <output name="outfile" file="cluster_result01.txt"/> 184 <output name="outfile" file="cluster_result01.txt"/>
177 </test> 185 </test>
178 <test> 186 <test>
179 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 187 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
180 <param name="selected_algorithm" value="KMeans"/> 188 <param name="selected_algorithm" value="KMeans"/>
181 <param name="selected_input_type" value="tabular"/> 189 <param name="selected_input_type" value="tabular"/>
182 <param name="start_column" value="2" /> 190 <param name="col" value="2,3,4" />
183 <param name="end_column" value="4" />
184 <param name="n_clusters" value="4" /> 191 <param name="n_clusters" value="4" />
185 <param name="init" value="random" /> 192 <param name="init" value="random" />
186 <param name="random_state" value="100"/> 193 <param name="random_state" value="100"/>
187 <output name="outfile" file="cluster_result02.txt"/> 194 <output name="outfile" file="cluster_result02.txt"/>
188 </test> 195 </test>
189 <test> 196 <test>
190 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 197 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
191 <param name="selected_algorithm" value="DBSCAN"/> 198 <param name="selected_algorithm" value="DBSCAN"/>
192 <param name="selected_input_type" value="tabular"/> 199 <param name="selected_input_type" value="tabular"/>
193 <param name="start_column" value="2" /> 200 <param name="col" value="2,3,4" />
194 <param name="end_column" value="4" />
195 <param name="algorithm" value="kd_tree"/> 201 <param name="algorithm" value="kd_tree"/>
196 <param name="leaf_size" value="10"/> 202 <param name="leaf_size" value="10"/>
197 <param name="eps" value="1.0"/> 203 <param name="eps" value="1.0"/>
198 <output name="outfile" file="cluster_result03.txt"/> 204 <output name="outfile" file="cluster_result03.txt"/>
199 </test> 205 </test>
200 <test> 206 <test>
201 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 207 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
202 <param name="selected_algorithm" value="Birch"/> 208 <param name="selected_algorithm" value="Birch"/>
203 <param name="selected_input_type" value="tabular"/> 209 <param name="selected_input_type" value="tabular"/>
204 <param name="start_column" value="2" /> 210 <param name="col" value="2,3,4" />
205 <param name="end_column" value="4" />
206 <param name="n_clusters" value="4"/> 211 <param name="n_clusters" value="4"/>
207 <param name="threshold" value="0.008"/> 212 <param name="threshold" value="0.008"/>
208 <output name="outfile" file="cluster_result04.txt"/> 213 <output name="outfile" file="cluster_result04.txt"/>
209 </test> 214 </test>
210 <test> 215 <test>
211 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 216 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
212 <param name="selected_algorithm" value="Birch"/> 217 <param name="selected_algorithm" value="Birch"/>
213 <param name="selected_input_type" value="tabular"/> 218 <param name="selected_input_type" value="tabular"/>
214 <param name="start_column" value="2" /> 219 <param name="col" value="2,3,4" />
215 <param name="end_column" value="4" />
216 <param name="branching_factor" value="20"/> 220 <param name="branching_factor" value="20"/>
217 <output name="outfile" file="cluster_result05.txt"/> 221 <output name="outfile" file="cluster_result05.txt"/>
218 </test> 222 </test>
219 <test> 223 <test>
220 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 224 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
221 <param name="selected_algorithm" value="AffinityPropagation"/> 225 <param name="selected_algorithm" value="AffinityPropagation"/>
222 <param name="selected_input_type" value="tabular"/> 226 <param name="selected_input_type" value="tabular"/>
223 <param name="start_column" value="2" /> 227 <param name="col" value="2,3,4" />
224 <param name="end_column" value="4" />
225 <param name="affinity" value="euclidean"/> 228 <param name="affinity" value="euclidean"/>
226 <param name="copy" value="false"/> 229 <param name="copy" value="false"/>
227 <output name="outfile" file="cluster_result06.txt"/> 230 <output name="outfile" file="cluster_result06.txt"/>
228 </test> 231 </test>
229 <test> 232 <test>
230 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 233 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
231 <param name="selected_algorithm" value="AffinityPropagation"/> 234 <param name="selected_algorithm" value="AffinityPropagation"/>
232 <param name="selected_input_type" value="tabular"/> 235 <param name="selected_input_type" value="tabular"/>
233 <param name="start_column" value="2" /> 236 <param name="col" value="2,3,4" />
234 <param name="end_column" value="4" />
235 <param name="damping" value="0.8"/> 237 <param name="damping" value="0.8"/>
236 <output name="outfile" file="cluster_result07.txt"/> 238 <output name="outfile" file="cluster_result07.txt"/>
237 </test> 239 </test>
238 <test> 240 <test>
239 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 241 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
240 <param name="selected_algorithm" value="MeanShift"/> 242 <param name="selected_algorithm" value="MeanShift"/>
241 <param name="selected_input_type" value="tabular"/> 243 <param name="selected_input_type" value="tabular"/>
242 <param name="start_column" value="2" /> 244 <param name="col" value="2,3,4" />
243 <param name="end_column" value="4" />
244 <param name="min_bin_freq" value="3"/> 245 <param name="min_bin_freq" value="3"/>
245 <output name="outfile" file="cluster_result08.txt"/> 246 <output name="outfile" file="cluster_result08.txt"/>
246 </test> 247 </test>
247 <test> 248 <test>
248 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 249 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
249 <param name="selected_algorithm" value="MeanShift"/> 250 <param name="selected_algorithm" value="MeanShift"/>
250 <param name="selected_input_type" value="tabular"/> 251 <param name="selected_input_type" value="tabular"/>
251 <param name="start_column" value="2" /> 252 <param name="col" value="2,3,4" />
252 <param name="end_column" value="4" />
253 <param name="cluster_all" value="False"/> 253 <param name="cluster_all" value="False"/>
254 <output name="outfile" file="cluster_result09.txt"/> 254 <output name="outfile" file="cluster_result09.txt"/>
255 </test> 255 </test>
256 <test> 256 <test>
257 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 257 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
258 <param name="selected_algorithm" value="AgglomerativeClustering"/> 258 <param name="selected_algorithm" value="AgglomerativeClustering"/>
259 <param name="selected_input_type" value="tabular"/> 259 <param name="selected_input_type" value="tabular"/>
260 <param name="start_column" value="2" /> 260 <param name="col" value="2,3,4" />
261 <param name="end_column" value="4" />
262 <param name="affinity" value="euclidean"/> 261 <param name="affinity" value="euclidean"/>
263 <param name="linkage" value="average"/> 262 <param name="linkage" value="average"/>
264 <param name="n_clusters" value="4"/> 263 <param name="n_clusters" value="4"/>
265 <output name="outfile" file="cluster_result10.txt"/> 264 <output name="outfile" file="cluster_result10.txt"/>
266 </test> 265 </test>
267 <test> 266 <test>
268 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 267 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
269 <param name="selected_algorithm" value="AgglomerativeClustering"/> 268 <param name="selected_algorithm" value="AgglomerativeClustering"/>
270 <param name="selected_input_type" value="tabular"/> 269 <param name="selected_input_type" value="tabular"/>
271 <param name="start_column" value="2" /> 270 <param name="col" value="2,3,4" />
272 <param name="end_column" value="4" />
273 <param name="linkage" value="complete"/> 271 <param name="linkage" value="complete"/>
274 <param name="n_clusters" value="4"/> 272 <param name="n_clusters" value="4"/>
275 <output name="outfile" file="cluster_result11.txt"/> 273 <output name="outfile" file="cluster_result11.txt"/>
276 </test> 274 </test>
277 <test> 275 <test>
278 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 276 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
279 <param name="selected_algorithm" value="SpectralClustering"/> 277 <param name="selected_algorithm" value="SpectralClustering"/>
280 <param name="selected_input_type" value="tabular"/> 278 <param name="selected_input_type" value="tabular"/>
281 <param name="start_column" value="2" /> 279 <param name="col" value="2,3,4" />
282 <param name="end_column" value="4" />
283 <param name="eigen_solver" value="arpack"/> 280 <param name="eigen_solver" value="arpack"/>
284 <param name="n_neighbors" value="12"/> 281 <param name="n_neighbors" value="12"/>
285 <param name="n_clusters" value="4"/> 282 <param name="n_clusters" value="4"/>
286 <param name="assign_labels" value="discretize"/> 283 <param name="assign_labels" value="discretize"/>
287 <param name="random_state" value="100"/> 284 <param name="random_state" value="100"/>
289 </test> 286 </test>
290 <test> 287 <test>
291 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 288 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
292 <param name="selected_algorithm" value="SpectralClustering"/> 289 <param name="selected_algorithm" value="SpectralClustering"/>
293 <param name="selected_input_type" value="tabular"/> 290 <param name="selected_input_type" value="tabular"/>
294 <param name="start_column" value="2" /> 291 <param name="col" value="2,3,4" />
295 <param name="end_column" value="4" />
296 <param name="assign_labels" value="discretize"/> 292 <param name="assign_labels" value="discretize"/>
297 <param name="random_state" value="100"/> 293 <param name="random_state" value="100"/>
298 <param name="degree" value="2"/> 294 <param name="degree" value="2"/>
299 <output name="outfile" file="cluster_result13.txt" compare="sim_size" /> 295 <output name="outfile" file="cluster_result13.txt" compare="sim_size" />
300 </test> 296 </test>
301 <test> 297 <test>
302 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 298 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
303 <param name="selected_algorithm" value="MiniBatchKMeans"/> 299 <param name="selected_algorithm" value="MiniBatchKMeans"/>
304 <param name="selected_input_type" value="tabular"/> 300 <param name="selected_input_type" value="tabular"/>
305 <param name="start_column" value="2" /> 301 <param name="col" value="2,3,4" />
306 <param name="end_column" value="4" />
307 <param name="tol" value="0.5"/> 302 <param name="tol" value="0.5"/>
308 <param name="random_state" value="100"/> 303 <param name="random_state" value="100"/>
309 <output name="outfile" file="cluster_result14.txt"/> 304 <output name="outfile" file="cluster_result14.txt"/>
310 </test> 305 </test>
311 <test> 306 <test>
312 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 307 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
313 <param name="selected_algorithm" value="MiniBatchKMeans"/> 308 <param name="selected_algorithm" value="MiniBatchKMeans"/>
314 <param name="selected_input_type" value="tabular"/> 309 <param name="selected_input_type" value="tabular"/>
315 <param name="n_init" value="5"/> 310 <param name="n_init" value="5"/>
316 <param name="start_column" value="2" /> 311 <param name="col" value="2,3,4" />
317 <param name="end_column" value="4" />
318 <param name="batch_size" value="10"/> 312 <param name="batch_size" value="10"/>
319 <param name="n_clusters" value="4"/> 313 <param name="n_clusters" value="4"/>
320 <param name="random_state" value="100"/> 314 <param name="random_state" value="100"/>
321 <param name="reassignment_ratio" value="1.0"/> 315 <param name="reassignment_ratio" value="1.0"/>
322 <output name="outfile" file="cluster_result15.txt"/> 316 <output name="outfile" file="cluster_result15.txt"/>
323 </test> 317 </test>
324 <test> 318 <test>
325 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 319 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
326 <param name="selected_algorithm" value="KMeans"/> 320 <param name="selected_algorithm" value="KMeans"/>
327 <param name="selected_input_type" value="tabular"/> 321 <param name="selected_input_type" value="tabular"/>
328 <param name="start_column" value="1" /> 322 <param name="col" value="1" />
329 <param name="end_column" value="1" />
330 <param name="n_clusters" value="4" /> 323 <param name="n_clusters" value="4" />
331 <param name="random_state" value="100"/> 324 <param name="random_state" value="100"/>
332 <output name="outfile" file="cluster_result16.txt"/> 325 <output name="outfile" file="cluster_result16.txt"/>
333 </test> 326 </test>
334 <test> 327 <test>