comparison flexynesis_utils.xml @ 2:e5ecfffcfe45 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 1afbaf45449e25238935e222f983da62392c067a
author bgruening
date Fri, 04 Jul 2025 14:57:15 +0000
parents 433a5f3f68a1
children f413f828ef30
comparison
equal deleted inserted replaced
1:8fa64b6a9544 2:e5ecfffcfe45
18 ln -s '$utils_conditional.labels' 'inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext' && 18 ln -s '$utils_conditional.labels' 'inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext' &&
19 cat '$flexynesis_utils_config' && 19 cat '$flexynesis_utils_config' &&
20 python '$flexynesis_utils_config' 20 python '$flexynesis_utils_config'
21 #end if 21 #end if
22 #if $utils_conditional.util == "split_data": 22 #if $utils_conditional.util == "split_data":
23 ln -s '$utils_conditional.clin' inputs/clin.csv && 23 ln -s '$utils_conditional.clin' inputs/clin.tabular &&
24 #set $omics_names = [] 24 #set $omics_names = []
25 #for $omics_file in $utils_conditional.omics: 25 #for $omics_file in $utils_conditional.omics:
26 ln -s '$omics_file' 'inputs/${omics_file.element_identifier}.${omics_file.ext}' && 26 ln -s '$omics_file' 'inputs/${omics_file.element_identifier}.${omics_file.ext}' &&
27 #silent $omics_names.append('inputs/' + str($omics_file.element_identifier) + '.' + str($omics_file.ext)) 27 #silent $omics_names.append('inputs/' + str($omics_file.element_identifier) + '.' + str($omics_file.ext))
28 #end for 28 #end for
29 29
30 python '$__tool_directory__/flexynesis_utils.py' 30 python '$__tool_directory__/flexynesis_utils.py'
31 --util split 31 --util split
32 --clin inputs/clin.csv 32 --clin inputs/clin.tabular
33 --omics '$(",".join($omics_names))' 33 --omics '$(",".join($omics_names))'
34 --split $utils_conditional.split 34 --split $utils_conditional.split
35 --out output 35 --out output
36 #end if 36 #end if
37 #if $utils_conditional.util == "binarize": 37 #if $utils_conditional.util == "binarize":
75 #end if 75 #end if
76 X=X) 76 X=X)
77 cluster_df = pd.DataFrame(data=cluster_labels, index=X.index, columns=['louvain_cluster']) 77 cluster_df = pd.DataFrame(data=cluster_labels, index=X.index, columns=['louvain_cluster'])
78 label_data = label_data.merge(cluster_df[['louvain_cluster']], left_index=True, right_index=True, how='left') 78 label_data = label_data.merge(cluster_df[['louvain_cluster']], left_index=True, right_index=True, how='left')
79 79
80 output_path = f"output/clustered_labels.csv" 80 output_path = f"output/clustered_labels.tabular"
81 label_data.to_csv(output_path, index=True) 81 label_data.to_csv(output_path, sep="\t", index=True)
82 82
83 #else if $utils_conditional.util == "get_optimal_clusters": 83 #else if $utils_conditional.util == "get_optimal_clusters":
84 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') 84 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext')
85 X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext') 85 X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext')
86 86
93 print(f"Silhouette scores: \n{silhouette_scores}") 93 print(f"Silhouette scores: \n{silhouette_scores}")
94 94
95 cluster_df = pd.DataFrame(data=kmeans_cluster_labels, index=X.index, columns=['optimal_kmeans_cluster']) 95 cluster_df = pd.DataFrame(data=kmeans_cluster_labels, index=X.index, columns=['optimal_kmeans_cluster'])
96 label_data = label_data.merge(cluster_df[['optimal_kmeans_cluster']], left_index=True, right_index=True, how='left') 96 label_data = label_data.merge(cluster_df[['optimal_kmeans_cluster']], left_index=True, right_index=True, how='left')
97 97
98 output_path = f"output/optimal_clusters_labels.csv" 98 output_path = f"output/optimal_clusters_labels.tabular"
99 label_data.to_csv(output_path, index=True) 99 label_data.to_csv(output_path, sep="\t", index=True)
100 100
101 #else if $utils_conditional.util == "k_means_clustering": 101 #else if $utils_conditional.util == "k_means_clustering":
102 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') 102 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext')
103 X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext') 103 X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext')
104 104
108 108
109 print(f"{kmeans}") 109 print(f"{kmeans}")
110 cluster_df = pd.DataFrame(data=cluster_labels, index=X.index, columns=['kmeans_cluster']) 110 cluster_df = pd.DataFrame(data=cluster_labels, index=X.index, columns=['kmeans_cluster'])
111 label_data = label_data.merge(cluster_df[['kmeans_cluster']], left_index=True, right_index=True, how='left') 111 label_data = label_data.merge(cluster_df[['kmeans_cluster']], left_index=True, right_index=True, how='left')
112 112
113 output_path = f"output/kmeans_labels.csv" 113 output_path = f"output/kmeans_labels.tabular"
114 label_data.to_csv(output_path, index=True) 114 label_data.to_csv(output_path, sep="\t", index=True)
115 115
116 #else if $utils_conditional.util == "compute_ami_ari": 116 #else if $utils_conditional.util == "compute_ami_ari":
117 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') 117 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext')
118 118
119 true_label = label_data.columns[$utils_conditional.true_label-2] 119 true_label = label_data.columns[$utils_conditional.true_label-2]
145 <option value="compute_ami_ari">Compute AMI and ARI</option> 145 <option value="compute_ami_ari">Compute AMI and ARI</option>
146 <option value="split_data">Split data to train and test</option> 146 <option value="split_data">Split data to train and test</option>
147 <option value="binarize">Binarize mutation data</option> 147 <option value="binarize">Binarize mutation data</option>
148 </param> 148 </param>
149 <when value="louvain_clustering"> 149 <when value="louvain_clustering">
150 <param argument="--X" type="data" format="tabular,csv" label="Matrix" help="Input matrix, (samples, features)"/> 150 <param argument="--X" type="data" format="tabular" label="Matrix" help="Input matrix, (samples, features)"/>
151 <expand macro="plots_common_input"/> 151 <expand macro="plots_common_input"/>
152 <param argument="--threshold" type="float" min="0" optional="true" label="Distance threshold to create an edge between two nodes"/> 152 <param argument="--threshold" type="float" min="0" optional="true" label="Distance threshold to create an edge between two nodes"/>
153 <param argument="--k" type="integer" min="0" optional="true" label="Number of nearest neighbors to connect for each node"/> 153 <param argument="--k" type="integer" min="0" optional="true" label="Number of nearest neighbors to connect for each node"/>
154 </when> 154 </when>
155 <when value="get_optimal_clusters"> 155 <when value="get_optimal_clusters">
156 <param argument="--X" type="data" format="tabular,csv" label="Matrix" help="Input matrix, (samples, features)"/> 156 <param argument="--X" type="data" format="tabular" label="Matrix" help="Input matrix, (samples, features)"/>
157 <expand macro="plots_common_input"/> 157 <expand macro="plots_common_input"/>
158 <param argument="--min_k" type="integer" min="0" value="2" optional="false" label="Minimum number of clusters to try"/> 158 <param argument="--min_k" type="integer" min="0" value="2" optional="false" label="Minimum number of clusters to try"/>
159 <param argument="--max_k" type="integer" min="0" value="10" optional="false" label="Maximum number of clusters to try"/> 159 <param argument="--max_k" type="integer" min="0" value="10" optional="false" label="Maximum number of clusters to try"/>
160 </when> 160 </when>
161 <when value="k_means_clustering"> 161 <when value="k_means_clustering">
162 <param argument="--X" type="data" format="tabular,csv" label="Matrix" help="Input matrix, (samples, features)"/> 162 <param argument="--X" type="data" format="tabular" label="Matrix" help="Input matrix, (samples, features)"/>
163 <expand macro="plots_common_input"/> 163 <expand macro="plots_common_input"/>
164 <param argument="--k" type="integer" min="0" optional="true" label="The number of clusters to form"/> 164 <param argument="--k" type="integer" min="0" optional="true" label="The number of clusters to form"/>
165 </when> 165 </when>
166 <when value="compute_ami_ari"> 166 <when value="compute_ami_ari">
167 <expand macro="plots_common_input"/> 167 <expand macro="plots_common_input"/>
168 <param name="true_label" type="data_column" data_ref="labels" label="Column name in the labels file to use for the true labels"/> 168 <param name="true_label" type="data_column" data_ref="labels" label="Column name in the labels file to use for the true labels"/>
169 <param name="predicted_label" type="data_column" data_ref="labels" label="Column name in the labels file to use for the predicted labels"/> 169 <param name="predicted_label" type="data_column" data_ref="labels" label="Column name in the labels file to use for the predicted labels"/>
170 </when> 170 </when>
171 <when value="split_data"> 171 <when value="split_data">
172 <param argument="--clin" type="data" format="csv" optional="false" label="Clinical data" help="Samples in rows"/> 172 <param argument="--clin" type="data" format="tabular" optional="false" label="Clinical data" help="Samples in rows"/>
173 <param argument="--omics" type="data" format="tabular,csv" optional="false" multiple="true" label="Omics data" help="samples in columns"/> 173 <param argument="--omics" type="data" format="tabular" optional="false" multiple="true" label="Omics data" help="samples in columns"/>
174 <param argument="--split" type="float" min="0" max="1" value="0.7" label="Training/Test split ratio" help="Proportion of data to use for training (e.g., 0.7 means 70% train, 30% test)"/> 174 <param argument="--split" type="float" min="0" max="1" value="0.7" label="Training/Test split ratio" help="Proportion of data to use for training (e.g., 0.7 means 70% train, 30% test)"/>
175 </when> 175 </when>
176 <when value="binarize"> 176 <when value="binarize">
177 <param argument="--mutation" type="data" format="tabular,csv" label="Mutation data" help="Mutation data with both genes and samples in rows"/> 177 <param argument="--mutation" type="data" format="tabular" label="Mutation data" help="Mutation data with both genes and samples in rows"/>
178 <param argument="--gene_idx" type="data_column" data_ref="mutation" label="Column in the mutation file with genes"/> 178 <param argument="--gene_idx" type="data_column" data_ref="mutation" label="Column in the mutation file with genes"/>
179 <param argument="--sample_idx" type="data_column" data_ref="mutation" label="Column in the mutation file with samples"/> 179 <param argument="--sample_idx" type="data_column" data_ref="mutation" label="Column in the mutation file with samples"/>
180 </when> 180 </when>
181 </conditional> 181 </conditional>
182 </inputs> 182 </inputs>
183 <outputs> 183 <outputs>
184 <data name="util_out" auto_format="true" from_work_dir="output/*" label="${tool.name} on ${on_string}: ${utils_conditional.util}"> 184 <data name="util_out" auto_format="true" from_work_dir="output/*" label="${tool.name} on ${on_string}: ${utils_conditional.util}">
185 <filter>utils_conditional['util'] != "split_data"</filter> 185 <filter>utils_conditional['util'] != "split_data"</filter>
186 </data> 186 </data>
187 <collection name="train_out" type="list" label="${tool.name} on ${on_string}: train datasets"> 187 <collection name="train_out" type="list" label="${tool.name} on ${on_string}: train datasets">
188 <discover_datasets pattern="__name_and_ext__" format="csv" directory="output/train"/> 188 <discover_datasets pattern="__name_and_ext__" format="tabular" directory="output/train"/>
189 <filter>utils_conditional['util'] == "split_data"</filter> 189 <filter>utils_conditional['util'] == "split_data"</filter>
190 </collection> 190 </collection>
191 <collection name="test_out" type="list" label="${tool.name} on ${on_string}: test datasets"> 191 <collection name="test_out" type="list" label="${tool.name} on ${on_string}: test datasets">
192 <discover_datasets pattern="__name_and_ext__" format="csv" directory="output/test"/> 192 <discover_datasets pattern="__name_and_ext__" format="tabular" directory="output/test"/>
193 <filter>utils_conditional['util'] == "split_data"</filter> 193 <filter>utils_conditional['util'] == "split_data"</filter>
194 </collection> 194 </collection>
195 </outputs> 195 </outputs>
196 <tests> 196 <tests>
197 <!-- test 1: Louvain clustering --> 197 <!-- test 1: Louvain clustering -->
198 <test expect_num_outputs="1"> 198 <test expect_num_outputs="1">
199 <param name="non_commercial_use" value="True"/> 199 <param name="non_commercial_use" value="True"/>
200 <conditional name="utils_conditional"> 200 <conditional name="utils_conditional">
201 <param name="util" value="louvain_clustering"/> 201 <param name="util" value="louvain_clustering"/>
202 <param name="X" value="embeddings.csv"/> 202 <param name="X" value="embeddings.tabular"/>
203 <param name="labels" value="labels_pr.csv"/> 203 <param name="labels" value="labels_pr.tabular"/>
204 <param name="k" value="15"/> 204 <param name="k" value="15"/>
205 </conditional> 205 </conditional>
206 <output name="util_out"> 206 <output name="util_out">
207 <assert_contents> 207 <assert_contents>
208 <has_text text="sample_id,variable,class_label,probability,known_label,predicted_label,split,louvain_cluster"/> 208 <has_text_matching expression="sample_id\tvariable\tclass_label\tprobability\tknown_label\tpredicted_label\tsplit\tlouvain_cluster"/>
209 <has_text text="MB-4818,CLAUDIN_SUBTYPE,LumA,0.8582904,LumB,LumA,test,3.0"/> 209 <has_text_matching expression="MB-4818\tCLAUDIN_SUBTYPE\tLumA\t0.8582904\tLumB\tLumA\ttest\t3.0"/>
210 </assert_contents> 210 </assert_contents>
211 </output> 211 </output>
212 </test> 212 </test>
213 <!-- test 2: Get optimal clusters --> 213 <!-- test 2: Get optimal clusters -->
214 <test expect_num_outputs="1"> 214 <test expect_num_outputs="1">
215 <param name="non_commercial_use" value="True"/> 215 <param name="non_commercial_use" value="True"/>
216 <conditional name="utils_conditional"> 216 <conditional name="utils_conditional">
217 <param name="util" value="get_optimal_clusters"/> 217 <param name="util" value="get_optimal_clusters"/>
218 <param name="X" value="embeddings.csv"/> 218 <param name="X" value="embeddings.tabular"/>
219 <param name="labels" value="labels_pr.csv"/> 219 <param name="labels" value="labels_pr.tabular"/>
220 <param name="min_k" value="2"/> 220 <param name="min_k" value="2"/>
221 <param name="max_k" value="10"/> 221 <param name="max_k" value="10"/>
222 </conditional> 222 </conditional>
223 <assert_stdout> 223 <assert_stdout>
224 <has_text text="Optimal number of clusters: 2"/> 224 <has_text text="Optimal number of clusters: 2"/>
225 <has_text text="Silhouette scores: "/> 225 <has_text text="Silhouette scores: "/>
226 </assert_stdout> 226 </assert_stdout>
227 <output name="util_out"> 227 <output name="util_out">
228 <assert_contents> 228 <assert_contents>
229 <has_text text="sample_id,variable,class_label,probability,known_label,predicted_label,split,optimal_kmeans_cluster"/> 229 <has_text_matching expression="sample_id\tvariable\tclass_label\tprobability\tknown_label\tpredicted_label\tsplit\toptimal_kmeans_cluster"/>
230 <has_text text="MB-4818,CLAUDIN_SUBTYPE,LumA,0.8582904,LumB,LumA,test,0.0"/> 230 <has_text_matching expression="MB-4818\tCLAUDIN_SUBTYPE\tLumA\t0.8582904\tLumB\tLumA\ttest\t0.0"/>
231 </assert_contents> 231 </assert_contents>
232 </output> 232 </output>
233 </test> 233 </test>
234 <!-- test 3: K-Means clustering --> 234 <!-- test 3: K-Means clustering -->
235 <test expect_num_outputs="1"> 235 <test expect_num_outputs="1">
236 <param name="non_commercial_use" value="True"/> 236 <param name="non_commercial_use" value="True"/>
237 <conditional name="utils_conditional"> 237 <conditional name="utils_conditional">
238 <param name="util" value="k_means_clustering"/> 238 <param name="util" value="k_means_clustering"/>
239 <param name="X" value="embeddings.csv"/> 239 <param name="X" value="embeddings.tabular"/>
240 <param name="labels" value="labels_pr.csv"/> 240 <param name="labels" value="labels_pr.tabular"/>
241 <param name="k" value="2"/> 241 <param name="k" value="2"/>
242 </conditional> 242 </conditional>
243 <assert_stdout> 243 <assert_stdout>
244 <has_text text="KMeans(n_clusters=2, random_state=42)"/> 244 <has_text text="KMeans(n_clusters=2, random_state=42)"/>
245 </assert_stdout> 245 </assert_stdout>
246 <output name="util_out"> 246 <output name="util_out">
247 <assert_contents> 247 <assert_contents>
248 <has_text text="sample_id,variable,class_label,probability,known_label,predicted_label,split,kmeans_cluster"/> 248 <has_text_matching expression="sample_id\tvariable\tclass_label\tprobability\tknown_label\tpredicted_label\tsplit\tkmeans_cluster"/>
249 <has_text text="MB-4818,CLAUDIN_SUBTYPE,LumA,0.8582904,LumB,LumA,test,0.0"/> 249 <has_text_matching expression="MB-4818\tCLAUDIN_SUBTYPE\tLumA\t0.8582904\tLumB\tLumA\ttest\t0.0"/>
250 </assert_contents> 250 </assert_contents>
251 </output> 251 </output>
252 </test> 252 </test>
253 <!-- test 4: Compute AMI and ARI --> 253 <!-- test 4: Compute AMI and ARI -->
254 <test expect_num_outputs="1"> 254 <test expect_num_outputs="1">
255 <param name="non_commercial_use" value="True"/> 255 <param name="non_commercial_use" value="True"/>
256 <conditional name="utils_conditional"> 256 <conditional name="utils_conditional">
257 <param name="util" value="compute_ami_ari"/> 257 <param name="util" value="compute_ami_ari"/>
258 <param name="labels" value="labels.csv"/> 258 <param name="labels" value="labels.tabular"/>
259 <param name="true_label" value="5"/> 259 <param name="true_label" value="5"/>
260 <param name="predicted_label" value="6"/> 260 <param name="predicted_label" value="6"/>
261 </conditional> 261 </conditional>
262 <assert_stdout> 262 <assert_stdout>
263 <has_text_matching expression="AMI: 0.5108[0-9]+"/> 263 <has_text_matching expression="AMI: 0.5108[0-9]+"/>
331 </conditional> 331 </conditional>
332 <output name="util_out"> 332 <output name="util_out">
333 <assert_contents> 333 <assert_contents>
334 <has_n_lines n="1611"/> 334 <has_n_lines n="1611"/>
335 <has_text text="Hugo_Symbol"/> 335 <has_text text="Hugo_Symbol"/>
336 <has_text text="AADACL2,0.0,0.0"/> 336 <has_text_matching expression="AADACL2\t0.0\t0.0"/>
337 <has_text text="ABCB1,0.0,0.0,0.0,1.0"/> 337 <has_text_matching expression="ABCB1\t0.0\t0.0\t0.0\t1.0"/>
338 </assert_contents> 338 </assert_contents>
339 </output> 339 </output>
340 </test> 340 </test>
341 </tests> 341 </tests>
342 <help><![CDATA[ 342 <help><![CDATA[