Mercurial > repos > bgruening > flexynesis_utils
comparison flexynesis_utils.xml @ 2:e5ecfffcfe45 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 1afbaf45449e25238935e222f983da62392c067a
| author | bgruening |
|---|---|
| date | Fri, 04 Jul 2025 14:57:15 +0000 |
| parents | 433a5f3f68a1 |
| children | f413f828ef30 |
comparison
equal
deleted
inserted
replaced
| 1:8fa64b6a9544 | 2:e5ecfffcfe45 |
|---|---|
| 18 ln -s '$utils_conditional.labels' 'inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext' && | 18 ln -s '$utils_conditional.labels' 'inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext' && |
| 19 cat '$flexynesis_utils_config' && | 19 cat '$flexynesis_utils_config' && |
| 20 python '$flexynesis_utils_config' | 20 python '$flexynesis_utils_config' |
| 21 #end if | 21 #end if |
| 22 #if $utils_conditional.util == "split_data": | 22 #if $utils_conditional.util == "split_data": |
| 23 ln -s '$utils_conditional.clin' inputs/clin.csv && | 23 ln -s '$utils_conditional.clin' inputs/clin.tabular && |
| 24 #set $omics_names = [] | 24 #set $omics_names = [] |
| 25 #for $omics_file in $utils_conditional.omics: | 25 #for $omics_file in $utils_conditional.omics: |
| 26 ln -s '$omics_file' 'inputs/${omics_file.element_identifier}.${omics_file.ext}' && | 26 ln -s '$omics_file' 'inputs/${omics_file.element_identifier}.${omics_file.ext}' && |
| 27 #silent $omics_names.append('inputs/' + str($omics_file.element_identifier) + '.' + str($omics_file.ext)) | 27 #silent $omics_names.append('inputs/' + str($omics_file.element_identifier) + '.' + str($omics_file.ext)) |
| 28 #end for | 28 #end for |
| 29 | 29 |
| 30 python '$__tool_directory__/flexynesis_utils.py' | 30 python '$__tool_directory__/flexynesis_utils.py' |
| 31 --util split | 31 --util split |
| 32 --clin inputs/clin.csv | 32 --clin inputs/clin.tabular |
| 33 --omics '$(",".join($omics_names))' | 33 --omics '$(",".join($omics_names))' |
| 34 --split $utils_conditional.split | 34 --split $utils_conditional.split |
| 35 --out output | 35 --out output |
| 36 #end if | 36 #end if |
| 37 #if $utils_conditional.util == "binarize": | 37 #if $utils_conditional.util == "binarize": |
| 75 #end if | 75 #end if |
| 76 X=X) | 76 X=X) |
| 77 cluster_df = pd.DataFrame(data=cluster_labels, index=X.index, columns=['louvain_cluster']) | 77 cluster_df = pd.DataFrame(data=cluster_labels, index=X.index, columns=['louvain_cluster']) |
| 78 label_data = label_data.merge(cluster_df[['louvain_cluster']], left_index=True, right_index=True, how='left') | 78 label_data = label_data.merge(cluster_df[['louvain_cluster']], left_index=True, right_index=True, how='left') |
| 79 | 79 |
| 80 output_path = f"output/clustered_labels.csv" | 80 output_path = f"output/clustered_labels.tabular" |
| 81 label_data.to_csv(output_path, index=True) | 81 label_data.to_csv(output_path, sep="\t", index=True) |
| 82 | 82 |
| 83 #else if $utils_conditional.util == "get_optimal_clusters": | 83 #else if $utils_conditional.util == "get_optimal_clusters": |
| 84 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') | 84 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') |
| 85 X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext') | 85 X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext') |
| 86 | 86 |
| 93 print(f"Silhouette scores: \n{silhouette_scores}") | 93 print(f"Silhouette scores: \n{silhouette_scores}") |
| 94 | 94 |
| 95 cluster_df = pd.DataFrame(data=kmeans_cluster_labels, index=X.index, columns=['optimal_kmeans_cluster']) | 95 cluster_df = pd.DataFrame(data=kmeans_cluster_labels, index=X.index, columns=['optimal_kmeans_cluster']) |
| 96 label_data = label_data.merge(cluster_df[['optimal_kmeans_cluster']], left_index=True, right_index=True, how='left') | 96 label_data = label_data.merge(cluster_df[['optimal_kmeans_cluster']], left_index=True, right_index=True, how='left') |
| 97 | 97 |
| 98 output_path = f"output/optimal_clusters_labels.csv" | 98 output_path = f"output/optimal_clusters_labels.tabular" |
| 99 label_data.to_csv(output_path, index=True) | 99 label_data.to_csv(output_path, sep="\t", index=True) |
| 100 | 100 |
| 101 #else if $utils_conditional.util == "k_means_clustering": | 101 #else if $utils_conditional.util == "k_means_clustering": |
| 102 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') | 102 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') |
| 103 X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext') | 103 X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext') |
| 104 | 104 |
| 108 | 108 |
| 109 print(f"{kmeans}") | 109 print(f"{kmeans}") |
| 110 cluster_df = pd.DataFrame(data=cluster_labels, index=X.index, columns=['kmeans_cluster']) | 110 cluster_df = pd.DataFrame(data=cluster_labels, index=X.index, columns=['kmeans_cluster']) |
| 111 label_data = label_data.merge(cluster_df[['kmeans_cluster']], left_index=True, right_index=True, how='left') | 111 label_data = label_data.merge(cluster_df[['kmeans_cluster']], left_index=True, right_index=True, how='left') |
| 112 | 112 |
| 113 output_path = f"output/kmeans_labels.csv" | 113 output_path = f"output/kmeans_labels.tabular" |
| 114 label_data.to_csv(output_path, index=True) | 114 label_data.to_csv(output_path, sep="\t", index=True) |
| 115 | 115 |
| 116 #else if $utils_conditional.util == "compute_ami_ari": | 116 #else if $utils_conditional.util == "compute_ami_ari": |
| 117 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') | 117 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') |
| 118 | 118 |
| 119 true_label = label_data.columns[$utils_conditional.true_label-2] | 119 true_label = label_data.columns[$utils_conditional.true_label-2] |
| 145 <option value="compute_ami_ari">Compute AMI and ARI</option> | 145 <option value="compute_ami_ari">Compute AMI and ARI</option> |
| 146 <option value="split_data">Split data to train and test</option> | 146 <option value="split_data">Split data to train and test</option> |
| 147 <option value="binarize">Binarize mutation data</option> | 147 <option value="binarize">Binarize mutation data</option> |
| 148 </param> | 148 </param> |
| 149 <when value="louvain_clustering"> | 149 <when value="louvain_clustering"> |
| 150 <param argument="--X" type="data" format="tabular,csv" label="Matrix" help="Input matrix, (samples, features)"/> | 150 <param argument="--X" type="data" format="tabular" label="Matrix" help="Input matrix, (samples, features)"/> |
| 151 <expand macro="plots_common_input"/> | 151 <expand macro="plots_common_input"/> |
| 152 <param argument="--threshold" type="float" min="0" optional="true" label="Distance threshold to create an edge between two nodes"/> | 152 <param argument="--threshold" type="float" min="0" optional="true" label="Distance threshold to create an edge between two nodes"/> |
| 153 <param argument="--k" type="integer" min="0" optional="true" label="Number of nearest neighbors to connect for each node"/> | 153 <param argument="--k" type="integer" min="0" optional="true" label="Number of nearest neighbors to connect for each node"/> |
| 154 </when> | 154 </when> |
| 155 <when value="get_optimal_clusters"> | 155 <when value="get_optimal_clusters"> |
| 156 <param argument="--X" type="data" format="tabular,csv" label="Matrix" help="Input matrix, (samples, features)"/> | 156 <param argument="--X" type="data" format="tabular" label="Matrix" help="Input matrix, (samples, features)"/> |
| 157 <expand macro="plots_common_input"/> | 157 <expand macro="plots_common_input"/> |
| 158 <param argument="--min_k" type="integer" min="0" value="2" optional="false" label="Minimum number of clusters to try"/> | 158 <param argument="--min_k" type="integer" min="0" value="2" optional="false" label="Minimum number of clusters to try"/> |
| 159 <param argument="--max_k" type="integer" min="0" value="10" optional="false" label="Maximum number of clusters to try"/> | 159 <param argument="--max_k" type="integer" min="0" value="10" optional="false" label="Maximum number of clusters to try"/> |
| 160 </when> | 160 </when> |
| 161 <when value="k_means_clustering"> | 161 <when value="k_means_clustering"> |
| 162 <param argument="--X" type="data" format="tabular,csv" label="Matrix" help="Input matrix, (samples, features)"/> | 162 <param argument="--X" type="data" format="tabular" label="Matrix" help="Input matrix, (samples, features)"/> |
| 163 <expand macro="plots_common_input"/> | 163 <expand macro="plots_common_input"/> |
| 164 <param argument="--k" type="integer" min="0" optional="true" label="The number of clusters to form"/> | 164 <param argument="--k" type="integer" min="0" optional="true" label="The number of clusters to form"/> |
| 165 </when> | 165 </when> |
| 166 <when value="compute_ami_ari"> | 166 <when value="compute_ami_ari"> |
| 167 <expand macro="plots_common_input"/> | 167 <expand macro="plots_common_input"/> |
| 168 <param name="true_label" type="data_column" data_ref="labels" label="Column name in the labels file to use for the true labels"/> | 168 <param name="true_label" type="data_column" data_ref="labels" label="Column name in the labels file to use for the true labels"/> |
| 169 <param name="predicted_label" type="data_column" data_ref="labels" label="Column name in the labels file to use for the predicted labels"/> | 169 <param name="predicted_label" type="data_column" data_ref="labels" label="Column name in the labels file to use for the predicted labels"/> |
| 170 </when> | 170 </when> |
| 171 <when value="split_data"> | 171 <when value="split_data"> |
| 172 <param argument="--clin" type="data" format="csv" optional="false" label="Clinical data" help="Samples in rows"/> | 172 <param argument="--clin" type="data" format="tabular" optional="false" label="Clinical data" help="Samples in rows"/> |
| 173 <param argument="--omics" type="data" format="tabular,csv" optional="false" multiple="true" label="Omics data" help="samples in columns"/> | 173 <param argument="--omics" type="data" format="tabular" optional="false" multiple="true" label="Omics data" help="samples in columns"/> |
| 174 <param argument="--split" type="float" min="0" max="1" value="0.7" label="Training/Test split ratio" help="Proportion of data to use for training (e.g., 0.7 means 70% train, 30% test)"/> | 174 <param argument="--split" type="float" min="0" max="1" value="0.7" label="Training/Test split ratio" help="Proportion of data to use for training (e.g., 0.7 means 70% train, 30% test)"/> |
| 175 </when> | 175 </when> |
| 176 <when value="binarize"> | 176 <when value="binarize"> |
| 177 <param argument="--mutation" type="data" format="tabular,csv" label="Mutation data" help="Mutation data with both genes and samples in rows"/> | 177 <param argument="--mutation" type="data" format="tabular" label="Mutation data" help="Mutation data with both genes and samples in rows"/> |
| 178 <param argument="--gene_idx" type="data_column" data_ref="mutation" label="Column in the mutation file with genes"/> | 178 <param argument="--gene_idx" type="data_column" data_ref="mutation" label="Column in the mutation file with genes"/> |
| 179 <param argument="--sample_idx" type="data_column" data_ref="mutation" label="Column in the mutation file with samples"/> | 179 <param argument="--sample_idx" type="data_column" data_ref="mutation" label="Column in the mutation file with samples"/> |
| 180 </when> | 180 </when> |
| 181 </conditional> | 181 </conditional> |
| 182 </inputs> | 182 </inputs> |
| 183 <outputs> | 183 <outputs> |
| 184 <data name="util_out" auto_format="true" from_work_dir="output/*" label="${tool.name} on ${on_string}: ${utils_conditional.util}"> | 184 <data name="util_out" auto_format="true" from_work_dir="output/*" label="${tool.name} on ${on_string}: ${utils_conditional.util}"> |
| 185 <filter>utils_conditional['util'] != "split_data"</filter> | 185 <filter>utils_conditional['util'] != "split_data"</filter> |
| 186 </data> | 186 </data> |
| 187 <collection name="train_out" type="list" label="${tool.name} on ${on_string}: train datasets"> | 187 <collection name="train_out" type="list" label="${tool.name} on ${on_string}: train datasets"> |
| 188 <discover_datasets pattern="__name_and_ext__" format="csv" directory="output/train"/> | 188 <discover_datasets pattern="__name_and_ext__" format="tabular" directory="output/train"/> |
| 189 <filter>utils_conditional['util'] == "split_data"</filter> | 189 <filter>utils_conditional['util'] == "split_data"</filter> |
| 190 </collection> | 190 </collection> |
| 191 <collection name="test_out" type="list" label="${tool.name} on ${on_string}: test datasets"> | 191 <collection name="test_out" type="list" label="${tool.name} on ${on_string}: test datasets"> |
| 192 <discover_datasets pattern="__name_and_ext__" format="csv" directory="output/test"/> | 192 <discover_datasets pattern="__name_and_ext__" format="tabular" directory="output/test"/> |
| 193 <filter>utils_conditional['util'] == "split_data"</filter> | 193 <filter>utils_conditional['util'] == "split_data"</filter> |
| 194 </collection> | 194 </collection> |
| 195 </outputs> | 195 </outputs> |
| 196 <tests> | 196 <tests> |
| 197 <!-- test 1: Louvain clustering --> | 197 <!-- test 1: Louvain clustering --> |
| 198 <test expect_num_outputs="1"> | 198 <test expect_num_outputs="1"> |
| 199 <param name="non_commercial_use" value="True"/> | 199 <param name="non_commercial_use" value="True"/> |
| 200 <conditional name="utils_conditional"> | 200 <conditional name="utils_conditional"> |
| 201 <param name="util" value="louvain_clustering"/> | 201 <param name="util" value="louvain_clustering"/> |
| 202 <param name="X" value="embeddings.csv"/> | 202 <param name="X" value="embeddings.tabular"/> |
| 203 <param name="labels" value="labels_pr.csv"/> | 203 <param name="labels" value="labels_pr.tabular"/> |
| 204 <param name="k" value="15"/> | 204 <param name="k" value="15"/> |
| 205 </conditional> | 205 </conditional> |
| 206 <output name="util_out"> | 206 <output name="util_out"> |
| 207 <assert_contents> | 207 <assert_contents> |
| 208 <has_text text="sample_id,variable,class_label,probability,known_label,predicted_label,split,louvain_cluster"/> | 208 <has_text_matching expression="sample_id\tvariable\tclass_label\tprobability\tknown_label\tpredicted_label\tsplit\tlouvain_cluster"/> |
| 209 <has_text text="MB-4818,CLAUDIN_SUBTYPE,LumA,0.8582904,LumB,LumA,test,3.0"/> | 209 <has_text_matching expression="MB-4818\tCLAUDIN_SUBTYPE\tLumA\t0.8582904\tLumB\tLumA\ttest\t3.0"/> |
| 210 </assert_contents> | 210 </assert_contents> |
| 211 </output> | 211 </output> |
| 212 </test> | 212 </test> |
| 213 <!-- test 2: Get optimal clusters --> | 213 <!-- test 2: Get optimal clusters --> |
| 214 <test expect_num_outputs="1"> | 214 <test expect_num_outputs="1"> |
| 215 <param name="non_commercial_use" value="True"/> | 215 <param name="non_commercial_use" value="True"/> |
| 216 <conditional name="utils_conditional"> | 216 <conditional name="utils_conditional"> |
| 217 <param name="util" value="get_optimal_clusters"/> | 217 <param name="util" value="get_optimal_clusters"/> |
| 218 <param name="X" value="embeddings.csv"/> | 218 <param name="X" value="embeddings.tabular"/> |
| 219 <param name="labels" value="labels_pr.csv"/> | 219 <param name="labels" value="labels_pr.tabular"/> |
| 220 <param name="min_k" value="2"/> | 220 <param name="min_k" value="2"/> |
| 221 <param name="max_k" value="10"/> | 221 <param name="max_k" value="10"/> |
| 222 </conditional> | 222 </conditional> |
| 223 <assert_stdout> | 223 <assert_stdout> |
| 224 <has_text text="Optimal number of clusters: 2"/> | 224 <has_text text="Optimal number of clusters: 2"/> |
| 225 <has_text text="Silhouette scores: "/> | 225 <has_text text="Silhouette scores: "/> |
| 226 </assert_stdout> | 226 </assert_stdout> |
| 227 <output name="util_out"> | 227 <output name="util_out"> |
| 228 <assert_contents> | 228 <assert_contents> |
| 229 <has_text text="sample_id,variable,class_label,probability,known_label,predicted_label,split,optimal_kmeans_cluster"/> | 229 <has_text_matching expression="sample_id\tvariable\tclass_label\tprobability\tknown_label\tpredicted_label\tsplit\toptimal_kmeans_cluster"/> |
| 230 <has_text text="MB-4818,CLAUDIN_SUBTYPE,LumA,0.8582904,LumB,LumA,test,0.0"/> | 230 <has_text_matching expression="MB-4818\tCLAUDIN_SUBTYPE\tLumA\t0.8582904\tLumB\tLumA\ttest\t0.0"/> |
| 231 </assert_contents> | 231 </assert_contents> |
| 232 </output> | 232 </output> |
| 233 </test> | 233 </test> |
| 234 <!-- test 3: K-Means clustering --> | 234 <!-- test 3: K-Means clustering --> |
| 235 <test expect_num_outputs="1"> | 235 <test expect_num_outputs="1"> |
| 236 <param name="non_commercial_use" value="True"/> | 236 <param name="non_commercial_use" value="True"/> |
| 237 <conditional name="utils_conditional"> | 237 <conditional name="utils_conditional"> |
| 238 <param name="util" value="k_means_clustering"/> | 238 <param name="util" value="k_means_clustering"/> |
| 239 <param name="X" value="embeddings.csv"/> | 239 <param name="X" value="embeddings.tabular"/> |
| 240 <param name="labels" value="labels_pr.csv"/> | 240 <param name="labels" value="labels_pr.tabular"/> |
| 241 <param name="k" value="2"/> | 241 <param name="k" value="2"/> |
| 242 </conditional> | 242 </conditional> |
| 243 <assert_stdout> | 243 <assert_stdout> |
| 244 <has_text text="KMeans(n_clusters=2, random_state=42)"/> | 244 <has_text text="KMeans(n_clusters=2, random_state=42)"/> |
| 245 </assert_stdout> | 245 </assert_stdout> |
| 246 <output name="util_out"> | 246 <output name="util_out"> |
| 247 <assert_contents> | 247 <assert_contents> |
| 248 <has_text text="sample_id,variable,class_label,probability,known_label,predicted_label,split,kmeans_cluster"/> | 248 <has_text_matching expression="sample_id\tvariable\tclass_label\tprobability\tknown_label\tpredicted_label\tsplit\tkmeans_cluster"/> |
| 249 <has_text text="MB-4818,CLAUDIN_SUBTYPE,LumA,0.8582904,LumB,LumA,test,0.0"/> | 249 <has_text_matching expression="MB-4818\tCLAUDIN_SUBTYPE\tLumA\t0.8582904\tLumB\tLumA\ttest\t0.0"/> |
| 250 </assert_contents> | 250 </assert_contents> |
| 251 </output> | 251 </output> |
| 252 </test> | 252 </test> |
| 253 <!-- test 4: Compute AMI and ARI --> | 253 <!-- test 4: Compute AMI and ARI --> |
| 254 <test expect_num_outputs="1"> | 254 <test expect_num_outputs="1"> |
| 255 <param name="non_commercial_use" value="True"/> | 255 <param name="non_commercial_use" value="True"/> |
| 256 <conditional name="utils_conditional"> | 256 <conditional name="utils_conditional"> |
| 257 <param name="util" value="compute_ami_ari"/> | 257 <param name="util" value="compute_ami_ari"/> |
| 258 <param name="labels" value="labels.csv"/> | 258 <param name="labels" value="labels.tabular"/> |
| 259 <param name="true_label" value="5"/> | 259 <param name="true_label" value="5"/> |
| 260 <param name="predicted_label" value="6"/> | 260 <param name="predicted_label" value="6"/> |
| 261 </conditional> | 261 </conditional> |
| 262 <assert_stdout> | 262 <assert_stdout> |
| 263 <has_text_matching expression="AMI: 0.5108[0-9]+"/> | 263 <has_text_matching expression="AMI: 0.5108[0-9]+"/> |
| 331 </conditional> | 331 </conditional> |
| 332 <output name="util_out"> | 332 <output name="util_out"> |
| 333 <assert_contents> | 333 <assert_contents> |
| 334 <has_n_lines n="1611"/> | 334 <has_n_lines n="1611"/> |
| 335 <has_text text="Hugo_Symbol"/> | 335 <has_text text="Hugo_Symbol"/> |
| 336 <has_text text="AADACL2,0.0,0.0"/> | 336 <has_text_matching expression="AADACL2\t0.0\t0.0"/> |
| 337 <has_text text="ABCB1,0.0,0.0,0.0,1.0"/> | 337 <has_text_matching expression="ABCB1\t0.0\t0.0\t0.0\t1.0"/> |
| 338 </assert_contents> | 338 </assert_contents> |
| 339 </output> | 339 </output> |
| 340 </test> | 340 </test> |
| 341 </tests> | 341 </tests> |
| 342 <help><