comparison cleanlab_issue_handler.xml @ 0:ecc18228c32e draft default tip

planemo upload for repository https://github.com/cleanlab/cleanlab commit ac4753a61ee908bc2a5953b6c6d38d2bbbacc6c0
author bgruening
date Wed, 28 May 2025 11:30:39 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ecc18228c32e
1 <tool id="cleanlab_issue_handler" name="Cleanlab Issue Handler" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="23.0">
2 <description>Detect and optionally clean data issues using Cleanlab</description>
3 <macros>
4 <token name="@TOOL_VERSION@">2.7.1</token>
5 <token name="@VERSION_SUFFIX@">1.0</token>
6 </macros>
7
8 <requirements>
9 <requirement type="package" version="2.7.1">cleanlab</requirement>
10 <requirement type="package" version="3.6.0">datasets</requirement>
11 <requirement type="package" version="3.0.0">xgboost</requirement>
12 </requirements>
13
14 <command detect_errors="exit_code"><![CDATA[
15 python '${__tool_directory__}/cleanlab_issue_handler.py'
16 --input_file '$input_file' '$input_file.ext'
17 --target_column '$target_column'
18 --task "$task_block.task"
19 --method "$task_block.method"
20 #if $summary_only:
21 --summary
22 #end if
23 #if str($task_block.task) == "classification":
24 $task_block.label_issues
25 $task_block.outliers
26 $task_block.near_duplicates
27 $task_block.non_iid
28 #elif str($task_block.task) == "regression":
29 --quality-threshold "$task_block.quality_threshold"
30 #end if
31 ]]></command>
32
33 <inputs>
34 <param name="input_file" type="data" format="csv,tsv,tabular" label="Input data file"/>
35 <param name="target_column" type="text" optional="false" label="Target column name" value="target" help="Name of the target column in the input data file. Default is 'target'."/>
36 <param name="summary_only" type="boolean" label="Only generate summary report?" checked="false"/>
37
38 <conditional name="task_block">
39 <param name="task" type="select" label="Task type">
40 <option value="classification">Classification</option>
41 <option value="regression">Regression</option>
42 </param>
43
44 <when value="classification">
45 <param name="method" type="select" label="Cleaning method">
46 <option value="remove">Remove problematic rows</option>
47 <option value="replace">Replace problematic labels (classification only)</option>
48 </param>
49 <param name="label_issues" type="boolean" truevalue="" falsevalue="--no-label-issues" label="Remove/Replace label issues" checked="true"/>
50 <param name="outliers" type="boolean" truevalue="" falsevalue="--no-outliers" label="Remove/Replace outlier issues" checked="true"/>
51 <param name="near_duplicates" type="boolean" truevalue="" falsevalue="--no-near-duplicates" label="Remove/Replace near-duplicate issues" checked="true"/>
52 <param name="non_iid" type="boolean" truevalue="" falsevalue="--no-non-iid" label="Remove/Replace non-IID issues" checked="true"/>
53 </when>
54
55 <when value="regression">
56 <param name="method" type="select" label="Cleaning method">
57 <option value="remove">Remove problematic rows</option>
58 <!-- No "replace" option for regression -->
59 </param>
60 <param name="quality_threshold" type="float" label="Quality threshold" value="0.2" min="0.0" max="1.0" help="Threshold for low-quality labels. Default is 0.2."/>
61 <!-- No issue type parameters shown -->
62 </when>
63 </conditional>
64 </inputs>
65
66 <outputs>
67 <data name="report_file" from_work_dir="summary.txt" format="txt" label="Issue Report"/>
68
69 <data name="output_file" from_work_dir="cleaned_data" format_source="input_file" label="cleaned_${input_file.name}">
70 <filter>not summary_only</filter>
71 </data>
72 </outputs>
73
74 <tests>
75 <!-- Test1: only summary -->
76 <test expect_num_outputs="1">
77 <param name="input_file" value="breast_cancer.csv" />
78 <param name="target_column" value="target" />
79 <param name="summary_only" value="true" />
80
81 <conditional name="task_block">
82 <param name="task" value="classification" />
83 <param name="method" value="remove" />
84 <param name="label_issues" value="true" />
85 <param name="outliers" value="true" />
86 <param name="near_duplicates" value="true" />
87 <param name="non_iid" value="true" />
88 </conditional>
89
90 <output name="report_file">
91 <assert_contents>
92 <has_text_matching expression="issue_type\s+score\s+num_issues"/>
93 <has_text_matching expression="label\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/>
94 <has_text_matching expression="outlier\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/>
95 <has_text_matching expression="non_iid\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/>
96 </assert_contents>
97 </output>
98 </test>
99
100 <!-- Test2: summary and cleaned all -->
101 <test expect_num_outputs="2">
102 <param name="input_file" value="breast_cancer.csv" />
103 <param name="target_column" value="target" />
104 <param name="summary_only" value="false" />
105
106 <conditional name="task_block">
107 <param name="task" value="classification" />
108 <param name="method" value="remove" />
109 <param name="label_issues" value="true" />
110 <param name="outliers" value="true" />
111 <param name="near_duplicates" value="true" />
112 <param name="non_iid" value="true" />
113 </conditional>
114
115 <output name="report_file">
116 <assert_contents>
117 <has_text_matching expression="issue_type\s+score\s+num_issues"/>
118 <has_text_matching expression="label\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/>
119 <has_text_matching expression="outlier\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/>
120 <has_text_matching expression="non_iid\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/>
121 </assert_contents>
122 </output>
123
124 <output name="output_file">
125 <assert_contents>
126 <has_text_matching expression=".*target.*"/>
127 <has_text_matching expression="^.*,.+,.+"/>
128 </assert_contents>
129 </output>
130 </test>
131
132 <!-- Test3: summary and cleaned label issues only -->
133 <test expect_num_outputs="2">
134 <param name="input_file" value="breast_cancer.csv" />
135 <param name="target_column" value="target" />
136 <param name="summary_only" value="false" />
137
138 <conditional name="task_block">
139 <param name="task" value="classification" />
140 <param name="method" value="remove" />
141 <param name="label_issues" value="true" />
142 <param name="outliers" value="false" />
143 <param name="near_duplicates" value="false" />
144 <param name="non_iid" value="false" />
145 </conditional>
146
147 <output name="report_file">
148 <assert_contents>
149 <has_text_matching expression="label\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/>
150 </assert_contents>
151 </output>
152
153 <output name="output_file">
154 <assert_contents>
155 <has_text_matching expression=".*target.*"/>
156 <has_text_matching expression="^.*,.+,.+"/>
157 </assert_contents>
158 </output>
159 </test>
160
161 <!-- Test4: summary and cleaned outliers only -->
162 <test expect_num_outputs="2">
163 <param name="input_file" value="breast_cancer.csv" />
164 <param name="target_column" value="target" />
165 <param name="summary_only" value="false" />
166
167 <conditional name="task_block">
168 <param name="task" value="classification" />
169 <param name="method" value="remove" />
170 <param name="label_issues" value="false" />
171 <param name="outliers" value="true" />
172 <param name="near_duplicates" value="false" />
173 <param name="non_iid" value="false" />
174 </conditional>
175
176 <output name="report_file">
177 <assert_contents>
178 <has_text_matching expression="outlier\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/>
179 </assert_contents>
180 </output>
181
182 <output name="output_file">
183 <assert_contents>
184 <has_text_matching expression=".*target.*"/>
185 <has_text_matching expression="^.*,.+,.+"/>
186 </assert_contents>
187 </output>
188 </test>
189
190 <!-- Test5: summary and clean all for tsv -->
191 <test expect_num_outputs="2">
192 <param name="input_file" value="breast_cancer.tsv" />
193 <param name="target_column" value="target" />
194 <param name="summary_only" value="false" />
195
196 <conditional name="task_block">
197 <param name="task" value="classification" />
198 <param name="method" value="remove" />
199 <param name="label_issues" value="true" />
200 <param name="outliers" value="true" />
201 <param name="near_duplicates" value="true" />
202 <param name="non_iid" value="true" />
203 </conditional>
204
205 <output name="report_file">
206 <assert_contents>
207 <has_text_matching expression="issue_type\s+score\s+num_issues"/>
208 <has_text_matching expression="label\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/>
209 <has_text_matching expression="outlier\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/>
210 <has_text_matching expression="non_iid\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/>
211 </assert_contents>
212 </output>
213
214 <output name="output_file">
215 <assert_contents>
216 <has_text_matching expression=".*target.*"/>
217 <has_text_matching expression="^.*\t.+\t.+"/>
218 </assert_contents>
219 </output>
220 </test>
221
222 <!-- Test6: regression with summary and cleaned output -->
223 <test expect_num_outputs="2">
224 <param name="input_file" value="reg_1027_ESL.csv" />
225 <param name="target_column" value="target" />
226 <param name="summary_only" value="false" />
227
228 <conditional name="task_block">
229 <param name="task" value="regression" />
230 <param name="method" value="remove" />
231 </conditional>
232
233 <output name="report_file">
234 <assert_contents>
235 <has_text text="Regression Issue Summary:"/>
236 <has_text_matching expression="Num low quality:"/>
237 <has_text_matching expression="Mean label quality:"/>
238 </assert_contents>
239 </output>
240
241 <output name="output_file">
242 <assert_contents>
243 <has_text_matching expression=".*target.*"/>
244 <has_text_matching expression="^.*,.+,.+"/>
245 </assert_contents>
246 </output>
247 </test>
248 </tests>
249
250 <help><![CDATA[
251 **Cleanlab Issue Handler**
252
253 This Galaxy tool identifies and optionally removes or corrects data issues in supervised learning datasets using the [Cleanlab](https://cleanlab.io/) Python library. It supports **classification** and **regression** tasks and helps improve dataset quality by detecting label errors, outliers, near-duplicate entries, and non-IID samples.
254
255 The tool internally fits a cross-validated model (e.g., via XGBoost) to estimate label quality and identify problematic samples. These issues can be summarized in a report, and optionally addressed via removal or correction (depending on task and selected method).
256
257 --------------------
258
259 **Detected Issue Types (with technical examples)**
260
261 - **Label Issues**
262 These are samples whose label in the dataset is likely incorrect.
263 **Example:** In a medical classification dataset, a patient's record is labeled as "benign," but its feature pattern is highly similar to correctly labeled "malignant" cases.
264
265
266 - **Outliers**
267 Points that are statistically distant from the rest of the dataset.
268 **Example**: An entry with unusually high or low feature values (e.g., several standard deviations away from the mean).
269
270 - **Near-Duplicates**
271 Highly similar or repeated samples.
272 **Example**: Two rows with nearly identical features and labels — possibly a duplication or copy artifact.
273
274 - **Non-IID Samples** *(classification only)*
275 Samples that violate the assumption of independent and identically distributed data.
276 **Example**: A subset from a different population source (e.g., a different hospital or device) introducing distributional shift.
277
278 --------------------
279
280 **Parameters**
281
282 - **Input file**: Tabular file (CSV/TSV) with a `target` column.
283 - **Task type**: `classification` or `regression`.
284 - **Method**: `remove` (delete problematic rows) or `replace` (correct labels — classification only).
285 - **Only report issues**: If checked, input data is unchanged; only a summary report is produced.
286 - **Issue types**: Choose which issues to detect and handle.
287 - **Quality threshold** *(regression only)*: A float between 0.0 and 1.0 that determines how aggressively the tool flags low-quality labels in regression. Labels with quality scores below this threshold will be removed.
288
289 --------------------
290
291 **Outputs**
292
293 - **summary.txt**: Report listing each issue type, confidence score, and number of affected rows.
294 - **cleaned_data**: Cleaned dataset (CSV/TSV), only produced if "Only report issues" is unchecked.
295
296 ]]></help>
297
298 <citations>
299 <citation type="bibtex">
300 @inproceedings{northcutt2021confident,
301 title={Confident learning: Estimating uncertainty in dataset labels},
302 author={Northcutt, Curtis G and Jiang, Lu and Chuang, Alex},
303 booktitle={Journal of Artificial Intelligence Research},
304 year={2021},
305 volume={70},
306 pages={1373--1411}
307 }
308 </citation>
309 </citations>
310 </tool>