Mercurial > repos > bgruening > cleanlab
comparison cleanlab_issue_handler.xml @ 0:ecc18228c32e draft default tip
planemo upload for repository https://github.com/cleanlab/cleanlab commit ac4753a61ee908bc2a5953b6c6d38d2bbbacc6c0
| author | bgruening |
|---|---|
| date | Wed, 28 May 2025 11:30:39 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:ecc18228c32e |
|---|---|
| 1 <tool id="cleanlab_issue_handler" name="Cleanlab Issue Handler" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="23.0"> | |
| 2 <description>Detect and optionally clean data issues using Cleanlab</description> | |
| 3 <macros> | |
| 4 <token name="@TOOL_VERSION@">2.7.1</token> | |
| 5 <token name="@VERSION_SUFFIX@">1.0</token> | |
| 6 </macros> | |
| 7 | |
| 8 <requirements> | |
| 9 <requirement type="package" version="2.7.1">cleanlab</requirement> | |
| 10 <requirement type="package" version="3.6.0">datasets</requirement> | |
| 11 <requirement type="package" version="3.0.0">xgboost</requirement> | |
| 12 </requirements> | |
| 13 | |
| 14 <command detect_errors="exit_code"><![CDATA[ | |
| 15 python '${__tool_directory__}/cleanlab_issue_handler.py' | |
| 16 --input_file '$input_file' '$input_file.ext' | |
| 17 --target_column '$target_column' | |
| 18 --task "$task_block.task" | |
| 19 --method "$task_block.method" | |
| 20 #if $summary_only: | |
| 21 --summary | |
| 22 #end if | |
| 23 #if str($task_block.task) == "classification": | |
| 24 $task_block.label_issues | |
| 25 $task_block.outliers | |
| 26 $task_block.near_duplicates | |
| 27 $task_block.non_iid | |
| 28 #elif str($task_block.task) == "regression": | |
| 29 --quality-threshold "$task_block.quality_threshold" | |
| 30 #end if | |
| 31 ]]></command> | |
| 32 | |
| 33 <inputs> | |
| 34 <param name="input_file" type="data" format="csv,tsv,tabular" label="Input data file"/> | |
| 35 <param name="target_column" type="text" optional="false" label="Target column name" value="target" help="Name of the target column in the input data file. Default is 'target'."/> | |
| 36 <param name="summary_only" type="boolean" label="Only generate summary report?" checked="false"/> | |
| 37 | |
| 38 <conditional name="task_block"> | |
| 39 <param name="task" type="select" label="Task type"> | |
| 40 <option value="classification">Classification</option> | |
| 41 <option value="regression">Regression</option> | |
| 42 </param> | |
| 43 | |
| 44 <when value="classification"> | |
| 45 <param name="method" type="select" label="Cleaning method"> | |
| 46 <option value="remove">Remove problematic rows</option> | |
| 47 <option value="replace">Replace problematic labels (classification only)</option> | |
| 48 </param> | |
| 49 <param name="label_issues" type="boolean" truevalue="" falsevalue="--no-label-issues" label="Remove/Replace label issues" checked="true"/> | |
| 50 <param name="outliers" type="boolean" truevalue="" falsevalue="--no-outliers" label="Remove/Replace outlier issues" checked="true"/> | |
| 51 <param name="near_duplicates" type="boolean" truevalue="" falsevalue="--no-near-duplicates" label="Remove/Replace near-duplicate issues" checked="true"/> | |
| 52 <param name="non_iid" type="boolean" truevalue="" falsevalue="--no-non-iid" label="Remove/Replace non-IID issues" checked="true"/> | |
| 53 </when> | |
| 54 | |
| 55 <when value="regression"> | |
| 56 <param name="method" type="select" label="Cleaning method"> | |
| 57 <option value="remove">Remove problematic rows</option> | |
| 58 <!-- No "replace" option for regression --> | |
| 59 </param> | |
| 60 <param name="quality_threshold" type="float" label="Quality threshold" value="0.2" min="0.0" max="1.0" help="Threshold for low-quality labels. Default is 0.2."/> | |
| 61 <!-- No issue type parameters shown --> | |
| 62 </when> | |
| 63 </conditional> | |
| 64 </inputs> | |
| 65 | |
| 66 <outputs> | |
| 67 <data name="report_file" from_work_dir="summary.txt" format="txt" label="Issue Report"/> | |
| 68 | |
| 69 <data name="output_file" from_work_dir="cleaned_data" format_source="input_file" label="cleaned_${input_file.name}"> | |
| 70 <filter>not summary_only</filter> | |
| 71 </data> | |
| 72 </outputs> | |
| 73 | |
| 74 <tests> | |
| 75 <!-- Test1: only summary --> | |
| 76 <test expect_num_outputs="1"> | |
| 77 <param name="input_file" value="breast_cancer.csv" /> | |
| 78 <param name="target_column" value="target" /> | |
| 79 <param name="summary_only" value="true" /> | |
| 80 | |
| 81 <conditional name="task_block"> | |
| 82 <param name="task" value="classification" /> | |
| 83 <param name="method" value="remove" /> | |
| 84 <param name="label_issues" value="true" /> | |
| 85 <param name="outliers" value="true" /> | |
| 86 <param name="near_duplicates" value="true" /> | |
| 87 <param name="non_iid" value="true" /> | |
| 88 </conditional> | |
| 89 | |
| 90 <output name="report_file"> | |
| 91 <assert_contents> | |
| 92 <has_text_matching expression="issue_type\s+score\s+num_issues"/> | |
| 93 <has_text_matching expression="label\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> | |
| 94 <has_text_matching expression="outlier\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> | |
| 95 <has_text_matching expression="non_iid\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> | |
| 96 </assert_contents> | |
| 97 </output> | |
| 98 </test> | |
| 99 | |
| 100 <!-- Test2: summary and cleaned all --> | |
| 101 <test expect_num_outputs="2"> | |
| 102 <param name="input_file" value="breast_cancer.csv" /> | |
| 103 <param name="target_column" value="target" /> | |
| 104 <param name="summary_only" value="false" /> | |
| 105 | |
| 106 <conditional name="task_block"> | |
| 107 <param name="task" value="classification" /> | |
| 108 <param name="method" value="remove" /> | |
| 109 <param name="label_issues" value="true" /> | |
| 110 <param name="outliers" value="true" /> | |
| 111 <param name="near_duplicates" value="true" /> | |
| 112 <param name="non_iid" value="true" /> | |
| 113 </conditional> | |
| 114 | |
| 115 <output name="report_file"> | |
| 116 <assert_contents> | |
| 117 <has_text_matching expression="issue_type\s+score\s+num_issues"/> | |
| 118 <has_text_matching expression="label\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> | |
| 119 <has_text_matching expression="outlier\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> | |
| 120 <has_text_matching expression="non_iid\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> | |
| 121 </assert_contents> | |
| 122 </output> | |
| 123 | |
| 124 <output name="output_file"> | |
| 125 <assert_contents> | |
| 126 <has_text_matching expression=".*target.*"/> | |
| 127 <has_text_matching expression="^.*,.+,.+"/> | |
| 128 </assert_contents> | |
| 129 </output> | |
| 130 </test> | |
| 131 | |
| 132 <!-- Test3: summary and cleaned label issues only --> | |
| 133 <test expect_num_outputs="2"> | |
| 134 <param name="input_file" value="breast_cancer.csv" /> | |
| 135 <param name="target_column" value="target" /> | |
| 136 <param name="summary_only" value="false" /> | |
| 137 | |
| 138 <conditional name="task_block"> | |
| 139 <param name="task" value="classification" /> | |
| 140 <param name="method" value="remove" /> | |
| 141 <param name="label_issues" value="true" /> | |
| 142 <param name="outliers" value="false" /> | |
| 143 <param name="near_duplicates" value="false" /> | |
| 144 <param name="non_iid" value="false" /> | |
| 145 </conditional> | |
| 146 | |
| 147 <output name="report_file"> | |
| 148 <assert_contents> | |
| 149 <has_text_matching expression="label\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> | |
| 150 </assert_contents> | |
| 151 </output> | |
| 152 | |
| 153 <output name="output_file"> | |
| 154 <assert_contents> | |
| 155 <has_text_matching expression=".*target.*"/> | |
| 156 <has_text_matching expression="^.*,.+,.+"/> | |
| 157 </assert_contents> | |
| 158 </output> | |
| 159 </test> | |
| 160 | |
| 161 <!-- Test4: summary and cleaned outliers only --> | |
| 162 <test expect_num_outputs="2"> | |
| 163 <param name="input_file" value="breast_cancer.csv" /> | |
| 164 <param name="target_column" value="target" /> | |
| 165 <param name="summary_only" value="false" /> | |
| 166 | |
| 167 <conditional name="task_block"> | |
| 168 <param name="task" value="classification" /> | |
| 169 <param name="method" value="remove" /> | |
| 170 <param name="label_issues" value="false" /> | |
| 171 <param name="outliers" value="true" /> | |
| 172 <param name="near_duplicates" value="false" /> | |
| 173 <param name="non_iid" value="false" /> | |
| 174 </conditional> | |
| 175 | |
| 176 <output name="report_file"> | |
| 177 <assert_contents> | |
| 178 <has_text_matching expression="outlier\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> | |
| 179 </assert_contents> | |
| 180 </output> | |
| 181 | |
| 182 <output name="output_file"> | |
| 183 <assert_contents> | |
| 184 <has_text_matching expression=".*target.*"/> | |
| 185 <has_text_matching expression="^.*,.+,.+"/> | |
| 186 </assert_contents> | |
| 187 </output> | |
| 188 </test> | |
| 189 | |
| 190 <!-- Test5: summary and clean all for tsv --> | |
| 191 <test expect_num_outputs="2"> | |
| 192 <param name="input_file" value="breast_cancer.tsv" /> | |
| 193 <param name="target_column" value="target" /> | |
| 194 <param name="summary_only" value="false" /> | |
| 195 | |
| 196 <conditional name="task_block"> | |
| 197 <param name="task" value="classification" /> | |
| 198 <param name="method" value="remove" /> | |
| 199 <param name="label_issues" value="true" /> | |
| 200 <param name="outliers" value="true" /> | |
| 201 <param name="near_duplicates" value="true" /> | |
| 202 <param name="non_iid" value="true" /> | |
| 203 </conditional> | |
| 204 | |
| 205 <output name="report_file"> | |
| 206 <assert_contents> | |
| 207 <has_text_matching expression="issue_type\s+score\s+num_issues"/> | |
| 208 <has_text_matching expression="label\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> | |
| 209 <has_text_matching expression="outlier\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> | |
| 210 <has_text_matching expression="non_iid\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> | |
| 211 </assert_contents> | |
| 212 </output> | |
| 213 | |
| 214 <output name="output_file"> | |
| 215 <assert_contents> | |
| 216 <has_text_matching expression=".*target.*"/> | |
| 217 <has_text_matching expression="^.*\t.+\t.+"/> | |
| 218 </assert_contents> | |
| 219 </output> | |
| 220 </test> | |
| 221 | |
| 222 <!-- Test6: regression with summary and cleaned output --> | |
| 223 <test expect_num_outputs="2"> | |
| 224 <param name="input_file" value="reg_1027_ESL.csv" /> | |
| 225 <param name="target_column" value="target" /> | |
| 226 <param name="summary_only" value="false" /> | |
| 227 | |
| 228 <conditional name="task_block"> | |
| 229 <param name="task" value="regression" /> | |
| 230 <param name="method" value="remove" /> | |
| 231 </conditional> | |
| 232 | |
| 233 <output name="report_file"> | |
| 234 <assert_contents> | |
| 235 <has_text text="Regression Issue Summary:"/> | |
| 236 <has_text_matching expression="Num low quality:"/> | |
| 237 <has_text_matching expression="Mean label quality:"/> | |
| 238 </assert_contents> | |
| 239 </output> | |
| 240 | |
| 241 <output name="output_file"> | |
| 242 <assert_contents> | |
| 243 <has_text_matching expression=".*target.*"/> | |
| 244 <has_text_matching expression="^.*,.+,.+"/> | |
| 245 </assert_contents> | |
| 246 </output> | |
| 247 </test> | |
| 248 </tests> | |
| 249 | |
| 250 <help>< Python library. It supports **classification** and **regression** tasks and helps improve dataset quality by detecting label errors, outliers, near-duplicate entries, and non-IID samples. | |
| 254 | |
| 255 The tool internally fits a cross-validated model (e.g., via XGBoost) to estimate label quality and identify problematic samples. These issues can be summarized in a report, and optionally addressed via removal or correction (depending on task and selected method). | |
| 256 | |
| 257 -------------------- | |
| 258 | |
| 259 **Detected Issue Types (with technical examples)** | |
| 260 | |
| 261 - **Label Issues** | |
| 262 These are samples whose label in the dataset is likely incorrect. | |
| 263 **Example:** In a medical classification dataset, a patient's record is labeled as "benign," but its feature pattern is highly similar to correctly labeled "malignant" cases. | |
| 264 | |
| 265 | |
| 266 - **Outliers** | |
| 267 Points that are statistically distant from the rest of the dataset. | |
| 268 **Example**: An entry with unusually high or low feature values (e.g., several standard deviations away from the mean). | |
| 269 | |
| 270 - **Near-Duplicates** | |
| 271 Highly similar or repeated samples. | |
| 272 **Example**: Two rows with nearly identical features and labels — possibly a duplication or copy artifact. | |
| 273 | |
| 274 - **Non-IID Samples** *(classification only)* | |
| 275 Samples that violate the assumption of independent and identically distributed data. | |
| 276 **Example**: A subset from a different population source (e.g., a different hospital or device) introducing distributional shift. | |
| 277 | |
| 278 -------------------- | |
| 279 | |
| 280 **Parameters** | |
| 281 | |
| 282 - **Input file**: Tabular file (CSV/TSV) with a `target` column. | |
| 283 - **Task type**: `classification` or `regression`. | |
| 284 - **Method**: `remove` (delete problematic rows) or `replace` (correct labels — classification only). | |
| 285 - **Only report issues**: If checked, input data is unchanged; only a summary report is produced. | |
| 286 - **Issue types**: Choose which issues to detect and handle. | |
| 287 - **Quality threshold** *(regression only)*: A float between 0.0 and 1.0 that determines how aggressively the tool flags low-quality labels in regression. Labels with quality scores below this threshold will be removed. | |
| 288 | |
| 289 -------------------- | |
| 290 | |
| 291 **Outputs** | |
| 292 | |
| 293 - **summary.txt**: Report listing each issue type, confidence score, and number of affected rows. | |
| 294 - **cleaned_data**: Cleaned dataset (CSV/TSV), only produced if "Only report issues" is unchecked. | |
| 295 | |
| 296 ]]></help> | |
| 297 | |
| 298 <citations> | |
| 299 <citation type="bibtex"> | |
| 300 @inproceedings{northcutt2021confident, | |
| 301 title={Confident learning: Estimating uncertainty in dataset labels}, | |
| 302 author={Northcutt, Curtis G and Jiang, Lu and Chuang, Alex}, | |
| 303 booktitle={Journal of Artificial Intelligence Research}, | |
| 304 year={2021}, | |
| 305 volume={70}, | |
| 306 pages={1373--1411} | |
| 307 } | |
| 308 </citation> | |
| 309 </citations> | |
| 310 </tool> |
