Mercurial > repos > bgruening > cleanlab
diff cleanlab_issue_handler.xml @ 0:ecc18228c32e draft default tip
planemo upload for repository https://github.com/cleanlab/cleanlab commit ac4753a61ee908bc2a5953b6c6d38d2bbbacc6c0
| author | bgruening |
|---|---|
| date | Wed, 28 May 2025 11:30:39 +0000 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cleanlab_issue_handler.xml Wed May 28 11:30:39 2025 +0000 @@ -0,0 +1,310 @@ +<tool id="cleanlab_issue_handler" name="Cleanlab Issue Handler" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="23.0"> + <description>Detect and optionally clean data issues using Cleanlab</description> + <macros> + <token name="@TOOL_VERSION@">2.7.1</token> + <token name="@VERSION_SUFFIX@">1.0</token> + </macros> + + <requirements> + <requirement type="package" version="2.7.1">cleanlab</requirement> + <requirement type="package" version="3.6.0">datasets</requirement> + <requirement type="package" version="3.0.0">xgboost</requirement> + </requirements> + + <command detect_errors="exit_code"><![CDATA[ + python '${__tool_directory__}/cleanlab_issue_handler.py' + --input_file '$input_file' '$input_file.ext' + --target_column '$target_column' + --task "$task_block.task" + --method "$task_block.method" + #if $summary_only: + --summary + #end if + #if str($task_block.task) == "classification": + $task_block.label_issues + $task_block.outliers + $task_block.near_duplicates + $task_block.non_iid + #elif str($task_block.task) == "regression": + --quality-threshold "$task_block.quality_threshold" + #end if + ]]></command> + + <inputs> + <param name="input_file" type="data" format="csv,tsv,tabular" label="Input data file"/> + <param name="target_column" type="text" optional="false" label="Target column name" value="target" help="Name of the target column in the input data file. Default is 'target'."/> + <param name="summary_only" type="boolean" label="Only generate summary report?" checked="false"/> + + <conditional name="task_block"> + <param name="task" type="select" label="Task type"> + <option value="classification">Classification</option> + <option value="regression">Regression</option> + </param> + + <when value="classification"> + <param name="method" type="select" label="Cleaning method"> + <option value="remove">Remove problematic rows</option> + <option value="replace">Replace problematic labels (classification only)</option> + </param> + <param name="label_issues" type="boolean" truevalue="" falsevalue="--no-label-issues" label="Remove/Replace label issues" checked="true"/> + <param name="outliers" type="boolean" truevalue="" falsevalue="--no-outliers" label="Remove/Replace outlier issues" checked="true"/> + <param name="near_duplicates" type="boolean" truevalue="" falsevalue="--no-near-duplicates" label="Remove/Replace near-duplicate issues" checked="true"/> + <param name="non_iid" type="boolean" truevalue="" falsevalue="--no-non-iid" label="Remove/Replace non-IID issues" checked="true"/> + </when> + + <when value="regression"> + <param name="method" type="select" label="Cleaning method"> + <option value="remove">Remove problematic rows</option> + <!-- No "replace" option for regression --> + </param> + <param name="quality_threshold" type="float" label="Quality threshold" value="0.2" min="0.0" max="1.0" help="Threshold for low-quality labels. Default is 0.2."/> + <!-- No issue type parameters shown --> + </when> + </conditional> + </inputs> + + <outputs> + <data name="report_file" from_work_dir="summary.txt" format="txt" label="Issue Report"/> + + <data name="output_file" from_work_dir="cleaned_data" format_source="input_file" label="cleaned_${input_file.name}"> + <filter>not summary_only</filter> + </data> + </outputs> + + <tests> + <!-- Test1: only summary --> + <test expect_num_outputs="1"> + <param name="input_file" value="breast_cancer.csv" /> + <param name="target_column" value="target" /> + <param name="summary_only" value="true" /> + + <conditional name="task_block"> + <param name="task" value="classification" /> + <param name="method" value="remove" /> + <param name="label_issues" value="true" /> + <param name="outliers" value="true" /> + <param name="near_duplicates" value="true" /> + <param name="non_iid" value="true" /> + </conditional> + + <output name="report_file"> + <assert_contents> + <has_text_matching expression="issue_type\s+score\s+num_issues"/> + <has_text_matching expression="label\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> + <has_text_matching expression="outlier\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> + <has_text_matching expression="non_iid\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> + </assert_contents> + </output> + </test> + + <!-- Test2: summary and cleaned all --> + <test expect_num_outputs="2"> + <param name="input_file" value="breast_cancer.csv" /> + <param name="target_column" value="target" /> + <param name="summary_only" value="false" /> + + <conditional name="task_block"> + <param name="task" value="classification" /> + <param name="method" value="remove" /> + <param name="label_issues" value="true" /> + <param name="outliers" value="true" /> + <param name="near_duplicates" value="true" /> + <param name="non_iid" value="true" /> + </conditional> + + <output name="report_file"> + <assert_contents> + <has_text_matching expression="issue_type\s+score\s+num_issues"/> + <has_text_matching expression="label\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> + <has_text_matching expression="outlier\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> + <has_text_matching expression="non_iid\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> + </assert_contents> + </output> + + <output name="output_file"> + <assert_contents> + <has_text_matching expression=".*target.*"/> + <has_text_matching expression="^.*,.+,.+"/> + </assert_contents> + </output> + </test> + + <!-- Test3: summary and cleaned label issues only --> + <test expect_num_outputs="2"> + <param name="input_file" value="breast_cancer.csv" /> + <param name="target_column" value="target" /> + <param name="summary_only" value="false" /> + + <conditional name="task_block"> + <param name="task" value="classification" /> + <param name="method" value="remove" /> + <param name="label_issues" value="true" /> + <param name="outliers" value="false" /> + <param name="near_duplicates" value="false" /> + <param name="non_iid" value="false" /> + </conditional> + + <output name="report_file"> + <assert_contents> + <has_text_matching expression="label\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> + </assert_contents> + </output> + + <output name="output_file"> + <assert_contents> + <has_text_matching expression=".*target.*"/> + <has_text_matching expression="^.*,.+,.+"/> + </assert_contents> + </output> + </test> + + <!-- Test4: summary and cleaned outliers only --> + <test expect_num_outputs="2"> + <param name="input_file" value="breast_cancer.csv" /> + <param name="target_column" value="target" /> + <param name="summary_only" value="false" /> + + <conditional name="task_block"> + <param name="task" value="classification" /> + <param name="method" value="remove" /> + <param name="label_issues" value="false" /> + <param name="outliers" value="true" /> + <param name="near_duplicates" value="false" /> + <param name="non_iid" value="false" /> + </conditional> + + <output name="report_file"> + <assert_contents> + <has_text_matching expression="outlier\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> + </assert_contents> + </output> + + <output name="output_file"> + <assert_contents> + <has_text_matching expression=".*target.*"/> + <has_text_matching expression="^.*,.+,.+"/> + </assert_contents> + </output> + </test> + + <!-- Test5: summary and clean all for tsv --> + <test expect_num_outputs="2"> + <param name="input_file" value="breast_cancer.tsv" /> + <param name="target_column" value="target" /> + <param name="summary_only" value="false" /> + + <conditional name="task_block"> + <param name="task" value="classification" /> + <param name="method" value="remove" /> + <param name="label_issues" value="true" /> + <param name="outliers" value="true" /> + <param name="near_duplicates" value="true" /> + <param name="non_iid" value="true" /> + </conditional> + + <output name="report_file"> + <assert_contents> + <has_text_matching expression="issue_type\s+score\s+num_issues"/> + <has_text_matching expression="label\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> + <has_text_matching expression="outlier\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> + <has_text_matching expression="non_iid\s+(0(\.\d+)?|1(\.0+)?)\s+\d+"/> + </assert_contents> + </output> + + <output name="output_file"> + <assert_contents> + <has_text_matching expression=".*target.*"/> + <has_text_matching expression="^.*\t.+\t.+"/> + </assert_contents> + </output> + </test> + + <!-- Test6: regression with summary and cleaned output --> + <test expect_num_outputs="2"> + <param name="input_file" value="reg_1027_ESL.csv" /> + <param name="target_column" value="target" /> + <param name="summary_only" value="false" /> + + <conditional name="task_block"> + <param name="task" value="regression" /> + <param name="method" value="remove" /> + </conditional> + + <output name="report_file"> + <assert_contents> + <has_text text="Regression Issue Summary:"/> + <has_text_matching expression="Num low quality:"/> + <has_text_matching expression="Mean label quality:"/> + </assert_contents> + </output> + + <output name="output_file"> + <assert_contents> + <has_text_matching expression=".*target.*"/> + <has_text_matching expression="^.*,.+,.+"/> + </assert_contents> + </output> + </test> +</tests> + + <help>< Python library. It supports **classification** and **regression** tasks and helps improve dataset quality by detecting label errors, outliers, near-duplicate entries, and non-IID samples. + +The tool internally fits a cross-validated model (e.g., via XGBoost) to estimate label quality and identify problematic samples. These issues can be summarized in a report, and optionally addressed via removal or correction (depending on task and selected method). + +-------------------- + +**Detected Issue Types (with technical examples)** + +- **Label Issues** + These are samples whose label in the dataset is likely incorrect. + **Example:** In a medical classification dataset, a patient's record is labeled as "benign," but its feature pattern is highly similar to correctly labeled "malignant" cases. + + +- **Outliers** + Points that are statistically distant from the rest of the dataset. + **Example**: An entry with unusually high or low feature values (e.g., several standard deviations away from the mean). + +- **Near-Duplicates** + Highly similar or repeated samples. + **Example**: Two rows with nearly identical features and labels — possibly a duplication or copy artifact. + +- **Non-IID Samples** *(classification only)* + Samples that violate the assumption of independent and identically distributed data. + **Example**: A subset from a different population source (e.g., a different hospital or device) introducing distributional shift. + +-------------------- + +**Parameters** + +- **Input file**: Tabular file (CSV/TSV) with a `target` column. +- **Task type**: `classification` or `regression`. +- **Method**: `remove` (delete problematic rows) or `replace` (correct labels — classification only). +- **Only report issues**: If checked, input data is unchanged; only a summary report is produced. +- **Issue types**: Choose which issues to detect and handle. +- **Quality threshold** *(regression only)*: A float between 0.0 and 1.0 that determines how aggressively the tool flags low-quality labels in regression. Labels with quality scores below this threshold will be removed. + +-------------------- + +**Outputs** + +- **summary.txt**: Report listing each issue type, confidence score, and number of affected rows. +- **cleaned_data**: Cleaned dataset (CSV/TSV), only produced if "Only report issues" is unchecked. + + ]]></help> + + <citations> + <citation type="bibtex"> +@inproceedings{northcutt2021confident, + title={Confident learning: Estimating uncertainty in dataset labels}, + author={Northcutt, Curtis G and Jiang, Lu and Chuang, Alex}, + booktitle={Journal of Artificial Intelligence Research}, + year={2021}, + volume={70}, + pages={1373--1411} +} + </citation> + </citations> +</tool>
