Mercurial > repos > bgruening > sklearn_build_pipeline

diff pipeline.xml @ 15:a7a047cf36d8 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 5b2ac730ec6d3b762faa9034eddd19ad1b347476"
author: bgruening
date: Mon, 16 Dec 2019 09:59:56 +0000
parents: 0c2586a48d0f
children: 449bd57f70f4
--- a/pipeline.xml	Thu Nov 07 05:09:07 2019 -0500
+++ b/pipeline.xml	Mon Dec 16 09:59:56 2019 +0000
@@ -1,5 +1,5 @@
 <tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@">
-    <description>constructs a list of transforms and a final estimator</description>
+    <description>an all-in-one platform to build pipeline, single estimator, preprocessor and custom wrappers</description>
     <macros>
         <import>main_macros.xml</import>
     </macros>
@@ -31,7 +31,10 @@
 from sklearn.pipeline import make_pipeline
 from imblearn.pipeline import make_pipeline as imb_make_pipeline
 from galaxy_ml.utils import (SafeEval, feature_selector, get_estimator,
-                             try_get_attr, get_search_params)
+                             try_get_attr, get_search_params, load_model)
+
+## TODO remove following imports after scikit-learn v0.22
+from sklearn.experimental import enable_hist_gradient_boosting
 
 
 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
@@ -174,27 +177,33 @@
     if len(pipeline_steps) == 0:
         sys.exit("No pipeline steps specified!")
     ## else:  turn the last pre-process component to final estimator
+elif estimator_json['selected_module'] == 'sklearn.compose':
+    #if $final_estimator.estimator_selector.selected_module == 'sklearn.compose':
+    regressor_path = '$final_estimator.estimator_selector.regressor'
+    transformer_path = '$final_estimator.estimator_selector.transformer'
+    #end if
+    with open(regressor_path, 'rb') as f:
+        regressor = load_model(f)
+    with open(transformer_path, 'rb') as f:
+        transformer = load_model(f)
+    estimator = compose.TransformedTargetRegressor(regressor=regressor, transformer=transformer)
+    pipeline_steps.append( estimator )
 else:
     estimator = get_estimator(estimator_json)
     pipeline_steps.append( estimator )
 
-#if $output_type == 'Final_Estimator_Builder':
-with open('$outfile', 'wb') as out_handler:
-    final_est = pipeline_steps[-1]
-    print(final_est)
-    pickle.dump(final_est, out_handler, pickle.HIGHEST_PROTOCOL)
-out_obj = final_est
-#else:
-if has_imblearn:
-    pipeline = imb_make_pipeline(*pipeline_steps)
+if len(pipeline_steps) == 1:
+    out_obj = pipeline_steps[-1]
+    print(out_obj)
 else:
-    pipeline = make_pipeline(*pipeline_steps)
-pprint.pprint(pipeline.named_steps)
+    if has_imblearn:
+        out_obj = imb_make_pipeline(*pipeline_steps)
+    else:
+        out_obj = make_pipeline(*pipeline_steps)
+    pprint.pprint(out_obj.named_steps)
 
 with open('$outfile', 'wb') as out_handler:
-    pickle.dump(pipeline, out_handler, pickle.HIGHEST_PROTOCOL)
-out_obj = pipeline
-#end if
+    pickle.dump(out_obj, out_handler, pickle.HIGHEST_PROTOCOL)
 
 #if $get_params
 results = get_search_params(out_obj)
@@ -262,12 +271,20 @@
             <conditional name="estimator_selector">
                 <param name="selected_module" type="select" label="Choose the module that contains target estimator:" >
                     <expand macro="estimator_module_options">
+                        <option value="sklearn.compose">sklearn.compose</option>
                         <option value="binarize_target">Binarize Target Classifier or Regressor</option>
                         <option value="custom_estimator">Load a custom estimator</option>
                         <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option>
                     </expand>
                 </param>
                 <expand macro="estimator_suboptions">
+                    <when value="sklearn.compose">
+                        <param name="selected_estimator" type="select" label="Choose estimator class:">
+                            <option value="TransformedTargetRegressor" selected="true">TransformedTargetRegressor</option>
+                        </param>
+                        <param name="regressor" type="data" format="zip" label="Choose the dataset containing the wrapped regressor"/>
+                        <param name="transformer" type="data" format="zip" label="Choose the dataset containing transformer"/>
+                    </when>
                     <when value="binarize_target">
                         <param name="clf_or_regr" type="select" label="Classifier or Regressor:">
                             <option value="BinarizeTargetClassifier">BinarizeTargetClassifier</option>
@@ -285,21 +302,55 @@
                 </expand>
             </conditional>
         </section>
-        <param name="output_type" type="select" label="Output the final estimator instead?">
+        <!--param name="output_type" type="select" label="Output the final estimator instead?">
             <option value="Pipeline_Builder" selected="true">Pipeline</option>
             <option value="Final_Estimator_Builder">Final Estimator</option>
-        </param>
+        </param>-->
         <param name="get_params" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Output parameters for searchCV?"
                 help="Optional. Tunable parameters could be obtained through `estimator_attributes` tool."/>
     </inputs>
     <outputs>
-        <data format="zip" name="outfile" label="${output_type}"/>
-        <data format="tabular" name="outfile_params" label="get_params for ${output_type}">
+        <data format="zip" name="outfile" label="New Pipleline/Estimator"/>
+        <data format="tabular" name="outfile_params" label="get_params for Pipleline/Estimator">
             <filter>get_params</filter>
         </data>
     </outputs>
     <tests>
         <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="pre_processor"/>
+                <conditional name="pre_processors">
+                    <param name="selected_pre_processor" value="QuantileTransformer"/>
+                    <section name="options">
+                        <param name="random_state" value="10"/>
+                    </section>
+                </conditional>
+            </conditional>
+            <section name="final_estimator">
+                <conditional name="estimator_selector">
+                    <param name="selected_module" value="none"/>
+                </conditional>
+            </section>
+            <output name="outfile" file="pipeline17" compare="sim_size" delta="5"/>
+        </test>
+        <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="pre_processor"/>
+                <conditional name="pre_processors">
+                    <param name="selected_pre_processor" value="PowerTransformer"/>
+                </conditional>
+            </conditional>
+            <section name="final_estimator">
+                <conditional name="estimator_selector">
+                    <param name="selected_module" value="sklearn.compose"/>
+                    <param name="regressor" value="RandomForestRegressor01.zip" ftype="zip"/>
+                    <param name="transformer" value="pipeline17" ftype="zip"/>
+                </conditional>
+            </section>
+            <param name="get_params" value="true"/>
+            <output name="outfile_params" file="pipeline_params18" ftype="tabular"/>
+        </test>
+        <test>
             <repeat name="pipeline_component">
                 <conditional name="component_selector">
                     <param name="component_type" value="pre_processor"/>
@@ -370,7 +421,9 @@
             <param name="selected_module" value="ensemble"/>
             <param name="selected_estimator" value="RandomForestRegressor"/>
             <param name="text_params" value="n_estimators=100, random_state=42"/>
+            <param name="get_params" value="true"/>
             <output name="outfile" file="pipeline05" compare="sim_size" delta="5"/>
+            <output name="outfile_params" file="pipeline_params05.tabular" ftype="tabular"/>
         </test>
         <test>
             <conditional name="component_selector">
@@ -421,14 +474,6 @@
         </test>
         <test>
             <conditional name="component_selector">
-                <param name="component_type" value="None"/>
-            </conditional>
-            <param name="selected_module" value="ensemble"/>
-            <param name="selected_estimator" value="AdaBoostRegressor"/>
-            <output name="outfile" file="pipeline10" compare="sim_size" delta="5"/>
-        </test>
-        <test>
-            <conditional name="component_selector">
                 <param name="component_type" value="imblearn"/>
                 <conditional name="imblearn_selector">
                     <param name="select_algorithm" value="under_sampling.EditedNearestNeighbours"/>
@@ -471,7 +516,6 @@
             </conditional>
             <param name="selected_module" value="ensemble"/>
             <param name="selected_estimator" value="RandomForestClassifier"/>
-            <param name="output_type" value="Final_Estimator_Builder"/>
             <output name="outfile" file="RandomForestClassifier.zip" compare="sim_size" delta="5"/>
         </test>
         <test>
@@ -483,7 +527,6 @@
                     <param name="selected_module" value="none"/>
                 </conditional>
             </section>
-            <param name="output_type" value="Final_Estimator_Builder"/>
             <output name="outfile" file="pipeline14" compare="sim_size" delta="5"/>
         </test>
         <test>
@@ -497,7 +540,6 @@
                     <param name="wrapped_estimator" value="RandomForestClassifier.zip" ftype="zip"/>
                 </conditional>
             </section>
-            <param name="output_type" value="Final_Estimator_Builder"/>
             <output name="outfile" file="pipeline15" compare="sim_size" delta="5"/>
         </test>
         <test>
@@ -521,13 +563,32 @@
     <help>
         <![CDATA[
 **What it does**
-Constructs a pipeline that contains a list of transfroms and a final estimator. Pipeline assembles several steps
-that can be cross-validated together while setting different parameters.
-please refer to `Scikit-learn pipeline Pipeline`_.
+This tool not only builds sklearn pipeline object, but also builds single main estimator or single preprocessing component. The output object type is based on the length of pipeline steps. When there is only one step (choose `None` for others), either a main estimator or preprocessor, the component is output directly instead of wrapping in a pipeline object.
+
+A typical pipeline chains one or more preprocessing steps plus a final main estimator, for example, [VarianceThreshold, StandardScaler, SGDClassifier] which is composed of a feature selctor, a preprocessing scaler and a main estimator together.
+For more information, please refer to `Scikit-learn pipeline Pipeline`_.
+
+**Pre-processing components** come from `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_, `cluster.FeatureAgglomeration`_, `skrebate`_ and more.
+
+**Final Estimator** supports estimators from `xgboost`_ and many scikit-learn modules, including `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_, `neighbors`_ and so on.
+
+**Custom estimators**
+
+- `GenomeOneHotEncoder`_
 
-**Pre-processing components** allow None, one or a combination of up to 5 transformations from `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_, `cluster.FeatureAgglomeration`_ and/or `skrebate`_.
+- `ProteinOnehotEncoder`_
+
+- `IRAPSClassifier`_
+
+- `BinarizeTargetClassifier`_
 
-**Estimator** selector supports estimators from `xgboost`_ and many scikit-learn modules, including `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_ and `neighbors`_.
+- `BinarizeTargetRegressor`_
+
+**Output**
+
+- Pickled pipeline/estimator object
+
+- Hyperparameter of the ojbect (optional)
 
 
 .. _`Scikit-learn pipeline Pipeline`: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
@@ -546,6 +607,12 @@
 .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html
 .. _`skrebate`: https://epistasislab.github.io/scikit-rebate/using/
 
+.. _`GenomeOneHotEncoder`: https://goeckslab.github.io/Galaxy-ML/APIs/preprocessors/#genomeonehotencoder
+.. _`ProteinOnehotEncoder`: https://goeckslab.github.io/Galaxy-ML/APIs/preprocessors/#proteinonehotencoder
+.. _`IRAPSClassifier`: https://goeckslab.github.io/Galaxy-ML/APIs/iraps-classifier/#irapsclassifier
+.. _`BinarizeTargetClassifier`: https://goeckslab.github.io/Galaxy-ML/APIs/binarize-target/#binarizetargetclassifier
+.. _`BinarizeTargetRegressor`: https://goeckslab.github.io/Galaxy-ML/APIs/binarize-target/#binarizetargetregressor
+
         ]]>
     </help>
     <expand macro="sklearn_citation">
author	bgruening
date	Mon, 16 Dec 2019 09:59:56 +0000
parents	0c2586a48d0f
children	449bd57f70f4