Mercurial > repos > bgruening > sklearn_build_pipeline

diff pipeline.xml @ 0:f8b431b981fa draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 76583c1fcd9d06a4679cc46ffaee44117b9e22cd
author: bgruening
date: Sat, 04 Aug 2018 12:14:28 -0400
children: ddd8c51b7302
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pipeline.xml	Sat Aug 04 12:14:28 2018 -0400
@@ -0,0 +1,281 @@
+<tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@">
+    <description>constructs a list of transforms and a final estimator</description>
+    <macros>
+        <import>main_macros.xml</import>
+    </macros>
+    <expand macro="python_requirements"/>
+    <expand macro="macro_stdio"/>
+    <version_command>echo "@VERSION@"</version_command>
+    <command>
+        <![CDATA[
+        python "$sklearn_pipeline_script" '$inputs'
+        ]]>
+    </command>
+    <configfiles>
+        <inputs name="inputs" />
+        <configfile name="sklearn_pipeline_script">
+            <![CDATA[
+import sys
+import json
+import pickle
+import pprint
+import xgboost
+import ast
+import sklearn.feature_selection
+from sklearn import ( preprocessing, svm, linear_model, ensemble, naive_bayes,
+                    tree, neighbors, decomposition, kernel_approximation, cluster)
+from sklearn.pipeline import Pipeline
+
+@GET_ESTIMATOR_FUNCTION@
+@FEATURE_SELECTOR_FUNCTION@
+
+input_json_path = sys.argv[1]
+with open(input_json_path, "r") as param_handler:
+    params = json.load(param_handler)
+
+pipeline_steps = []
+
+def get_component(input_json, check_none=False):
+    if input_json['component_type'] == 'None':
+        if not check_none:
+            return
+        else:
+            sys.exit("The pre-processing component type can't be None when the number of components is greater than 1.")
+    if input_json['component_type'] == 'pre_processor':
+        preprocessor = input_json["pre_processors"]["selected_pre_processor"]
+        pre_processor_options = input_json["pre_processors"]["options"]
+        my_class = getattr(preprocessing, preprocessor)
+        return my_class(**pre_processor_options)
+    if input_json['component_type'] == 'feature_selection':
+        fs_obj = feature_selector(input_json['fs_algorithm_selector'])
+        return fs_obj
+    if input_json['component_type'] == 'decomposition':
+        algorithm = input_json['matrix_decomposition_selector']['select_algorithm']
+        obj = getattr(decomposition, algorithm)()
+        options = input_json['matrix_decomposition_selector']['text_params'].strip()
+        if options != "":
+            options = ast.literal_eval('{' + options + '}')
+            obj.set_params(**options)
+        return obj
+    if input_json['component_type'] == 'kernel_approximation':
+        algorithm = input_json['kernel_approximation_selector']['select_algorithm']
+        obj = getattr(kernel_approximation, algorithm)()
+        options = input_json['kernel_approximation_selector']['text_params'].strip()
+        if options != "":
+            options = ast.literal_eval('{' + options + '}')
+            obj.set_params(**options)
+        return obj
+    if input_json['component_type'] == 'FeatureAgglomeration':
+        algorithm = input_json['FeatureAgglomeration_selector']['select_algorithm']
+        obj = getattr(cluster, algorithm)()
+        options = input_json['FeatureAgglomeration_selector']['text_params'].strip()
+        if options != "":
+            options = ast.literal_eval('{' + options + '}')
+            obj.set_params(**options)
+        return obj
+if len(params['pipeline_component']) == 1:
+    step_obj = get_component( params['pipeline_component'][0]['component_selector'])
+    if step_obj:
+        pipeline_steps.append( ('preprocessing_1', step_obj) )
+else:
+    for i, c in enumerate(params['pipeline_component']):
+        step_obj = get_component( c['component_selector'], check_none=True )
+        pipeline_steps.append( ('preprocessing_' + str(i+1), step_obj) )
+
+# Set up final estimator and add to pipeline.
+estimator_json = params["final_estimator"]['estimator_selector']
+estimator = get_estimator(estimator_json)
+
+pipeline_steps.append( ('estimator', estimator) )
+
+pipeline = Pipeline(pipeline_steps)
+pprint.pprint(pipeline.named_steps)
+
+with open("$outfile", 'wb') as out_handler:
+    pickle.dump(pipeline, out_handler, pickle.HIGHEST_PROTOCOL)
+
+            ]]>
+        </configfile>
+    </configfiles>
+    <inputs>
+        <repeat name="pipeline_component" min="1" max="5" title="Pre-processing step">
+            <conditional name="component_selector">
+                <param name="component_type" type="select" label="Choose the type of transformation:">
+                    <option value="none" selected="true">None</option>
+                    <option value="pre_processor">Sklearn Preprocessor</option>
+                    <option value="feature_selection">Feature Selection</option>
+                    <option value="decomposition">Matrix Decomposition</option>
+                    <option value="kernel_approximation">Kernel Approximation</option>
+                    <option value="FeatureAgglomeration">Agglomerate Features</option>
+                </param>
+                <when value="none"/>
+                <when value="pre_processor">
+                    <conditional name="pre_processors">
+                        <expand macro="sparse_preprocessors_ext" />
+                        <expand macro="sparse_preprocessor_options_ext" />
+                    </conditional>
+                </when>
+                <when value="feature_selection">
+                    <expand macro="feature_selection_all">
+                        <expand macro="fs_selectfrommodel_no_prefitted"/>
+                    </expand>
+                </when>
+                <when value="decomposition">
+                    <expand macro="matrix_decomposition_all"/>
+                </when>
+                <when value="kernel_approximation">
+                    <expand macro="kernel_approximation_all"/>
+                </when>
+                <when value="FeatureAgglomeration">
+                    <expand macro="FeatureAgglomeration"/>
+                </when>
+            </conditional>
+        </repeat>
+        <section name="final_estimator" title="Final Estimator" expanded="true">
+            <expand macro="estimator_selector_all" />
+        </section>
+    </inputs>
+    <outputs>
+        <data format="zip" name="outfile"/>
+    </outputs>
+    <tests>
+        <test>
+            <repeat name="pipeline_component">
+                <conditional name="component_selector">
+                    <param name="component_type" value="pre_processor"/>
+                    <conditional name="pre_processors">
+                        <param name="selected_pre_processor" value="RobustScaler"/>
+                    </conditional>
+                </conditional>
+            </repeat>
+            <repeat name="pipeline_component">
+                <conditional name="component_selector">
+                    <param name="component_type" value="feature_selection"/>
+                    <conditional name="fs_algorithm_selector">
+                        <param name="selected_algorithm" value="SelectKBest"/>
+                        <param name="score_func" value="f_classif"/>
+                    </conditional>
+                </conditional>
+            </repeat>
+            <param name="selected_module" value="svm"/>
+            <param name="selected_estimator" value="SVR"/>
+            <param name="text_params" value="'kernel': 'linear'"/>
+            <output name="outfile" file="pipeline01" compare="sim_size" delta="1"/>
+        </test>
+        <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="pre_processor"/>
+                <conditional name="pre_processors">
+                    <param name="selected_pre_processor" value="RobustScaler"/>
+                </conditional>
+            </conditional>
+            <param name="selected_module" value="linear_model"/>
+            <param name="selected_estimator" value="LassoCV"/>
+            <output name="outfile" file="pipeline02" compare="sim_size" delta="1"/>
+        </test>
+        <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="pre_processor"/>
+                <conditional name="pre_processors">
+                    <param name="selected_pre_processor" value="RobustScaler"/>
+                </conditional>
+            </conditional>
+            <param name="selected_module" value="xgboost"/>
+            <param name="selected_estimator" value="XGBClassifier"/>
+            <output name="outfile" file="pipeline03" compare="sim_size" delta="1"/>
+        </test>
+        <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="feature_selection"/>
+                <conditional name="fs_algorithm_selector">
+                    <param name="selected_algorithm" value="SelectFromModel"/>
+                    <conditional name="model_inputter">
+                        <conditional name="estimator_selector">
+                            <param name="selected_module" value="ensemble"/>
+                            <param name="selected_estimator" value="AdaBoostClassifier"/>
+                        </conditional>
+                    </conditional>
+                </conditional>
+            </conditional>
+            <section name="final_estimator">
+                <param name="selected_module" value="svm"/>
+                <param name="selected_estimator" value="LinearSVC"/>
+            </section>
+            <output name="outfile" file="pipeline04" compare="sim_size" delta="1"/>
+        </test>
+        <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="None"/>
+            </conditional>
+            <param name="selected_module" value="ensemble"/>
+            <param name="selected_estimator" value="RandomForestRegressor"/>
+            <param name="text_params" value="'n_estimators': 100, 'random_state': 42"/>
+            <output name="outfile" file="pipeline05" compare="sim_size" delta="1"/>
+        </test>
+        <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="decomposition"/>
+                    <conditional name="matrix_decomposition_selector">
+                        <param name="select_algorithm" value="PCA"/>
+                    </conditional>
+            </conditional>
+            <param name="selected_module" value="ensemble"/>
+            <param name="selected_estimator" value="AdaBoostRegressor"/>
+            <output name="outfile" file="pipeline06" compare="sim_size" delta="1"/>
+        </test>
+        <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="kernel_approximation"/>
+                    <conditional name="kernel_approximation_selector">
+                        <param name="select_algorithm" value="RBFSampler"/>
+                        <param name="text_params" value="'n_components': 10, 'gamma': 2.0"/>
+                    </conditional>
+            </conditional>
+            <param name="selected_module" value="ensemble"/>
+            <param name="selected_estimator" value="AdaBoostClassifier"/>
+            <output name="outfile" file="pipeline07" compare="sim_size" delta="1"/>
+        </test>
+        <test>
+            <conditional name="component_selector">
+                <param name="component_type" value="FeatureAgglomeration"/>
+                    <conditional name="FeatureAgglomeration_selector">
+                        <param name="select_algorithm" value="FeatureAgglomeration"/>
+                        <param name="text_params" value="'n_clusters': 3, 'affinity': 'euclidean'"/>
+                    </conditional>
+            </conditional>
+            <param name="selected_module" value="ensemble"/>
+            <param name="selected_estimator" value="AdaBoostClassifier"/>
+            <output name="outfile" file="pipeline08" compare="sim_size" delta="1"/>
+        </test>
+    </tests>
+    <help>
+        <![CDATA[
+**What it does**
+Constructs a pipeline that contains a list of transfroms and a final estimator. Pipeline assembles several steps
+that can be cross-validated together while setting different parameters.
+please refer to `Scikit-learn pipeline Pipeline`_.
+
+**Pre-processing components** allow None, one or a combination of up to 5 transformations from `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_ and/or `cluster.FeatureAgglomeration`_.
+
+**Estimator** selector supports estimators from `xgboost`_ and many scikit-learn modules, including `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_ and `neighbors`_.
+
+
+.. _`Scikit-learn pipeline Pipeline`: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
+.. _`svm`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm
+.. _`linear_model`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model
+.. _`ensemble`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble
+.. _`naive_bayes`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.naive_bayes
+.. _`tree`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree
+.. _`neighbors`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.neighbors
+.. _`xgboost`: https://xgboost.readthedocs.io/en/latest/python/python_api.html
+
+.. _`sklearn.preprocessing`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
+.. _`feature_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection
+.. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition
+.. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation
+.. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html
+
+        ]]>
+    </help>
+    <expand macro="sklearn_citation"/>
+</tool>
author	bgruening
date	Sat, 04 Aug 2018 12:14:28 -0400
parents
children	ddd8c51b7302