Mercurial > repos > bgruening > sklearn_feature_selection
view feature_selection.xml @ 34:b9d86fc6359d draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
author | bgruening |
---|---|
date | Wed, 09 Aug 2023 11:58:39 +0000 |
parents | 2ac77e0aec82 |
children |
line wrap: on
line source
<tool id="sklearn_feature_selection" name="Feature Selection" version="@VERSION@" profile="@PROFILE@"> <description>module, including univariate filter selection methods and recursive feature elimination algorithm</description> <macros> <import>main_macros.xml</import> </macros> <expand macro="python_requirements" /> <!--TODO: Add imblearn package support--> <expand macro="macro_stdio" /> <version_command>echo "@VERSION@"</version_command> <command> <![CDATA[ python "$feature_selection_script" '$inputs' ]]> </command> <configfiles> <inputs name="inputs" /> <configfile name="feature_selection_script"> <![CDATA[ import json import sklearn.feature_selection import skrebate import pandas import sys import warnings import xgboost from sklearn import ( cluster, compose, decomposition, ensemble, feature_extraction, feature_selection, gaussian_process, kernel_approximation, metrics, model_selection, naive_bayes, neighbors, pipeline, preprocessing, svm, linear_model, tree, discriminant_analysis) from imblearn.pipeline import Pipeline as imbPipeline from sklearn.pipeline import Pipeline from galaxy_ml.model_persist import dump_model_to_h5 from galaxy_ml.utils import (SafeEval, feature_selector, read_columns, get_module) warnings.simplefilter('ignore') safe_eval = SafeEval() input_json_path = sys.argv[1] with open(input_json_path, 'r') as param_handler: params = json.load(param_handler) ## handle cheetah #if $fs_algorithm_selector.selected_algorithm == 'SelectFromModel'\ and $fs_algorithm_selector.model_inputter.input_mode == 'prefitted': params['fs_algorithm_selector']['model_inputter']['fitted_estimator'] =\ '$fs_algorithm_selector.model_inputter.fitted_estimator' #end if #if $fs_algorithm_selector.selected_algorithm == 'SelectFromModel'\ and $fs_algorithm_selector.model_inputter.input_mode == 'new'\ and $fs_algorithm_selector.model_inputter.estimator_selector.selected_module == 'custom_estimator': params['fs_algorithm_selector']['model_inputter']['estimator_selector']['c_estimator'] =\ '$fs_algorithm_selector.model_inputter.estimator_selector.c_estimator' #end if #if $fs_algorithm_selector.selected_algorithm in ['RFE', 'RFECV', 'DyRFECV']\ and $fs_algorithm_selector.estimator_selector.selected_module == 'custom_estimator': params['fs_algorithm_selector']['estimator_selector']['c_estimator'] =\ '$fs_algorithm_selector.estimator_selector.c_estimator' #end if #if $fs_algorithm_selector.selected_algorithm in ['RFECV', 'DyRFECV']\ and $fs_algorithm_selector.options.cv_selector.selected_cv\ in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut']: params['fs_algorithm_selector']['options']['cv_selector']['groups_selector']['infile_g'] =\ '$fs_algorithm_selector.options.cv_selector.groups_selector.infile_g' #end if ## Read features features_has_header = params['input_options']['header1'] #if $input_options.selected_input == 'tabular' header = 'infer' if features_has_header else None column_option = params['input_options']['column_selector_options_1']['selected_column_selector_option'] if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: c = params['input_options']['column_selector_options_1']['col1'] else: c = None X, input_df = read_columns( '$input_options.infile1', c = c, c_option = column_option, return_df = True, sep='\t', header=header, parse_dates=True, ) X = X.astype(float) #elif $input_options.selected_input == 'seq_fasta' fasta_file = '$input_options.fasta_file' pyfaidx = get_module('pyfaidx') sequences = pyfaidx.Fasta(fasta_file) n_seqs = len(sequences.keys()) X = np.arange(n_seqs)[:, np.newaxis] for param in estimator_params.keys(): if param.endswith('fasta_path'): estimator.set_params( **{param: fasta_file}) else: raise ValueError( "The selected estimator doesn't support " "fasta file input! Please consider using " "KerasGBatchClassifier with " "FastaDNABatchGenerator/FastaProteinBatchGenerator " "or having GenomeOneHotEncoder/ProteinOneHotEncoder " "in pipeline!") #elif $input_options.selected_input == 'sparse' X = mmread('$input_options.infile1') #end if ## Read labels header = 'infer' if params['input_options']['header2'] else None column_option = params['input_options']['column_selector_options_2']['selected_column_selector_option2'] if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: c = params['input_options']['column_selector_options_2']['col2'] else: c = None y = read_columns( '$input_options.infile2', c = c, c_option = column_option, sep='\t', header=header, parse_dates=True, ) y = y.ravel() ## Create feature selector new_selector = feature_selector(params['fs_algorithm_selector'], X=X, y=y) if params['fs_algorithm_selector']['selected_algorithm'] != 'SelectFromModel'\ or params['fs_algorithm_selector']['model_inputter']['input_mode'] != 'prefitted' : new_selector.fit(X, y) ## Transform to select features selected_names = None res = new_selector.transform(X) if features_has_header: selected_names = input_df.columns[new_selector.get_support(indices=True)] res = pandas.DataFrame(res, columns = selected_names) res.to_csv(path_or_buf='$outfile', sep='\t', index=False) #if $save: dump_model_to_h5(new_selector, '$outfile_selector') #end if ]]> </configfile> </configfiles> <inputs> <expand macro="feature_selection_fs" /> <param name="save" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Save the fitted selector?" /> <expand macro="sl_mixed_input_plus_sequence" /> </inputs> <outputs> <data format="tabular" name="outfile" /> <data format="h5mlm" name="outfile_selector" label="${fs_algorithm_selector.selected_algorithm}"> <filter>save</filter> </data> </outputs> <tests> <test> <param name="selected_algorithm" value="SelectFromModel" /> <param name="input_mode" value="new" /> <param name="selected_module" value="ensemble" /> <param name="selected_estimator" value="RandomForestRegressor" /> <param name="text_params" value="n_estimators=10, random_state=10" /> <param name="infile1" value="regression_train.tabular" ftype="tabular" /> <param name="header1" value="false" /> <param name="col1" value="1,2,3,4,5" /> <param name="infile2" value="regression_train.tabular" ftype="tabular" /> <param name="col2" value="6" /> <param name="header2" value="false" /> <output name="outfile" file="feature_selection_result01" /> </test> <test> <param name="selected_algorithm" value="GenericUnivariateSelect" /> <param name="param" value="20" /> <param name="infile1" value="regression_X.tabular" ftype="tabular" /> <param name="header1" value="True" /> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17" /> <param name="infile2" value="regression_y.tabular" ftype="tabular" /> <param name="col2" value="1" /> <param name="header2" value="True" /> <output name="outfile" file="feature_selection_result02" /> </test> <test> <param name="selected_algorithm" value="SelectPercentile" /> <param name="infile1" value="regression_X.tabular" ftype="tabular" /> <param name="header1" value="True" /> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17" /> <param name="infile2" value="regression_y.tabular" ftype="tabular" /> <param name="col2" value="1" /> <param name="header2" value="True" /> <output name="outfile" file="feature_selection_result03" /> </test> <test> <param name="selected_algorithm" value="SelectKBest" /> <param name="infile1" value="regression_X.tabular" ftype="tabular" /> <param name="header1" value="True" /> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17" /> <param name="infile2" value="regression_y.tabular" ftype="tabular" /> <param name="col2" value="1" /> <param name="header2" value="True" /> <output name="outfile" file="feature_selection_result04" /> </test> <test> <param name="selected_algorithm" value="SelectFpr" /> <param name="alpha" value="0.05" /> <param name="infile1" value="regression_X.tabular" ftype="tabular" /> <param name="header1" value="True" /> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17" /> <param name="infile2" value="regression_y.tabular" ftype="tabular" /> <param name="col2" value="1" /> <param name="header2" value="True" /> <output name="outfile" file="feature_selection_result05" /> </test> <test> <param name="selected_algorithm" value="SelectFdr" /> <param name="alpha" value="0.05" /> <param name="infile1" value="regression_X.tabular" ftype="tabular" /> <param name="header1" value="True" /> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17" /> <param name="infile2" value="regression_y.tabular" ftype="tabular" /> <param name="col2" value="1" /> <param name="header2" value="True" /> <output name="outfile" file="feature_selection_result06" /> </test> <test> <param name="selected_algorithm" value="SelectFwe" /> <param name="alpha" value="0.05" /> <param name="infile1" value="regression_X.tabular" ftype="tabular" /> <param name="header1" value="True" /> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17" /> <param name="infile2" value="regression_y.tabular" ftype="tabular" /> <param name="col2" value="1" /> <param name="header2" value="True" /> <output name="outfile" file="feature_selection_result07" /> </test> <test> <param name="selected_algorithm" value="RFE" /> <param name="input_mode" value="new" /> <param name="selected_module" value="ensemble" /> <param name="selected_estimator" value="RandomForestRegressor" /> <param name="text_params" value="n_estimators=10, random_state=10" /> <param name="infile1" value="regression_train.tabular" ftype="tabular" /> <param name="header1" value="false" /> <param name="col1" value="1,2,3,4,5" /> <param name="infile2" value="regression_train.tabular" ftype="tabular" /> <param name="col2" value="6" /> <param name="header2" value="false" /> <output name="outfile" file="feature_selection_result08" /> </test> <test> <param name="selected_algorithm" value="RFECV" /> <param name="input_mode" value="new" /> <param name="selected_module" value="ensemble" /> <param name="selected_estimator" value="RandomForestRegressor" /> <param name="text_params" value="n_estimators=10, random_state=10" /> <param name="infile1" value="regression_train.tabular" ftype="tabular" /> <param name="header1" value="false" /> <param name="col1" value="1,2,3,4,5" /> <param name="infile2" value="regression_train.tabular" ftype="tabular" /> <param name="col2" value="6" /> <param name="header2" value="false" /> <output name="outfile" file="feature_selection_result09" /> </test> <test> <param name="selected_algorithm" value="VarianceThreshold" /> <param name="threshold" value="0.1" /> <param name="infile1" value="regression_X.tabular" ftype="tabular" /> <param name="header1" value="True" /> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17" /> <param name="infile2" value="regression_y.tabular" ftype="tabular" /> <param name="col2" value="1" /> <param name="header2" value="True" /> <output name="outfile" file="feature_selection_result10" /> </test> <test> <param name="selected_algorithm" value="SelectKBest" /> <param name="k" value="3" /> <param name="infile1" value="test3.tabular" ftype="tabular" /> <param name="header1" value="True" /> <param name="selected_column_selector_option" value="all_but_by_header_name" /> <param name="col1" value="target" /> <param name="infile2" value="test3.tabular" ftype="tabular" /> <param name="header2" value="True" /> <param name="selected_column_selector_option2" value="by_header_name" /> <param name="col2" value="target" /> <output name="outfile" file="feature_selection_result11" /> </test> <test> <param name="selected_algorithm" value="SelectFromModel" /> <param name="input_mode" value="prefitted" /> <param name="fitted_estimator" value="searchCV03" ftype="h5mlm" /> <param name="infile1" value="regression_X.tabular" ftype="tabular" /> <param name="header1" value="true" /> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17" /> <param name="infile2" value="regression_y.tabular" ftype="tabular" /> <param name="col2" value="1" /> <param name="header2" value="true" /> <output name="outfile" file="feature_selection_result12" /> </test> <test> <param name="selected_algorithm" value="RFECV" /> <param name="input_mode" value="new" /> <param name="selected_module" value="ensemble" /> <param name="selected_estimator" value="RandomForestRegressor" /> <param name="text_params" value="n_estimators=10, random_state=10" /> <section name="groups_selector"> <param name="infile_groups" value="regression_y.tabular" ftype="tabular" /> <param name="header_g" value="true" /> <param name="selected_column_selector_option_g" value="by_index_number" /> <param name="col_g" value="1" /> </section> <param name="selected_cv" value="GroupShuffleSplit" /> <param name="random_state" value="0" /> <param name="infile1" value="regression_X.tabular" ftype="tabular" /> <param name="header1" value="true" /> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17" /> <param name="infile2" value="regression_y.tabular" ftype="tabular" /> <param name="col2" value="1" /> <param name="header2" value="true" /> <output name="outfile" file="feature_selection_result13" /> </test> </tests> <help> <![CDATA[ **What it does** This tool provides several loss, score, and utility functions to measure classification performance. Some metrics might require probability estimates of the positive class, confidence values, or binary decisions values. This tool is based on sklearn.metrics package. For information about classification metric functions and their parameter settings please refer to `Scikit-learn classification metrics`_. .. _`Scikit-learn classification metrics`: http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics ]]> </help> <expand macro="sklearn_citation"> <expand macro="skrebate_citation" /> <expand macro="xgboost_citation" /> </expand> </tool>