Mercurial > repos > bgruening > eden_vectorizer
changeset 2:4bf9238ad734 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tools/sklearn commit 6c002ea2995c85f5f16adb2ef1c6be82dfbc5417
author | bgruening |
---|---|
date | Tue, 31 May 2016 16:51:53 -0400 |
parents | 72e5aaebe37f |
children | 49857bc1b594 |
files | main_macros.xml test-data/regression_test.tabular test-data/regression_train.tabular test-data/rfc_model01 test-data/rfc_result01 test-data/rfc_result02 test-data/rfr_model01 test-data/rfr_result01 |
diffstat | 8 files changed, 279 insertions(+), 54 deletions(-) [+] |
line wrap: on
line diff
--- a/main_macros.xml Wed May 04 13:09:29 2016 -0400 +++ b/main_macros.xml Tue May 31 16:51:53 2016 -0400 @@ -14,11 +14,13 @@ </stdio> </xml> + + <!--Generic interface--> <xml name="train_loadConditional" token_train="tabular" token_data="tabular" token_model="txt"> <conditional name="selected_tasks"> <param name="selected_task" type="select" label="Select a Classification Task"> + <option value="train" selected="true">Train a model</option> <option value="load">Load a model and predict</option> - <option value="train" selected="true">Train a model</option> </param> <when value="load"> <param name="infile_model" type="data" format="@MODEL@" label="Models" help="Select a model file." /> @@ -43,18 +45,92 @@ </conditional> </xml> + <xml name="sl_Conditional" token_train="tabular" token_data="tabular" token_model="txt"> + <conditional name="selected_tasks"> + <param name="selected_task" type="select" label="Select a Classification Task"> + <option value="train" selected="true">Train a model</option> + <option value="load">Load a model and predict</option> + </param> + <when value="load"> + <param name="infile_model" type="data" format="@MODEL@" label="Models" help="Select a model file." /> + <param name="infile_data" type="data" format="@DATA@" label="Data (tabular)" help="Select the dataset you want to classify."/> + <conditional name="prediction_options"> + <param name="prediction_option" type="select" label="Select the type of prediction"> + <option value="predict">Predict class labels</option> + <option value="advanced">Include advanced options</option> + </param> + <when value="predict"> + </when> + <when value="advanced"> + </when> + </conditional> + </when> + <when value="train"> + <conditional name="selected_algorithms"> + <yield /> + </conditional> + </when> + </conditional> + </xml> + <xml name="advanced_section"> <section name="options" title="Advanced Options" expanded="False"> <yield /> </section> </xml> - <xml name="tabular_input"> - <param name="infile" type="data" format="tabular" label="Data file with numeric values"/> - <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Select a subset of data. Start column:" /> - <param name="end_column" type="data_column" data_ref="infile" optional="True" label="End column:" /> + + <!--Ensemble methods--> + <xml name="n_estimators" token_default_value="10" token_help=" "> + <param argument="n_estimators" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Number of trees in the forest" help="@HELP@"/> + </xml> + + <xml name="max_depth" token_default_value="" token_help=" "> + <param argument="max_depth" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Maximum depth of the tree" help="@HELP@"/> + </xml> + + <xml name="min_samples_split" token_default_value="2" token_help=" "> + <param argument="min_samples_split" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Maximum depth of the tree" help="@HELP@"/> + </xml> + + <xml name="min_samples_leaf" token_default_value="1" token_help=" "> + <param argument="min_samples_leaf" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Minimum number of samples in newly created leaves" help="@HELP@"/> + </xml> + + <xml name="min_weight_fraction_leaf" token_default_value="0.0" token_help=" "> + <param argument="min_weight_fraction_leaf" type="float" optional="true" value="@DEFAULT_VALUE@" label="Minimum weighted fraction of the input samples required to be at a leaf node" help="@HELP@"/> </xml> + <xml name="max_leaf_nodes" token_default_value="" token_help=" "> + <param argument="max_leaf_nodes" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Maximum number of leaf nodes in best-first method" help="@HELP@"/> + </xml> + + <xml name="bootstrap" token_checked="true" token_help=" "> + <param argument="bootstrap" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="@CHECKED@" label="Use bootstrap samples for building trees." help="@HELP@"/> + </xml> + + <xml name="criterion" token_help=" "> + <param argument="criterion" type="select" label="Function to measure the quality of a split" help=" "> + <option value="gini" selected="true">Gini impurity</option> + <option value="entropy">Information gain</option> + <yield/> + </param> + </xml> + + <xml name="oob_score" token_checked="flase" token_help=" "> + <param argument="oob_score" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="@CHECKED@" label="Use out-of-bag samples to estimate the generalization error" help="@HELP@"/> + </xml> + + <xml name="max_features" token_default_value="auto" token_help="This could be an integer, float, string, or None. For more information please refer to help. "> + <param argument="max_features" type="text" optional="true" value="@DEFAULT_VALUE@" label="Number of features for finding the best split" help="@HELP@"/> + </xml> + + <xml name="learning_rate" token_default_value="1.0" token_help=" "> + <param argument="learning_rate" type="float" optional="true" value="@DEFAULT_VALUE@" label="Learning rate" help="@HELP@"/> + </xml> + + + <!--Parameters--> <xml name="tol" token_default_value="0.0" token_help_text="Early stopping heuristics based on the relative center changes. Set to default (0.0) to disable this convergence detection."> <param argument="tol" type="float" optional="true" value="@DEFAULT_VALUE@" label="Tolerance" help="@HELP_TEXT@"/> </xml> @@ -136,66 +212,71 @@ <xml name="coef0" token_default_value="1" token_label="Zero coefficient" token_help_text=" "> <param argument="coef0" type="integer" optional="true" value="@DEFAULT_VALUE@" label="@LABEL@" help="@HELP_TEXT@"/> - </xml> + </xml> + + <xml name="pos_label" token_default_value=""> + <param argument="pos_label" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Label of the positive class" help=" "/> + </xml> + + <xml name="average"> + <param argument="average" type="select" optional="True" label="Averaging type" help=" "> + <option value="binary" selected="true" help="Only report results for the class specified by pos_label. Applicable only on binary classification.">binary</option> + <option value="micro" help="Calculate metrics globally by counting the total true positives, false negatives and false positives.">micro</option> + <option value="samples" help="Calculate metrics for each instance, and find their average (only meaningful for multilabel).">samples</option> + <!--option value="macro" help=""></option--> + <!--option value="weighted" help=""></option--> + </param> + </xml> + + <xml name="beta"> + <param argument="beta" type="float" value="1.0" label="The strength of recall versus precision in the F-score" help=" "/> + </xml> + + + <!--Data interface--> + <xml name="tabular_input"> + <param name="infile" type="data" format="tabular" label="Data file with numeric values"/> + <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Select a subset of data. Start column:" /> + <param name="end_column" type="data_column" data_ref="infile" optional="True" label="End column:" /> + </xml> + + <xml name="sample_cols" token_label1="File containing true class labels:" token_label2="File containing predicted class labels:" token_multiple1="False" token_multiple2="False" token_format1="tabular" token_format2="tabular" token_help1="" token_help2=""> + <param name="infile1" type="data" format="@FORMAT1@" label="@LABEL1@" help="@HELP1@"/> + <param name="col1" multiple="@MULTIPLE1@" type="data_column" data_ref="infile1" label="Select target column(s):"/> + <param name="infile2" type="data" format="@FORMAT2@" label="@LABEL2@" help="@HELP2@"/> + <param name="col2" multiple="@MULTIPLE2@" type="data_column" data_ref="infile2" label="Select target column(s):"/> + <yield/> + </xml> <xml name="multiple_input" token_name="input_files" token_max_num="10" token_format="txt" token_label="Sparse matrix file (.mtx, .txt)" token_help_text="Specify a sparse matrix file in .txt format."> <repeat name="@NAME@" min="1" max="@MAX_NUM@" title="Select input file(s):"> <param name="input" type="data" format="@FORMAT@" label="@LABEL@" help="@HELP_TEXT@"/> </repeat> - </xml> + </xml> - <xml name="eden_citation"> - <citations> - <citation type="bibtex"> - @misc{fabrizio_costa_2015_15094, - author = {Fabrizio Costa and - Björn Grüning and - gigolo}, - title = {EDeN: EDeN - Graph Vectorizer}, - month = feb, - year = 2015, - doi = {10.5281/zenodo.15094}, - url = {http://dx.doi.org/10.5281/zenodo.15094} - } - } - </citation> - </citations> + <xml name="sparse_target" token_label1="Select a sparse matrix:" token_label2="Select the tabular containing true labels:" token_multiple="False" token_format1="txt" token_format2="tabular" token_help1="" token_help2=""> + <param name="infile1" type="data" format="@FORMAT1@" label="@LABEL1@" help="@HELP1@"/> + <param name="infile2" type="data" format="@FORMAT2@" label="@LABEL2@" help="@HELP2@"/> + <param name="col2" multiple="@MULTIPLE@" type="data_column" data_ref="infile2" label="Select target column(s):"/> </xml> - <xml name="sklearn_citation"> - <citations> - <citation type="bibtex"> - @article{scikit-learn, - title={Scikit-learn: Machine Learning in {P}ython}, - author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. - and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. - and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and - Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, - journal={Journal of Machine Learning Research}, - volume={12}, - pages={2825--2830}, - year={2011} - url = {https://github.com/scikit-learn/scikit-learn} - } - </citation> - </citations> - </xml> - - <xml name="scipy_citation"> - <citations> - <citation type="bibtex"> - @Misc{, - author = {Eric Jones and Travis Oliphant and Pearu Peterson and others}, - title = {{SciPy}: Open source scientific tools for {Python}}, - year = {2001--}, - url = "http://www.scipy.org/", - note = {[Online; accessed 2016-04-09]} - } - </citation> - </citations> + <xml name="sl_mixed_input"> + <conditional name="input_options"> + <param name="selected_input" type="select" label="Select input type:"> + <option value="tabular" selected="true">tabular data</option> + <option value="sparse">sparse matrix</option> + </param> + <when value="tabular"> + <expand macro="sample_cols" multiple1="true"/> + </when> + <when value="sparse"> + <expand macro="sparse_target"/> + </when> + </conditional> </xml> + <!--Advanced options--> <xml name="nn_advanced_options"> <section name="options" title="Advanced Options" expanded="False"> <yield/> @@ -442,4 +523,115 @@ </when> </xml> + <xml name="sparse_preprocessors"> + <param name="selected_pre_processor" type="select" label="Select a preprocessor:"> + <option value="StandardScaler" selected="true">Standardize features by removing the mean and scaling to unit variance</option> + <option value="Binarizer">Binarize data</option> + <option value="Imputer">Complete missing values</option> + <option value="MaxAbsScaler">Scale features by their maximum absolute value</option> + <option value="Normalizer">Normalize samples individually to unit norm</option> + <yield/> + </param> + </xml> + + <xml name="sparse_preprocessor_options"> + <when value="Binarizer"> + <section name="options" title="Advanced Options" expanded="False"> + <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Use a copy of data for precomputing binarization" help=" "/> + <param argument="threshold" type="float" optional="true" value="0.0" label="Threshold" help="Feature values below or equal to this are replaced by 0, above it by 1. Threshold may not be less than 0 for operations on sparse matrices. "/> + </section> + </when> + <when value="Imputer"> + <section name="options" title="Advanced Options" expanded="False"> + <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Use a copy of data for precomputing imputation" help=" "/> + <param argument="strategy" type="select" optional="true" label="Imputation strategy" help=" "> + <option value="mean" selected="true">Replace missing values using the mean along the axis</option> + <option value="median">Replace missing values using the median along the axis</option> + <option value="most_frequent">Replace missing using the most frequent value along the axis</option> + </param> + <param argument="missing_values" type="text" optional="true" value="NaN" label="Placeholder for missing values" help="For missing values encoded as numpy.nan, use the string value “NaN”"/> + <param argument="axis" type="select" optional="true" label="The axis along which to impute" help=" "> + <option value="0" selected="true">Impute along columns</option> + <option value="1">Impute along rows</option> + </param> + </section> + </when> + <when value="StandardScaler"> + <section name="options" title="Advanced Options" expanded="False"> + <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Use a copy of data for performing inplace scaling" help=" "/> + <param argument="with_mean" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Center the data before scaling" help=" "/> + <param argument="with_std" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Scale the data to unit variance (or unit standard deviation)" help=" "/> + </section> + </when> + <when value="MaxAbsScaler"> + <section name="options" title="Advanced Options" expanded="False"> + <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Use a copy of data for precomputing scaling" help=" "/> + </section> + </when> + <when value="Normalizer"> + <section name="options" title="Advanced Options" expanded="False"> + <param argument="norm" type="select" optional="true" label="The norm to use to normalize non zero samples" help=" "> + <option value="l1" selected="true">l1</option> + <option value="l2">l2</option> + <option value="max">max</option> + <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Use a copy of data for precomputing row normalization" help=" "/> + </param> + </section> + </when> + <yield/> + </xml> + + + <!--Citations--> + <xml name="eden_citation"> + <citations> + <citation type="bibtex"> + @misc{fabrizio_costa_2015_15094, + author = {Fabrizio Costa and + Björn Grüning and + gigolo}, + title = {EDeN: EDeN - Graph Vectorizer}, + month = feb, + year = 2015, + doi = {10.5281/zenodo.15094}, + url = {http://dx.doi.org/10.5281/zenodo.15094} + } + } + </citation> + </citations> + </xml> + + <xml name="sklearn_citation"> + <citations> + <citation type="bibtex"> + @article{scikit-learn, + title={Scikit-learn: Machine Learning in {P}ython}, + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011} + url = {https://github.com/scikit-learn/scikit-learn} + } + </citation> + </citations> + </xml> + + <xml name="scipy_citation"> + <citations> + <citation type="bibtex"> + @Misc{, + author = {Eric Jones and Travis Oliphant and Pearu Peterson and others}, + title = {{SciPy}: Open source scientific tools for {Python}}, + year = {2001--}, + url = "http://www.scipy.org/", + note = {[Online; accessed 2016-04-09]} + } + </citation> + </citations> + </xml> + </macros> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/regression_test.tabular Tue May 31 16:51:53 2016 -0400 @@ -0,0 +1,5 @@ +86.9702122735 1.00532111569 -1.01739601979 -0.613139481654 0.641846874331 +91.2021798817 -0.621522971207 1.11914889596 0.390012184498 1.28956938152 +-47.4101632272 -0.638416457964 -0.732777468453 -0.864026104978 -1.06109770116 +61.7128046302 -1.09994800577 -0.739679672932 0.585657963012 1.48906827536 +-206.998295124 0.130238853011 0.70574123041 1.33206565264 -1.33220923738
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/regression_train.tabular Tue May 31 16:51:53 2016 -0400 @@ -0,0 +1,10 @@ +143.762620712 -0.330941870584 -1.17964571928 0.47944415578 -0.0486946279099 1.57951239219 +-88.5787166225 1.08055532812 -2.57109184022 -0.92512305494 0.317511276982 -1.202358944 +-82.8452345578 0.272541389247 -0.168636324107 0.923988150154 -0.467750945768 -0.719169535969 +72.4951388149 -0.268686605278 0.991068834926 0.731619322189 1.17038734294 0.323842059244 +11.805182128 1.03604670966 -0.709685560786 -1.54916691211 -0.614757954242 0.24176665894 +-63.9354970901 -0.101485840571 0.984112210822 -2.01704822953 0.282058758309 -0.776448499847 +126.325840796 -0.359998340179 0.353534448839 -1.23256828198 0.563632964937 1.15031170568 +23.0341392692 0.518540465136 1.03188231893 -2.53173026594 -0.0419267228327 0.193734455015 +67.6714937696 -0.115688051547 -0.821437865172 -0.368962397052 -0.526743874023 0.94315222831 +47.3927584881 -0.785096541368 -0.0942409319417 0.224267378731 -1.63317786831 1.26458811586
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rfc_result01 Tue May 31 16:51:53 2016 -0400 @@ -0,0 +1,6 @@ +0 1 2 3 0 +3.68258022948 2.82110345641 -3.990140724 -1.9523364774 1 +0.015942057224 -0.711958594347 0.125502976978 -0.972218263337 0 +2.08690768825 0.929399321468 -2.12924084484 -1.99714022188 1 +1.41321052084 0.523750660422 -1.4210539291 -1.49298569451 1 +0.76831404394 1.38267855169 -0.989045048734 0.649504257894 1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rfc_result02 Tue May 31 16:51:53 2016 -0400 @@ -0,0 +1,6 @@ +0 1 2 3 0 +3.68258022948 2.82110345641 -3.990140724 -1.9523364774 1 +0.015942057224 -0.711958594347 0.125502976978 -0.972218263337 0 +2.08690768825 0.929399321468 -2.12924084484 -1.99714022188 1 +1.41321052084 0.523750660422 -1.4210539291 -1.49298569451 1 +0.76831404394 1.38267855169 -0.989045048734 0.649504257894 1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rfr_result01 Tue May 31 16:51:53 2016 -0400 @@ -0,0 +1,6 @@ +0 1 2 3 4 0 +86.9702122735 1.00532111569 -1.01739601979 -0.613139481654 0.641846874331 0.867517611177 +91.2021798817 -0.621522971207 1.11914889596 0.390012184498 1.28956938152 0.851121328511 +-47.4101632272 -0.638416457964 -0.732777468453 -0.864026104978 -1.06109770116 0.0534409530407 +61.7128046302 -1.09994800577 -0.739679672932 0.585657963012 1.48906827536 1.18927597457 +-206.998295124 0.130238853011 0.70574123041 1.33206565264 -1.33220923738 -0.350236265367