Mercurial > repos > bgruening > sklearn_sample_generator

--- a/main_macros.xml	Wed May 04 13:10:15 2016 -0400
+++ b/main_macros.xml	Tue May 31 16:52:37 2016 -0400
@@ -14,11 +14,13 @@
     </stdio>
   </xml>

+
+  <!--Generic interface-->
   <xml name="train_loadConditional" token_train="tabular" token_data="tabular" token_model="txt">
     <conditional name="selected_tasks">
         <param name="selected_task" type="select" label="Select a Classification Task">
+            <option value="train" selected="true">Train a model</option>
             <option value="load">Load a model and predict</option>
-            <option value="train" selected="true">Train a model</option>
         </param>
         <when value="load">
             <param name="infile_model" type="data" format="@MODEL@" label="Models" help="Select a model file." />
@@ -43,18 +45,92 @@
     </conditional>
   </xml>

+  <xml name="sl_Conditional" token_train="tabular" token_data="tabular" token_model="txt">
+    <conditional name="selected_tasks">
+        <param name="selected_task" type="select" label="Select a Classification Task">
+            <option value="train" selected="true">Train a model</option>
+            <option value="load">Load a model and predict</option>
+        </param>
+        <when value="load">
+            <param name="infile_model" type="data" format="@MODEL@" label="Models" help="Select a model file." />
+            <param name="infile_data" type="data" format="@DATA@" label="Data (tabular)" help="Select the dataset you want to classify."/>
+            <conditional name="prediction_options">
+                <param name="prediction_option" type="select" label="Select the type of prediction">
+                    <option value="predict">Predict class labels</option>
+                    <option value="advanced">Include advanced options</option>
+                </param>
+                <when value="predict">
+                </when>
+                <when value="advanced">
+                </when>
+            </conditional>
+        </when>
+        <when value="train">
+            <conditional name="selected_algorithms">
+                <yield />
+            </conditional>
+        </when>
+    </conditional>
+  </xml>
+
   <xml name="advanced_section">
     <section name="options" title="Advanced Options" expanded="False">
       <yield />
     </section>
   </xml>

-  <xml name="tabular_input">
-    <param name="infile" type="data" format="tabular" label="Data file with numeric values"/>
-    <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Select a subset of data. Start column:" />
-    <param name="end_column" type="data_column" data_ref="infile" optional="True" label="End column:" />
+
+  <!--Ensemble methods-->
+  <xml name="n_estimators" token_default_value="10" token_help=" ">
+    <param argument="n_estimators" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Number of trees in the forest" help="@HELP@"/>
+  </xml>
+
+  <xml name="max_depth" token_default_value="" token_help=" ">
+    <param argument="max_depth" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Maximum depth of the tree" help="@HELP@"/>
+  </xml>
+
+  <xml name="min_samples_split" token_default_value="2" token_help=" ">
+    <param argument="min_samples_split" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Maximum depth of the tree" help="@HELP@"/>
+  </xml>
+
+  <xml name="min_samples_leaf" token_default_value="1" token_help=" ">
+    <param argument="min_samples_leaf" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Minimum number of samples in newly created leaves" help="@HELP@"/>
+  </xml>
+
+  <xml name="min_weight_fraction_leaf" token_default_value="0.0" token_help=" ">
+    <param argument="min_weight_fraction_leaf" type="float" optional="true" value="@DEFAULT_VALUE@" label="Minimum weighted fraction of the input samples required to be at a leaf node" help="@HELP@"/>
   </xml>

+  <xml name="max_leaf_nodes" token_default_value="" token_help=" ">
+    <param argument="max_leaf_nodes" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Maximum number of leaf nodes in best-first method" help="@HELP@"/>
+  </xml>
+
+  <xml name="bootstrap" token_checked="true" token_help=" ">
+    <param argument="bootstrap" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="@CHECKED@" label="Use bootstrap samples for building trees." help="@HELP@"/>
+  </xml>
+
+  <xml name="criterion" token_help=" ">
+    <param argument="criterion" type="select" label="Function to measure the quality of a split"  help=" ">
+        <option value="gini" selected="true">Gini impurity</option>
+        <option value="entropy">Information gain</option>
+        <yield/>
+    </param>
+  </xml>
+
+  <xml name="oob_score" token_checked="flase" token_help=" ">
+    <param argument="oob_score" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="@CHECKED@" label="Use out-of-bag samples to estimate the generalization error" help="@HELP@"/>
+  </xml>
+
+  <xml name="max_features" token_default_value="auto" token_help="This could be an integer, float, string, or None. For more information please refer to help. ">
+    <param argument="max_features" type="text" optional="true" value="@DEFAULT_VALUE@" label="Number of features for finding the best split" help="@HELP@"/>
+  </xml>
+
+  <xml name="learning_rate" token_default_value="1.0" token_help=" ">
+    <param argument="learning_rate" type="float" optional="true" value="@DEFAULT_VALUE@" label="Learning rate" help="@HELP@"/>
+  </xml>
+
+
+  <!--Parameters-->
   <xml name="tol" token_default_value="0.0" token_help_text="Early stopping heuristics based on the relative center changes. Set to default (0.0) to disable this convergence detection.">
         <param argument="tol" type="float" optional="true" value="@DEFAULT_VALUE@" label="Tolerance" help="@HELP_TEXT@"/>
   </xml>
@@ -136,66 +212,71 @@

   <xml name="coef0" token_default_value="1" token_label="Zero coefficient" token_help_text=" ">
     <param argument="coef0" type="integer" optional="true" value="@DEFAULT_VALUE@" label="@LABEL@" help="@HELP_TEXT@"/>
-  </xml>
+  </xml>
+
+  <xml name="pos_label" token_default_value="">
+    <param argument="pos_label" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Label of the positive class" help=" "/>
+  </xml>
+
+  <xml name="average">
+    <param argument="average" type="select" optional="True" label="Averaging type" help=" ">
+      <option value="binary" selected="true" help="Only report results for the class specified by pos_label. Applicable only on binary classification.">binary</option>
+      <option value="micro" help="Calculate metrics globally by counting the total true positives, false negatives and false positives.">micro</option>
+      <option value="samples" help="Calculate metrics for each instance, and find their average (only meaningful for multilabel).">samples</option>
+      <!--option value="macro" help=""></option-->
+      <!--option value="weighted" help=""></option-->
+    </param>
+  </xml>
+
+  <xml name="beta">
+    <param argument="beta" type="float" value="1.0" label="The strength of recall versus precision in the F-score" help=" "/>
+  </xml>
+
+
+  <!--Data interface-->
+  <xml name="tabular_input">
+    <param name="infile" type="data" format="tabular" label="Data file with numeric values"/>
+    <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Select a subset of data. Start column:" />
+    <param name="end_column" type="data_column" data_ref="infile" optional="True" label="End column:" />
+  </xml>
+
+  <xml name="sample_cols" token_label1="File containing true class labels:" token_label2="File containing predicted class labels:" token_multiple1="False" token_multiple2="False" token_format1="tabular" token_format2="tabular" token_help1="" token_help2="">
+    <param name="infile1" type="data" format="@FORMAT1@" label="@LABEL1@" help="@HELP1@"/>
+    <param name="col1" multiple="@MULTIPLE1@" type="data_column" data_ref="infile1" label="Select target column(s):"/>
+    <param name="infile2" type="data" format="@FORMAT2@" label="@LABEL2@" help="@HELP2@"/>
+    <param name="col2" multiple="@MULTIPLE2@" type="data_column" data_ref="infile2" label="Select target column(s):"/>
+    <yield/>
+  </xml>

   <xml name="multiple_input" token_name="input_files" token_max_num="10" token_format="txt" token_label="Sparse matrix file (.mtx, .txt)" token_help_text="Specify a sparse matrix file in .txt format.">
     <repeat name="@NAME@" min="1" max="@MAX_NUM@" title="Select input file(s):">
         <param name="input" type="data" format="@FORMAT@" label="@LABEL@" help="@HELP_TEXT@"/>
     </repeat>
-  </xml>
+  </xml>

-  <xml name="eden_citation">
-    <citations>
-        <citation type="bibtex">
-            @misc{fabrizio_costa_2015_15094,
-              author       = {Fabrizio Costa and
-                              Björn Grüning and
-                              gigolo},
-              title        = {EDeN: EDeN - Graph Vectorizer},
-              month        = feb,
-              year         = 2015,
-              doi          = {10.5281/zenodo.15094},
-              url          = {http://dx.doi.org/10.5281/zenodo.15094}
-                    }
-                        }
-        </citation>
-    </citations>
+  <xml name="sparse_target" token_label1="Select a sparse matrix:" token_label2="Select the tabular containing true labels:" token_multiple="False" token_format1="txt" token_format2="tabular" token_help1="" token_help2="">
+    <param name="infile1" type="data" format="@FORMAT1@" label="@LABEL1@" help="@HELP1@"/>
+    <param name="infile2" type="data" format="@FORMAT2@" label="@LABEL2@" help="@HELP2@"/>
+    <param name="col2" multiple="@MULTIPLE@" type="data_column" data_ref="infile2" label="Select target column(s):"/>
   </xml>

-  <xml name="sklearn_citation">
-    <citations>
-        <citation type="bibtex">
-            @article{scikit-learn,
-             title={Scikit-learn: Machine Learning in {P}ython},
-             author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
-                     and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
-                     and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
-                     Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
-             journal={Journal of Machine Learning Research},
-             volume={12},
-             pages={2825--2830},
-             year={2011}
-             url = {https://github.com/scikit-learn/scikit-learn}
-            }
-        </citation>
-    </citations>
-  </xml>
-
-  <xml name="scipy_citation">
-    <citations>
-        <citation type="bibtex">
-          @Misc{,
-          author =    {Eric Jones and Travis Oliphant and Pearu Peterson and others},
-          title =     {{SciPy}: Open source scientific tools for {Python}},
-          year =      {2001--},
-          url = "http://www.scipy.org/",
-          note = {[Online; accessed 2016-04-09]}
-        }
-        </citation>
-    </citations>
+  <xml name="sl_mixed_input">
+    <conditional name="input_options">
+      <param name="selected_input" type="select" label="Select input type:">
+          <option value="tabular" selected="true">tabular data</option>
+          <option value="sparse">sparse matrix</option>
+      </param>
+      <when value="tabular">
+          <expand macro="sample_cols" multiple1="true"/>
+      </when>
+      <when value="sparse">
+          <expand macro="sparse_target"/>
+      </when>
+    </conditional>
   </xml>


+  <!--Advanced options-->
   <xml name="nn_advanced_options">
     <section name="options" title="Advanced Options" expanded="False">
       <yield/>
@@ -442,4 +523,115 @@
     </when>
   </xml>

+  <xml name="sparse_preprocessors">
+    <param name="selected_pre_processor" type="select" label="Select a preprocessor:">
+      <option value="StandardScaler" selected="true">Standardize features by removing the mean and scaling to unit variance</option>
+      <option value="Binarizer">Binarize data</option>
+      <option value="Imputer">Complete missing values</option>
+      <option value="MaxAbsScaler">Scale features by their maximum absolute value</option>
+      <option value="Normalizer">Normalize samples individually to unit norm</option>
+      <yield/>
+    </param>
+  </xml>
+
+  <xml name="sparse_preprocessor_options">
+    <when value="Binarizer">
+        <section name="options" title="Advanced Options" expanded="False">
+            <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Use a copy of data for precomputing binarization" help=" "/>
+            <param argument="threshold" type="float" optional="true" value="0.0" label="Threshold" help="Feature values below or equal to this are replaced by 0, above it by 1. Threshold may not be less than 0 for operations on sparse matrices. "/>
+        </section>
+    </when>
+    <when value="Imputer">
+        <section name="options" title="Advanced Options" expanded="False">
+            <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Use a copy of data for precomputing imputation" help=" "/>
+            <param argument="strategy" type="select" optional="true" label="Imputation strategy" help=" ">
+                <option value="mean" selected="true">Replace missing values using the mean along the axis</option>
+                <option value="median">Replace missing values using the median along the axis</option>
+                <option value="most_frequent">Replace missing using the most frequent value along the axis</option>
+            </param>
+            <param argument="missing_values" type="text" optional="true" value="NaN" label="Placeholder for missing values" help="For missing values encoded as numpy.nan, use the string value “NaN”"/>
+            <param argument="axis" type="select" optional="true" label="The axis along which to impute" help=" ">
+                <option value="0" selected="true">Impute along columns</option>
+                <option value="1">Impute along rows</option>
+            </param>
+        </section>
+    </when>
+    <when value="StandardScaler">
+        <section name="options" title="Advanced Options" expanded="False">
+            <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Use a copy of data for performing inplace scaling" help=" "/>
+            <param argument="with_mean" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Center the data before scaling" help=" "/>
+            <param argument="with_std" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Scale the data to unit variance (or unit standard deviation)" help=" "/>
+        </section>
+    </when>
+    <when value="MaxAbsScaler">
+        <section name="options" title="Advanced Options" expanded="False">
+            <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Use a copy of data for precomputing scaling" help=" "/>
+        </section>
+    </when>
+    <when value="Normalizer">
+        <section name="options" title="Advanced Options" expanded="False">
+            <param argument="norm" type="select" optional="true" label="The norm to use to normalize non zero samples" help=" ">
+                <option value="l1" selected="true">l1</option>
+                <option value="l2">l2</option>
+                <option value="max">max</option>
+                <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Use a copy of data for precomputing row normalization" help=" "/>
+            </param>
+        </section>
+    </when>
+    <yield/>
+  </xml>
+
+
+  <!--Citations-->
+  <xml name="eden_citation">
+    <citations>
+        <citation type="bibtex">
+            @misc{fabrizio_costa_2015_15094,
+              author       = {Fabrizio Costa and
+                              Björn Grüning and
+                              gigolo},
+              title        = {EDeN: EDeN - Graph Vectorizer},
+              month        = feb,
+              year         = 2015,
+              doi          = {10.5281/zenodo.15094},
+              url          = {http://dx.doi.org/10.5281/zenodo.15094}
+                    }
+                        }
+        </citation>
+    </citations>
+  </xml>
+
+  <xml name="sklearn_citation">
+    <citations>
+        <citation type="bibtex">
+            @article{scikit-learn,
+             title={Scikit-learn: Machine Learning in {P}ython},
+             author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+                     and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+                     and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+                     Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+             journal={Journal of Machine Learning Research},
+             volume={12},
+             pages={2825--2830},
+             year={2011}
+             url = {https://github.com/scikit-learn/scikit-learn}
+            }
+        </citation>
+    </citations>
+  </xml>
+
+  <xml name="scipy_citation">
+    <citations>
+        <citation type="bibtex">
+          @Misc{,
+          author =    {Eric Jones and Travis Oliphant and Pearu Peterson and others},
+          title =     {{SciPy}: Open source scientific tools for {Python}},
+          year =      {2001--},
+          url = "http://www.scipy.org/",
+          note = {[Online; accessed 2016-04-09]}
+        }
+        </citation>
+    </citations>
+  </xml>
+
 </macros>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/regression_test.tabular	Tue May 31 16:52:37 2016 -0400
@@ -0,0 +1,5 @@
+86.9702122735	1.00532111569	-1.01739601979	-0.613139481654	0.641846874331
+91.2021798817	-0.621522971207	1.11914889596	0.390012184498	1.28956938152
+-47.4101632272	-0.638416457964	-0.732777468453	-0.864026104978	-1.06109770116
+61.7128046302	-1.09994800577	-0.739679672932	0.585657963012	1.48906827536
+-206.998295124	0.130238853011	0.70574123041	1.33206565264	-1.33220923738
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/regression_train.tabular	Tue May 31 16:52:37 2016 -0400
@@ -0,0 +1,10 @@
+143.762620712	-0.330941870584	-1.17964571928	0.47944415578	-0.0486946279099	1.57951239219
+-88.5787166225	1.08055532812	-2.57109184022	-0.92512305494	0.317511276982	-1.202358944
+-82.8452345578	0.272541389247	-0.168636324107	0.923988150154	-0.467750945768	-0.719169535969
+72.4951388149	-0.268686605278	0.991068834926	0.731619322189	1.17038734294	0.323842059244
+11.805182128	1.03604670966	-0.709685560786	-1.54916691211	-0.614757954242	0.24176665894
+-63.9354970901	-0.101485840571	0.984112210822	-2.01704822953	0.282058758309	-0.776448499847
+126.325840796	-0.359998340179	0.353534448839	-1.23256828198	0.563632964937	1.15031170568
+23.0341392692	0.518540465136	1.03188231893	-2.53173026594	-0.0419267228327	0.193734455015
+67.6714937696	-0.115688051547	-0.821437865172	-0.368962397052	-0.526743874023	0.94315222831
+47.3927584881	-0.785096541368	-0.0942409319417	0.224267378731	-1.63317786831	1.26458811586
Binary file test-data/rfc_model01 has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rfc_result01	Tue May 31 16:52:37 2016 -0400
@@ -0,0 +1,6 @@
+0	1	2	3	0
+3.68258022948	2.82110345641	-3.990140724	-1.9523364774	1
+0.015942057224	-0.711958594347	0.125502976978	-0.972218263337	0
+2.08690768825	0.929399321468	-2.12924084484	-1.99714022188	1
+1.41321052084	0.523750660422	-1.4210539291	-1.49298569451	1
+0.76831404394	1.38267855169	-0.989045048734	0.649504257894	1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rfc_result02	Tue May 31 16:52:37 2016 -0400
@@ -0,0 +1,6 @@
+0	1	2	3	0
+3.68258022948	2.82110345641	-3.990140724	-1.9523364774	1
+0.015942057224	-0.711958594347	0.125502976978	-0.972218263337	0
+2.08690768825	0.929399321468	-2.12924084484	-1.99714022188	1
+1.41321052084	0.523750660422	-1.4210539291	-1.49298569451	1
+0.76831404394	1.38267855169	-0.989045048734	0.649504257894	1
Binary file test-data/rfr_model01 has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rfr_result01	Tue May 31 16:52:37 2016 -0400
@@ -0,0 +1,6 @@
+0	1	2	3	4	0
+86.9702122735	1.00532111569	-1.01739601979	-0.613139481654	0.641846874331	0.867517611177
+91.2021798817	-0.621522971207	1.11914889596	0.390012184498	1.28956938152	0.851121328511
+-47.4101632272	-0.638416457964	-0.732777468453	-0.864026104978	-1.06109770116	0.0534409530407
+61.7128046302	-1.09994800577	-0.739679672932	0.585657963012	1.48906827536	1.18927597457
+-206.998295124	0.130238853011	0.70574123041	1.33206565264	-1.33220923738	-0.350236265367