Mercurial > repos > bgruening > sklearn_data_preprocess

--- a/main_macros.xml	Tue Jul 10 03:07:38 2018 -0400
+++ b/main_macros.xml	Fri Jul 13 03:51:32 2018 -0400
@@ -35,7 +35,8 @@
     if not options['threshold'] or options['threshold'] == 'None':
       options['threshold'] = None
       if 'extra_estimator' in inputs and inputs['extra_estimator']['has_estimator'] == 'no_load':
-        fitted_estimator = pickle.load(open("inputs['extra_estimator']['fitted_estimator']", 'r'))
+        with open("inputs['extra_estimator']['fitted_estimator']", 'rb') as model_handler:
+          fitted_estimator = pickle.load(model_handler)
         new_selector = selector(fitted_estimator, prefit=True, **options)
       else:
         estimator=inputs["estimator"]
@@ -83,7 +84,7 @@
       parse_dates=True
     )
   else:
-    X = mmread(open(file1, 'r'))
+    X = mmread(file1)

   header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header2"] else None
   column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
@@ -432,19 +433,6 @@


   <!--Data interface-->
-  <xml name="tabular_input">
-    <param name="infile" type="data" format="tabular" label="Data file with numeric values"/>
-    <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Select a subset of data. Start column:" />
-    <param name="end_column" type="data_column" data_ref="infile" optional="True" label="End column:" />
-  </xml>
-
-  <xml name="sample_cols" token_label1="File containing true class labels:" token_label2="File containing predicted class labels:" token_multiple1="False" token_multiple2="False" token_format1="tabular" token_format2="tabular" token_help1="" token_help2="">
-    <param name="infile1" type="data" format="@FORMAT1@" label="@LABEL1@" help="@HELP1@"/>
-    <param name="col1" multiple="@MULTIPLE1@" type="data_column" data_ref="infile1" label="Select target column(s):"/>
-    <param name="infile2" type="data" format="@FORMAT2@" label="@LABEL2@" help="@HELP2@"/>
-    <param name="col2" multiple="@MULTIPLE2@" type="data_column" data_ref="infile2" label="Select target column(s):"/>
-    <yield/>
-  </xml>

   <xml name="samples_tabular" token_multiple1="false" token_multiple2="false">
     <param name="infile1" type="data" format="tabular" label="Training samples dataset:"/>
@@ -472,13 +460,13 @@
       <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" data_ref="@INFILE@" label="Select target column(s):"/>
     </when>
     <when value="by_header_name">
-      <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="String seperate by colon. For example: target1,target2"/>
+      <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="Comma-separated string. For example: target1,target2"/>
     </when>
     <when value="all_but_by_index_number">
       <param name="@COL_NAME@" multiple="@MULTIPLE@" type="data_column" data_ref="@INFILE@" label="Select target column(s):"/>
     </when>
     <when value="all_but_by_header_name">
-      <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="String seperate by colon. For example: target1,target2"/>
+      <param name="@COL_NAME@" type="text" value="" label="Type header name(s):" help="Comma-separated string. For example: target1,target2"/>
     </when>
     <when value="all_columns">
     </when>
@@ -553,11 +541,6 @@
     </conditional>
   </xml>

-  <xml name="multitype_input" token_format="tabular" token_help="All datasets with tabular format are supporetd.">
-    <param name="infile_transform" type="data" format="@FORMAT@" label="Select a dataset to transform:" help="@HELP@"/>
-  </xml>
-
-
   <!--Advanced options-->
   <xml name="nn_advanced_options">
     <section name="options" title="Advanced Options" expanded="False">
@@ -822,9 +805,17 @@
     </param>
   </xml>

+  <xml name="sparse_preprocessors_ext">
+    <expand macro="sparse_preprocessors">
+      <option value="KernelCenterer">Kernel Centerer (Centers a kernel matrix)</option>
+      <option value="MinMaxScaler">Minmax Scaler (Scales features to a range)</option>
+      <option value="PolynomialFeatures">Polynomial Features (Generates polynomial and interaction features)</option>
+      <option value="RobustScaler">Robust Scaler (Scales features using outlier-invariance statistics)</option>
+    </expand>
+  </xml>
+
   <xml name="sparse_preprocessor_options">
     <when value="Binarizer">
-        <expand macro="multitype_input" format="tabular,txt" help="Tabular and sparse datasets are supporetd."/>
         <section name="options" title="Advanced Options" expanded="False">
             <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true"
                 label="Use a copy of data for precomputing binarization" help=" "/>
@@ -834,7 +825,6 @@
         </section>
     </when>
     <when value="Imputer">
-      <expand macro="multitype_input" format="tabular,txt" help="Tabular and sparse datasets are supporetd."/>
       <section name="options" title="Advanced Options" expanded="False">
           <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true"
             label="Use a copy of data for precomputing imputation" help=" "/>
@@ -854,7 +844,6 @@
       </section>
     </when>
     <when value="StandardScaler">
-      <expand macro="multitype_input"/>
       <section name="options" title="Advanced Options" expanded="False">
         <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true"
             label="Use a copy of data for performing inplace scaling" help=" "/>
@@ -865,14 +854,12 @@
       </section>
     </when>
     <when value="MaxAbsScaler">
-      <expand macro="multitype_input" format="tabular,txt" help="Tabular and sparse datasets are supporetd."/>
       <section name="options" title="Advanced Options" expanded="False">
         <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true"
             label="Use a copy of data for precomputing scaling" help=" "/>
       </section>
     </when>
     <when value="Normalizer">
-      <expand macro="multitype_input" format="tabular,txt" help="Tabular and sparse datasets are supporetd."/>
       <section name="options" title="Advanced Options" expanded="False">
         <param argument="norm" type="select" optional="true" label="The norm to use to normalize non zero samples" help=" ">
           <option value="l1" selected="true">l1</option>
@@ -885,6 +872,41 @@
     </when>
     <yield/>
   </xml>
+
+  <xml name="sparse_preprocessor_options_ext">
+    <expand macro="sparse_preprocessor_options">
+      <when value="KernelCenterer">
+        <section name="options" title="Advanced Options" expanded="False">
+        </section>
+      </when>
+      <when value="MinMaxScaler">
+          <section name="options" title="Advanced Options" expanded="False">
+              <!--feature_range-->
+              <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
+                  label="Use a copy of data for precomputing normalization" help=" "/>
+          </section>
+      </when>
+      <when value="PolynomialFeatures">
+          <section name="options" title="Advanced Options" expanded="False">
+              <param argument="degree" type="integer" optional="true" value="2" label="The degree of the polynomial features " help=""/>
+              <param argument="interaction_only" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Produce interaction features only" help="(Features that are products of at most degree distinct input features) "/>
+              <param argument="include_bias" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Include a bias column" help="Feature in which all polynomial powers are zero "/>
+          </section>
+      </when>
+      <when value="RobustScaler">
+          <section name="options" title="Advanced Options" expanded="False">
+              <!--=True, =True, copy=True-->
+              <param argument="with_centering" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
+                  label="Center the data before scaling" help=" "/>
+              <param argument="with_scaling" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
+                  label="Scale the data to interquartile range" help=" "/>
+              <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
+                  label="Use a copy of data for inplace scaling" help=" "/>
+          </section>
+      </when>
+    </expand>
+  </xml>
+
   <xml name="estimator_input_no_fit">
     <expand macro="feature_selection_estimator" />
     <conditional name="extra_estimator">
@@ -892,6 +914,7 @@
       <expand macro="feature_selection_estimator_choices" />
     </conditional>
   </xml>
+
   <xml name="feature_selection_all">
     <conditional name="feature_selection_algorithms">
       <param name="selected_algorithm" type="select" label="Select a feature selection algorithm">
@@ -1014,6 +1037,7 @@
       </when-->
     </conditional>
   </xml>
+
   <xml name="feature_selection_score_function">
     <param argument="score_func" type="select" label="Select a score function">
       <option value="chi2">chi2 - Compute chi-squared stats between each non-negative feature and class</option>
@@ -1023,6 +1047,7 @@
       <option value="mutual_info_regression">mutual_info_regression - Estimate mutual information for a continuous target variable</option>
     </param>
   </xml>
+
   <xml name="feature_selection_estimator">
     <param argument="estimator" type="select" label="Select an estimator" help="The base estimator from which the transformer is built.">
       <option value="svm.SVR(kernel=&quot;linear&quot;)">svm.SVR(kernel=&quot;linear&quot;)</option>
@@ -1032,6 +1057,7 @@
       <option value="ensemble.RandomForestRegressor(n_estimators = 1000, random_state = 42)">ensemble.RandomForestRegressor(n_estimators = 1000, random_state = 42)</option>
     </param>
   </xml>
+
   <xml name="feature_selection_extra_estimator">
       <param name="has_estimator" type="select" label="Does your estimator on the list above?">
         <option value="yes">Yes, my estimator is on the list</option>
@@ -1039,6 +1065,7 @@
         <yield/>
       </param>
   </xml>
+
   <xml name="feature_selection_estimator_choices">
     <when value="yes">
     </when>
@@ -1047,6 +1074,7 @@
     </when>
     <yield/>
   </xml>
+
   <xml name="feature_selection_methods">
     <conditional name="select_methods">
       <param name="selected_method" type="select" label="Select an operation">
--- a/pre_process.xml	Tue Jul 10 03:07:38 2018 -0400
+++ b/pre_process.xml	Fri Jul 13 03:51:32 2018 -0400
@@ -24,19 +24,32 @@
 from scipy.io import mmwrite
 from sklearn import preprocessing

+@COLUMNS_FUNCTION@
+
 input_json_path = sys.argv[1]
-params = json.load(open(input_json_path, "r"))
+with open(input_json_path, "r") as param_handler:
+    params = json.load(param_handler)

 #if $input_type.selected_input_type == "sparse":
-X = mmread(open("$infile", 'r'))
+X = mmread("$infile")
 #else:
-X = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False )
-#end if
-
-#if $input_type.pre_processors.infile_transform.ext == 'txt':
-y = mmread(open("$infile", 'r'))
-#else:
-y = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False )
+header = 'infer' if params["input_type"]["header1"] else None
+column_option = params["input_type"]["column_selector_options_1"]["selected_column_selector_option"]
+if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
+    c = params["input_type"]["column_selector_options_1"]["col1"]
+else:
+    c = None
+X = read_columns(
+        "$input_type.infile",
+        c = c,
+        c_option = column_option,
+        sep='\t',
+        header=header,
+        parse_dates=True,
+        encoding=None,
+        index_col=None,
+        tupleize_cols=False
+)
 #end if

 preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"]
@@ -45,17 +58,19 @@
 my_class = getattr(preprocessing, preprocessor)
 estimator = my_class(**options)
 estimator.fit(X)
-result = estimator.transform(y)
+result = estimator.transform(X)

-#if $input_type.pre_processors.infile_transform.ext == 'txt':
-mmwrite(open("$outfile_transform" , 'w+'), result)
+#if $input_type.selected_input_type == "sparse":
+with open("$outfile_transform", "w+") as transform_handler:
+    mmwrite(transform_handler, result)
 #else:
 res = pandas.DataFrame(result)
 res.to_csv(path_or_buf = "$outfile_transform", sep="\t", index=False, header=None)
 #end if

 #if $save:
-pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL)
+with open("$outfile_fit", 'wb') as out_handler:
+    pickle.dump(estimator, out_handler, pickle.HIGHEST_PROTOCOL)
 #end if
         ]]>
         </configfile>
@@ -67,49 +82,14 @@
                 <option value="sparse">Sparse</option>
             </param>
             <when value="tabular">
-                <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:"/>
+                <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:" />
+                <param name="header1" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Does the dataset contain header:" />
+                <conditional name="column_selector_options_1">
+                    <expand macro="samples_column_selector_options" multiple="true" column_option="selected_column_selector_option" col_name="col1" infile="infile"/>
+                </conditional>
                 <conditional name="pre_processors">
-                    <expand macro="sparse_preprocessors">
-                        <option value="KernelCenterer">Kernel Centerer (Centers a kernel matrix)</option>
-                        <option value="MinMaxScaler">Minmax Scaler (Scales features to a range)</option>
-                        <option value="PolynomialFeatures">Polynomial Features (Generates polynomial and interaction features)</option>
-                        <option value="RobustScaler">Robust Scaler (Scales features using outlier-invariance statistics)</option>
-                    </expand>
-                    <expand macro="sparse_preprocessor_options">
-                        <when value="KernelCenterer">
-                            <expand macro="multitype_input"/>
-                            <section name="options" title="Advanced Options" expanded="False">
-                            </section>
-                        </when>
-                        <when value="MinMaxScaler">
-                            <expand macro="multitype_input"/>
-                            <section name="options" title="Advanced Options" expanded="False">
-                                <!--feature_range-->
-                                <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
-                                    label="Use a copy of data for precomputing normalization" help=" "/>
-                            </section>
-                        </when>
-                        <when value="PolynomialFeatures">
-                            <expand macro="multitype_input"/>
-                            <section name="options" title="Advanced Options" expanded="False">
-                                <param argument="degree" type="integer" optional="true" value="2" label="The degree of the polynomial features " help=""/>
-                                <param argument="interaction_only" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Produce interaction features only" help="(Features that are products of at most degree distinct input features) "/>
-                                <param argument="include_bias" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Include a bias column" help="Feature in which all polynomial powers are zero "/>
-                            </section>
-                        </when>
-                        <when value="RobustScaler">
-                            <expand macro="multitype_input"/>
-                            <section name="options" title="Advanced Options" expanded="False">
-                                <!--=True, =True, copy=True-->
-                                <param argument="with_centering" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
-                                    label="Center the data before scaling" help=" "/>
-                                <param argument="with_scaling" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
-                                    label="Scale the data to interquartile range" help=" "/>
-                                <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true"
-                                    label="Use a copy of data for inplace scaling" help=" "/>
-                            </section>
-                        </when>
-                    </expand>
+                    <expand macro="sparse_preprocessors_ext" />
+                    <expand macro="sparse_preprocessor_options_ext" />
                 </conditional>
             </when>
             <when value="sparse">
@@ -133,7 +113,7 @@
     <tests>
         <test>
             <param name="infile" value="train.tabular" ftype="tabular"/>
-            <param name="infile_transform" value="train.tabular" ftype="tabular"/>
+            <param name="selected_column_selector_option" value="all_columns"/>
             <param name="selected_input_type" value="tabular"/>
             <param name="selected_pre_processor" value="KernelCenterer"/>
             <param name="save" value="true"/>
@@ -142,7 +122,7 @@
         </test>
         <test>
             <param name="infile" value="train.tabular" ftype="tabular"/>
-            <param name="infile_transform" value="train.tabular" ftype="tabular"/>
+            <param name="selected_column_selector_option" value="all_columns"/>
             <param name="selected_input_type" value="tabular"/>
             <param name="selected_pre_processor" value="MinMaxScaler"/>
             <param name="save" value="true"/>
@@ -151,7 +131,7 @@
         </test>
         <test>
             <param name="infile" value="train.tabular" ftype="tabular"/>
-            <param name="infile_transform" value="train.tabular" ftype="tabular"/>
+            <param name="selected_column_selector_option" value="all_columns"/>
             <param name="selected_input_type" value="tabular"/>
             <param name="selected_pre_processor" value="PolynomialFeatures"/>
             <param name="save" value="true"/>
@@ -160,7 +140,7 @@
         </test>
         <test>
             <param name="infile" value="train.tabular" ftype="tabular"/>
-            <param name="infile_transform" value="train.tabular" ftype="tabular"/>
+            <param name="selected_column_selector_option" value="all_columns"/>
             <param name="selected_input_type" value="tabular"/>
             <param name="selected_pre_processor" value="RobustScaler"/>
             <param name="save" value="true"/>
@@ -169,7 +149,6 @@
         </test>
         <test>
             <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>
-            <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>
             <param name="selected_input_type" value="sparse"/>
             <param name="selected_pre_processor" value="Binarizer"/>
             <param name="save" value="true"/>
@@ -178,7 +157,6 @@
         </test>
         <test>
             <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>
-            <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>
             <param name="selected_input_type" value="sparse"/>
             <param name="selected_pre_processor" value="Imputer"/>
             <param name="save" value="true"/>
@@ -188,8 +166,8 @@
         </test>
         <test>
             <param name="infile" value="train.tabular" ftype="tabular"/>
-            <param name="infile_transform" value="train.tabular" ftype="tabular"/>
             <param name="selected_input_type" value="tabular"/>
+            <param name="selected_column_selector_option" value="all_columns"/>
             <param name="selected_pre_processor" value="StandardScaler"/>
             <param name="save" value="true"/>
             <output name="outfile_transform" file="prp_result07" ftype="tabular"/>
@@ -197,7 +175,6 @@
         </test>
         <test>
             <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>
-            <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>
             <param name="selected_input_type" value="sparse"/>
             <param name="selected_pre_processor" value="MaxAbsScaler"/>
             <param name="save" value="true"/>
@@ -206,7 +183,6 @@
         </test>
         <test>
             <param name="infile" value="csr_sparse2.mtx" ftype="txt"/>
-            <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/>
             <param name="selected_input_type" value="sparse"/>
             <param name="selected_pre_processor" value="Normalizer"/>
             <param name="save" value="true"/>
--- a/test-data/mv_result07.tabular	Tue Jul 10 03:07:38 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-0.7824428015300172