Mercurial > repos > bgruening > sklearn_data_preprocess
comparison pre_process.xml @ 28:a12d5eae322e draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 60f0fbc0eafd7c11bc60fb6c77f2937782efd8a9-dirty
| author | bgruening |
|---|---|
| date | Fri, 09 Aug 2019 06:25:31 -0400 |
| parents | 55b36adb2dc7 |
| children | e270c53b5df6 |
comparison
equal
deleted
inserted
replaced
| 27:f7e85579f2e6 | 28:a12d5eae322e |
|---|---|
| 17 <![CDATA[ | 17 <![CDATA[ |
| 18 import sys | 18 import sys |
| 19 import json | 19 import json |
| 20 import pandas | 20 import pandas |
| 21 import pickle | 21 import pickle |
| 22 | |
| 22 from scipy.io import mmread | 23 from scipy.io import mmread |
| 23 from scipy.io import mmwrite | 24 from scipy.io import mmwrite |
| 24 from sklearn import preprocessing | 25 from sklearn import preprocessing |
| 25 | 26 from galaxy_ml.utils import read_columns, SafeEval |
| 26 sys.path.insert(0, '$__tool_directory__') | 27 |
| 27 from utils import read_columns | 28 |
| 29 safe_eval = SafeEval() | |
| 28 | 30 |
| 29 input_json_path = sys.argv[1] | 31 input_json_path = sys.argv[1] |
| 30 with open(input_json_path, "r") as param_handler: | 32 with open(input_json_path, "r") as param_handler: |
| 31 params = json.load(param_handler) | 33 params = json.load(param_handler) |
| 32 | 34 |
| 37 column_option = params["input_type"]["column_selector_options_1"]["selected_column_selector_option"] | 39 column_option = params["input_type"]["column_selector_options_1"]["selected_column_selector_option"] |
| 38 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: | 40 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: |
| 39 c = params["input_type"]["column_selector_options_1"]["col1"] | 41 c = params["input_type"]["column_selector_options_1"]["col1"] |
| 40 else: | 42 else: |
| 41 c = None | 43 c = None |
| 42 X = read_columns( | 44 X, input_df = read_columns( |
| 43 "$input_type.infile", | 45 "$input_type.infile", |
| 44 c = c, | 46 c=c, |
| 45 c_option = column_option, | 47 c_option=column_option, |
| 48 return_df=True, | |
| 46 sep='\t', | 49 sep='\t', |
| 47 header=header, | 50 header=header, |
| 48 parse_dates=True, | 51 parse_dates=True, |
| 49 encoding=None, | 52 encoding=None, |
| 50 index_col=None, | 53 index_col=None, |
| 51 tupleize_cols=False).astype(float) | 54 tupleize_cols=False) |
| 55 X = X.astype(float) | |
| 52 #end if | 56 #end if |
| 53 | 57 |
| 54 preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"] | 58 preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"] |
| 55 options = params["input_type"]["pre_processors"]["options"] | 59 options = params["input_type"]["pre_processors"]["options"] |
| 60 if 'feature_range' in options: | |
| 61 feature_range = safe_eval(options['feature_range'].strip()) | |
| 62 if not feature_range: | |
| 63 feature_range = (0, 1) | |
| 64 options['feature_range'] = feature_range | |
| 56 | 65 |
| 57 my_class = getattr(preprocessing, preprocessor) | 66 my_class = getattr(preprocessing, preprocessor) |
| 58 estimator = my_class(**options) | 67 estimator = my_class(**options) |
| 59 estimator.fit(X) | 68 estimator.fit(X) |
| 60 result = estimator.transform(X) | 69 result = estimator.transform(X) |
| 61 | 70 |
| 62 #if $input_type.selected_input_type == "sparse": | 71 #if $input_type.selected_input_type == "sparse": |
| 63 with open("$outfile_transform", "wb") as transform_handler: | 72 with open("$outfile_transform", "wb") as transform_handler: |
| 64 mmwrite(transform_handler, result) | 73 mmwrite(transform_handler, result) |
| 65 #else: | 74 #else: |
| 66 res = pandas.DataFrame(result) | 75 columns = input_df.columns |
| 67 res.to_csv(path_or_buf = "$outfile_transform", sep="\t", index=False, header=None) | 76 if preprocessor == 'PolynomialFeatures': |
| 77 columns = None | |
| 78 header = False | |
| 79 res = pandas.DataFrame(result, columns=columns) | |
| 80 res.to_csv(path_or_buf = "$outfile_transform", sep="\t", | |
| 81 index=False, header=True if header else False) | |
| 68 #end if | 82 #end if |
| 69 | 83 |
| 70 #if $save: | 84 #if $save: |
| 71 with open("$outfile_fit", 'wb') as out_handler: | 85 with open("$outfile_fit", 'wb') as out_handler: |
| 72 pickle.dump(estimator, out_handler, pickle.HIGHEST_PROTOCOL) | 86 pickle.dump(estimator, out_handler, pickle.HIGHEST_PROTOCOL) |
| 153 <param name="save" value="true"/> | 167 <param name="save" value="true"/> |
| 154 <output name="outfile_transform" file="prp_result05" ftype="tabular"/> | 168 <output name="outfile_transform" file="prp_result05" ftype="tabular"/> |
| 155 <output name="outfile_fit" file="prp_model05" ftype="zip" compare="sim_size" delta="5"/> | 169 <output name="outfile_fit" file="prp_model05" ftype="zip" compare="sim_size" delta="5"/> |
| 156 </test> | 170 </test> |
| 157 <test> | 171 <test> |
| 158 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> | |
| 159 <param name="selected_input_type" value="sparse"/> | |
| 160 <param name="selected_pre_processor" value="Imputer"/> | |
| 161 <param name="save" value="true"/> | |
| 162 <param name="axis" value="true"/> | |
| 163 <output name="outfile_transform" file="prp_result06" ftype="tabular"/> | |
| 164 <output name="outfile_fit" file="prp_model06" ftype="zip" compare="sim_size" delta="50"/> | |
| 165 </test> | |
| 166 <test> | |
| 167 <param name="infile" value="train.tabular" ftype="tabular"/> | 172 <param name="infile" value="train.tabular" ftype="tabular"/> |
| 168 <param name="selected_input_type" value="tabular"/> | 173 <param name="selected_input_type" value="tabular"/> |
| 169 <param name="selected_column_selector_option" value="all_columns"/> | 174 <param name="selected_column_selector_option" value="all_columns"/> |
| 170 <param name="selected_pre_processor" value="StandardScaler"/> | 175 <param name="selected_pre_processor" value="StandardScaler"/> |
| 171 <param name="save" value="true"/> | 176 <param name="save" value="true"/> |
| 185 <param name="selected_input_type" value="sparse"/> | 190 <param name="selected_input_type" value="sparse"/> |
| 186 <param name="selected_pre_processor" value="Normalizer"/> | 191 <param name="selected_pre_processor" value="Normalizer"/> |
| 187 <param name="save" value="true"/> | 192 <param name="save" value="true"/> |
| 188 <output name="outfile_transform" file="prp_result09" ftype="tabular"/> | 193 <output name="outfile_transform" file="prp_result09" ftype="tabular"/> |
| 189 <output name="outfile_fit" file="prp_model09" ftype="zip" compare="sim_size" delta="5"/> | 194 <output name="outfile_fit" file="prp_model09" ftype="zip" compare="sim_size" delta="5"/> |
| 195 </test> | |
| 196 <test> | |
| 197 <param name="infile" value="regression_X.tabular" ftype="tabular"/> | |
| 198 <param name="header1" value="true"/> | |
| 199 <param name="selected_column_selector_option" value="all_columns"/> | |
| 200 <param name="selected_input_type" value="tabular"/> | |
| 201 <param name="selected_pre_processor" value="MinMaxScaler"/> | |
| 202 <param name="feature_range" value="(-1, 1)"/> | |
| 203 <param name="save" value="false"/> | |
| 204 <output name="outfile_transform" file="prp_result10" ftype="tabular"/> | |
| 190 </test> | 205 </test> |
| 191 </tests> | 206 </tests> |
| 192 <help> | 207 <help> |
| 193 <