changeset 40:b92b5fe51845 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
author bgruening
date Fri, 30 Apr 2021 23:31:41 +0000
parents 4ee54a853377
children 58f4081dabf4
files association_rules.py fitted_model_eval.py keras_deep_learning.py keras_train_and_eval.py label_encoder.py ml_visualization_ex.py model_prediction.py search_model_validation.py simple_model_fit.py stacking_ensembles.py test-data/le_input_w_header.tabular test-data/le_input_wo_header.tabular test-data/le_output.tabular test-data/mba_input_int_w.tabular test-data/mba_input_int_wo.tabular test-data/mba_input_str_w.tabular test-data/mba_input_str_wo.tabular test-data/mba_out_str.tabular test-data/mba_output_int.tabular test-data/mba_output_str.tabular to_categorical.py train_test_eval.py train_test_split.py
diffstat 23 files changed, 649 insertions(+), 266 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/association_rules.py	Fri Apr 30 23:31:41 2021 +0000
@@ -0,0 +1,116 @@
+import argparse
+import json
+import warnings
+
+import pandas as pd
+from mlxtend.frequent_patterns import association_rules, fpgrowth
+from mlxtend.preprocessing import TransactionEncoder
+
+
+def main(inputs, infile, outfile, min_support=0.5, min_confidence=0.5, min_lift=1.0, min_conviction=1.0, max_length=None):
+    """
+    Parameter
+    ---------
+    input : str
+        File path to galaxy tool parameter
+
+    infile : str
+        File paths of input vector
+
+    outfile : str
+        File path to output matrix
+
+    min_support: float
+        Minimum support
+
+    min_confidence: float
+        Minimum confidence
+
+    min_lift: float
+        Minimum lift
+
+    min_conviction: float
+        Minimum conviction
+
+    max_length: int
+        Maximum length
+
+    """
+    warnings.simplefilter('ignore')
+
+    with open(inputs, 'r') as param_handler:
+        params = json.load(param_handler)
+
+    input_header = params['header0']
+    header = 'infer' if input_header else None
+
+    with open(infile) as fp:
+        lines = fp.read().splitlines()
+
+    if header is not None:
+        lines = lines[1:]
+
+    dataset = []
+    for line in lines:
+        line_items = line.split("\t")
+        dataset.append(line_items)
+
+    # TransactionEncoder learns the unique labels in the dataset and transforms the
+    # input dataset (a Python list of lists) into a one-hot encoded NumPy boolean array
+    te = TransactionEncoder()
+    te_ary = te.fit_transform(dataset)
+
+    # Turn the encoded NumPy array into a DataFrame
+    df = pd.DataFrame(te_ary, columns=te.columns_)
+
+    # Extract frequent itemsets for association rule mining
+    # use_colnames: Use DataFrames' column names in the returned DataFrame instead of column indices
+    frequent_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True, max_len=max_length)
+
+    # Get association rules, with confidence larger than min_confidence
+    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
+
+    # Filter association rules, keeping rules with lift and conviction larger than min_liftand and min_conviction
+    rules = rules[(rules['lift'] >= min_lift) & (rules['conviction'] >= min_conviction)]
+
+    # Convert columns from frozenset to list (more readable)
+    rules['antecedents'] = rules['antecedents'].apply(list)
+    rules['consequents'] = rules['consequents'].apply(list)
+
+    # The next 3 steps are intended to fix the order of the association
+    # rules generated, so tests that rely on diff'ing a desired output
+    # with an expected output can pass
+
+    # 1) Sort entry in every row/column for columns 'antecedents' and 'consequents'
+    rules['antecedents'] = rules['antecedents'].apply(lambda row: sorted(row))
+    rules['consequents'] = rules['consequents'].apply(lambda row: sorted(row))
+
+    # 2) Create two temporary string columns to sort on
+    rules['ant_str'] = rules['antecedents'].apply(lambda row: " ".join(row))
+    rules['con_str'] = rules['consequents'].apply(lambda row: " ".join(row))
+
+    # 3) Sort results so they are re-producable
+    rules.sort_values(by=['ant_str', 'con_str'], inplace=True)
+    del rules['ant_str']
+    del rules['con_str']
+    rules.reset_index(drop=True, inplace=True)
+
+    # Write association rules and metrics to file
+    rules.to_csv(outfile, sep="\t", index=False)
+
+
+if __name__ == '__main__':
+    aparser = argparse.ArgumentParser()
+    aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
+    aparser.add_argument("-y", "--infile", dest="infile", required=True)
+    aparser.add_argument("-o", "--outfile", dest="outfile", required=True)
+    aparser.add_argument("-s", "--support", dest="support", default=0.5)
+    aparser.add_argument("-c", "--confidence", dest="confidence", default=0.5)
+    aparser.add_argument("-l", "--lift", dest="lift", default=1.0)
+    aparser.add_argument("-v", "--conviction", dest="conviction", default=1.0)
+    aparser.add_argument("-t", "--length", dest="length", default=5)
+    args = aparser.parse_args()
+
+    main(args.inputs, args.infile, args.outfile,
+         min_support=float(args.support), min_confidence=float(args.confidence),
+         min_lift=float(args.lift), min_conviction=float(args.conviction), max_length=int(args.length))
--- a/fitted_model_eval.py	Tue Apr 13 21:23:33 2021 +0000
+++ b/fitted_model_eval.py	Fri Apr 30 23:31:41 2021 +0000
@@ -30,7 +30,9 @@
     # tabular input
     if input_type == "tabular":
         header = "infer" if params["input_options"]["header1"] else None
-        column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
+        column_option = params["input_options"]["column_selector_options_1"][
+            "selected_column_selector_option"
+        ]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
@@ -52,7 +54,9 @@
 
     # Get target y
     header = "infer" if params["input_options"]["header2"] else None
-    column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
+    column_option = params["input_options"]["column_selector_options_2"][
+        "selected_column_selector_option2"
+    ]
     if column_option in [
         "by_index_number",
         "all_but_by_index_number",
@@ -70,7 +74,9 @@
         infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
         loaded_df[df_key] = infile2
 
-    y = read_columns(infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True)
+    y = read_columns(
+        infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True
+    )
     if len(y.shape) == 2 and y.shape[1] == 1:
         y = y.ravel()
 
@@ -123,7 +129,8 @@
     if hasattr(main_est, "config") and hasattr(main_est, "load_weights"):
         if not infile_weights or infile_weights == "None":
             raise ValueError(
-                "The selected model skeleton asks for weights, " "but no dataset for weights was provided!"
+                "The selected model skeleton asks for weights, "
+                "but no dataset for weights was provided!"
             )
         main_est.load_weights(infile_weights)
 
@@ -142,7 +149,9 @@
     scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)
 
     if hasattr(estimator, "evaluate"):
-        scores = estimator.evaluate(X_test, y_test=y_test, scorer=scorer, is_multimetric=True)
+        scores = estimator.evaluate(
+            X_test, y_test=y_test, scorer=scorer, is_multimetric=True
+        )
     else:
         scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True)
 
--- a/keras_deep_learning.py	Tue Apr 13 21:23:33 2021 +0000
+++ b/keras_deep_learning.py	Fri Apr 30 23:31:41 2021 +0000
@@ -10,12 +10,12 @@
 from galaxy_ml.utils import get_search_params, SafeEval, try_get_attr
 from keras.models import Model, Sequential
 
-
 safe_eval = SafeEval()
 
 
 def _handle_shape(literal):
-    """Eval integer or list/tuple of integers from string
+    """
+    Eval integer or list/tuple of integers from string
 
     Parameters:
     -----------
@@ -32,7 +32,8 @@
 
 
 def _handle_regularizer(literal):
-    """Construct regularizer from string literal
+    """
+    Construct regularizer from string literal
 
     Parameters
     ----------
@@ -48,15 +49,16 @@
         return None
 
     if l1 is None:
-        l1 = 0.
+        l1 = 0.0
     if l2 is None:
-        l2 = 0.
+        l2 = 0.0
 
     return keras.regularizers.l1_l2(l1=l1, l2=l2)
 
 
 def _handle_constraint(config):
-    """Construct constraint from galaxy tool parameters.
+    """
+    Construct constraint from galaxy tool parameters.
     Suppose correct dictionary format
 
     Parameters
@@ -72,14 +74,14 @@
                 "MinMaxNorm"
             }
     """
-    constraint_type = config['constraint_type']
-    if constraint_type in ('None', ''):
+    constraint_type = config["constraint_type"]
+    if constraint_type in ("None", ""):
         return None
 
     klass = getattr(keras.constraints, constraint_type)
-    options = config.get('constraint_options', {})
-    if 'axis' in options:
-        options['axis'] = literal_eval(options['axis'])
+    options = config.get("constraint_options", {})
+    if "axis" in options:
+        options["axis"] = literal_eval(options["axis"])
 
     return klass(**options)
 
@@ -89,62 +91,82 @@
 
 
 def _handle_layer_parameters(params):
-    """Access to handle all kinds of parameters
+    """
+    Access to handle all kinds of parameters
     """
     for key, value in six.iteritems(params):
-        if value in ('None', ''):
+        if value in ("None", ""):
             params[key] = None
             continue
 
-        if type(value) in [int, float, bool]\
-                or (type(value) is str and value.isalpha()):
+        if type(value) in [int, float, bool] or (
+            type(value) is str and value.isalpha()
+        ):
             continue
 
-        if key in ['input_shape', 'noise_shape', 'shape', 'batch_shape',
-                   'target_shape', 'dims', 'kernel_size', 'strides',
-                   'dilation_rate', 'output_padding', 'cropping', 'size',
-                   'padding', 'pool_size', 'axis', 'shared_axes'] \
-                and isinstance(value, str):
+        if (
+            key
+            in [
+                "input_shape",
+                "noise_shape",
+                "shape",
+                "batch_shape",
+                "target_shape",
+                "dims",
+                "kernel_size",
+                "strides",
+                "dilation_rate",
+                "output_padding",
+                "cropping",
+                "size",
+                "padding",
+                "pool_size",
+                "axis",
+                "shared_axes",
+            ]
+            and isinstance(value, str)
+        ):
             params[key] = _handle_shape(value)
 
-        elif key.endswith('_regularizer') and isinstance(value, dict):
+        elif key.endswith("_regularizer") and isinstance(value, dict):
             params[key] = _handle_regularizer(value)
 
-        elif key.endswith('_constraint') and isinstance(value, dict):
+        elif key.endswith("_constraint") and isinstance(value, dict):
             params[key] = _handle_constraint(value)
 
-        elif key == 'function':  # No support for lambda/function eval
+        elif key == "function":  # No support for lambda/function eval
             params.pop(key)
 
     return params
 
 
 def get_sequential_model(config):
-    """Construct keras Sequential model from Galaxy tool parameters
+    """
+    Construct keras Sequential model from Galaxy tool parameters
 
     Parameters:
     -----------
     config : dictionary, galaxy tool parameters loaded by JSON
     """
     model = Sequential()
-    input_shape = _handle_shape(config['input_shape'])
-    layers = config['layers']
+    input_shape = _handle_shape(config["input_shape"])
+    layers = config["layers"]
     for layer in layers:
-        options = layer['layer_selection']
-        layer_type = options.pop('layer_type')
+        options = layer["layer_selection"]
+        layer_type = options.pop("layer_type")
         klass = getattr(keras.layers, layer_type)
-        kwargs = options.pop('kwargs', '')
+        kwargs = options.pop("kwargs", "")
 
         # parameters needs special care
         options = _handle_layer_parameters(options)
 
         if kwargs:
-            kwargs = safe_eval('dict(' + kwargs + ')')
+            kwargs = safe_eval("dict(" + kwargs + ")")
             options.update(kwargs)
 
         # add input_shape to the first layer only
-        if not getattr(model, '_layers') and input_shape is not None:
-            options['input_shape'] = input_shape
+        if not getattr(model, "_layers") and input_shape is not None:
+            options["input_shape"] = input_shape
 
         model.add(klass(**options))
 
@@ -152,31 +174,32 @@
 
 
 def get_functional_model(config):
-    """Construct keras functional model from Galaxy tool parameters
+    """
+    Construct keras functional model from Galaxy tool parameters
 
     Parameters
     -----------
     config : dictionary, galaxy tool parameters loaded by JSON
     """
-    layers = config['layers']
+    layers = config["layers"]
     all_layers = []
     for layer in layers:
-        options = layer['layer_selection']
-        layer_type = options.pop('layer_type')
+        options = layer["layer_selection"]
+        layer_type = options.pop("layer_type")
         klass = getattr(keras.layers, layer_type)
-        inbound_nodes = options.pop('inbound_nodes', None)
-        kwargs = options.pop('kwargs', '')
+        inbound_nodes = options.pop("inbound_nodes", None)
+        kwargs = options.pop("kwargs", "")
 
         # parameters needs special care
         options = _handle_layer_parameters(options)
 
         if kwargs:
-            kwargs = safe_eval('dict(' + kwargs + ')')
+            kwargs = safe_eval("dict(" + kwargs + ")")
             options.update(kwargs)
 
         # merge layers
-        if 'merging_layers' in options:
-            idxs = literal_eval(options.pop('merging_layers'))
+        if "merging_layers" in options:
+            idxs = literal_eval(options.pop("merging_layers"))
             merging_layers = [all_layers[i - 1] for i in idxs]
             new_layer = klass(**options)(merging_layers)
         # non-input layers
@@ -188,41 +211,43 @@
 
         all_layers.append(new_layer)
 
-    input_indexes = _handle_shape(config['input_layers'])
+    input_indexes = _handle_shape(config["input_layers"])
     input_layers = [all_layers[i - 1] for i in input_indexes]
 
-    output_indexes = _handle_shape(config['output_layers'])
+    output_indexes = _handle_shape(config["output_layers"])
     output_layers = [all_layers[i - 1] for i in output_indexes]
 
     return Model(inputs=input_layers, outputs=output_layers)
 
 
 def get_batch_generator(config):
-    """Construct keras online data generator from Galaxy tool parameters
+    """
+    Construct keras online data generator from Galaxy tool parameters
 
     Parameters
     -----------
     config : dictionary, galaxy tool parameters loaded by JSON
     """
-    generator_type = config.pop('generator_type')
-    if generator_type == 'none':
+    generator_type = config.pop("generator_type")
+    if generator_type == "none":
         return None
 
-    klass = try_get_attr('galaxy_ml.preprocessors', generator_type)
+    klass = try_get_attr("galaxy_ml.preprocessors", generator_type)
 
-    if generator_type == 'GenomicIntervalBatchGenerator':
-        config['ref_genome_path'] = 'to_be_determined'
-        config['intervals_path'] = 'to_be_determined'
-        config['target_path'] = 'to_be_determined'
-        config['features'] = 'to_be_determined'
+    if generator_type == "GenomicIntervalBatchGenerator":
+        config["ref_genome_path"] = "to_be_determined"
+        config["intervals_path"] = "to_be_determined"
+        config["target_path"] = "to_be_determined"
+        config["features"] = "to_be_determined"
     else:
-        config['fasta_path'] = 'to_be_determined'
+        config["fasta_path"] = "to_be_determined"
 
     return klass(**config)
 
 
 def config_keras_model(inputs, outfile):
-    """ config keras model layers and output JSON
+    """
+    config keras model layers and output JSON
 
     Parameters
     ----------
@@ -232,23 +257,30 @@
     outfile : str
         Path to galaxy dataset containing keras model JSON.
     """
-    model_type = inputs['model_selection']['model_type']
-    layers_config = inputs['model_selection']
+    model_type = inputs["model_selection"]["model_type"]
+    layers_config = inputs["model_selection"]
 
-    if model_type == 'sequential':
+    if model_type == "sequential":
         model = get_sequential_model(layers_config)
     else:
         model = get_functional_model(layers_config)
 
     json_string = model.to_json()
 
-    with open(outfile, 'w') as f:
+    with open(outfile, "w") as f:
         json.dump(json.loads(json_string), f, indent=2)
 
 
-def build_keras_model(inputs, outfile, model_json, infile_weights=None,
-                      batch_mode=False, outfile_params=None):
-    """ for `keras_model_builder` tool
+def build_keras_model(
+    inputs,
+    outfile,
+    model_json,
+    infile_weights=None,
+    batch_mode=False,
+    outfile_params=None,
+):
+    """
+    for `keras_model_builder` tool
 
     Parameters
     ----------
@@ -265,75 +297,81 @@
     outfile_params : str, default=None
         File path to search parameters output.
     """
-    with open(model_json, 'r') as f:
+    with open(model_json, "r") as f:
         json_model = json.load(f)
 
-    config = json_model['config']
+    config = json_model["config"]
 
     options = {}
 
-    if json_model['class_name'] == 'Sequential':
-        options['model_type'] = 'sequential'
+    if json_model["class_name"] == "Sequential":
+        options["model_type"] = "sequential"
         klass = Sequential
-    elif json_model['class_name'] == 'Model':
-        options['model_type'] = 'functional'
+    elif json_model["class_name"] == "Model":
+        options["model_type"] = "functional"
         klass = Model
     else:
-        raise ValueError("Unknow Keras model class: %s"
-                         % json_model['class_name'])
+        raise ValueError("Unknow Keras model class: %s" % json_model["class_name"])
 
     # load prefitted model
-    if inputs['mode_selection']['mode_type'] == 'prefitted':
+    if inputs["mode_selection"]["mode_type"] == "prefitted":
         estimator = klass.from_config(config)
         estimator.load_weights(infile_weights)
     # build train model
     else:
-        cls_name = inputs['mode_selection']['learning_type']
-        klass = try_get_attr('galaxy_ml.keras_galaxy_models', cls_name)
+        cls_name = inputs["mode_selection"]["learning_type"]
+        klass = try_get_attr("galaxy_ml.keras_galaxy_models", cls_name)
 
-        options['loss'] = (inputs['mode_selection']
-                           ['compile_params']['loss'])
-        options['optimizer'] =\
-            (inputs['mode_selection']['compile_params']
-             ['optimizer_selection']['optimizer_type']).lower()
+        options["loss"] = inputs["mode_selection"]["compile_params"]["loss"]
+        options["optimizer"] = (
+            inputs["mode_selection"]["compile_params"]["optimizer_selection"][
+                "optimizer_type"
+            ]
+        ).lower()
 
-        options.update((inputs['mode_selection']['compile_params']
-                        ['optimizer_selection']['optimizer_options']))
+        options.update(
+            (
+                inputs["mode_selection"]["compile_params"]["optimizer_selection"][
+                    "optimizer_options"
+                ]
+            )
+        )
 
-        train_metrics = inputs['mode_selection']['compile_params']['metrics']
-        if train_metrics[-1] == 'none':
+        train_metrics = inputs["mode_selection"]["compile_params"]["metrics"]
+        if train_metrics[-1] == "none":
             train_metrics = train_metrics[:-1]
-        options['metrics'] = train_metrics
+        options["metrics"] = train_metrics
 
-        options.update(inputs['mode_selection']['fit_params'])
-        options['seed'] = inputs['mode_selection']['random_seed']
+        options.update(inputs["mode_selection"]["fit_params"])
+        options["seed"] = inputs["mode_selection"]["random_seed"]
 
         if batch_mode:
-            generator = get_batch_generator(inputs['mode_selection']
-                                            ['generator_selection'])
-            options['data_batch_generator'] = generator
-            options['prediction_steps'] = \
-                inputs['mode_selection']['prediction_steps']
-            options['class_positive_factor'] = \
-                inputs['mode_selection']['class_positive_factor']
+            generator = get_batch_generator(
+                inputs["mode_selection"]["generator_selection"]
+            )
+            options["data_batch_generator"] = generator
+            options["prediction_steps"] = inputs["mode_selection"]["prediction_steps"]
+            options["class_positive_factor"] = inputs["mode_selection"][
+                "class_positive_factor"
+            ]
         estimator = klass(config, **options)
         if outfile_params:
             hyper_params = get_search_params(estimator)
             # TODO: remove this after making `verbose` tunable
             for h_param in hyper_params:
-                if h_param[1].endswith('verbose'):
-                    h_param[0] = '@'
-            df = pd.DataFrame(hyper_params, columns=['', 'Parameter', 'Value'])
-            df.to_csv(outfile_params, sep='\t', index=False)
+                if h_param[1].endswith("verbose"):
+                    h_param[0] = "@"
+            df = pd.DataFrame(hyper_params, columns=["", "Parameter", "Value"])
+            df.to_csv(outfile_params, sep="\t", index=False)
 
     print(repr(estimator))
     # save model by pickle
-    with open(outfile, 'wb') as f:
+    with open(outfile, "wb") as f:
         pickle.dump(estimator, f, pickle.HIGHEST_PROTOCOL)
 
 
-if __name__ == '__main__':
-    warnings.simplefilter('ignore')
+if __name__ == "__main__":
+    warnings.simplefilter("ignore")
 
     aparser = argparse.ArgumentParser()
     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
@@ -345,7 +383,7 @@
     args = aparser.parse_args()
 
     input_json_path = args.inputs
-    with open(input_json_path, 'r') as param_handler:
+    with open(input_json_path, "r") as param_handler:
         inputs = json.load(param_handler)
 
     tool_id = args.tool_id
@@ -355,18 +393,20 @@
     infile_weights = args.infile_weights
 
     # for keras_model_config tool
-    if tool_id == 'keras_model_config':
+    if tool_id == "keras_model_config":
         config_keras_model(inputs, outfile)
 
     # for keras_model_builder tool
     else:
         batch_mode = False
-        if tool_id == 'keras_batch_models':
+        if tool_id == "keras_batch_models":
             batch_mode = True
 
-        build_keras_model(inputs=inputs,
-                          model_json=model_json,
-                          infile_weights=infile_weights,
-                          batch_mode=batch_mode,
-                          outfile=outfile,
-                          outfile_params=outfile_params)
+        build_keras_model(
+            inputs=inputs,
+            model_json=model_json,
+            infile_weights=infile_weights,
+            batch_mode=batch_mode,
+            outfile=outfile,
+            outfile_params=outfile_params,
+        )
--- a/keras_train_and_eval.py	Tue Apr 13 21:23:33 2021 +0000
+++ b/keras_train_and_eval.py	Fri Apr 30 23:31:41 2021 +0000
@@ -11,16 +11,9 @@
 from galaxy_ml.externals.selene_sdk.utils import compute_score
 from galaxy_ml.keras_galaxy_models import _predict_generator
 from galaxy_ml.model_validations import train_test_split
-from galaxy_ml.utils import (
-    clean_params,
-    get_main_estimator,
-    get_module,
-    get_scoring,
-    load_model,
-    read_columns,
-    SafeEval,
-    try_get_attr,
-)
+from galaxy_ml.utils import (clean_params, get_main_estimator,
+                             get_module, get_scoring, load_model, read_columns,
+                             SafeEval, try_get_attr)
 from scipy.io import mmread
 from sklearn.metrics.scorer import _check_multimetric_scoring
 from sklearn.model_selection import _search, _validation
@@ -28,7 +21,6 @@
 from sklearn.pipeline import Pipeline
 from sklearn.utils import indexable, safe_indexing
 
-
 _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")
 setattr(_search, "_fit_and_score", _fit_and_score)
 setattr(_validation, "_fit_and_score", _fit_and_score)
@@ -56,7 +48,10 @@
 
         param_name = p["sp_name"]
         if param_name.lower().endswith(NON_SEARCHABLE):
-            warnings.warn("Warning: `%s` is not eligible for search and was " "omitted!" % param_name)
+            warnings.warn(
+                "Warning: `%s` is not eligible for search and was "
+                "omitted!" % param_name
+            )
             continue
 
         if not swap_value.startswith(":"):
@@ -99,7 +94,11 @@
         index_arr = np.arange(n_samples)
         test = index_arr[np.isin(groups, group_names)]
         train = index_arr[~np.isin(groups, group_names)]
-        rval = list(chain.from_iterable((safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays))
+        rval = list(
+            chain.from_iterable(
+                (safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays
+            )
+        )
     else:
         rval = train_test_split(*new_arrays, **kwargs)
 
@@ -127,14 +126,22 @@
         pred_labels = (pred_probas > 0.5).astype("int32")
         targets = y_true.ravel().astype("int32")
         if not is_multimetric:
-            preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas
+            preds = (
+                pred_labels
+                if scorer.__class__.__name__ == "_PredictScorer"
+                else pred_probas
+            )
             score = scorer._score_func(targets, preds, **scorer._kwargs)
 
             return score
         else:
             scores = {}
             for name, one_scorer in scorer.items():
-                preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas
+                preds = (
+                    pred_labels
+                    if one_scorer.__class__.__name__ == "_PredictScorer"
+                    else pred_probas
+                )
                 score = one_scorer._score_func(targets, preds, **one_scorer._kwargs)
                 scores[name] = score
 
@@ -144,13 +151,21 @@
         pred_labels = (pred_probas > 0.5).astype("int32")
         targets = y_true.astype("int32")
         if not is_multimetric:
-            preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas
+            preds = (
+                pred_labels
+                if scorer.__class__.__name__ == "_PredictScorer"
+                else pred_probas
+            )
             score, _ = compute_score(preds, targets, scorer._score_func)
             return score
         else:
             scores = {}
             for name, one_scorer in scorer.items():
-                preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas
+                preds = (
+                    pred_labels
+                    if one_scorer.__class__.__name__ == "_PredictScorer"
+                    else pred_probas
+                )
                 score, _ = compute_score(preds, targets, one_scorer._score_func)
                 scores[name] = score
 
@@ -243,7 +258,9 @@
     # tabular input
     if input_type == "tabular":
         header = "infer" if params["input_options"]["header1"] else None
-        column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
+        column_option = params["input_options"]["column_selector_options_1"][
+            "selected_column_selector_option"
+        ]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
@@ -295,7 +312,9 @@
 
     # Get target y
     header = "infer" if params["input_options"]["header2"] else None
-    column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
+    column_option = params["input_options"]["column_selector_options_2"][
+        "selected_column_selector_option2"
+    ]
     if column_option in [
         "by_index_number",
         "all_but_by_index_number",
@@ -313,12 +332,9 @@
         infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
         loaded_df[df_key] = infile2
 
-    y = read_columns(infile2,
-                     c=c,
-                     c_option=column_option,
-                     sep='\t',
-                     header=header,
-                     parse_dates=True)
+    y = read_columns(
+        infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True
+    )
     if len(y.shape) == 2 and y.shape[1] == 1:
         y = y.ravel()
     if input_type == "refseq_and_interval":
@@ -328,10 +344,14 @@
 
     # load groups
     if groups:
-        groups_selector = (params["experiment_schemes"]["test_split"]["split_algos"]).pop("groups_selector")
+        groups_selector = (
+            params["experiment_schemes"]["test_split"]["split_algos"]
+        ).pop("groups_selector")
 
         header = "infer" if groups_selector["header_g"] else None
-        column_option = groups_selector["column_selector_options_g"]["selected_column_selector_option_g"]
+        column_option = groups_selector["column_selector_options_g"][
+            "selected_column_selector_option_g"
+        ]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
@@ -346,12 +366,14 @@
         if df_key in loaded_df:
             groups = loaded_df[df_key]
 
-        groups = read_columns(groups,
-                              c=c,
-                              c_option=column_option,
-                              sep='\t',
-                              header=header,
-                              parse_dates=True)
+        groups = read_columns(
+            groups,
+            c=c,
+            c_option=column_option,
+            sep="\t",
+            header=header,
+            parse_dates=True,
+        )
         groups = groups.ravel()
 
     # del loaded_df
@@ -364,7 +386,7 @@
         main_est.set_params(memory=memory)
 
     # handle scorer, convert to scorer dict
-    scoring = params['experiment_schemes']['metrics']['scoring']
+    scoring = params["experiment_schemes"]["metrics"]["scoring"]
     if scoring is not None:
         # get_scoring() expects secondary_scoring to be a comma separated string (not a list)
         # Check if secondary_scoring is specified
@@ -385,7 +407,9 @@
         if y is not None:
             test_split_options["labels"] = y
         else:
-            raise ValueError("Stratified shuffle split is not " "applicable on empty target values!")
+            raise ValueError(
+                "Stratified shuffle split is not " "applicable on empty target values!"
+            )
 
     (
         X_train,
@@ -408,7 +432,10 @@
             if y_train is not None:
                 val_split_options["labels"] = y_train
             else:
-                raise ValueError("Stratified shuffle split is not " "applicable on empty target values!")
+                raise ValueError(
+                    "Stratified shuffle split is not "
+                    "applicable on empty target values!"
+                )
 
         (
             X_train,
@@ -431,8 +458,12 @@
     if hasattr(estimator, "evaluate"):
         steps = estimator.prediction_steps
         batch_size = estimator.batch_size
-        generator = estimator.data_generator_.flow(X_test, y=y_test, batch_size=batch_size)
-        predictions, y_true = _predict_generator(estimator.model_, generator, steps=steps)
+        generator = estimator.data_generator_.flow(
+            X_test, y=y_test, batch_size=batch_size
+        )
+        predictions, y_true = _predict_generator(
+            estimator.model_, generator, steps=steps
+        )
         scores = _evaluate(y_true, predictions, scorer, is_multimetric=True)
 
     else:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/label_encoder.py	Fri Apr 30 23:31:41 2021 +0000
@@ -0,0 +1,48 @@
+import argparse
+import json
+import warnings
+
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder
+
+
+def main(inputs, infile, outfile):
+    """
+    Parameter
+    ---------
+    input : str
+        File path to galaxy tool parameter
+
+    infile : str
+        File paths of input vector
+
+    outfile : str
+        File path to output vector
+
+    """
+    warnings.simplefilter('ignore')
+
+    with open(inputs, 'r') as param_handler:
+        params = json.load(param_handler)
+
+    input_header = params['header0']
+    header = 'infer' if input_header else None
+
+    input_vector = pd.read_csv(infile, sep='\t', header=header)
+
+    le = LabelEncoder()
+
+    output_vector = le.fit_transform(input_vector)
+
+    np.savetxt(outfile, output_vector, fmt="%d", delimiter='\t')
+
+
+if __name__ == '__main__':
+    aparser = argparse.ArgumentParser()
+    aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
+    aparser.add_argument("-y", "--infile", dest="infile")
+    aparser.add_argument("-o", "--outfile", dest="outfile")
+    args = aparser.parse_args()
+
+    main(args.inputs, args.infile, args.outfile)
--- a/ml_visualization_ex.py	Tue Apr 13 21:23:33 2021 +0000
+++ b/ml_visualization_ex.py	Fri Apr 30 23:31:41 2021 +0000
@@ -13,10 +13,10 @@
 from keras.models import model_from_json
 from keras.utils import plot_model
 from sklearn.feature_selection.base import SelectorMixin
-from sklearn.metrics import auc, average_precision_score, confusion_matrix, precision_recall_curve, roc_curve
+from sklearn.metrics import (auc, average_precision_score, confusion_matrix,
+                             precision_recall_curve, roc_curve)
 from sklearn.pipeline import Pipeline
 
-
 safe_eval = SafeEval()
 
 # plotly default colors
@@ -51,7 +51,9 @@
         y_true = df1.iloc[:, idx].values
         y_score = df2.iloc[:, idx].values
 
-        precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=pos_label)
+        precision, recall, _ = precision_recall_curve(
+            y_true, y_score, pos_label=pos_label
+        )
         ap = average_precision_score(y_true, y_score, pos_label=pos_label or 1)
 
         trace = go.Scatter(
@@ -111,7 +113,9 @@
         y_true = df1.iloc[:, idx].values
         y_score = df2.iloc[:, idx].values
 
-        precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=pos_label)
+        precision, recall, _ = precision_recall_curve(
+            y_true, y_score, pos_label=pos_label
+        )
         ap = average_precision_score(y_true, y_score, pos_label=pos_label or 1)
 
         plt.step(
@@ -155,7 +159,9 @@
         y_true = df1.iloc[:, idx].values
         y_score = df2.iloc[:, idx].values
 
-        fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate)
+        fpr, tpr, _ = roc_curve(
+            y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate
+        )
         roc_auc = auc(fpr, tpr)
 
         trace = go.Scatter(
@@ -168,7 +174,9 @@
         data.append(trace)
 
     layout = go.Layout(
-        xaxis=dict(title="False Positive Rate", linecolor="lightslategray", linewidth=1),
+        xaxis=dict(
+            title="False Positive Rate", linecolor="lightslategray", linewidth=1
+        ),
         yaxis=dict(title="True Positive Rate", linecolor="lightslategray", linewidth=1),
         title=dict(
             text=title or "Receiver Operating Characteristic (ROC) Curve",
@@ -204,7 +212,9 @@
     os.rename("output.html", "output")
 
 
-def visualize_roc_curve_matplotlib(df1, df2, pos_label, drop_intermediate=True, title=None):
+def visualize_roc_curve_matplotlib(
+    df1, df2, pos_label, drop_intermediate=True, title=None
+):
     """visualize roc-curve using matplotlib and output svg image"""
     backend = matplotlib.get_backend()
     if "inline" not in backend:
@@ -216,7 +226,9 @@
         y_true = df1.iloc[:, idx].values
         y_score = df2.iloc[:, idx].values
 
-        fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate)
+        fpr, tpr, _ = roc_curve(
+            y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate
+        )
         roc_auc = auc(fpr, tpr)
 
         plt.step(
@@ -253,11 +265,15 @@
         col = plot_selection[column_name]["col1"]
     else:
         col = None
-    _, input_df = read_columns(file_path, c=col,
-                               c_option=column_option,
-                               return_df=True,
-                               sep='\t', header=header,
-                               parse_dates=True)
+    _, input_df = read_columns(
+        file_path,
+        c=col,
+        c_option=column_option,
+        return_df=True,
+        sep="\t",
+        header=header,
+        parse_dates=True,
+    )
     return input_df
 
 
@@ -344,7 +360,9 @@
         with open(infile_estimator, "rb") as estimator_handler:
             estimator = load_model(estimator_handler)
 
-        column_option = params["plotting_selection"]["column_selector_options"]["selected_column_selector_option"]
+        column_option = params["plotting_selection"]["column_selector_options"][
+            "selected_column_selector_option"
+        ]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
@@ -379,7 +397,11 @@
         else:
             coefs = getattr(estimator, "feature_importances_", None)
         if coefs is None:
-            raise RuntimeError("The classifier does not expose " '"coef_" or "feature_importances_" ' "attributes")
+            raise RuntimeError(
+                "The classifier does not expose "
+                '"coef_" or "feature_importances_" '
+                "attributes"
+            )
 
         threshold = params["plotting_selection"]["threshold"]
         if threshold is not None:
@@ -454,7 +476,9 @@
         layout = go.Layout(
             xaxis=dict(title="Number of features selected"),
             yaxis=dict(title="Cross validation score"),
-            title=dict(text=title or None, x=0.5, y=0.92, xanchor="center", yanchor="top"),
+            title=dict(
+                text=title or None, x=0.5, y=0.92, xanchor="center", yanchor="top"
+            ),
             font=dict(family="sans-serif", size=11),
             # control backgroud colors
             plot_bgcolor="rgba(255,255,255,0)",
@@ -548,9 +572,13 @@
 
     elif plot_type == "classification_confusion_matrix":
         plot_selection = params["plotting_selection"]
-        input_true = get_dataframe(true_labels, plot_selection, "header_true", "column_selector_options_true")
+        input_true = get_dataframe(
+            true_labels, plot_selection, "header_true", "column_selector_options_true"
+        )
         header_predicted = "infer" if plot_selection["header_predicted"] else None
-        input_predicted = pd.read_csv(predicted_labels, sep="\t", parse_dates=True, header=header_predicted)
+        input_predicted = pd.read_csv(
+            predicted_labels, sep="\t", parse_dates=True, header=header_predicted
+        )
         true_classes = input_true.iloc[:, -1].copy()
         predicted_classes = input_predicted.iloc[:, -1].copy()
         axis_labels = list(set(true_classes))
--- a/model_prediction.py	Tue Apr 13 21:23:33 2021 +0000
+++ b/model_prediction.py	Fri Apr 30 23:31:41 2021 +0000
@@ -63,7 +63,8 @@
     if hasattr(main_est, "config") and hasattr(main_est, "load_weights"):
         if not infile_weights or infile_weights == "None":
             raise ValueError(
-                "The selected model skeleton asks for weights, " "but dataset for weights wan not selected!"
+                "The selected model skeleton asks for weights, "
+                "but dataset for weights wan not selected!"
             )
         main_est.load_weights(infile_weights)
 
@@ -72,7 +73,9 @@
     # tabular input
     if input_type == "tabular":
         header = "infer" if params["input_options"]["header1"] else None
-        column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
+        column_option = params["input_options"]["column_selector_options_1"][
+            "selected_column_selector_option"
+        ]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
@@ -122,9 +125,13 @@
         pred_data_generator = klass(fasta_path, seq_length=seq_length)
 
         if params["method"] == "predict":
-            preds = estimator.predict(X, data_generator=pred_data_generator, steps=steps)
+            preds = estimator.predict(
+                X, data_generator=pred_data_generator, steps=steps
+            )
         else:
-            preds = estimator.predict_proba(X, data_generator=pred_data_generator, steps=steps)
+            preds = estimator.predict_proba(
+                X, data_generator=pred_data_generator, steps=steps
+            )
 
     # vcf input
     elif input_type == "variant_effect":
@@ -135,7 +142,9 @@
         if options["blacklist_regions"] == "none":
             options["blacklist_regions"] = None
 
-        pred_data_generator = klass(ref_genome_path=ref_seq, vcf_path=vcf_path, **options)
+        pred_data_generator = klass(
+            ref_genome_path=ref_seq, vcf_path=vcf_path, **options
+        )
 
         pred_data_generator.set_processing_attrs()
 
--- a/search_model_validation.py	Tue Apr 13 21:23:33 2021 +0000
+++ b/search_model_validation.py	Fri Apr 30 23:31:41 2021 +0000
@@ -11,31 +11,16 @@
 import numpy as np
 import pandas as pd
 import skrebate
-from galaxy_ml.utils import (
-    clean_params,
-    get_cv,
-    get_main_estimator,
-    get_module,
-    get_scoring,
-    load_model,
-    read_columns,
-    SafeEval,
-    try_get_attr
-)
+from galaxy_ml.utils import (clean_params, get_cv,
+                             get_main_estimator, get_module, get_scoring,
+                             load_model, read_columns, SafeEval, try_get_attr)
 from scipy.io import mmread
-from sklearn import (
-    cluster,
-    decomposition,
-    feature_selection,
-    kernel_approximation,
-    model_selection,
-    preprocessing,
-)
+from sklearn import (cluster, decomposition, feature_selection,
+                     kernel_approximation, model_selection, preprocessing)
 from sklearn.exceptions import FitFailedWarning
 from sklearn.model_selection import _search, _validation
 from sklearn.model_selection._validation import _score, cross_validate
 
-
 _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")
 setattr(_search, "_fit_and_score", _fit_and_score)
 setattr(_validation, "_fit_and_score", _fit_and_score)
@@ -57,7 +42,10 @@
 
         param_name = p["sp_name"]
         if param_name.lower().endswith(NON_SEARCHABLE):
-            print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name)
+            print(
+                "Warning: `%s` is not eligible for search and was "
+                "omitted!" % param_name
+            )
             continue
 
         if not search_list.startswith(":"):
@@ -90,7 +78,9 @@
                 decomposition.IncrementalPCA(),
                 decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
                 decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS),
-                decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS),
+                decomposition.MiniBatchDictionaryLearning(
+                    random_state=0, n_jobs=N_JOBS
+                ),
                 decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS),
                 decomposition.NMF(random_state=0),
                 decomposition.PCA(random_state=0),
@@ -107,14 +97,26 @@
                 skrebate.MultiSURF(n_jobs=N_JOBS),
                 skrebate.MultiSURFstar(n_jobs=N_JOBS),
                 imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS),
-                imblearn.under_sampling.CondensedNearestNeighbour(random_state=0, n_jobs=N_JOBS),
-                imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS),
-                imblearn.under_sampling.RepeatedEditedNearestNeighbours(random_state=0, n_jobs=N_JOBS),
+                imblearn.under_sampling.CondensedNearestNeighbour(
+                    random_state=0, n_jobs=N_JOBS
+                ),
+                imblearn.under_sampling.EditedNearestNeighbours(
+                    random_state=0, n_jobs=N_JOBS
+                ),
+                imblearn.under_sampling.RepeatedEditedNearestNeighbours(
+                    random_state=0, n_jobs=N_JOBS
+                ),
                 imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
-                imblearn.under_sampling.InstanceHardnessThreshold(random_state=0, n_jobs=N_JOBS),
+                imblearn.under_sampling.InstanceHardnessThreshold(
+                    random_state=0, n_jobs=N_JOBS
+                ),
                 imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS),
-                imblearn.under_sampling.NeighbourhoodCleaningRule(random_state=0, n_jobs=N_JOBS),
-                imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS),
+                imblearn.under_sampling.NeighbourhoodCleaningRule(
+                    random_state=0, n_jobs=N_JOBS
+                ),
+                imblearn.under_sampling.OneSidedSelection(
+                    random_state=0, n_jobs=N_JOBS
+                ),
                 imblearn.under_sampling.RandomUnderSampler(random_state=0),
                 imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS),
                 imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
@@ -122,7 +124,9 @@
                 imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
                 imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
                 imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS),
-                imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS),
+                imblearn.over_sampling.SMOTENC(
+                    categorical_features=[], random_state=0, n_jobs=N_JOBS
+                ),
                 imblearn.combine.SMOTEENN(random_state=0),
                 imblearn.combine.SMOTETomek(random_state=0),
             )
@@ -205,7 +209,9 @@
     # tabular input
     if input_type == "tabular":
         header = "infer" if params["input_options"]["header1"] else None
-        column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
+        column_option = params["input_options"]["column_selector_options_1"][
+            "selected_column_selector_option"
+        ]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
@@ -261,7 +267,9 @@
 
     # Get target y
     header = "infer" if params["input_options"]["header2"] else None
-    column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
+    column_option = params["input_options"]["column_selector_options_2"][
+        "selected_column_selector_option2"
+    ]
     if column_option in [
         "by_index_number",
         "all_but_by_index_number",
@@ -279,7 +287,9 @@
         infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
         loaded_df[df_key] = infile2
 
-    y = read_columns(infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True)
+    y = read_columns(
+        infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True
+    )
     if len(y.shape) == 2 and y.shape[1] == 1:
         y = y.ravel()
     if input_type == "refseq_and_interval":
@@ -378,12 +388,16 @@
         X, X_test, y, y_test = train_test_split(X, y, **split_options)
     elif split_options["shuffle"] == "group":
         if groups is None:
-            raise ValueError("No group based CV option was choosen for " "group shuffle!")
+            raise ValueError(
+                "No group based CV option was choosen for " "group shuffle!"
+            )
         split_options["labels"] = groups
         if y is None:
             X, X_test, groups, _ = train_test_split(X, groups, **split_options)
         else:
-            X, X_test, y, y_test, groups, _ = train_test_split(X, y, groups, **split_options)
+            X, X_test, y, y_test, groups, _ = train_test_split(
+                X, y, groups, **split_options
+            )
     else:
         if split_options["shuffle"] == "None":
             split_options["shuffle"] = None
@@ -411,9 +425,13 @@
 
     # TODO Solve deep learning models in pipeline
     if best_estimator_.__class__.__name__ == "KerasGBatchClassifier":
-        test_score = best_estimator_.evaluate(X_test, scorer=scorer_, is_multimetric=is_multimetric)
+        test_score = best_estimator_.evaluate(
+            X_test, scorer=scorer_, is_multimetric=is_multimetric
+        )
     else:
-        test_score = _score(best_estimator_, X_test, y_test, scorer_, is_multimetric=is_multimetric)
+        test_score = _score(
+            best_estimator_, X_test, y_test, scorer_, is_multimetric=is_multimetric
+        )
 
     if not is_multimetric:
         test_score = {primary_scoring: test_score}
@@ -487,7 +505,9 @@
         params = json.load(param_handler)
 
     # Override the refit parameter
-    params["search_schemes"]["options"]["refit"] = True if params["save"] != "nope" else False
+    params["search_schemes"]["options"]["refit"] = (
+        True if params["save"] != "nope" else False
+    )
 
     with open(infile_estimator, "rb") as estimator_handler:
         estimator = load_model(estimator_handler)
@@ -499,17 +519,21 @@
     options = params["search_schemes"]["options"]
 
     if groups:
-        header = "infer" if (options["cv_selector"]["groups_selector"]["header_g"]) else None
-        column_option = options["cv_selector"]["groups_selector"]["column_selector_options_g"][
-            "selected_column_selector_option_g"
-        ]
+        header = (
+            "infer" if (options["cv_selector"]["groups_selector"]["header_g"]) else None
+        )
+        column_option = options["cv_selector"]["groups_selector"][
+            "column_selector_options_g"
+        ]["selected_column_selector_option_g"]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
             "by_header_name",
             "all_but_by_header_name",
         ]:
-            c = options["cv_selector"]["groups_selector"]["column_selector_options_g"]["col_g"]
+            c = options["cv_selector"]["groups_selector"]["column_selector_options_g"][
+                "col_g"
+            ]
         else:
             c = None
 
@@ -537,12 +561,14 @@
     secondary_scoring = options["scoring"].get("secondary_scoring", None)
     if secondary_scoring is not None:
         # If secondary_scoring is specified, convert the list into comman separated string
-        options["scoring"]["secondary_scoring"] = ",".join(options["scoring"]["secondary_scoring"])
+        options["scoring"]["secondary_scoring"] = ",".join(
+            options["scoring"]["secondary_scoring"]
+        )
     options["scoring"] = get_scoring(options["scoring"])
     if options["error_score"]:
         options["error_score"] = "raise"
     else:
-        options["error_score"] = np.NaN
+        options["error_score"] = np.nan
     if options["refit"] and isinstance(options["scoring"], dict):
         options["refit"] = primary_scoring
     if "pre_dispatch" in options and options["pre_dispatch"] == "":
@@ -588,7 +614,9 @@
         # make sure refit is choosen
         # this could be True for sklearn models, but not the case for
         # deep learning models
-        if not options["refit"] and not all(hasattr(estimator, attr) for attr in ("config", "model_type")):
+        if not options["refit"] and not all(
+            hasattr(estimator, attr) for attr in ("config", "model_type")
+        ):
             warnings.warn("Refit is change to `True` for nested validation!")
             setattr(searcher, "refit", True)
 
@@ -687,7 +715,9 @@
 
         cv_results = pd.DataFrame(searcher.cv_results_)
         cv_results = cv_results[sorted(cv_results.columns)]
-        cv_results.to_csv(path_or_buf=outfile_result, sep="\t", header=True, index=False)
+        cv_results.to_csv(
+            path_or_buf=outfile_result, sep="\t", header=True, index=False
+        )
 
     memory.clear(warn=False)
 
--- a/simple_model_fit.py	Tue Apr 13 21:23:33 2021 +0000
+++ b/simple_model_fit.py	Fri Apr 30 23:31:41 2021 +0000
@@ -7,7 +7,6 @@
 from scipy.io import mmread
 from sklearn.pipeline import Pipeline
 
-
 N_JOBS = int(__import__("os").environ.get("GALAXY_SLOTS", 1))
 
 
@@ -36,7 +35,7 @@
         if name == "memory" or name.endswith("__memory") or name.endswith("_path"):
             new_p = {name: None}
             estimator.set_params(**new_p)
-        elif n_jobs is not None and (name == 'n_jobs' or name.endswith('__n_jobs')):
+        elif n_jobs is not None and (name == "n_jobs" or name.endswith("__n_jobs")):
             new_p = {name: n_jobs}
             estimator.set_params(**new_p)
         elif name.endswith("callbacks"):
@@ -68,7 +67,9 @@
     # tabular input
     if input_type == "tabular":
         header = "infer" if params["input_options"]["header1"] else None
-        column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
+        column_option = params["input_options"]["column_selector_options_1"][
+            "selected_column_selector_option"
+        ]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
@@ -90,7 +91,9 @@
 
     # Get target y
     header = "infer" if params["input_options"]["header2"] else None
-    column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
+    column_option = params["input_options"]["column_selector_options_2"][
+        "selected_column_selector_option2"
+    ]
     if column_option in [
         "by_index_number",
         "all_but_by_index_number",
@@ -108,12 +111,9 @@
         infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
         loaded_df[df_key] = infile2
 
-    y = read_columns(infile2,
-                     c=c,
-                     c_option=column_option,
-                     sep='\t',
-                     header=header,
-                     parse_dates=True)
+    y = read_columns(
+        infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True
+    )
     if len(y.shape) == 2 and y.shape[1] == 1:
         y = y.ravel()
 
--- a/stacking_ensembles.py	Tue Apr 13 21:23:33 2021 +0000
+++ b/stacking_ensembles.py	Fri Apr 30 23:31:41 2021 +0000
@@ -8,8 +8,8 @@
 import mlxtend.classifier
 import mlxtend.regressor
 import pandas as pd
-from galaxy_ml.utils import get_cv, get_estimator, get_search_params, load_model
-
+from galaxy_ml.utils import (get_cv, get_estimator, get_search_params,
+                             load_model)
 
 warnings.filterwarnings("ignore")
 
@@ -62,7 +62,9 @@
             with open(meta_path, "rb") as f:
                 meta_estimator = load_model(f)
         else:
-            estimator_json = params["algo_selection"]["meta_estimator"]["estimator_selector"]
+            estimator_json = params["algo_selection"]["meta_estimator"][
+                "estimator_selector"
+            ]
             meta_estimator = get_estimator(estimator_json)
 
     options = params["algo_selection"]["options"]
@@ -89,10 +91,14 @@
         ensemble_estimator = klass(base_estimators, **options)
 
     elif mod == mlxtend.classifier:
-        ensemble_estimator = klass(classifiers=base_estimators, meta_classifier=meta_estimator, **options)
+        ensemble_estimator = klass(
+            classifiers=base_estimators, meta_classifier=meta_estimator, **options
+        )
 
     else:
-        ensemble_estimator = klass(regressors=base_estimators, meta_regressor=meta_estimator, **options)
+        ensemble_estimator = klass(
+            regressors=base_estimators, meta_regressor=meta_estimator, **options
+        )
 
     print(ensemble_estimator)
     for base_est in base_estimators:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/le_input_w_header.tabular	Fri Apr 30 23:31:41 2021 +0000
@@ -0,0 +1,5 @@
+Class
+Liverpool
+Real Madrid
+Bayern Munich
+A.C. Milan
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/le_input_wo_header.tabular	Fri Apr 30 23:31:41 2021 +0000
@@ -0,0 +1,4 @@
+Liverpool
+Real Madrid
+Bayern Munich
+A.C. Milan
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/le_output.tabular	Fri Apr 30 23:31:41 2021 +0000
@@ -0,0 +1,4 @@
+2
+3
+1
+0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_input_int_w.tabular	Fri Apr 30 23:31:41 2021 +0000
@@ -0,0 +1,6 @@
+Transactions
+10	11	12	13	14	15
+16	11	12	13	14	15
+10	17	13	14
+10	18	19	13	15
+19	11	11	13	20	14
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_input_int_wo.tabular	Fri Apr 30 23:31:41 2021 +0000
@@ -0,0 +1,5 @@
+10	11	12	13	14	15
+16	11	12	13	14	15
+10	17	13	14
+10	18	19	13	15
+19	11	11	13	20	14
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_input_str_w.tabular	Fri Apr 30 23:31:41 2021 +0000
@@ -0,0 +1,6 @@
+Transactions
+Milk	Onion	Nutmeg	Kidney Beans	Eggs	Yogurt
+Dill	Onion	Nutmeg	Kidney Beans	Eggs	Yogurt
+Milk	Apple	Kidney Beans	Eggs
+Milk	Unicorn	Corn	Kidney Beans	Yogurt
+Corn	Onion	Onion	Kidney Beans	Ice cream	Eggs
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_input_str_wo.tabular	Fri Apr 30 23:31:41 2021 +0000
@@ -0,0 +1,5 @@
+Milk	Onion	Nutmeg	Kidney Beans	Eggs	Yogurt
+Dill	Onion	Nutmeg	Kidney Beans	Eggs	Yogurt
+Milk	Apple	Kidney Beans	Eggs
+Milk	Unicorn	Corn	Kidney Beans	Yogurt
+Corn	Onion	Onion	Kidney Beans	Ice cream	Eggs
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_out_str.tabular	Fri Apr 30 23:31:41 2021 +0000
@@ -0,0 +1,7 @@
+antecedents	consequents	antecedent support	consequent support	support	confidence	lift	leverage	conviction
+['Eggs']	['Kidney Beans', 'Onion']	0.8	0.6	0.6	0.7499999999999999	1.2499999999999998	0.12	1.5999999999999994
+['Eggs']	['Onion']	0.8	0.6	0.6	0.7499999999999999	1.2499999999999998	0.12	1.5999999999999994
+['Eggs', 'Kidney Beans']	['Onion']	0.8	0.6	0.6	0.7499999999999999	1.2499999999999998	0.12	1.5999999999999994
+['Kidney Beans', 'Onion']	['Eggs']	0.6	0.8	0.6	1.0	1.25	0.12	inf
+['Onion']	['Eggs']	0.6	0.8	0.6	1.0	1.25	0.12	inf
+['Onion']	['Eggs', 'Kidney Beans']	0.6	0.8	0.6	1.0	1.25	0.12	inf
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_output_int.tabular	Fri Apr 30 23:31:41 2021 +0000
@@ -0,0 +1,7 @@
+antecedents	consequents	antecedent support	consequent support	support	confidence	lift	leverage	conviction
+['11']	['13', '14']	0.6	0.8	0.6	1.0	1.25	0.12	inf
+['11']	['14']	0.6	0.8	0.6	1.0	1.25	0.12	inf
+['11', '13']	['14']	0.6	0.8	0.6	1.0	1.25	0.12	inf
+['13', '14']	['11']	0.8	0.6	0.6	0.7499999999999999	1.2499999999999998	0.12	1.5999999999999994
+['14']	['11']	0.8	0.6	0.6	0.7499999999999999	1.2499999999999998	0.12	1.5999999999999994
+['14']	['11', '13']	0.8	0.6	0.6	0.7499999999999999	1.2499999999999998	0.12	1.5999999999999994
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mba_output_str.tabular	Fri Apr 30 23:31:41 2021 +0000
@@ -0,0 +1,7 @@
+antecedents	consequents	antecedent support	consequent support	support	confidence	lift	leverage	conviction
+['Eggs']	['Kidney Beans', 'Onion']	0.8	0.6	0.6	0.7499999999999999	1.2499999999999998	0.12	1.5999999999999994
+['Eggs']	['Onion']	0.8	0.6	0.6	0.7499999999999999	1.2499999999999998	0.12	1.5999999999999994
+['Eggs', 'Kidney Beans']	['Onion']	0.8	0.6	0.6	0.7499999999999999	1.2499999999999998	0.12	1.5999999999999994
+['Kidney Beans', 'Onion']	['Eggs']	0.6	0.8	0.6	1.0	1.25	0.12	inf
+['Onion']	['Eggs']	0.6	0.8	0.6	1.0	1.25	0.12	inf
+['Onion']	['Eggs', 'Kidney Beans']	0.6	0.8	0.6	1.0	1.25	0.12	inf
--- a/to_categorical.py	Tue Apr 13 21:23:33 2021 +0000
+++ b/to_categorical.py	Fri Apr 30 23:31:41 2021 +0000
@@ -43,7 +43,9 @@
     aparser = argparse.ArgumentParser()
     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
     aparser.add_argument("-y", "--infile", dest="infile")
-    aparser.add_argument("-n", "--num_classes", dest="num_classes", type=int, default=None)
+    aparser.add_argument(
+        "-n", "--num_classes", dest="num_classes", type=int, default=None
+    )
     aparser.add_argument("-o", "--outfile", dest="outfile")
     args = aparser.parse_args()
 
--- a/train_test_eval.py	Tue Apr 13 21:23:33 2021 +0000
+++ b/train_test_eval.py	Fri Apr 30 23:31:41 2021 +0000
@@ -9,14 +9,8 @@
 import numpy as np
 import pandas as pd
 from galaxy_ml.model_validations import train_test_split
-from galaxy_ml.utils import (
-    get_module,
-    get_scoring,
-    load_model,
-    read_columns,
-    SafeEval,
-    try_get_attr,
-)
+from galaxy_ml.utils import (get_module, get_scoring, load_model,
+                             read_columns, SafeEval, try_get_attr)
 from scipy.io import mmread
 from sklearn import pipeline
 from sklearn.metrics.scorer import _check_multimetric_scoring
@@ -24,7 +18,6 @@
 from sklearn.model_selection._validation import _score
 from sklearn.utils import indexable, safe_indexing
 
-
 _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")
 setattr(_search, "_fit_and_score", _fit_and_score)
 setattr(_validation, "_fit_and_score", _fit_and_score)
@@ -262,12 +255,9 @@
         infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
         loaded_df[df_key] = infile2
 
-    y = read_columns(infile2,
-                     c=c,
-                     c_option=column_option,
-                     sep='\t',
-                     header=header,
-                     parse_dates=True)
+    y = read_columns(
+        infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True
+    )
     if len(y.shape) == 2 and y.shape[1] == 1:
         y = y.ravel()
     if input_type == "refseq_and_interval":
@@ -299,12 +289,14 @@
         if df_key in loaded_df:
             groups = loaded_df[df_key]
 
-        groups = read_columns(groups,
-                              c=c,
-                              c_option=column_option,
-                              sep='\t',
-                              header=header,
-                              parse_dates=True)
+        groups = read_columns(
+            groups,
+            c=c,
+            c_option=column_option,
+            sep="\t",
+            header=header,
+            parse_dates=True,
+        )
         groups = groups.ravel()
 
     # del loaded_df
@@ -371,9 +363,14 @@
                 "Stratified shuffle split is not " "applicable on empty target values!"
             )
 
-    X_train, X_test, y_train, y_test, groups_train, _groups_test = train_test_split_none(
-        X, y, groups, **test_split_options
-    )
+    (
+        X_train,
+        X_test,
+        y_train,
+        y_test,
+        groups_train,
+        _groups_test,
+    ) = train_test_split_none(X, y, groups, **test_split_options)
 
     exp_scheme = params["experiment_schemes"]["selected_exp_scheme"]
 
--- a/train_test_split.py	Tue Apr 13 21:23:33 2021 +0000
+++ b/train_test_split.py	Fri Apr 30 23:31:41 2021 +0000
@@ -28,17 +28,23 @@
 
     # read groups
     if infile_groups:
-        header = "infer" if (params["mode_selection"]["cv_selector"]["groups_selector"]["header_g"]) else None
-        column_option = params["mode_selection"]["cv_selector"]["groups_selector"]["column_selector_options_g"][
-            "selected_column_selector_option_g"
-        ]
+        header = (
+            "infer"
+            if (params["mode_selection"]["cv_selector"]["groups_selector"]["header_g"])
+            else None
+        )
+        column_option = params["mode_selection"]["cv_selector"]["groups_selector"][
+            "column_selector_options_g"
+        ]["selected_column_selector_option_g"]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
             "by_header_name",
             "all_but_by_header_name",
         ]:
-            c = params["mode_selection"]["cv_selector"]["groups_selector"]["column_selector_options_g"]["col_g"]
+            c = params["mode_selection"]["cv_selector"]["groups_selector"][
+                "column_selector_options_g"
+            ]["col_g"]
         else:
             c = None
 
@@ -67,7 +73,10 @@
 
     total_n_splits = splitter.get_n_splits(array.values, y=y, groups=groups)
     if nth_split > total_n_splits:
-        raise ValueError("Total number of splits is {}, but got `nth_split` " "= {}".format(total_n_splits, nth_split))
+        raise ValueError(
+            "Total number of splits is {}, but got `nth_split` "
+            "= {}".format(total_n_splits, nth_split)
+        )
 
     i = 1
     for train_index, test_index in splitter.split(array.values, y=y, groups=groups):
@@ -137,7 +146,9 @@
 
     # cv splitter
     else:
-        train, test = _get_single_cv_split(params, array, infile_labels=infile_labels, infile_groups=infile_groups)
+        train, test = _get_single_cv_split(
+            params, array, infile_labels=infile_labels, infile_groups=infile_groups
+        )
 
     print("Input shape: %s" % repr(array.shape))
     print("Train shape: %s" % repr(train.shape))