Mercurial > repos > bgruening > sklearn_numeric_clustering
diff fitted_model_eval.py @ 40:006e27f0a7ef draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 208a8d348e7c7a182cfbe1b6f17868146428a7e2"
author | bgruening |
---|---|
date | Tue, 13 Apr 2021 20:52:41 +0000 |
parents | e38a2675db5e |
children | 156835c25f62 |
line wrap: on
line diff
--- a/fitted_model_eval.py Wed Mar 11 17:22:07 2020 +0000 +++ b/fitted_model_eval.py Tue Apr 13 20:52:41 2021 +0000 @@ -1,17 +1,17 @@ import argparse import json -import pandas as pd import warnings +import pandas as pd +from galaxy_ml.utils import get_scoring, load_model, read_columns from scipy.io import mmread -from sklearn.pipeline import Pipeline from sklearn.metrics.scorer import _check_multimetric_scoring from sklearn.model_selection._validation import _score -from galaxy_ml.utils import get_scoring, load_model, read_columns +from sklearn.pipeline import Pipeline def _get_X_y(params, infile1, infile2): - """ read from inputs and output X and y + """read from inputs and output X and y Parameters ---------- @@ -26,35 +26,40 @@ # store read dataframe object loaded_df = {} - input_type = params['input_options']['selected_input'] + input_type = params["input_options"]["selected_input"] # tabular input - if input_type == 'tabular': - header = 'infer' if params['input_options']['header1'] else None - column_option = (params['input_options']['column_selector_options_1'] - ['selected_column_selector_option']) - if column_option in ['by_index_number', 'all_but_by_index_number', - 'by_header_name', 'all_but_by_header_name']: - c = params['input_options']['column_selector_options_1']['col1'] + if input_type == "tabular": + header = "infer" if params["input_options"]["header1"] else None + column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + if column_option in [ + "by_index_number", + "all_but_by_index_number", + "by_header_name", + "all_but_by_header_name", + ]: + c = params["input_options"]["column_selector_options_1"]["col1"] else: c = None df_key = infile1 + repr(header) - df = pd.read_csv(infile1, sep='\t', header=header, - parse_dates=True) + df = pd.read_csv(infile1, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = df X = read_columns(df, c=c, c_option=column_option).astype(float) # sparse input - elif input_type == 'sparse': - X = mmread(open(infile1, 'r')) + elif input_type == "sparse": + X = mmread(open(infile1, "r")) # Get target y - header = 'infer' if params['input_options']['header2'] else None - column_option = (params['input_options']['column_selector_options_2'] - ['selected_column_selector_option2']) - if column_option in ['by_index_number', 'all_but_by_index_number', - 'by_header_name', 'all_but_by_header_name']: - c = params['input_options']['column_selector_options_2']['col2'] + header = "infer" if params["input_options"]["header2"] else None + column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] + if column_option in [ + "by_index_number", + "all_but_by_index_number", + "by_header_name", + "all_but_by_header_name", + ]: + c = params["input_options"]["column_selector_options_2"]["col2"] else: c = None @@ -62,26 +67,24 @@ if df_key in loaded_df: infile2 = loaded_df[df_key] else: - infile2 = pd.read_csv(infile2, sep='\t', - header=header, parse_dates=True) + infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = infile2 - y = read_columns( - infile2, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + y = read_columns(infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() return X, y -def main(inputs, infile_estimator, outfile_eval, - infile_weights=None, infile1=None, - infile2=None): +def main( + inputs, + infile_estimator, + outfile_eval, + infile_weights=None, + infile1=None, + infile2=None, +): """ Parameter --------- @@ -103,49 +106,55 @@ infile2 : str File path to dataset containing target values """ - warnings.filterwarnings('ignore') + warnings.filterwarnings("ignore") - with open(inputs, 'r') as param_handler: + with open(inputs, "r") as param_handler: params = json.load(param_handler) X_test, y_test = _get_X_y(params, infile1, infile2) # load model - with open(infile_estimator, 'rb') as est_handler: + with open(infile_estimator, "rb") as est_handler: estimator = load_model(est_handler) main_est = estimator if isinstance(estimator, Pipeline): main_est = estimator.steps[-1][-1] - if hasattr(main_est, 'config') and hasattr(main_est, 'load_weights'): - if not infile_weights or infile_weights == 'None': - raise ValueError("The selected model skeleton asks for weights, " - "but no dataset for weights was provided!") + if hasattr(main_est, "config") and hasattr(main_est, "load_weights"): + if not infile_weights or infile_weights == "None": + raise ValueError( + "The selected model skeleton asks for weights, " "but no dataset for weights was provided!" + ) main_est.load_weights(infile_weights) # handle scorer, convert to scorer dict - scoring = params['scoring'] + # Check if scoring is specified + scoring = params["scoring"] + if scoring is not None: + # get_scoring() expects secondary_scoring to be a comma separated string (not a list) + # Check if secondary_scoring is specified + secondary_scoring = scoring.get("secondary_scoring", None) + if secondary_scoring is not None: + # If secondary_scoring is specified, convert the list into comman separated string + scoring["secondary_scoring"] = ",".join(scoring["secondary_scoring"]) + scorer = get_scoring(scoring) scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer) - if hasattr(estimator, 'evaluate'): - scores = estimator.evaluate(X_test, y_test=y_test, - scorer=scorer, - is_multimetric=True) + if hasattr(estimator, "evaluate"): + scores = estimator.evaluate(X_test, y_test=y_test, scorer=scorer, is_multimetric=True) else: - scores = _score(estimator, X_test, y_test, scorer, - is_multimetric=True) + scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True) # handle output for name, score in scores.items(): scores[name] = [score] df = pd.DataFrame(scores) df = df[sorted(df.columns)] - df.to_csv(path_or_buf=outfile_eval, sep='\t', - header=True, index=False) + df.to_csv(path_or_buf=outfile_eval, sep="\t", header=True, index=False) -if __name__ == '__main__': +if __name__ == "__main__": aparser = argparse.ArgumentParser() aparser.add_argument("-i", "--inputs", dest="inputs", required=True) aparser.add_argument("-e", "--infile_estimator", dest="infile_estimator") @@ -155,6 +164,11 @@ aparser.add_argument("-O", "--outfile_eval", dest="outfile_eval") args = aparser.parse_args() - main(args.inputs, args.infile_estimator, args.outfile_eval, - infile_weights=args.infile_weights, infile1=args.infile1, - infile2=args.infile2) + main( + args.inputs, + args.infile_estimator, + args.outfile_eval, + infile_weights=args.infile_weights, + infile1=args.infile1, + infile2=args.infile2, + )