comparison fitted_model_eval.py @ 0:bdf3f88c60e0 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 208a8d348e7c7a182cfbe1b6f17868146428a7e2"
author bgruening
date Tue, 13 Apr 2021 21:33:38 +0000
parents
children 2cb67aeee0d9
comparison
equal deleted inserted replaced
-1:000000000000 0:bdf3f88c60e0
1 import argparse
2 import json
3 import warnings
4
5 import pandas as pd
6 from galaxy_ml.utils import get_scoring, load_model, read_columns
7 from scipy.io import mmread
8 from sklearn.metrics.scorer import _check_multimetric_scoring
9 from sklearn.model_selection._validation import _score
10 from sklearn.pipeline import Pipeline
11
12
13 def _get_X_y(params, infile1, infile2):
14 """read from inputs and output X and y
15
16 Parameters
17 ----------
18 params : dict
19 Tool inputs parameter
20 infile1 : str
21 File path to dataset containing features
22 infile2 : str
23 File path to dataset containing target values
24
25 """
26 # store read dataframe object
27 loaded_df = {}
28
29 input_type = params["input_options"]["selected_input"]
30 # tabular input
31 if input_type == "tabular":
32 header = "infer" if params["input_options"]["header1"] else None
33 column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
34 if column_option in [
35 "by_index_number",
36 "all_but_by_index_number",
37 "by_header_name",
38 "all_but_by_header_name",
39 ]:
40 c = params["input_options"]["column_selector_options_1"]["col1"]
41 else:
42 c = None
43
44 df_key = infile1 + repr(header)
45 df = pd.read_csv(infile1, sep="\t", header=header, parse_dates=True)
46 loaded_df[df_key] = df
47
48 X = read_columns(df, c=c, c_option=column_option).astype(float)
49 # sparse input
50 elif input_type == "sparse":
51 X = mmread(open(infile1, "r"))
52
53 # Get target y
54 header = "infer" if params["input_options"]["header2"] else None
55 column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
56 if column_option in [
57 "by_index_number",
58 "all_but_by_index_number",
59 "by_header_name",
60 "all_but_by_header_name",
61 ]:
62 c = params["input_options"]["column_selector_options_2"]["col2"]
63 else:
64 c = None
65
66 df_key = infile2 + repr(header)
67 if df_key in loaded_df:
68 infile2 = loaded_df[df_key]
69 else:
70 infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
71 loaded_df[df_key] = infile2
72
73 y = read_columns(infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True)
74 if len(y.shape) == 2 and y.shape[1] == 1:
75 y = y.ravel()
76
77 return X, y
78
79
80 def main(
81 inputs,
82 infile_estimator,
83 outfile_eval,
84 infile_weights=None,
85 infile1=None,
86 infile2=None,
87 ):
88 """
89 Parameter
90 ---------
91 inputs : str
92 File path to galaxy tool parameter
93
94 infile_estimator : strgit
95 File path to trained estimator input
96
97 outfile_eval : str
98 File path to save the evalulation results, tabular
99
100 infile_weights : str
101 File path to weights input
102
103 infile1 : str
104 File path to dataset containing features
105
106 infile2 : str
107 File path to dataset containing target values
108 """
109 warnings.filterwarnings("ignore")
110
111 with open(inputs, "r") as param_handler:
112 params = json.load(param_handler)
113
114 X_test, y_test = _get_X_y(params, infile1, infile2)
115
116 # load model
117 with open(infile_estimator, "rb") as est_handler:
118 estimator = load_model(est_handler)
119
120 main_est = estimator
121 if isinstance(estimator, Pipeline):
122 main_est = estimator.steps[-1][-1]
123 if hasattr(main_est, "config") and hasattr(main_est, "load_weights"):
124 if not infile_weights or infile_weights == "None":
125 raise ValueError(
126 "The selected model skeleton asks for weights, " "but no dataset for weights was provided!"
127 )
128 main_est.load_weights(infile_weights)
129
130 # handle scorer, convert to scorer dict
131 # Check if scoring is specified
132 scoring = params["scoring"]
133 if scoring is not None:
134 # get_scoring() expects secondary_scoring to be a comma separated string (not a list)
135 # Check if secondary_scoring is specified
136 secondary_scoring = scoring.get("secondary_scoring", None)
137 if secondary_scoring is not None:
138 # If secondary_scoring is specified, convert the list into comman separated string
139 scoring["secondary_scoring"] = ",".join(scoring["secondary_scoring"])
140
141 scorer = get_scoring(scoring)
142 scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)
143
144 if hasattr(estimator, "evaluate"):
145 scores = estimator.evaluate(X_test, y_test=y_test, scorer=scorer, is_multimetric=True)
146 else:
147 scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True)
148
149 # handle output
150 for name, score in scores.items():
151 scores[name] = [score]
152 df = pd.DataFrame(scores)
153 df = df[sorted(df.columns)]
154 df.to_csv(path_or_buf=outfile_eval, sep="\t", header=True, index=False)
155
156
157 if __name__ == "__main__":
158 aparser = argparse.ArgumentParser()
159 aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
160 aparser.add_argument("-e", "--infile_estimator", dest="infile_estimator")
161 aparser.add_argument("-w", "--infile_weights", dest="infile_weights")
162 aparser.add_argument("-X", "--infile1", dest="infile1")
163 aparser.add_argument("-y", "--infile2", dest="infile2")
164 aparser.add_argument("-O", "--outfile_eval", dest="outfile_eval")
165 args = aparser.parse_args()
166
167 main(
168 args.inputs,
169 args.infile_estimator,
170 args.outfile_eval,
171 infile_weights=args.infile_weights,
172 infile1=args.infile1,
173 infile2=args.infile2,
174 )