Mercurial > repos > devteam > column_maker
comparison column_maker.py @ 9:33b81f9ea109 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_maker commit fe76077775aaca531f6a563fdfcbd73fbf1528e7
author | iuc |
---|---|
date | Thu, 28 Jul 2022 15:27:54 +0000 |
parents | 227e82286a0e |
children | beec6ecc7d3c |
comparison
equal
deleted
inserted
replaced
8:227e82286a0e | 9:33b81f9ea109 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 """ | 2 """ |
3 This tool takes a tab-delimited textfile as input and creates another column in | 3 This tool takes a tab-delimited textfile as input and creates new columns in |
4 the file which is the result of a computation performed on every row in the | 4 the file which are the result of a computation performed on every row in the |
5 original file. The tool will skip over invalid lines within the file, | 5 original file. The tool will skip over empty and comment (starting with a #) |
6 informing the user about the number of lines skipped. | 6 lines within the file. It does not change the formatting of any original, |
7 retained columns. | |
7 """ | 8 """ |
8 | 9 |
9 import argparse | 10 import argparse |
10 import json | 11 import enum |
11 import re | 12 import re |
12 import sys | 13 import sys |
13 # functions that may be used in the compute expression | 14 # Functions that may be used in the compute expression |
14 from math import ( # noqa: F401 | 15 from math import ( # noqa: F401 |
15 ceil, | 16 ceil, |
16 exp, | 17 exp, |
17 floor, | 18 floor, |
18 log, | 19 log, |
19 log10, | 20 log10, |
20 sqrt | 21 sqrt, |
21 ) | 22 ) |
22 | 23 |
23 from numpy import format_float_positional # noqa: F401 | 24 from numpy import format_float_positional |
25 | |
26 | |
27 class Mode(enum.Enum): | |
28 APPEND = '' | |
29 INSERT = 'I' | |
30 REPLACE = 'R' | |
31 | |
32 | |
33 def from_str(s, to_type): | |
34 if to_type is list: | |
35 return [part.strip(' ') for part in s.split(',')] | |
36 else: | |
37 return to_type(s) | |
38 | |
39 | |
40 def to_str(obj): | |
41 if type(obj) is list: | |
42 return ','.join([to_str(i) for i in obj]) | |
43 if args.avoid_scientific_notation and type(obj) is float: | |
44 return format_float_positional(obj) | |
45 return str(obj) | |
46 | |
24 | 47 |
25 parser = argparse.ArgumentParser() | 48 parser = argparse.ArgumentParser() |
26 parser.add_argument('input', type=argparse.FileType('r'), help="input file") | 49 parser.add_argument('input', type=str, help='input file') |
27 parser.add_argument('output', type=argparse.FileType('wt'), help="output file") | 50 parser.add_argument('output', type=str, help='output file') |
28 parser.add_argument('cond', nargs='?', type=str, help="expression") | 51 parser.add_argument( |
29 parser.add_argument('columns', nargs='?', type=int, help="number of columns") | 52 '-t', '--column-types', nargs='?', required=True, |
30 parser.add_argument('column_types', nargs='?', type=str, help="comma separated list of column types") | 53 help='A comma-separated list of column types in the input file' |
31 parser.add_argument('--round', action="store_true", | 54 ) |
32 help="round result") | 55 parser.add_argument( |
33 parser.add_argument('--avoid_scientific_notation', action="store_true", | 56 '--avoid-scientific-notation', action='store_true', |
34 help="avoid scientific notation") | 57 help='avoid scientific notation' |
35 parser.add_argument('--header_new_column_name', default=None, type=str, | 58 ) |
36 help="First line of input is a header line with column " | 59 parser.add_argument( |
37 "names and this should become the name of the new " | 60 '--header', action='store_true', |
38 "column") | 61 help='The input has a header line with column names. ' |
39 parser.add_argument('--load_json', default=None, type=argparse.FileType('r'), | 62 'Actions must specify names of newly calculated columns.' |
40 help="overwrite parsed arguments from json file") | 63 ) |
64 parser.add_argument( | |
65 '--fail-on-non-existent-columns', action='store_true', | |
66 help='If an action references a column number that is not existent ' | |
67 'when the expression gets computed, the default behavior is to treat ' | |
68 'this as a case of rows for which the expression cannot be computed. ' | |
69 'The behavior of the tool will then depend on which of the ' | |
70 'non-computable switches is in effect. With this flag, in contrast, ' | |
71 'the tool will fail directly upon encountering a non-existing column.' | |
72 ) | |
73 non_computable = parser.add_mutually_exclusive_group() | |
74 non_computable.add_argument('--fail-on-non-computable', action='store_true') | |
75 non_computable.add_argument('--skip-non-computable', action='store_true') | |
76 non_computable.add_argument('--keep-non-computable', action='store_true') | |
77 non_computable.add_argument('--non-computable-blank', action='store_true') | |
78 non_computable.add_argument('--non-computable-default') | |
79 | |
80 group = parser.add_mutually_exclusive_group(required=True) | |
81 group.add_argument( | |
82 '-a', '--actions', nargs='*', type=str, | |
83 help='One or more action(s) of the format EXPR;[COL_ADD_SPEC];[COL_NAME]' | |
84 ) | |
85 group.add_argument( | |
86 '-f', '--file', type=str, | |
87 help='File to read actions from (mutually exclusive with -a)' | |
88 ) | |
41 args = parser.parse_args() | 89 args = parser.parse_args() |
42 | 90 |
43 argparse_dict = vars(args) | 91 if not args.column_types: |
44 if args.load_json: | 92 with open(args.input) as fh: |
45 json_dict = json.load(args.load_json) | 93 if not fh.readline(): |
46 argparse_dict.update(json_dict) | 94 # Generally, the input must have at least one column to be |
47 | 95 # considered tabular, but empty files are ok and should produce |
48 fh = argparse_dict['input'] | 96 # empty output. |
49 out = argparse_dict['output'] | 97 with open(args.output, 'w') as out: |
50 expr = argparse_dict['cond'] | 98 pass |
51 round_result = argparse_dict['round'] | 99 sys.exit() |
52 avoid_scientific_notation = argparse_dict['avoid_scientific_notation'] | 100 sys.exit( |
53 | 101 "Missing column types. " |
54 if argparse_dict['header_new_column_name'] is not None: | 102 "In Galaxy, click the pencil icon on the history item and " |
55 header_line = fh.readline().strip('\n') | 103 "select the Auto-detect option to correct it. " |
56 out.write( | 104 "This tool can only be used with tab-delimited data." |
57 '{0}\t{1}\n'.format( | 105 ) |
58 header_line, argparse_dict['header_new_column_name'] | 106 |
59 ) | 107 in_column_types = [t.strip() for t in args.column_types.split(',')] |
60 ) | 108 in_columns = len(in_column_types) |
109 | |
110 # Prepare initial column variable names and type cast representations | |
111 # for column data types | |
112 cols, type_casts = [], [] | |
113 for n, col_type in enumerate(in_column_types, start=1): | |
114 col_name = "c%d" % n | |
115 cols.append(col_name) | |
116 col_str = ', '.join(cols) # 'c1, c2, c3, c4' | |
117 | |
118 # Define lambda for type-casting of original row fields | |
61 try: | 119 try: |
62 in_columns = int(argparse_dict['columns']) | 120 cast_types = eval( |
63 if in_columns < 1: | 121 'lambda fields: [from_str(s, t) for s, t in zip(fields, [%s])]' |
64 # To be considered tabular, data must have at least one column. | 122 % args.column_types |
65 raise ValueError | 123 ) |
66 except Exception: | 124 except Exception as e: |
67 if not fh.readline(): | 125 sys.exit( |
68 # empty file content is ok and should produce empty output | 126 'While parsing column types, the following problem occured: "%s"' |
69 out.close() | 127 % e |
70 sys.exit() | 128 ) |
71 sys.exit("Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") | 129 |
72 try: | 130 # Get and parse actions |
73 in_column_types = argparse_dict['column_types'].split(',') | 131 if args.file: |
74 except Exception: | 132 actions = [] |
75 sys.exit("Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") | 133 with open(args.file) as i: |
76 if len(in_column_types) != in_columns: | 134 for line in i: |
77 sys.exit("The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") | 135 line = line.strip() |
78 | 136 if line: |
137 actions.append(line) | |
138 else: | |
139 actions = args.actions | |
140 | |
141 # each action must be a full data row manipulation instruction of the form: | |
142 # EXPR;[COL_ADD_SPEC];[COL_NAME] | |
143 # where EXPR is the actual expression to compute on the row, | |
144 # COL_ADD_SPEC consists of a column index and a mode identifier for how the | |
145 # new column should be added. | |
146 # Examples: 3I (insert new col before current column 3), | |
147 # 2R (replace current column 2 with new column); | |
148 # a missing COL_ADD_SPEC is interpreted as mode A (append new column at the | |
149 # end of the row). | |
150 # COL_NAME is required with the --header option and specifies the name of the | |
151 # new column; without --header, any COL_NAME gets ignored. | |
79 operators = 'is|not|or|and' | 152 operators = 'is|not|or|and' |
80 builtin_and_math_functions = 'abs|all|any|bin|chr|cmp|complex|divmod|float|bool|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|log10|exp|sqrt|ceil|floor' | 153 builtin_and_math_functions = ( |
81 string_and_list_methods = [name for name in dir('') + dir([]) if not name.startswith('_')] | 154 'abs|all|any|ascii|bin|bool|chr|complex|divmod|float|format|hex|int|len|' |
82 whitelist = r"^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s)*$" % (operators, builtin_and_math_functions, '|'.join(string_and_list_methods)) | 155 'list|map|max|min|oct|ord|pow|range|reversed|round|set|sorted|str|sum|type|' |
83 if not re.compile(whitelist).match(expr): | 156 'log|log10|exp|sqrt|ceil|floor' |
84 sys.exit("Invalid expression") | 157 ) |
85 if avoid_scientific_notation: | 158 imported_numpy_function = 'format_float_positional' |
86 expr = "format_float_positional(%s)" % expr | 159 string_and_list_methods = [ |
87 | 160 name for name in dir('') + dir([]) if not name.startswith('_') |
88 # Prepare the column variable names and wrappers for column data types | 161 ] |
89 cols, type_casts = [], [] | 162 whitelist = r"^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s|%s)*$" % ( |
90 for col in range(1, in_columns + 1): | 163 operators, |
91 col_name = "c%d" % col | 164 builtin_and_math_functions, |
92 cols.append(col_name) | 165 imported_numpy_function, |
93 col_type = in_column_types[col - 1].strip() | 166 '|'.join(string_and_list_methods) |
94 if not round_result and col_type == 'int': | 167 ) |
95 col_type = 'float' | 168 valid_pat = re.compile(whitelist) |
96 type_cast = "%s(%s)" % (col_type, col_name) | 169 ops = [] |
97 type_casts.append(type_cast) | 170 num_cols = in_columns |
98 | 171 for ac in actions: |
99 col_str = ', '.join(cols) # 'c1, c2, c3, c4' | 172 try: |
100 type_cast_str = ', '.join(type_casts) # 'str(c1), int(c2), int(c3), str(c4)' | 173 expr_string, col_add_spec, new_col_name = ac.split(';') |
101 assign = "%s = line.split('\\t')" % col_str | 174 except ValueError: |
102 if len(cols) == 1: | 175 sys.exit( |
103 # Single column, unpacking by assignment won't work | 176 'Invalid Action: "%s". ' |
104 assign += '[0]' | 177 'Required format: EXPR;[COL_ADD_SPEC];[COL_NAME]' % ac |
105 wrap = "%s = %s" % (col_str, type_cast_str) | 178 ) |
179 if not valid_pat.match(expr_string): | |
180 sys.exit('Invalid expression: "%s"' % expr_string) | |
181 try: | |
182 expr_lambda = eval('lambda %s: %s' % (col_str, expr_string)) | |
183 except Exception as e: | |
184 if str(e).startswith('invalid syntax'): | |
185 sys.exit( | |
186 'Expression "%s" caused a syntax error during parsing.' | |
187 % expr_string | |
188 ) | |
189 else: | |
190 sys.exit( | |
191 'While parsing expression "%s" the following problem occured: ' | |
192 '"%s"' % (expr_string, str(e)) | |
193 ) | |
194 try: | |
195 new_col_idx = int(col_add_spec[:-1] or '0') - 1 | |
196 except ValueError: | |
197 sys.exit( | |
198 'COL_ADD_SPECS need to start with a (1-based) column index. ' | |
199 'Could not parse a column index from "%s"' % col_add_spec | |
200 ) | |
201 try: | |
202 mode = Mode(col_add_spec[-1:]) | |
203 except ValueError: | |
204 sys.exit( | |
205 'COL_ADD_SPECS need to end in a single-character mode identifier ' | |
206 '("I", or "R"), or be empty (for Append mode). ' | |
207 'Could not parse a valid identifier from "%s"' % col_add_spec | |
208 ) | |
209 if mode is Mode.REPLACE: | |
210 if new_col_idx < 0 or new_col_idx >= num_cols: | |
211 sys.exit( | |
212 'Cannot replace the contents of column %d as specified by ' | |
213 'action "%s". No such column at this point of the ' | |
214 'computation' % (new_col_idx + 1, ac) | |
215 ) | |
216 if not new_col_name and args.header: | |
217 sys.exit( | |
218 'A name is required for any new columns when using an existing ' | |
219 'header line (--header option), but found none in action: ' | |
220 '"%s"' % ac | |
221 ) | |
222 # Successfully parsed the instruction | |
223 # Store the expression lambda, the index and name of the new column, and | |
224 # the original string representation of the expression (for use in | |
225 # potential later error messages). | |
226 ops.append([expr_lambda, new_col_idx, mode, new_col_name, expr_string]) | |
227 if mode is Mode.APPEND or mode is Mode.INSERT: | |
228 # If the current expression results in an additional column, | |
229 # we need to handle the new field in subsequent lambda functions. | |
230 num_cols += 1 | |
231 col_str += ', c%d' % num_cols | |
232 | |
233 | |
234 # ready to start parsing the input file | |
235 print( | |
236 'Computing %d new columns with instructions %s' | |
237 % (num_cols - in_columns, actions) | |
238 ) | |
106 skipped_lines = 0 | 239 skipped_lines = 0 |
107 first_invalid_line = 0 | 240 first_invalid_line = 0 |
108 invalid_line = None | 241 invalid_line = None |
109 lines_kept = 0 | 242 lines_computed = 0 |
110 total_lines = 0 | 243 total_lines = 0 |
111 | 244 non_existent_col_pat = re.compile(r"name 'c\d+' is not defined") |
112 # Read input file, skipping invalid lines, and perform computation that will result in a new column | 245 |
113 code = ''' | 246 with open(args.input, encoding='utf-8') as fh, \ |
114 for i, line in enumerate(fh): | 247 open(args.output, 'w', encoding='utf-8') as out: |
115 total_lines += 1 | 248 if args.header: |
116 line = line.rstrip('\\r\\n') | 249 # compute new header line from original |
117 if not line or line.startswith('#'): | 250 header_cols = fh.readline().strip('\n').split('\t') |
118 skipped_lines += 1 | 251 for _, col_idx, mode, col_name, _ in ops: |
119 if not invalid_line: | 252 if mode is Mode.INSERT: |
120 first_invalid_line = i + 1 | 253 header_cols.insert(col_idx, col_name) |
121 invalid_line = line | 254 elif mode is Mode.REPLACE: |
122 continue | 255 header_cols[col_idx] = col_name |
123 try: | 256 else: |
124 %s | 257 header_cols.append(col_name) |
125 %s | 258 out.write('\t'.join(header_cols) + '\n') |
126 new_val = %s | 259 |
127 if round_result: | 260 # read data, skipping empty and comment lines, and perform computations |
128 new_val = int(round(new_val)) | 261 # that will result in new columns |
129 new_line = line + '\\t' + str(new_val) + "\\n" | 262 for i, line in enumerate(fh): |
130 out.write(new_line) | 263 total_lines += 1 |
131 lines_kept += 1 | 264 line = line.rstrip('\n') |
132 except Exception: | 265 if not line or line.startswith('#'): |
133 skipped_lines += 1 | 266 skipped_lines += 1 |
134 if not invalid_line: | 267 if not invalid_line: |
135 first_invalid_line = i + 1 | 268 first_invalid_line = i + 1 |
136 invalid_line = line | 269 invalid_line = line |
137 fh.close() | 270 continue |
138 ''' % (assign, wrap, expr) | 271 fields = line.split('\t') |
139 | 272 if len(fields) == in_columns: |
140 valid_expr = True | 273 try: |
141 try: | 274 typed_fields = cast_types(fields) |
142 exec(code) | 275 except ValueError as e: |
143 except Exception as e: | 276 sys.exit( |
144 out.close() | 277 'Failed to convert some of the columns in line #%d to their ' |
145 if str(e).startswith('invalid syntax'): | 278 'expected types. The error was: "%s" for the line: "%s"' |
146 valid_expr = False | 279 % (i, str(e), line) |
147 sys.exit('Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr) | 280 ) |
148 else: | 281 else: |
149 sys.exit(str(e)) | 282 # A "suspicious" line with less or more fields than expected |
150 | 283 # Type-casting for it might fail or not, but it is pointless to |
151 if valid_expr: | 284 # even try because subsequent computation of any expression will |
152 out.close() | 285 # fail anyway as expression lambdas expect a fixed number of |
153 valid_lines = total_lines - skipped_lines | 286 # arguments. |
154 print('Creating column %d with expression %s' % (in_columns + 1, expr)) | 287 # Lets pass in a copy of the original string fields, let |
155 if valid_lines > 0: | 288 # the computation of the first expression fail, then have that |
156 print('kept %4.2f%% of %d lines.' % (100.0 * lines_kept / valid_lines, | 289 # situation handled according to the non-computable settings in |
157 total_lines)) | 290 # effect. |
158 else: | 291 typed_fields = fields[:] |
159 print('Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr) | 292 for fun, col_idx, mode, col_name, ex in ops: |
160 if skipped_lines > 0: | 293 try: |
161 print('Skipped %d invalid lines starting at line #%d: "%s"' % | 294 try: |
162 (skipped_lines, first_invalid_line, invalid_line)) | 295 new_val = fun(*typed_fields) |
296 except NameError as e: | |
297 # Python 3.10+ would have the problematic name | |
298 # available as e.name | |
299 if non_existent_col_pat.fullmatch(str(e)) and ( | |
300 not args.fail_on_non_existent_columns | |
301 ): | |
302 # Looks like a reference to a non-existent column | |
303 # and we are not supposed to fail on it directly. | |
304 # Reraise and have it handled as a non-computable | |
305 # row. | |
306 raise | |
307 # NameErrors are not row-specific, but indicate a | |
308 # general problem with the user-supplied expression. | |
309 sys.exit( | |
310 'While parsing expression "%s" the following ' | |
311 'problem occured: "%s"' % (ex, str(e)) | |
312 ) | |
313 except Exception as e: | |
314 if args.skip_non_computable: | |
315 # log that a line got skipped, then stop computing | |
316 # for this line | |
317 skipped_lines += 1 | |
318 if not invalid_line: | |
319 first_invalid_line = i + 1 | |
320 invalid_line = line | |
321 break | |
322 if args.keep_non_computable: | |
323 # write the original line unchanged and stop computing | |
324 # for this line | |
325 out.write(line + '\n') | |
326 break | |
327 if args.non_computable_blank: | |
328 new_val = '' | |
329 elif args.non_computable_default is not None: | |
330 new_val = args.non_computable_default | |
331 else: | |
332 # --fail_on_non_computable | |
333 # (which is default behavior, too) | |
334 sys.exit( | |
335 'Could not compute a new column value using "%s" on ' | |
336 'line #%d: "%s". Error was "%s"' | |
337 % (ex, i, line, str(e)) | |
338 ) | |
339 if mode is Mode.INSERT: | |
340 fields.insert(col_idx, new_val) | |
341 typed_fields.insert(col_idx, new_val) | |
342 elif mode is Mode.REPLACE: | |
343 if col_idx > len(fields): | |
344 # Intentionally allow "replacing" one column beyond | |
345 # current fields since this can be used to fix | |
346 # short lines in the input. | |
347 sys.exit( | |
348 'Cannot replace column #%d in line with %d columns: ' | |
349 '"%s"' % (col_idx + 1, len(fields), line) | |
350 ) | |
351 fields[col_idx:col_idx + 1] = [new_val] | |
352 typed_fields[col_idx:col_idx + 1] = [new_val] | |
353 else: | |
354 fields.append(new_val) | |
355 typed_fields.append(new_val) | |
356 else: | |
357 fields = [to_str(field) for field in fields] | |
358 out.write('\t'.join(fields) + '\n') | |
359 lines_computed += 1 | |
360 | |
361 | |
362 valid_lines = total_lines - skipped_lines | |
363 if valid_lines > 0: | |
364 print( | |
365 'Computed new column values for %4.2f%% of %d lines written.' | |
366 % (100.0 * lines_computed / valid_lines, valid_lines) | |
367 ) | |
368 elif args.fail_on_non_existent_columns: | |
369 # Warn the user that there could be an issue with an expression. | |
370 print( | |
371 'Could not compute a new column for any input row! ' | |
372 'Please check your expression(s) "%s" for problems.' | |
373 % actions | |
374 ) | |
375 else: | |
376 # Same, but the problem could also be a reference to a non-existent | |
377 # column. | |
378 print( | |
379 'Could not compute a new column for any input row! ' | |
380 'Please check your expression(s) "%s" for references to non-existent ' | |
381 'columns or other problems.' | |
382 % actions | |
383 ) | |
384 if skipped_lines > 0: | |
385 print('Skipped %d invalid lines starting at line #%d: "%s"' % | |
386 (skipped_lines, first_invalid_line, invalid_line)) | |
387 if lines_computed < valid_lines: | |
388 print( | |
389 'Rewrote %d lines unmodified because computation of a new value failed' | |
390 % (valid_lines - lines_computed) | |
391 ) |