comparison column_maker.py @ 9:33b81f9ea109 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_maker commit fe76077775aaca531f6a563fdfcbd73fbf1528e7
author iuc
date Thu, 28 Jul 2022 15:27:54 +0000
parents 227e82286a0e
children beec6ecc7d3c
comparison
equal deleted inserted replaced
8:227e82286a0e 9:33b81f9ea109
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """ 2 """
3 This tool takes a tab-delimited textfile as input and creates another column in 3 This tool takes a tab-delimited textfile as input and creates new columns in
4 the file which is the result of a computation performed on every row in the 4 the file which are the result of a computation performed on every row in the
5 original file. The tool will skip over invalid lines within the file, 5 original file. The tool will skip over empty and comment (starting with a #)
6 informing the user about the number of lines skipped. 6 lines within the file. It does not change the formatting of any original,
7 retained columns.
7 """ 8 """
8 9
9 import argparse 10 import argparse
10 import json 11 import enum
11 import re 12 import re
12 import sys 13 import sys
13 # functions that may be used in the compute expression 14 # Functions that may be used in the compute expression
14 from math import ( # noqa: F401 15 from math import ( # noqa: F401
15 ceil, 16 ceil,
16 exp, 17 exp,
17 floor, 18 floor,
18 log, 19 log,
19 log10, 20 log10,
20 sqrt 21 sqrt,
21 ) 22 )
22 23
23 from numpy import format_float_positional # noqa: F401 24 from numpy import format_float_positional
25
26
27 class Mode(enum.Enum):
28 APPEND = ''
29 INSERT = 'I'
30 REPLACE = 'R'
31
32
33 def from_str(s, to_type):
34 if to_type is list:
35 return [part.strip(' ') for part in s.split(',')]
36 else:
37 return to_type(s)
38
39
40 def to_str(obj):
41 if type(obj) is list:
42 return ','.join([to_str(i) for i in obj])
43 if args.avoid_scientific_notation and type(obj) is float:
44 return format_float_positional(obj)
45 return str(obj)
46
24 47
25 parser = argparse.ArgumentParser() 48 parser = argparse.ArgumentParser()
26 parser.add_argument('input', type=argparse.FileType('r'), help="input file") 49 parser.add_argument('input', type=str, help='input file')
27 parser.add_argument('output', type=argparse.FileType('wt'), help="output file") 50 parser.add_argument('output', type=str, help='output file')
28 parser.add_argument('cond', nargs='?', type=str, help="expression") 51 parser.add_argument(
29 parser.add_argument('columns', nargs='?', type=int, help="number of columns") 52 '-t', '--column-types', nargs='?', required=True,
30 parser.add_argument('column_types', nargs='?', type=str, help="comma separated list of column types") 53 help='A comma-separated list of column types in the input file'
31 parser.add_argument('--round', action="store_true", 54 )
32 help="round result") 55 parser.add_argument(
33 parser.add_argument('--avoid_scientific_notation', action="store_true", 56 '--avoid-scientific-notation', action='store_true',
34 help="avoid scientific notation") 57 help='avoid scientific notation'
35 parser.add_argument('--header_new_column_name', default=None, type=str, 58 )
36 help="First line of input is a header line with column " 59 parser.add_argument(
37 "names and this should become the name of the new " 60 '--header', action='store_true',
38 "column") 61 help='The input has a header line with column names. '
39 parser.add_argument('--load_json', default=None, type=argparse.FileType('r'), 62 'Actions must specify names of newly calculated columns.'
40 help="overwrite parsed arguments from json file") 63 )
64 parser.add_argument(
65 '--fail-on-non-existent-columns', action='store_true',
66 help='If an action references a column number that is not existent '
67 'when the expression gets computed, the default behavior is to treat '
68 'this as a case of rows for which the expression cannot be computed. '
69 'The behavior of the tool will then depend on which of the '
70 'non-computable switches is in effect. With this flag, in contrast, '
71 'the tool will fail directly upon encountering a non-existing column.'
72 )
73 non_computable = parser.add_mutually_exclusive_group()
74 non_computable.add_argument('--fail-on-non-computable', action='store_true')
75 non_computable.add_argument('--skip-non-computable', action='store_true')
76 non_computable.add_argument('--keep-non-computable', action='store_true')
77 non_computable.add_argument('--non-computable-blank', action='store_true')
78 non_computable.add_argument('--non-computable-default')
79
80 group = parser.add_mutually_exclusive_group(required=True)
81 group.add_argument(
82 '-a', '--actions', nargs='*', type=str,
83 help='One or more action(s) of the format EXPR;[COL_ADD_SPEC];[COL_NAME]'
84 )
85 group.add_argument(
86 '-f', '--file', type=str,
87 help='File to read actions from (mutually exclusive with -a)'
88 )
41 args = parser.parse_args() 89 args = parser.parse_args()
42 90
43 argparse_dict = vars(args) 91 if not args.column_types:
44 if args.load_json: 92 with open(args.input) as fh:
45 json_dict = json.load(args.load_json) 93 if not fh.readline():
46 argparse_dict.update(json_dict) 94 # Generally, the input must have at least one column to be
47 95 # considered tabular, but empty files are ok and should produce
48 fh = argparse_dict['input'] 96 # empty output.
49 out = argparse_dict['output'] 97 with open(args.output, 'w') as out:
50 expr = argparse_dict['cond'] 98 pass
51 round_result = argparse_dict['round'] 99 sys.exit()
52 avoid_scientific_notation = argparse_dict['avoid_scientific_notation'] 100 sys.exit(
53 101 "Missing column types. "
54 if argparse_dict['header_new_column_name'] is not None: 102 "In Galaxy, click the pencil icon on the history item and "
55 header_line = fh.readline().strip('\n') 103 "select the Auto-detect option to correct it. "
56 out.write( 104 "This tool can only be used with tab-delimited data."
57 '{0}\t{1}\n'.format( 105 )
58 header_line, argparse_dict['header_new_column_name'] 106
59 ) 107 in_column_types = [t.strip() for t in args.column_types.split(',')]
60 ) 108 in_columns = len(in_column_types)
109
110 # Prepare initial column variable names and type cast representations
111 # for column data types
112 cols, type_casts = [], []
113 for n, col_type in enumerate(in_column_types, start=1):
114 col_name = "c%d" % n
115 cols.append(col_name)
116 col_str = ', '.join(cols) # 'c1, c2, c3, c4'
117
118 # Define lambda for type-casting of original row fields
61 try: 119 try:
62 in_columns = int(argparse_dict['columns']) 120 cast_types = eval(
63 if in_columns < 1: 121 'lambda fields: [from_str(s, t) for s, t in zip(fields, [%s])]'
64 # To be considered tabular, data must have at least one column. 122 % args.column_types
65 raise ValueError 123 )
66 except Exception: 124 except Exception as e:
67 if not fh.readline(): 125 sys.exit(
68 # empty file content is ok and should produce empty output 126 'While parsing column types, the following problem occured: "%s"'
69 out.close() 127 % e
70 sys.exit() 128 )
71 sys.exit("Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") 129
72 try: 130 # Get and parse actions
73 in_column_types = argparse_dict['column_types'].split(',') 131 if args.file:
74 except Exception: 132 actions = []
75 sys.exit("Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") 133 with open(args.file) as i:
76 if len(in_column_types) != in_columns: 134 for line in i:
77 sys.exit("The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data.") 135 line = line.strip()
78 136 if line:
137 actions.append(line)
138 else:
139 actions = args.actions
140
141 # each action must be a full data row manipulation instruction of the form:
142 # EXPR;[COL_ADD_SPEC];[COL_NAME]
143 # where EXPR is the actual expression to compute on the row,
144 # COL_ADD_SPEC consists of a column index and a mode identifier for how the
145 # new column should be added.
146 # Examples: 3I (insert new col before current column 3),
147 # 2R (replace current column 2 with new column);
148 # a missing COL_ADD_SPEC is interpreted as mode A (append new column at the
149 # end of the row).
150 # COL_NAME is required with the --header option and specifies the name of the
151 # new column; without --header, any COL_NAME gets ignored.
79 operators = 'is|not|or|and' 152 operators = 'is|not|or|and'
80 builtin_and_math_functions = 'abs|all|any|bin|chr|cmp|complex|divmod|float|bool|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|log10|exp|sqrt|ceil|floor' 153 builtin_and_math_functions = (
81 string_and_list_methods = [name for name in dir('') + dir([]) if not name.startswith('_')] 154 'abs|all|any|ascii|bin|bool|chr|complex|divmod|float|format|hex|int|len|'
82 whitelist = r"^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s)*$" % (operators, builtin_and_math_functions, '|'.join(string_and_list_methods)) 155 'list|map|max|min|oct|ord|pow|range|reversed|round|set|sorted|str|sum|type|'
83 if not re.compile(whitelist).match(expr): 156 'log|log10|exp|sqrt|ceil|floor'
84 sys.exit("Invalid expression") 157 )
85 if avoid_scientific_notation: 158 imported_numpy_function = 'format_float_positional'
86 expr = "format_float_positional(%s)" % expr 159 string_and_list_methods = [
87 160 name for name in dir('') + dir([]) if not name.startswith('_')
88 # Prepare the column variable names and wrappers for column data types 161 ]
89 cols, type_casts = [], [] 162 whitelist = r"^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s|%s)*$" % (
90 for col in range(1, in_columns + 1): 163 operators,
91 col_name = "c%d" % col 164 builtin_and_math_functions,
92 cols.append(col_name) 165 imported_numpy_function,
93 col_type = in_column_types[col - 1].strip() 166 '|'.join(string_and_list_methods)
94 if not round_result and col_type == 'int': 167 )
95 col_type = 'float' 168 valid_pat = re.compile(whitelist)
96 type_cast = "%s(%s)" % (col_type, col_name) 169 ops = []
97 type_casts.append(type_cast) 170 num_cols = in_columns
98 171 for ac in actions:
99 col_str = ', '.join(cols) # 'c1, c2, c3, c4' 172 try:
100 type_cast_str = ', '.join(type_casts) # 'str(c1), int(c2), int(c3), str(c4)' 173 expr_string, col_add_spec, new_col_name = ac.split(';')
101 assign = "%s = line.split('\\t')" % col_str 174 except ValueError:
102 if len(cols) == 1: 175 sys.exit(
103 # Single column, unpacking by assignment won't work 176 'Invalid Action: "%s". '
104 assign += '[0]' 177 'Required format: EXPR;[COL_ADD_SPEC];[COL_NAME]' % ac
105 wrap = "%s = %s" % (col_str, type_cast_str) 178 )
179 if not valid_pat.match(expr_string):
180 sys.exit('Invalid expression: "%s"' % expr_string)
181 try:
182 expr_lambda = eval('lambda %s: %s' % (col_str, expr_string))
183 except Exception as e:
184 if str(e).startswith('invalid syntax'):
185 sys.exit(
186 'Expression "%s" caused a syntax error during parsing.'
187 % expr_string
188 )
189 else:
190 sys.exit(
191 'While parsing expression "%s" the following problem occured: '
192 '"%s"' % (expr_string, str(e))
193 )
194 try:
195 new_col_idx = int(col_add_spec[:-1] or '0') - 1
196 except ValueError:
197 sys.exit(
198 'COL_ADD_SPECS need to start with a (1-based) column index. '
199 'Could not parse a column index from "%s"' % col_add_spec
200 )
201 try:
202 mode = Mode(col_add_spec[-1:])
203 except ValueError:
204 sys.exit(
205 'COL_ADD_SPECS need to end in a single-character mode identifier '
206 '("I", or "R"), or be empty (for Append mode). '
207 'Could not parse a valid identifier from "%s"' % col_add_spec
208 )
209 if mode is Mode.REPLACE:
210 if new_col_idx < 0 or new_col_idx >= num_cols:
211 sys.exit(
212 'Cannot replace the contents of column %d as specified by '
213 'action "%s". No such column at this point of the '
214 'computation' % (new_col_idx + 1, ac)
215 )
216 if not new_col_name and args.header:
217 sys.exit(
218 'A name is required for any new columns when using an existing '
219 'header line (--header option), but found none in action: '
220 '"%s"' % ac
221 )
222 # Successfully parsed the instruction
223 # Store the expression lambda, the index and name of the new column, and
224 # the original string representation of the expression (for use in
225 # potential later error messages).
226 ops.append([expr_lambda, new_col_idx, mode, new_col_name, expr_string])
227 if mode is Mode.APPEND or mode is Mode.INSERT:
228 # If the current expression results in an additional column,
229 # we need to handle the new field in subsequent lambda functions.
230 num_cols += 1
231 col_str += ', c%d' % num_cols
232
233
234 # ready to start parsing the input file
235 print(
236 'Computing %d new columns with instructions %s'
237 % (num_cols - in_columns, actions)
238 )
106 skipped_lines = 0 239 skipped_lines = 0
107 first_invalid_line = 0 240 first_invalid_line = 0
108 invalid_line = None 241 invalid_line = None
109 lines_kept = 0 242 lines_computed = 0
110 total_lines = 0 243 total_lines = 0
111 244 non_existent_col_pat = re.compile(r"name 'c\d+' is not defined")
112 # Read input file, skipping invalid lines, and perform computation that will result in a new column 245
113 code = ''' 246 with open(args.input, encoding='utf-8') as fh, \
114 for i, line in enumerate(fh): 247 open(args.output, 'w', encoding='utf-8') as out:
115 total_lines += 1 248 if args.header:
116 line = line.rstrip('\\r\\n') 249 # compute new header line from original
117 if not line or line.startswith('#'): 250 header_cols = fh.readline().strip('\n').split('\t')
118 skipped_lines += 1 251 for _, col_idx, mode, col_name, _ in ops:
119 if not invalid_line: 252 if mode is Mode.INSERT:
120 first_invalid_line = i + 1 253 header_cols.insert(col_idx, col_name)
121 invalid_line = line 254 elif mode is Mode.REPLACE:
122 continue 255 header_cols[col_idx] = col_name
123 try: 256 else:
124 %s 257 header_cols.append(col_name)
125 %s 258 out.write('\t'.join(header_cols) + '\n')
126 new_val = %s 259
127 if round_result: 260 # read data, skipping empty and comment lines, and perform computations
128 new_val = int(round(new_val)) 261 # that will result in new columns
129 new_line = line + '\\t' + str(new_val) + "\\n" 262 for i, line in enumerate(fh):
130 out.write(new_line) 263 total_lines += 1
131 lines_kept += 1 264 line = line.rstrip('\n')
132 except Exception: 265 if not line or line.startswith('#'):
133 skipped_lines += 1 266 skipped_lines += 1
134 if not invalid_line: 267 if not invalid_line:
135 first_invalid_line = i + 1 268 first_invalid_line = i + 1
136 invalid_line = line 269 invalid_line = line
137 fh.close() 270 continue
138 ''' % (assign, wrap, expr) 271 fields = line.split('\t')
139 272 if len(fields) == in_columns:
140 valid_expr = True 273 try:
141 try: 274 typed_fields = cast_types(fields)
142 exec(code) 275 except ValueError as e:
143 except Exception as e: 276 sys.exit(
144 out.close() 277 'Failed to convert some of the columns in line #%d to their '
145 if str(e).startswith('invalid syntax'): 278 'expected types. The error was: "%s" for the line: "%s"'
146 valid_expr = False 279 % (i, str(e), line)
147 sys.exit('Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr) 280 )
148 else: 281 else:
149 sys.exit(str(e)) 282 # A "suspicious" line with less or more fields than expected
150 283 # Type-casting for it might fail or not, but it is pointless to
151 if valid_expr: 284 # even try because subsequent computation of any expression will
152 out.close() 285 # fail anyway as expression lambdas expect a fixed number of
153 valid_lines = total_lines - skipped_lines 286 # arguments.
154 print('Creating column %d with expression %s' % (in_columns + 1, expr)) 287 # Lets pass in a copy of the original string fields, let
155 if valid_lines > 0: 288 # the computation of the first expression fail, then have that
156 print('kept %4.2f%% of %d lines.' % (100.0 * lines_kept / valid_lines, 289 # situation handled according to the non-computable settings in
157 total_lines)) 290 # effect.
158 else: 291 typed_fields = fields[:]
159 print('Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr) 292 for fun, col_idx, mode, col_name, ex in ops:
160 if skipped_lines > 0: 293 try:
161 print('Skipped %d invalid lines starting at line #%d: "%s"' % 294 try:
162 (skipped_lines, first_invalid_line, invalid_line)) 295 new_val = fun(*typed_fields)
296 except NameError as e:
297 # Python 3.10+ would have the problematic name
298 # available as e.name
299 if non_existent_col_pat.fullmatch(str(e)) and (
300 not args.fail_on_non_existent_columns
301 ):
302 # Looks like a reference to a non-existent column
303 # and we are not supposed to fail on it directly.
304 # Reraise and have it handled as a non-computable
305 # row.
306 raise
307 # NameErrors are not row-specific, but indicate a
308 # general problem with the user-supplied expression.
309 sys.exit(
310 'While parsing expression "%s" the following '
311 'problem occured: "%s"' % (ex, str(e))
312 )
313 except Exception as e:
314 if args.skip_non_computable:
315 # log that a line got skipped, then stop computing
316 # for this line
317 skipped_lines += 1
318 if not invalid_line:
319 first_invalid_line = i + 1
320 invalid_line = line
321 break
322 if args.keep_non_computable:
323 # write the original line unchanged and stop computing
324 # for this line
325 out.write(line + '\n')
326 break
327 if args.non_computable_blank:
328 new_val = ''
329 elif args.non_computable_default is not None:
330 new_val = args.non_computable_default
331 else:
332 # --fail_on_non_computable
333 # (which is default behavior, too)
334 sys.exit(
335 'Could not compute a new column value using "%s" on '
336 'line #%d: "%s". Error was "%s"'
337 % (ex, i, line, str(e))
338 )
339 if mode is Mode.INSERT:
340 fields.insert(col_idx, new_val)
341 typed_fields.insert(col_idx, new_val)
342 elif mode is Mode.REPLACE:
343 if col_idx > len(fields):
344 # Intentionally allow "replacing" one column beyond
345 # current fields since this can be used to fix
346 # short lines in the input.
347 sys.exit(
348 'Cannot replace column #%d in line with %d columns: '
349 '"%s"' % (col_idx + 1, len(fields), line)
350 )
351 fields[col_idx:col_idx + 1] = [new_val]
352 typed_fields[col_idx:col_idx + 1] = [new_val]
353 else:
354 fields.append(new_val)
355 typed_fields.append(new_val)
356 else:
357 fields = [to_str(field) for field in fields]
358 out.write('\t'.join(fields) + '\n')
359 lines_computed += 1
360
361
362 valid_lines = total_lines - skipped_lines
363 if valid_lines > 0:
364 print(
365 'Computed new column values for %4.2f%% of %d lines written.'
366 % (100.0 * lines_computed / valid_lines, valid_lines)
367 )
368 elif args.fail_on_non_existent_columns:
369 # Warn the user that there could be an issue with an expression.
370 print(
371 'Could not compute a new column for any input row! '
372 'Please check your expression(s) "%s" for problems.'
373 % actions
374 )
375 else:
376 # Same, but the problem could also be a reference to a non-existent
377 # column.
378 print(
379 'Could not compute a new column for any input row! '
380 'Please check your expression(s) "%s" for references to non-existent '
381 'columns or other problems.'
382 % actions
383 )
384 if skipped_lines > 0:
385 print('Skipped %d invalid lines starting at line #%d: "%s"' %
386 (skipped_lines, first_invalid_line, invalid_line))
387 if lines_computed < valid_lines:
388 print(
389 'Rewrote %d lines unmodified because computation of a new value failed'
390 % (valid_lines - lines_computed)
391 )