Mercurial > repos > devteam > column_maker

--- a/column_maker.py	Wed Feb 24 05:19:12 2021 +0000
+++ b/column_maker.py	Thu Jul 28 15:27:54 2022 +0000
@@ -1,162 +1,391 @@
 #!/usr/bin/env python
 """
-This tool takes a tab-delimited textfile as input and creates another column in
-the file which is the result of a computation performed on every row in the
-original file. The tool will skip over invalid lines within the file,
-informing the user about the number of lines skipped.
+This tool takes a tab-delimited textfile as input and creates new columns in
+the file which are the result of a computation performed on every row in the
+original file. The tool will skip over empty and comment (starting with a #)
+lines within the file. It does not change the formatting of any original,
+retained columns.
 """

 import argparse
-import json
+import enum
 import re
 import sys
-# functions that may be used in the compute expression
+# Functions that may be used in the compute expression
 from math import (  # noqa: F401
     ceil,
     exp,
     floor,
     log,
     log10,
-    sqrt
+    sqrt,
 )

-from numpy import format_float_positional  # noqa: F401
+from numpy import format_float_positional
+
+
+class Mode(enum.Enum):
+    APPEND = ''
+    INSERT = 'I'
+    REPLACE = 'R'
+
+
+def from_str(s, to_type):
+    if to_type is list:
+        return [part.strip(' ') for part in s.split(',')]
+    else:
+        return to_type(s)
+
+
+def to_str(obj):
+    if type(obj) is list:
+        return ','.join([to_str(i) for i in obj])
+    if args.avoid_scientific_notation and type(obj) is float:
+        return format_float_positional(obj)
+    return str(obj)
+

 parser = argparse.ArgumentParser()
-parser.add_argument('input', type=argparse.FileType('r'), help="input file")
-parser.add_argument('output', type=argparse.FileType('wt'), help="output file")
-parser.add_argument('cond', nargs='?', type=str, help="expression")
-parser.add_argument('columns', nargs='?', type=int, help="number of columns")
-parser.add_argument('column_types', nargs='?', type=str, help="comma separated list of column types")
-parser.add_argument('--round', action="store_true",
-                    help="round result")
-parser.add_argument('--avoid_scientific_notation', action="store_true",
-                    help="avoid scientific notation")
-parser.add_argument('--header_new_column_name', default=None, type=str,
-                    help="First line of input is a header line with column "
-                         "names and this should become the name of the new "
-                         "column")
-parser.add_argument('--load_json', default=None, type=argparse.FileType('r'),
-                    help="overwrite parsed arguments from json file")
+parser.add_argument('input', type=str, help='input file')
+parser.add_argument('output', type=str, help='output file')
+parser.add_argument(
+    '-t', '--column-types', nargs='?', required=True,
+    help='A comma-separated list of column types in the input file'
+)
+parser.add_argument(
+    '--avoid-scientific-notation', action='store_true',
+    help='avoid scientific notation'
+)
+parser.add_argument(
+    '--header', action='store_true',
+    help='The input has a header line with column names. '
+         'Actions must specify names of newly calculated columns.'
+)
+parser.add_argument(
+    '--fail-on-non-existent-columns', action='store_true',
+    help='If an action references a column number that is not existent '
+         'when the expression gets computed, the default behavior is to treat '
+         'this as a case of rows for which the expression cannot be computed. '
+         'The behavior of the tool will then depend on which of the '
+         'non-computable switches is in effect. With this flag, in contrast, '
+         'the tool will fail directly upon encountering a non-existing column.'
+)
+non_computable = parser.add_mutually_exclusive_group()
+non_computable.add_argument('--fail-on-non-computable', action='store_true')
+non_computable.add_argument('--skip-non-computable', action='store_true')
+non_computable.add_argument('--keep-non-computable', action='store_true')
+non_computable.add_argument('--non-computable-blank', action='store_true')
+non_computable.add_argument('--non-computable-default')
+
+group = parser.add_mutually_exclusive_group(required=True)
+group.add_argument(
+    '-a', '--actions', nargs='*', type=str,
+    help='One or more action(s) of the format EXPR;[COL_ADD_SPEC];[COL_NAME]'
+)
+group.add_argument(
+    '-f', '--file', type=str,
+    help='File to read actions from (mutually exclusive with -a)'
+)
 args = parser.parse_args()

-argparse_dict = vars(args)
-if args.load_json:
-    json_dict = json.load(args.load_json)
-    argparse_dict.update(json_dict)
-
-fh = argparse_dict['input']
-out = argparse_dict['output']
-expr = argparse_dict['cond']
-round_result = argparse_dict['round']
-avoid_scientific_notation = argparse_dict['avoid_scientific_notation']
+if not args.column_types:
+    with open(args.input) as fh:
+        if not fh.readline():
+            # Generally, the input must have at least one column to be
+            # considered tabular, but empty files are ok and should produce
+            # empty output.
+            with open(args.output, 'w') as out:
+                pass
+            sys.exit()
+        sys.exit(
+            "Missing column types. "
+            "In Galaxy, click the pencil icon on the history item and "
+            "select the Auto-detect option to correct it.  "
+            "This tool can only be used with tab-delimited data."
+        )

-if argparse_dict['header_new_column_name'] is not None:
-    header_line = fh.readline().strip('\n')
-    out.write(
-        '{0}\t{1}\n'.format(
-            header_line, argparse_dict['header_new_column_name']
-        )
-    )
-try:
-    in_columns = int(argparse_dict['columns'])
-    if in_columns < 1:
-        # To be considered tabular, data must have at least one column.
-        raise ValueError
-except Exception:
-    if not fh.readline():
-        # empty file content is ok and should produce empty output
-        out.close()
-        sys.exit()
-    sys.exit("Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data.")
+in_column_types = [t.strip() for t in args.column_types.split(',')]
+in_columns = len(in_column_types)
+
+# Prepare initial column variable names and type cast representations
+# for column data types
+cols, type_casts = [], []
+for n, col_type in enumerate(in_column_types, start=1):
+    col_name = "c%d" % n
+    cols.append(col_name)
+col_str = ', '.join(cols)    # 'c1, c2, c3, c4'
+
+# Define lambda for type-casting of original row fields
 try:
-    in_column_types = argparse_dict['column_types'].split(',')
-except Exception:
-    sys.exit("Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data.")
-if len(in_column_types) != in_columns:
-    sys.exit("The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data.")
+    cast_types = eval(
+        'lambda fields: [from_str(s, t) for s, t in zip(fields, [%s])]'
+        % args.column_types
+    )
+except Exception as e:
+    sys.exit(
+        'While parsing column types, the following problem occured: "%s"'
+        % e
+    )

-operators = 'is|not|or|and'
-builtin_and_math_functions = 'abs|all|any|bin|chr|cmp|complex|divmod|float|bool|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|log10|exp|sqrt|ceil|floor'
-string_and_list_methods = [name for name in dir('') + dir([]) if not name.startswith('_')]
-whitelist = r"^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s)*$" % (operators, builtin_and_math_functions, '|'.join(string_and_list_methods))
-if not re.compile(whitelist).match(expr):
-    sys.exit("Invalid expression")
-if avoid_scientific_notation:
-    expr = "format_float_positional(%s)" % expr
+# Get and parse actions
+if args.file:
+    actions = []
+    with open(args.file) as i:
+        for line in i:
+            line = line.strip()
+            if line:
+                actions.append(line)
+else:
+    actions = args.actions

-# Prepare the column variable names and wrappers for column data types
-cols, type_casts = [], []
-for col in range(1, in_columns + 1):
-    col_name = "c%d" % col
-    cols.append(col_name)
-    col_type = in_column_types[col - 1].strip()
-    if not round_result and col_type == 'int':
-        col_type = 'float'
-    type_cast = "%s(%s)" % (col_type, col_name)
-    type_casts.append(type_cast)
+# each action must be a full data row manipulation instruction of the form:
+# EXPR;[COL_ADD_SPEC];[COL_NAME]
+# where EXPR is the actual expression to compute on the row,
+# COL_ADD_SPEC consists of a column index and a mode identifier for how the
+# new column should be added.
+# Examples: 3I (insert new col before current column 3),
+# 2R (replace current column 2 with new column);
+# a missing COL_ADD_SPEC is interpreted as mode A (append new column at the
+# end of the row).
+# COL_NAME is required with the --header option and specifies the name of the
+# new column; without --header, any COL_NAME gets ignored.
+operators = 'is|not|or|and'
+builtin_and_math_functions = (
+    'abs|all|any|ascii|bin|bool|chr|complex|divmod|float|format|hex|int|len|'
+    'list|map|max|min|oct|ord|pow|range|reversed|round|set|sorted|str|sum|type|'
+    'log|log10|exp|sqrt|ceil|floor'
+)
+imported_numpy_function = 'format_float_positional'
+string_and_list_methods = [
+    name for name in dir('') + dir([]) if not name.startswith('_')
+]
+whitelist = r"^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s|%s)*$" % (
+    operators,
+    builtin_and_math_functions,
+    imported_numpy_function,
+    '|'.join(string_and_list_methods)
+)
+valid_pat = re.compile(whitelist)
+ops = []
+num_cols = in_columns
+for ac in actions:
+    try:
+        expr_string, col_add_spec, new_col_name = ac.split(';')
+    except ValueError:
+        sys.exit(
+            'Invalid Action: "%s".  '
+            'Required format: EXPR;[COL_ADD_SPEC];[COL_NAME]' % ac
+        )
+    if not valid_pat.match(expr_string):
+        sys.exit('Invalid expression: "%s"' % expr_string)
+    try:
+        expr_lambda = eval('lambda %s: %s' % (col_str, expr_string))
+    except Exception as e:
+        if str(e).startswith('invalid syntax'):
+            sys.exit(
+                'Expression "%s" caused a syntax error during parsing.'
+                % expr_string
+            )
+        else:
+            sys.exit(
+                'While parsing expression "%s" the following problem occured: '
+                '"%s"' % (expr_string, str(e))
+            )
+    try:
+        new_col_idx = int(col_add_spec[:-1] or '0') - 1
+    except ValueError:
+        sys.exit(
+            'COL_ADD_SPECS need to start with a (1-based) column index. '
+            'Could not parse a column index from "%s"' % col_add_spec
+        )
+    try:
+        mode = Mode(col_add_spec[-1:])
+    except ValueError:
+        sys.exit(
+            'COL_ADD_SPECS need to end in a single-character mode identifier '
+            '("I", or "R"), or be empty (for Append mode).  '
+            'Could not parse a valid identifier from "%s"' % col_add_spec
+        )
+    if mode is Mode.REPLACE:
+        if new_col_idx < 0 or new_col_idx >= num_cols:
+            sys.exit(
+                'Cannot replace the contents of column %d as specified by '
+                'action "%s".  No such column at this point of the '
+                'computation' % (new_col_idx + 1, ac)
+            )
+    if not new_col_name and args.header:
+        sys.exit(
+            'A name is required for any new columns when using an existing '
+            'header line (--header option), but found none in action: '
+            '"%s"' % ac
+        )
+    # Successfully parsed the instruction
+    # Store the expression lambda, the index and name of the new column, and
+    # the original string representation of the expression (for use in
+    # potential later error messages).
+    ops.append([expr_lambda, new_col_idx, mode, new_col_name, expr_string])
+    if mode is Mode.APPEND or mode is Mode.INSERT:
+        # If the current expression results in an additional column,
+        # we need to handle the new field in subsequent lambda functions.
+        num_cols += 1
+        col_str += ', c%d' % num_cols

-col_str = ', '.join(cols)    # 'c1, c2, c3, c4'
-type_cast_str = ', '.join(type_casts)  # 'str(c1), int(c2), int(c3), str(c4)'
-assign = "%s = line.split('\\t')" % col_str
-if len(cols) == 1:
-    # Single column, unpacking by assignment won't work
-    assign += '[0]'
-wrap = "%s = %s" % (col_str, type_cast_str)
+
+# ready to start parsing the input file
+print(
+    'Computing %d new columns with instructions %s'
+    % (num_cols - in_columns, actions)
+)
 skipped_lines = 0
 first_invalid_line = 0
 invalid_line = None
-lines_kept = 0
+lines_computed = 0
 total_lines = 0
+non_existent_col_pat = re.compile(r"name 'c\d+' is not defined")
+
+with open(args.input, encoding='utf-8') as fh, \
+     open(args.output, 'w', encoding='utf-8') as out:
+    if args.header:
+        # compute new header line from original
+        header_cols = fh.readline().strip('\n').split('\t')
+        for _, col_idx, mode, col_name, _ in ops:
+            if mode is Mode.INSERT:
+                header_cols.insert(col_idx, col_name)
+            elif mode is Mode.REPLACE:
+                header_cols[col_idx] = col_name
+            else:
+                header_cols.append(col_name)
+        out.write('\t'.join(header_cols) + '\n')

-# Read input file, skipping invalid lines, and perform computation that will result in a new column
-code = '''
-for i, line in enumerate(fh):
-    total_lines += 1
-    line = line.rstrip('\\r\\n')
-    if not line or line.startswith('#'):
-        skipped_lines += 1
-        if not invalid_line:
-            first_invalid_line = i + 1
-            invalid_line = line
-        continue
-    try:
-        %s
-        %s
-        new_val = %s
-        if round_result:
-            new_val = int(round(new_val))
-        new_line = line + '\\t' + str(new_val) + "\\n"
-        out.write(new_line)
-        lines_kept += 1
-    except Exception:
-        skipped_lines += 1
-        if not invalid_line:
-            first_invalid_line = i + 1
-            invalid_line = line
-fh.close()
-''' % (assign, wrap, expr)
+    # read data, skipping empty and comment lines, and perform computations
+    # that will result in new columns
+    for i, line in enumerate(fh):
+        total_lines += 1
+        line = line.rstrip('\n')
+        if not line or line.startswith('#'):
+            skipped_lines += 1
+            if not invalid_line:
+                first_invalid_line = i + 1
+                invalid_line = line
+            continue
+        fields = line.split('\t')
+        if len(fields) == in_columns:
+            try:
+                typed_fields = cast_types(fields)
+            except ValueError as e:
+                sys.exit(
+                    'Failed to convert some of the columns in line #%d to their '
+                    'expected types.  The error was: "%s" for the line: "%s"'
+                    % (i, str(e), line)
+                )
+        else:
+            # A "suspicious" line with less or more fields than expected
+            # Type-casting for it might fail or not, but it is pointless to
+            # even try because subsequent computation of any expression will
+            # fail anyway as expression lambdas expect a fixed number of
+            # arguments.
+            # Lets pass in a copy of the original string fields, let
+            # the computation of the first expression fail, then have that
+            # situation handled according to the non-computable settings in
+            # effect.
+            typed_fields = fields[:]
+        for fun, col_idx, mode, col_name, ex in ops:
+            try:
+                try:
+                    new_val = fun(*typed_fields)
+                except NameError as e:
+                    # Python 3.10+ would have the problematic name
+                    # available as e.name
+                    if non_existent_col_pat.fullmatch(str(e)) and (
+                        not args.fail_on_non_existent_columns
+                    ):
+                        # Looks like a reference to a non-existent column
+                        # and we are not supposed to fail on it directly.
+                        # Reraise and have it handled as a non-computable
+                        # row.
+                        raise
+                    # NameErrors are not row-specific, but indicate a
+                    # general problem with the user-supplied expression.
+                    sys.exit(
+                        'While parsing expression "%s" the following '
+                        'problem occured: "%s"' % (ex, str(e))
+                    )
+            except Exception as e:
+                if args.skip_non_computable:
+                    # log that a line got skipped, then stop computing
+                    # for this line
+                    skipped_lines += 1
+                    if not invalid_line:
+                        first_invalid_line = i + 1
+                        invalid_line = line
+                    break
+                if args.keep_non_computable:
+                    # write the original line unchanged and stop computing
+                    # for this line
+                    out.write(line + '\n')
+                    break
+                if args.non_computable_blank:
+                    new_val = ''
+                elif args.non_computable_default is not None:
+                    new_val = args.non_computable_default
+                else:
+                    # --fail_on_non_computable
+                    # (which is default behavior, too)
+                    sys.exit(
+                        'Could not compute a new column value using "%s" on '
+                        'line #%d: "%s".  Error was "%s"'
+                        % (ex, i, line, str(e))
+                    )
+            if mode is Mode.INSERT:
+                fields.insert(col_idx, new_val)
+                typed_fields.insert(col_idx, new_val)
+            elif mode is Mode.REPLACE:
+                if col_idx > len(fields):
+                    # Intentionally allow "replacing" one column beyond
+                    # current fields since this can be used to fix
+                    # short lines in the input.
+                    sys.exit(
+                        'Cannot replace column #%d in line with %d columns: '
+                        '"%s"' % (col_idx + 1, len(fields), line)
+                    )
+                fields[col_idx:col_idx + 1] = [new_val]
+                typed_fields[col_idx:col_idx + 1] = [new_val]
+            else:
+                fields.append(new_val)
+                typed_fields.append(new_val)
+        else:
+            fields = [to_str(field) for field in fields]
+            out.write('\t'.join(fields) + '\n')
+            lines_computed += 1

-valid_expr = True
-try:
-    exec(code)
-except Exception as e:
-    out.close()
-    if str(e).startswith('invalid syntax'):
-        valid_expr = False
-        sys.exit('Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr)
-    else:
-        sys.exit(str(e))

-if valid_expr:
-    out.close()
-    valid_lines = total_lines - skipped_lines
-    print('Creating column %d with expression %s' % (in_columns + 1, expr))
-    if valid_lines > 0:
-        print('kept %4.2f%% of %d lines.' % (100.0 * lines_kept / valid_lines,
-                                             total_lines))
-    else:
-        print('Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr)
-    if skipped_lines > 0:
-        print('Skipped %d invalid lines starting at line #%d: "%s"' %
-              (skipped_lines, first_invalid_line, invalid_line))
+valid_lines = total_lines - skipped_lines
+if valid_lines > 0:
+    print(
+        'Computed new column values for %4.2f%% of %d lines written.'
+        % (100.0 * lines_computed / valid_lines, valid_lines)
+    )
+elif args.fail_on_non_existent_columns:
+    # Warn the user that there could be an issue with an expression.
+    print(
+        'Could not compute a new column for any input row!  '
+        'Please check your expression(s) "%s" for problems.'
+        % actions
+    )
+else:
+    # Same, but the problem could also be a reference to a non-existent
+    # column.
+    print(
+        'Could not compute a new column for any input row!  '
+        'Please check your expression(s) "%s" for references to non-existent '
+        'columns or other problems.'
+        % actions
+    )
+if skipped_lines > 0:
+    print('Skipped %d invalid lines starting at line #%d: "%s"' %
+          (skipped_lines, first_invalid_line, invalid_line))
+if lines_computed < valid_lines:
+    print(
+        'Rewrote %d lines unmodified because computation of a new value failed'
+        % (valid_lines - lines_computed)
+    )
--- a/column_maker.xml	Wed Feb 24 05:19:12 2021 +0000
+++ b/column_maker.xml	Thu Jul 28 15:27:54 2022 +0000
@@ -1,103 +1,161 @@
-<tool id="Add_a_column1" name="Compute" version="1.6">
-    <description>an expression on every row</description>
+<tool id="Add_a_column1" name="Compute" version="2.0">
+    <description>on rows</description>
+    <macros>
+        <xml name="compute_repeat">
+            <repeat name="expressions" title="Expressions" min="1" default="1">
+                <param name="cond" type="text" value="c3-c2" label="Add expression">
+                    <sanitizer>
+                        <valid initial="default">
+                            <add value="&lt;" />
+                            <add value="&gt;" />
+                            <add value="&quot;" />
+                            <add value="&apos;" />
+                        </valid>
+                    </sanitizer>
+                </param>
+                <conditional name="add_column">
+                    <param name="mode" type="select" label="Mode of the operation">
+                        <option value="">Append</option>
+                        <option value="I">Insert</option>
+                        <option value="R">Replace</option>
+                    </param>
+                    <when value="">
+                        <param name="pos" type="hidden" value="" />
+                    </when>
+                    <when value="I">
+                        <param name="pos" type="integer" min="1" value="1" label="Insert new column before existing column number" />
+                    </when>
+                    <when value="R">
+                        <param name="pos" type="integer" min="1" value="1" label="Use new column to replace column number" />
+                    </when>
+                </conditional>
+                <yield />
+            </repeat>
+        </xml>
+    </macros>
     <requirements>
         <requirement type="package" version="3.8">python</requirement>
-        <requirement type="package" version="1.19.1">numpy</requirement>
+        <requirement type="package" version="1.23.1">numpy</requirement>
     </requirements>
     <command detect_errors="aggressive"><![CDATA[
-        ln -s '$input' data &&
-
-        ## inject colums and column_types metadata into inputs json
-        #import json
-        #set inputs_dict = json.load(open($inputs))
-        #set inputs_dict['columns'] = $input.metadata.columns
-        #set inputs_dict['column_types'] = $input.metadata.column_types
-        ## flatten conditional
-        #if $header_lines_conditional.header_lines_select == "yes":
-            #set inputs_dict['header_new_column_name'] = str($header_lines_conditional.header_new_column_name)
-        #end if
-        #set x = json.dump($inputs_dict, open($inputs, 'w'))
-
-        python '$__tool_directory__/column_maker.py'
-             data '$out_file1'
-             --load_json '$inputs'
+python '$__tool_directory__/column_maker.py'
+#if str($error_handling.auto_col_types) == 'on':
+    #set $col_types = $input.metadata.column_types
+#else:
+    #set $col_types = ','.join(['str' for t in $input.metadata.column_types.split(',')])
+#end if
+--column-types $col_types
+$avoid_scientific_notation
+#if str($ops.header_lines_select) == 'yes':
+    --header
+#end if
+--file '$expressions_file'
+$error_handling.fail_on_non_existent_columns
+$error_handling.non_computable.action
+#if str($error_handling.non_computable.action) == '--non-computable-default':
+    '$error_handling.non_computable.default_value'
+#end if
+'$input'
+'$out_file1'
     ]]></command>
     <configfiles>
-      <inputs name="inputs"/>
+      <configfile name="expressions_file"><![CDATA[
+#if str($ops.header_lines_select) == 'yes':
+    #for $expr in $ops.expressions:
+${expr.cond};${expr.add_column.pos}${expr.add_column.mode};${expr.new_column_name}
+    #end for
+#else:
+    #for $expr in $ops.expressions:
+${expr.cond};${expr.add_column.pos}${expr.add_column.mode};
+    #end for
+#end if
+]]></configfile>
     </configfiles>
     <inputs>
-        <param name="cond" type="text" value="c3-c2" label="Add expression">
-            <sanitizer>
-                <valid initial="default">
-                    <add value="&lt;" />
-                    <add value="&gt;" />
-                    <add value="&quot;" />
-                    <add value="&apos;" />
-                </valid>
-            </sanitizer>
-        </param>
-        <param format="tabular" name="input" type="data" label="as a new column to" help="Dataset missing? See TIP below"/>
-        <param name="round" type="boolean" truevalue="yes" falsevalue="no" label="Round result?" />
-        <param name="avoid_scientific_notation" type="boolean" truevalue="yes" falsevalue="no"
-        label="Avoid scientific notation"
-        help="If yes, use fully expanded decimal representation when writing new columns (use only if expression produces decimal numbers)." />
-        <conditional name="header_lines_conditional">
+        <param name="input" type="data" format="tabular" label="Input file" help="Dataset missing? See TIP below" />
+        <conditional name="ops">
             <param name="header_lines_select" type="select"
             label="Input has a header line with column names?"
-            help="Select Yes to be able to specify a name for the new column and have it added to the header line. If you select No, the first line will be treated as a regular line: If it is empty or starts with a # character it will be skipped, otherwise the tool will attempt to compute the specified expression on it." >
-                <option value="no" >No</option>
-                <option value="yes" >Yes</option>
+            help="Select Yes to be able to specify names for new columns and have them added to the header line. If you select No, the first line will be treated as a regular line: If it is empty or starts with a # character it will be skipped, otherwise the tool will attempt to compute the specified expression on it." >
+                <option value="no">No</option>
+                <option value="yes">Yes</option>
             </param>
             <when value="no">
+                <expand macro="compute_repeat" />
             </when>
             <when value="yes">
-                <param name="header_new_column_name" type="text" value="New Column" label="The new column name" />
+                <expand macro="compute_repeat">
+                    <param name="new_column_name" type="text" value="New Column" label="The new column name" />
+                </expand>
             </when>
         </conditional>
+        <param name="avoid_scientific_notation" type="boolean" truevalue="--avoid-scientific-notation" falsevalue=""
+        label="Avoid scientific notation in any newly computed columns"
+        help="If yes, use fully expanded decimal representation when writing new columns with floating point values. To prevent scientific notation in just specific new columns, you can use numpy's format_float_positional function in the corresponding expression." />
+        <section name="error_handling" title="Error handling">
+            <param name="auto_col_types" type="boolean" truevalue="on" falsevalue="off" checked="true" label="Autodetect column types"
+            help="By default, try to use the column types that Galaxy has recorded for the input. This simplifies expressions, but can occasionally cause problems on its own. If disabled all column values are assumed to be strings and you will have to handle conversions to different types explicitly in the expression." />
+            <param argument="--fail-on-non-existent-columns" type="boolean" truevalue="--fail-on-non-existent-columns" falsevalue="" checked="true" label="Fail on references to non-existent columns"
+            help="If any expression references a column number that does not exist when that expression gets computed, the tool run will fail. Uncheck to have such a situation handled as a case of a non-computable expression as configured below." />
+            <conditional name="non_computable">
+                <param name="action" type="select" label="If an expression cannot be computed for a row">
+                    <option value="--fail-on-non-computable">Fail the entire tool run</option>
+                    <option value="--skip-non-computable">Skip the row</option>
+                    <option value="--keep-non-computable">Keep the row unchanged</option>
+                    <option value="--non-computable-blank">Produce an empty column value for the row</option>
+                    <option value="--non-computable-default">Fill in a replacement value</option>
+                </param>
+                <when value="--fail-on-non-computable" />
+                <when value="--skip-non-computable" />
+                <when value="--keep-non-computable" />
+                <when value="--non-computable-blank" />
+                <when value="--non-computable-default">
+                    <param name="default_value" type="text" label="Replacement value" help="Pick from suggestions or enter your own.">
+                        <option value="nan">nan (not a number)</option>
+                        <option value="inf">inf (infinity)</option>
+                        <option value="-inf">-inf (negative infinity)</option>
+                        <option value="NA">NA (not available)</option>
+                        <option value=".">.</option>
+                    </param>
+                </when>
+            </conditional>
+        </section>
     </inputs>
     <outputs>
-        <data format_source="input" name="out_file1" metadata_source="input"/>
+        <data name="out_file1" format_source="input" metadata_source="input"/>
     </outputs>
     <tests>
         <test>
-            <param name="cond" value="c3-c2"/>
-            <param name="input" value="1.bed"/>
-            <param name="round" value="false"/>
+            <param name="cond" value="float(c3-c2)"/>
+            <param name="input" value="1.bed" ftype="bed" />
             <output name="out_file1" file="column_maker_out1.interval"/>
         </test>
         <test>
-            <param name="cond" value="c4*1"/>
-            <param name="input" value="1.interval"/>
-            <param name="round" value="false"/>
+            <param name="cond" value="c4*1."/>
+            <param name="input" value="1.interval" ftype="interval" />
             <output name="out_file1" file="column_maker_out2.interval"/>
         </test>
         <test>
-            <param name="cond" value="c4*1"/>
-            <param name="input" value="1.header.tsv"/>
-            <param name="round" value="false"/>
-            <conditional name="header_lines_conditional">
-                <param name="header_lines_select" value="yes" />
-                <param name="header_new_column_name" value="value1_again" />
-            </conditional>
+            <param name="cond" value="c4*1."/>
+            <param name="input" value="1.header.tsv" ftype="tabular" />
+            <param name="header_lines_select" value="yes" />
+            <param name="new_column_name" value="value1_again" />
             <output name="out_file1" file="column_maker_out2.header.tsv"/>
         </test>
         <test>
-            <param name="cond" value="c4*1"/>
+            <param name="cond" value="round(c4*1)"/>
             <param name="input" value="1.interval"/>
-            <param name="round" value="true"/>
             <output name="out_file1" file="column_maker_out3.interval"/>
         </test>
         <test>
              <!-- test that single column input works -->
              <param name="cond" value="c1/10"/>
-             <param name="input" value="1.tab" ftype="tabular"/>
-             <param name="round" value="no"/>
+             <param name="input" value="1.tab" ftype="tabular" />
              <output name="out_file1" file="column_maker_out4.tab"/>
          </test>
          <test>
             <param name="cond" value="float(.0000000000001)"/>
             <param name="input" value="1.bed"/>
-            <param name="round" value="false"/>
             <output name="out_file1">
                 <assert_contents>
                     <has_text text="CCDS10397" />
@@ -107,8 +165,7 @@
         </test>
         <test>
             <param name="cond" value="float(.0000000000001)"/>
-            <param name="input" value="1.bed"/>
-            <param name="round" value="false"/>
+            <param name="input" value="1.bed" ftype="bed" />
             <param name="avoid_scientific_notation" value="true"/>
             <output name="out_file1">
                 <assert_contents>
@@ -118,9 +175,124 @@
                 </assert_contents>
             </output>
         </test>
+        <test>
+            <param name="input" value="1.tab" ftype="tabular" />
+            <repeat name="expressions">
+                <param name="cond" value="c1/10" />
+                <conditional name="add_column">
+                    <param name="mode" value="R" />
+                    <param name="pos" value="1" />
+                </conditional>
+            </repeat>
+            <repeat name="expressions">
+                <param name="cond" value="round(c1*10)" />
+                <conditional name="add_column">
+                    <param name="mode" value="I" />
+                    <param name="pos" value="1" />
+                </conditional>
+            </repeat>
+            <output name="out_file1" file="column_maker_out4.tab" />
+        </test>
+        <!-- Test list column type in input -->
+        <test>
+            <param name="input" value="bed12.bed" ftype="bed12" />
+            <!-- get largest blocksize from column 11 of bed12 and use it as
+            new score value -->
+            <param name="cond" value="max(map(int, c11))" />
+            <conditional name="add_column">
+                <param name="mode" value="R" />
+                <param name="pos" value="5" />
+            </conditional>
+            <output name="out_file1" file="bed12_modified.bed" />
+        </test>
+        <!-- Test error handling example from help section -->
+        <test>
+            <param name="input" value="short_line_test.tab" ftype="tabular" />
+            <param name="cond" value="c6" />
+            <conditional name="add_column">
+                <param name="mode" value="R" />
+                <param name="pos" value="6" />
+            </conditional>
+            <param name="fail_on_non_existent_columns" value="false" />
+            <param name="action" value="--non-computable-default" />
+            <param name="default_value" value="." />
+            <output name="out_file1" file="short_line_test_out.tab" />
+        </test>
+        <!-- Test athletes BMI calculation in presence of NA values as in
+        https://training.galaxyproject.org/training-material/topics/introduction/tutorials/data-manipulation-olympics/tutorial.html#exercises-4
+        -->
+        <test>
+            <param name="input" value="olympics.tsv" ftype="tabular" />
+            <param name="header_lines_select" value="yes" />
+            <param name="new_column_name" value="BMI" />
+            <param name="cond" value="int(c8) / (int(c7) * int(c7)) * 10000" />
+            <param name="auto_col_types" value="false" />
+            <param name="action" value="--non-computable-default" />
+            <param name="default_value" value="NA" />
+            <output name="out_file1" file="olympics_bmi_out.tab" />
+        </test>
+        <!-- Test operation used by iwc SARS-CoV-2 consensus building WF that
+        turns a 3-column CHROM POS REF tabular dataset into a 3-column BED
+        dataset. -->
+        <test>
+            <param name="input" value="chrom_pos_ref.tab" ftype="tabular" />
+            <repeat name="expressions">
+                <param name="cond" value="int(c2) - (len(c3) == 1)" />
+                <conditional name="add_column">
+                    <param name="mode" value="R" />
+                    <param name="pos" value="2" />
+                </conditional>
+            </repeat>
+            <repeat name="expressions">
+                <param name="cond" value="int(c2) + ((len(c3) - 1) or 1)" />
+                <conditional name="add_column">
+                    <param name="mode" value="R" />
+                    <param name="pos" value="3" />
+                </conditional>
+            </repeat>
+            <output name="out_file1" file="bed_from_chrom_pos_ref.bed" />
+        </test>
+        <!-- Test failure on expression syntax errors -->
+        <test expect_failure="true">
+            <param name="cond" value="c3- = c2"/>
+            <param name="input" value="1.bed" ftype="bed" />
+            <assert_stderr>
+                <has_text text="syntax error during parsing." />
+            </assert_stderr>
+        </test>
+        <!-- Test failure on expression NameErrors -->
+        <test expect_failure="true">
+            <param name="cond" value="floatfloat(c3-c2)"/>
+            <param name="input" value="1.bed" ftype="bed" />
+            <assert_stderr>
+                <has_text text="name 'floatfloat' is not defined" />
+            </assert_stderr>
+        </test>
+        <!-- Test failure on non-existent column ref -->
+        <test expect_failure="true">
+            <param name="cond" value="c7 - c2"/>
+            <param name="input" value="1.bed" ftype="bed" />
+            <assert_stderr>
+                <has_text text="name 'c7' is not defined" />
+            </assert_stderr>
+        </test>
+        <!-- Test failure on non-computable expression -->
+        <test expect_failure="true">
+            <param name="cond" value="c3 / 0"/>
+            <param name="input" value="1.bed" ftype="bed" />
+            <assert_stderr>
+                <has_text text="division by zero" />
+            </assert_stderr>
+        </test>
+        <!-- Test keep-non-computable prevents failure -->
+        <test>
+            <param name="cond" value="c3 / 0"/>
+            <param name="input" value="1.bed" ftype="bed" />
+            <param name="action" value="--keep-non-computable" />
+            <output name="out_file1" file="1.bed" />
+        </test>
     </tests>
     <help><![CDATA[
-
  .. class:: infomark

 **TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert*
@@ -129,52 +301,138 @@

 **What it does**

-This tool computes an expression for every row of a dataset and appends the result as a new column (field).
+This tool computes an expression on every row of a dataset and appends or inserts the result as a new column (field).
+
+Several expressions can be specified and will be applied sequentially to each row.
+
+**Expression rules**

 - Columns are referenced with **c** and a **number**. For example, **c1** refers to the first column of a tab-delimited file

-- **c3-c2** will add a length column to the dataset if **c2** and **c3** are start and end position
+- The following built-in Python functions are available for use in expressions::
+
+    abs | all | any | ascii | bin | bool | chr | ceil | complex | divmod
+
+    exp | float | floor | format | hex | int | len | list | log | log10
+
+    list | map | max | min | oct | ord | pow | range | reversed
+
+    round | set | sorted | sqrt | str | sum | type
+
+- In addition the numpy function ``format_float_positional`` is available to
+  control the formatting of floating point numbers.
+
+- Expressions can be chained, and the tool will keep track of newly added
+  columns while working through the chain. This means you can reference a column
+  that was created as the result of a previous expression in later ones.

 -----

-**Example**
+**Simple examples**

 If this is your input::

    chr1  151077881  151077918  2  200  -
    chr1  151081985  151082078  3  500  +

-computing "c4*c5" will produce::
-
-   chr1  151077881  151077918  2  200  -   400.0
-   chr1  151081985  151082078  3  500  +  1500.0
-
-if, at the same time, "Round result?" is set to **YES** results will look like this::
+computing "c4 * c5" will produce::

    chr1  151077881  151077918  2  200  -   400
    chr1  151081985  151082078  3  500  +  1500

-You can also use this tool to evaluate expressions. For example, computing "c3>=c2" for Input will result in the following::
+You can also use this tool to evaluate expressions.
+For example, computing "c3 >= c2" for the input above will result in the following::
+
+   chr1  151077881  151077918  2  200  -  True
+   chr1  151081985  151082078  3  500  +  True
+
+Similarly, computing "type(c2) == type(c3) will return::

    chr1  151077881  151077918  2  200  -  True
    chr1  151081985  151082078  3  500  +  True

-or computing "type(c2)==type('') for Input will return::
+-----
+
+**Error handling**
+
+The tool will always fail on syntax errors in and other unrecoverable parsing
+errors with any of your expressions. For other problems, however, it offers
+control over how they should be handled:

-   chr1  151077881  151077918  2  200  -  False
-   chr1  151081985  151082078  3  500  +  False
+1. The default for "Autodetect column types" is "Yes", which means the tool
+   will evaluate each column value as the type that Galaxy assumes for the
+   column. This default behavior will allow you to write simpler expressions.
+   The arithmetic expression "c4 * c5" from the first simple example,
+   for instance, works only because Galaxy realizes that c4 and c5 are integer
+   columns. Occasionally, this autodetection can cause issues. A common
+   such situation are missing values in columns that Galaxy thinks are of
+   numeric type. If you're getting errors like "Failed to convert some of the
+   columns in line #X ...", a solution might be to turn off column type
+   autodetection. The price you will have to pay for doing so is that now you
+   will have to handle type conversions yourself. In the first example you would
+   now have to use the epression: "int(c4) * int(c5)".

+2. By default, if any expression references columns that are not existing before
+   that expression gets computed, the tool will fail, but you can uncheck the
+   "Fail on references to non-existent columns" option. If you do so, the result
+   will depend on your choice for "If an expression cannot be computed for a row"
+   (see 3.)

-The following built-in functions are available::
+3. The default for rows, for which an expression fails to compute is, again, to
+   fail the tool run, but you can also choose to:
+
+   - skip the row on output
+
+     This is a simple way to only keep lines conforming to an expected standard.
+     It is also easy to mask problems with your expressions with this option so
+     take a look at the results and try to understand what gets skipped and for
+     what reasons (the stdout of the tool will contain information about both).
+
+   - keep the row unchanged

-  abs | all | any | bin | bool | chr | ceil | cmp | complex
+     This can be a good solution if your input contains special separator lines
+     that don't follow the general tabular format of other lines and you would
+     like to keep those lines
+
+   - produce an empty column value for the row

-  divmod | exp | float | log | log10 | floor | hex | int | len | long
+     This will use the empty string as a substitute for non-computable items.
+     Different from the "keep the row unchanged option" the problematic line will
+     have a column added or changed. This option is a good choice for inputs
+     in which all rows have the same tabular layout where you want to make sure
+     that the same is true for the output, i.e. that all output lines still have
+     the same number of columns.
+
+   - fill in a replacement value
+
+     This option is very similar to the previous one, but lets you control the
+     replacement value.
+
+**Example**
+
+In the following input::

-  max | min | oct | ord | pow | range | reversed
+   chr1  151077881  151077918  2  200  -
+   chr1  151081985  151082078  3  500  +
+   chr1  151090031  151090938  4  700
+
+the last line does not have a strand column. This violates the bed file format
+specification, which says that unknown strand is to be encoded as ``.`` in the
+strand column.
+
+You can fix the file with the following tool run:

-  round | sorted | sqrt | str | sum | type | unichr | unicode |
+**Add expression**: `c6`
+
+**Mode of the operation**: `Replace`
+
+**Use new column to replace column number**: `6`

+**Fail on references to non-existent columns**: `No`
+
+**If an expression cannot be computed for a row**: `Fill in a replacement value`
+
+**Replacement value**: `.`
     ]]></help>
     <citations />
 </tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/bed12.bed	Thu Jul 28 15:27:54 2022 +0000
@@ -0,0 +1,3 @@
+chr1	14756	15038	JUNC00000001	294	-	14756	15038	255,0,0	2	73,69	0,213
+chr1	14969	15836	JUNC00000002	144	-	14969	15836	255,0,0	2	69,41	0,826
+chr1	15905	16677	JUNC00000003	12	-	15905	16677	255,0,0	2	42,71	0,701
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/bed12_modified.bed	Thu Jul 28 15:27:54 2022 +0000
@@ -0,0 +1,3 @@
+chr1	14756	15038	JUNC00000001	73	-	14756	15038	255,0,0	2	73,69	0,213
+chr1	14969	15836	JUNC00000002	69	-	14969	15836	255,0,0	2	69,41	0,826
+chr1	15905	16677	JUNC00000003	71	-	15905	16677	255,0,0	2	42,71	0,701
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/bed_from_chrom_pos_ref.bed	Thu Jul 28 15:27:54 2022 +0000
@@ -0,0 +1,6 @@
+NC_045512.2	28361	28370
+NC_045512.2	28880	28881
+NC_045512.2	28881	28882
+NC_045512.2	28882	28883
+NC_045512.2	29509	29510
+NC_045512.2	29733	29759
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/chrom_pos_ref.tab	Thu Jul 28 15:27:54 2022 +0000
@@ -0,0 +1,6 @@
+NC_045512.2	28361	GGAGAACGCA
+NC_045512.2	28881	G
+NC_045512.2	28882	G
+NC_045512.2	28883	G
+NC_045512.2	29510	A
+NC_045512.2	29733	CGAGGCCACGCGGAGTACGATCGAGTG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/olympics.tsv	Thu Jul 28 15:27:54 2022 +0000
@@ -0,0 +1,7 @@
+athlete_id	name	sex	birth_year	birth_day	birth_place	height	weight	team	noc	games	year	season	city	sport	event	medal
+1	Jean-François Blanchy	M	1886	12 December 	Bordeaux, Gironde (FRA)	NA	NA	France	FRA	1912 Summer Olympics	1912	Summer	Stockholm	Tennis	Doubles, Men	NA
+1	Jean-François Blanchy	M	1886	12 December 	Bordeaux, Gironde (FRA)	NA	NA	France	FRA	1912 Summer Olympics	1912	Summer	Stockholm	Tennis	Singles, Men	NA
+1	Jean-François Blanchy	M	1886	12 December 	Bordeaux, Gironde (FRA)	NA	NA	France	FRA	1920 Summer Olympics	1920	Summer	Antwerpen	Tennis	Doubles, Men	NA
+1	Jean-François Blanchy	M	1886	12 December 	Bordeaux, Gironde (FRA)	NA	NA	France	FRA	1920 Summer Olympics	1920	Summer	Antwerpen	Tennis	Doubles, Mixed	NA
+1	Jean-François Blanchy	M	1886	12 December 	Bordeaux, Gironde (FRA)	NA	NA	France	FRA	1920 Summer Olympics	1920	Summer	Antwerpen	Tennis	Singles, Men	NA
+2	Arnaud Boetsch	M	1969	1 April 	Meulan, Yvelines (FRA)	183	76	France	FRA	1996 Summer Olympics	1996	Summer	Atlanta	Tennis	Doubles, Men	NA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/olympics_bmi_out.tab	Thu Jul 28 15:27:54 2022 +0000
@@ -0,0 +1,7 @@
+athlete_id	name	sex	birth_year	birth_day	birth_place	height	weight	team	noc	games	year	season	city	sport	event	medal	BMI
+1	Jean-François Blanchy	M	1886	12 December 	Bordeaux, Gironde (FRA)	NA	NA	France	FRA	1912 Summer Olympics	1912	Summer	Stockholm	Tennis	Doubles, Men	NA	NA
+1	Jean-François Blanchy	M	1886	12 December 	Bordeaux, Gironde (FRA)	NA	NA	France	FRA	1912 Summer Olympics	1912	Summer	Stockholm	Tennis	Singles, Men	NA	NA
+1	Jean-François Blanchy	M	1886	12 December 	Bordeaux, Gironde (FRA)	NA	NA	France	FRA	1920 Summer Olympics	1920	Summer	Antwerpen	Tennis	Doubles, Men	NA	NA
+1	Jean-François Blanchy	M	1886	12 December 	Bordeaux, Gironde (FRA)	NA	NA	France	FRA	1920 Summer Olympics	1920	Summer	Antwerpen	Tennis	Doubles, Mixed	NA	NA
+1	Jean-François Blanchy	M	1886	12 December 	Bordeaux, Gironde (FRA)	NA	NA	France	FRA	1920 Summer Olympics	1920	Summer	Antwerpen	Tennis	Singles, Men	NA	NA
+2	Arnaud Boetsch	M	1969	1 April 	Meulan, Yvelines (FRA)	183	76	France	FRA	1996 Summer Olympics	1996	Summer	Atlanta	Tennis	Doubles, Men	NA	22.694018931589476
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/short_line_test.tab	Thu Jul 28 15:27:54 2022 +0000
@@ -0,0 +1,3 @@
+chr1	151077881	151077918	2	200	-
+chr1	151081985	151082078	3	500	+
+chr1	151090031	151090938	4	700
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/short_line_test_out.tab	Thu Jul 28 15:27:54 2022 +0000
@@ -0,0 +1,3 @@
+chr1	151077881	151077918	2	200	-
+chr1	151081985	151082078	3	500	+
+chr1	151090031	151090938	4	700	.