Mercurial > repos > devteam > column_maker

--- a/column_maker.py	Thu Oct 25 17:31:05 2018 -0400
+++ b/column_maker.py	Thu Jun 04 09:03:25 2020 +0000
@@ -1,37 +1,36 @@
 #!/usr/bin/env python
-# This tool takes a tab-delimited textfile as input and creates another column in the file which is the result of
-# a computation performed on every row in the original file.  The tool will skip over invalid lines within the file,
-# informing the user about the number of lines skipped.
+"""
+This tool takes a tab-delimited textfile as input and creates another column in
+the file which is the result of a computation performed on every row in the
+original file. The tool will skip over invalid lines within the file,
+informing the user about the number of lines skipped.
+"""
+from __future__ import print_function

-import sys, re
-# These functions may be used in compute expression:
-from math import log,exp,sqrt,ceil,floor
-
+import re
+import sys

-assert sys.version_info[:2] >= ( 2, 4 )
-
-def stop_err( msg ):
-    sys.stderr.write( msg )
-    sys.exit()
+assert sys.version_info[:2] >= (2, 4)

 inp_file = sys.argv[1]
 out_file = sys.argv[2]
 expr = sys.argv[3]
 round_result = sys.argv[4]
 try:
-    in_columns = int( sys.argv[5] )
-except:
-    stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data." )
+    in_columns = int(sys.argv[5])
+except Exception:
+    exit("Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data.")
 if in_columns < 2:
     # To be considered tabular, data must fulfill requirements of the sniff.is_column_based() method.
-    stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data." )
+    exit("Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data.")
 try:
-    in_column_types = sys.argv[6].split( ',' )
-except:
-    stop_err( "Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data." )
-if len( in_column_types ) != in_columns:
-    stop_err( "The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data." )
-
+    in_column_types = sys.argv[6].split(',')
+except Exception:
+    exit("Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data.")
+if len(in_column_types) != in_columns:
+    exit("The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data.")
+avoid_scientific_notation = sys.argv[7]
+
 # Unescape if input has been escaped
 mapped_str = {
     '__lt__': '<',
@@ -44,43 +43,56 @@
     '__dq__': '"',
 }
 for key, value in mapped_str.items():
-    expr = expr.replace( key, value )
+    expr = expr.replace(key, value)

 operators = 'is|not|or|and'
 builtin_and_math_functions = 'abs|all|any|bin|chr|cmp|complex|divmod|float|bool|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor'
-string_and_list_methods = [ name for name in dir('') + dir([]) if not name.startswith('_') ]
-whitelist = "^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s)*$" % (operators, builtin_and_math_functions, '|'.join(string_and_list_methods))
+string_and_list_methods = [name for name in dir('') + dir([]) if not name.startswith('_')]
+whitelist = r"^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s)*$" % (operators, builtin_and_math_functions, '|'.join(string_and_list_methods))
 if not re.compile(whitelist).match(expr):
-    stop_err("Invalid expression")
+    exit("Invalid expression")
+if avoid_scientific_notation == "yes":
+    expr = "format_float_positional(%s)" % expr

 # Prepare the column variable names and wrappers for column data types
 cols, type_casts = [], []
-for col in range( 1, in_columns + 1 ):
+for col in range(1, in_columns + 1):
     col_name = "c%d" % col
-    cols.append( col_name )
-    col_type = in_column_types[ col - 1 ].strip()
+    cols.append(col_name)
+    col_type = in_column_types[col - 1].strip()
     if round_result == 'no' and col_type == 'int':
         col_type = 'float'
-    type_cast = "%s(%s)" % ( col_type, col_name )
-    type_casts.append( type_cast )
-
-col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
-type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
-assign = "%s = line.split( '\\t' )" % col_str
-wrap = "%s = %s" % ( col_str, type_cast_str )
+    type_cast = "%s(%s)" % (col_type, col_name)
+    type_casts.append(type_cast)
+
+col_str = ', '.join(cols)    # 'c1, c2, c3, c4'
+type_cast_str = ', '.join(type_casts)  # 'str(c1), int(c2), int(c3), str(c4)'
+assign = "%s = line.split('\\t')" % col_str
+wrap = "%s = %s" % (col_str, type_cast_str)
 skipped_lines = 0
 first_invalid_line = 0
 invalid_line = None
 lines_kept = 0
 total_lines = 0
-out = open( out_file, 'wt' )
+out = open(out_file, 'wt')

 # Read input file, skipping invalid lines, and perform computation that will result in a new column
 code = '''
-for i, line in enumerate( file( inp_file ) ):
+# import here since flake8 complains otherwise
+from math import (
+    ceil,
+    exp,
+    floor,
+    log,
+    sqrt
+)
+from numpy import format_float_positional
+
+fh = open(inp_file)
+for i, line in enumerate(fh):
     total_lines += 1
-    line = line.rstrip( '\\r\\n' )
-    if not line or line.startswith( '#' ):
+    line = line.rstrip('\\r\\n')
+    if not line or line.startswith('#'):
         skipped_lines += 1
         if not invalid_line:
             first_invalid_line = i + 1
@@ -91,35 +103,38 @@
         %s
         new_val = %s
         if round_result == "yes":
-            new_val = int( round( new_val ) )
-        new_line = line + '\\t' + str( new_val )
-        print >> out, new_line
+            new_val = int(round(new_val))
+        new_line = line + '\\t' + str(new_val) + "\\n"
+        out.write(new_line)
         lines_kept += 1
-    except:
+    except Exception:
         skipped_lines += 1
         if not invalid_line:
             first_invalid_line = i + 1
             invalid_line = line
-''' % ( assign, wrap, expr )
+fh.close()
+''' % (assign, wrap, expr)

 valid_expr = True
 try:
-    exec code
-except Exception, e:
+    exec(code)
+except Exception as e:
     out.close()
-    if str( e ).startswith( 'invalid syntax' ):
+    if str(e).startswith('invalid syntax'):
         valid_expr = False
-        stop_err( 'Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr )
+        exit('Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr)
     else:
-        stop_err( str( e ) )
+        exit(str(e))

 if valid_expr:
     out.close()
     valid_lines = total_lines - skipped_lines
-    print 'Creating column %d with expression %s' % ( in_columns + 1, expr )
+    print('Creating column %d with expression %s' % (in_columns + 1, expr))
     if valid_lines > 0:
-        print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
+        print('kept %4.2f%% of %d lines.' % (100.0 * lines_kept / valid_lines,
+                                             total_lines))
     else:
-        print 'Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr
+        print('Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr)
     if skipped_lines > 0:
-        print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
+        print('Skipped %d invalid lines starting at line #%d: "%s"' %
+              (skipped_lines, first_invalid_line, invalid_line))
--- a/column_maker.xml	Thu Oct 25 17:31:05 2018 -0400
+++ b/column_maker.xml	Thu Jun 04 09:03:25 2020 +0000
@@ -1,8 +1,9 @@
-<tool id="Add_a_column1" name="Compute" version="1.2.0">
+<tool id="Add_a_column1" name="Compute" version="1.3.0">
     <description>an expression on every row</description>
     <requirements>
         <requirement type="package" version="2.7.13">python</requirement>
         <requirement type="package" version="4.4">sed</requirement>
+        <requirement type="package" version="1.14">numpy</requirement>
     </requirements>
     <command detect_errors="aggressive"><![CDATA[
         #if $header_lines_conditional.header_lines_select == "yes":
@@ -18,7 +19,8 @@
             "$cond"
             $round
             ${input.metadata.columns}
-            "${input.metadata.column_types}" &&
+            "${input.metadata.column_types}"
+            $avoid_scientific_notation &&
         cat header column_maker_output > '$out_file1'
     ]]></command>
     <inputs>
@@ -39,6 +41,10 @@
                 <param name="header_new_column_name" type="text" value="New Column" label="The new column name" />
             </when>
         </conditional>
+        <param name="avoid_scientific_notation" type="select" label="Avoid scientific notation" help="If yes, use fully expanded decimal representation when writing new columns (use only if expression produces decimal numbers).">
+            <option value="no">no</option>
+            <option value="yes">yes</option>
+        </param>
     </inputs>
     <outputs>
         <data format_source="input" name="out_file1" metadata_source="input"/>
@@ -72,6 +78,30 @@
             <param name="round" value="yes"/>
             <output name="out_file1" file="column_maker_out3.interval"/>
         </test>
+        <test>
+            <param name="cond" value="float(.0000000000001)"/>
+            <param name="input" value="1.bed"/>
+            <param name="round" value="no"/>
+            <output name="out_file1">
+                <assert_contents>
+                    <has_text text="CCDS10397" />
+                    <has_text text="1e-13" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="cond" value="float(.0000000000001)"/>
+            <param name="input" value="1.bed"/>
+            <param name="round" value="no"/>
+            <param name="avoid_scientific_notation" value="yes"/>
+            <output name="out_file1">
+                <assert_contents>
+                    <has_text text="CCDS10397" />
+                    <has_text text=".0000000000001" />
+                    <not_has_text text="1e-13" />
+                </assert_contents>
+            </output>
+        </test>
     </tests>
     <help>