diff filtering.py @ 3:48edb538c102 draft default tip

Uploaded 3.3.0
author greg
date Wed, 03 Oct 2012 13:22:53 -0400
parents f28d5018f9cb
children
line wrap: on
line diff
--- a/filtering.py	Tue Aug 07 09:20:08 2012 -0400
+++ b/filtering.py	Wed Oct 03 13:22:53 2012 -0400
@@ -32,7 +32,7 @@
 cond_text = sys.argv[3]
 try:
     in_columns = int( sys.argv[4] )
-    assert sys.argv[5]  #check to see that the column types varaible isn't null
+    assert sys.argv[5]  #check to see that the column types variable isn't null
     in_column_types = sys.argv[5].split( ',' )
 except:
     stop_err( "Data does not appear to be tabular.  This tool can only be used with tab-delimited data." )
@@ -61,20 +61,32 @@
         if operand in secured:
             stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
 
-# Prepare the column variable names and wrappers for column data types
+# Work out which columns are used in the filter (save using 1 based counting)
+used_cols = sorted(set(int(match.group()[1:]) \
+                   for match in re.finditer('c(\d)+', cond_text))) 
+largest_col_index = max(used_cols)
+
+# Prepare the column variable names and wrappers for column data types. Only 
+# cast columns used in the filter.
 cols, type_casts = [], []
-for col in range( 1, in_columns + 1 ):
+for col in range( 1, largest_col_index + 1 ):
     col_name = "c%d" % col
     cols.append( col_name )
     col_type = in_column_types[ col - 1 ]
-    type_cast = "%s(%s)" % ( col_type, col_name )
+    if col in used_cols:
+        type_cast = "%s(%s)" % ( col_type, col_name )
+    else:
+        #If we don't use this column, don't cast it.
+        #Otherwise we get errors on things like optional integer columns.
+        type_cast = col_name
     type_casts.append( type_cast )
  
 col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
 type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
-assign = "%s = line.split( '\\t' )" % col_str
+assign = "%s, = line.split( '\\t' )[:%i]" % ( col_str, largest_col_index )
 wrap = "%s = %s" % ( col_str, type_cast_str )
 skipped_lines = 0
+invalid_lines = 0
 first_invalid_line = 0
 invalid_line = None
 lines_kept = 0
@@ -88,9 +100,6 @@
     line = line.rstrip( '\\r\\n' )
     if not line or line.startswith( '#' ):
         skipped_lines += 1
-        if not invalid_line:
-            first_invalid_line = i + 1
-            invalid_line = line
         continue
     try:
         %s
@@ -99,7 +108,7 @@
             lines_kept += 1
             print >> out, line
     except:
-        skipped_lines += 1
+        invalid_lines += 1
         if not invalid_line:
             first_invalid_line = i + 1
             invalid_line = line
@@ -121,8 +130,10 @@
     valid_lines = total_lines - skipped_lines
     print 'Filtering with %s, ' % cond_text
     if valid_lines > 0:
-        print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
+        print 'kept %4.2f%% of %d valid lines (%d total lines).' % ( 100.0*lines_kept/valid_lines, valid_lines, total_lines )
     else:
         print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text
-    if skipped_lines > 0:
-        print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
+    if invalid_lines:
+        print 'Skipped %d invalid line(s) starting at line #%d: "%s"' % ( invalid_lines, first_invalid_line, invalid_line )
+    if skipped_lines:
+        print 'Skipped %i comment (starting with #) or blank line(s)' % skipped_lines