gregs_filter: filtering.py comparison

comparison filtering.py @ 3:48edb538c102 draft default tip

Uploaded 3.3.0

author	greg
date	Wed, 03 Oct 2012 13:22:53 -0400
parents	f28d5018f9cb
children

comparison

equal deleted inserted replaced

-:24884dd48421
+:48edb538c102
 in_fname = sys.argv[1]
 out_fname = sys.argv[2]
 cond_text = sys.argv[3]
 try:
 in_columns = int( sys.argv[4] )
-assert sys.argv[5]  #check to see that the column types varaible isn't null
+assert sys.argv[5]  #check to see that the column types variable isn't null
 in_column_types = sys.argv[5].split( ',' )
 except:
 stop_err( "Data does not appear to be tabular.  This tool can only be used with tab-delimited data." )
 # Unescape if input has been escaped
 check = int( operand )
 except:
 if operand in secured:
 stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
-# Prepare the column variable names and wrappers for column data types
+# Work out which columns are used in the filter (save using 1 based counting)
+used_cols = sorted(set(int(match.group()[1:]) \
+for match in re.finditer('c(\d)+', cond_text)))
+largest_col_index = max(used_cols)
+# Prepare the column variable names and wrappers for column data types. Only
+# cast columns used in the filter.
 cols, type_casts = [], []
-for col in range( 1, in_columns + 1 ):
+for col in range( 1, largest_col_index + 1 ):
 col_name = "c%d" % col
 cols.append( col_name )
 col_type = in_column_types[ col - 1 ]
-type_cast = "%s(%s)" % ( col_type, col_name )
+if col in used_cols:
+type_cast = "%s(%s)" % ( col_type, col_name )
+else:
+#If we don't use this column, don't cast it.
+#Otherwise we get errors on things like optional integer columns.
+type_cast = col_name
 type_casts.append( type_cast )
 col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
 type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
-assign = "%s = line.split( '\\t' )" % col_str
+assign = "%s, = line.split( '\\t' )[:%i]" % ( col_str, largest_col_index )
 wrap = "%s = %s" % ( col_str, type_cast_str )
 skipped_lines = 0
+invalid_lines = 0
 first_invalid_line = 0
 invalid_line = None
 lines_kept = 0
 total_lines = 0
 out = open( out_fname, 'wt' )
 for i, line in enumerate( file( in_fname ) ):
 total_lines += 1
 line = line.rstrip( '\\r\\n' )
 if not line or line.startswith( '#' ):
 skipped_lines += 1
-if not invalid_line:
-first_invalid_line = i + 1
-invalid_line = line
 continue
 try:
 %s
 %s
 if %s:
 lines_kept += 1
 print >> out, line
 except:
-skipped_lines += 1
+invalid_lines += 1
 if not invalid_line:
 first_invalid_line = i + 1
 invalid_line = line
 ''' % ( assign, wrap, cond_text )
 if valid_filter:
 out.close()
 valid_lines = total_lines - skipped_lines
 print 'Filtering with %s, ' % cond_text
 if valid_lines > 0:
-print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
+print 'kept %4.2f%% of %d valid lines (%d total lines).' % ( 100.0*lines_kept/valid_lines, valid_lines, total_lines )
 else:
 print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text
-if skipped_lines > 0:
+if invalid_lines:
-print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
+print 'Skipped %d invalid line(s) starting at line #%d: "%s"' % ( invalid_lines, first_invalid_line, invalid_line )
+if skipped_lines:
+print 'Skipped %i comment (starting with #) or blank line(s)' % skipped_lines

Mercurial > repos > greg > gregs_filter

comparison filtering.py @ 3:48edb538c102 draft default tip