Mercurial > repos > greg > gregs_filter
diff filtering.py @ 3:48edb538c102 draft default tip
Uploaded 3.3.0
author | greg |
---|---|
date | Wed, 03 Oct 2012 13:22:53 -0400 |
parents | f28d5018f9cb |
children |
line wrap: on
line diff
--- a/filtering.py Tue Aug 07 09:20:08 2012 -0400 +++ b/filtering.py Wed Oct 03 13:22:53 2012 -0400 @@ -32,7 +32,7 @@ cond_text = sys.argv[3] try: in_columns = int( sys.argv[4] ) - assert sys.argv[5] #check to see that the column types varaible isn't null + assert sys.argv[5] #check to see that the column types variable isn't null in_column_types = sys.argv[5].split( ',' ) except: stop_err( "Data does not appear to be tabular. This tool can only be used with tab-delimited data." ) @@ -61,20 +61,32 @@ if operand in secured: stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) ) -# Prepare the column variable names and wrappers for column data types +# Work out which columns are used in the filter (save using 1 based counting) +used_cols = sorted(set(int(match.group()[1:]) \ + for match in re.finditer('c(\d)+', cond_text))) +largest_col_index = max(used_cols) + +# Prepare the column variable names and wrappers for column data types. Only +# cast columns used in the filter. cols, type_casts = [], [] -for col in range( 1, in_columns + 1 ): +for col in range( 1, largest_col_index + 1 ): col_name = "c%d" % col cols.append( col_name ) col_type = in_column_types[ col - 1 ] - type_cast = "%s(%s)" % ( col_type, col_name ) + if col in used_cols: + type_cast = "%s(%s)" % ( col_type, col_name ) + else: + #If we don't use this column, don't cast it. + #Otherwise we get errors on things like optional integer columns. + type_cast = col_name type_casts.append( type_cast ) col_str = ', '.join( cols ) # 'c1, c2, c3, c4' type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)' -assign = "%s = line.split( '\\t' )" % col_str +assign = "%s, = line.split( '\\t' )[:%i]" % ( col_str, largest_col_index ) wrap = "%s = %s" % ( col_str, type_cast_str ) skipped_lines = 0 +invalid_lines = 0 first_invalid_line = 0 invalid_line = None lines_kept = 0 @@ -88,9 +100,6 @@ line = line.rstrip( '\\r\\n' ) if not line or line.startswith( '#' ): skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line continue try: %s @@ -99,7 +108,7 @@ lines_kept += 1 print >> out, line except: - skipped_lines += 1 + invalid_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line @@ -121,8 +130,10 @@ valid_lines = total_lines - skipped_lines print 'Filtering with %s, ' % cond_text if valid_lines > 0: - print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines ) + print 'kept %4.2f%% of %d valid lines (%d total lines).' % ( 100.0*lines_kept/valid_lines, valid_lines, total_lines ) else: print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text - if skipped_lines > 0: - print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) + if invalid_lines: + print 'Skipped %d invalid line(s) starting at line #%d: "%s"' % ( invalid_lines, first_invalid_line, invalid_line ) + if skipped_lines: + print 'Skipped %i comment (starting with #) or blank line(s)' % skipped_lines