# HG changeset patch # User greg # Date 1349284973 14400 # Node ID 48edb538c102b0dffa9047a2ed4f6f75ee7a8b57 # Parent 24884dd4842150c1201325286c4cd8b04fe7f578 Uploaded 3.3.0 diff -r 24884dd48421 -r 48edb538c102 READ_ME --- a/READ_ME Tue Aug 07 09:20:08 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -Hello... diff -r 24884dd48421 -r 48edb538c102 filtering.py --- a/filtering.py Tue Aug 07 09:20:08 2012 -0400 +++ b/filtering.py Wed Oct 03 13:22:53 2012 -0400 @@ -32,7 +32,7 @@ cond_text = sys.argv[3] try: in_columns = int( sys.argv[4] ) - assert sys.argv[5] #check to see that the column types varaible isn't null + assert sys.argv[5] #check to see that the column types variable isn't null in_column_types = sys.argv[5].split( ',' ) except: stop_err( "Data does not appear to be tabular. This tool can only be used with tab-delimited data." ) @@ -61,20 +61,32 @@ if operand in secured: stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) ) -# Prepare the column variable names and wrappers for column data types +# Work out which columns are used in the filter (save using 1 based counting) +used_cols = sorted(set(int(match.group()[1:]) \ + for match in re.finditer('c(\d)+', cond_text))) +largest_col_index = max(used_cols) + +# Prepare the column variable names and wrappers for column data types. Only +# cast columns used in the filter. cols, type_casts = [], [] -for col in range( 1, in_columns + 1 ): +for col in range( 1, largest_col_index + 1 ): col_name = "c%d" % col cols.append( col_name ) col_type = in_column_types[ col - 1 ] - type_cast = "%s(%s)" % ( col_type, col_name ) + if col in used_cols: + type_cast = "%s(%s)" % ( col_type, col_name ) + else: + #If we don't use this column, don't cast it. + #Otherwise we get errors on things like optional integer columns. + type_cast = col_name type_casts.append( type_cast ) col_str = ', '.join( cols ) # 'c1, c2, c3, c4' type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)' -assign = "%s = line.split( '\\t' )" % col_str +assign = "%s, = line.split( '\\t' )[:%i]" % ( col_str, largest_col_index ) wrap = "%s = %s" % ( col_str, type_cast_str ) skipped_lines = 0 +invalid_lines = 0 first_invalid_line = 0 invalid_line = None lines_kept = 0 @@ -88,9 +100,6 @@ line = line.rstrip( '\\r\\n' ) if not line or line.startswith( '#' ): skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line continue try: %s @@ -99,7 +108,7 @@ lines_kept += 1 print >> out, line except: - skipped_lines += 1 + invalid_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line @@ -121,8 +130,10 @@ valid_lines = total_lines - skipped_lines print 'Filtering with %s, ' % cond_text if valid_lines > 0: - print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines ) + print 'kept %4.2f%% of %d valid lines (%d total lines).' % ( 100.0*lines_kept/valid_lines, valid_lines, total_lines ) else: print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text - if skipped_lines > 0: - print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) + if invalid_lines: + print 'Skipped %d invalid line(s) starting at line #%d: "%s"' % ( invalid_lines, first_invalid_line, invalid_line ) + if skipped_lines: + print 'Skipped %i comment (starting with #) or blank line(s)' % skipped_lines diff -r 24884dd48421 -r 48edb538c102 filtering.xml --- a/filtering.xml Tue Aug 07 09:20:08 2012 -0400 +++ b/filtering.xml Wed Oct 03 13:22:53 2012 -0400 @@ -1,10 +1,10 @@ - + data on any column using simple expressions filtering.py $input $out_file1 "$cond" ${input.metadata.columns} "${input.metadata.column_types}" - + @@ -23,6 +23,17 @@ + + + + + + + + + + +