Mercurial > repos > greg > gregs_filter
changeset 3:48edb538c102 draft default tip
Uploaded 3.3.0
author | greg |
---|---|
date | Wed, 03 Oct 2012 13:22:53 -0400 |
parents | 24884dd48421 |
children | |
files | READ_ME filtering.py filtering.xml |
diffstat | 3 files changed, 36 insertions(+), 15 deletions(-) [+] |
line wrap: on
line diff
--- a/READ_ME Tue Aug 07 09:20:08 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -Hello...
--- a/filtering.py Tue Aug 07 09:20:08 2012 -0400 +++ b/filtering.py Wed Oct 03 13:22:53 2012 -0400 @@ -32,7 +32,7 @@ cond_text = sys.argv[3] try: in_columns = int( sys.argv[4] ) - assert sys.argv[5] #check to see that the column types varaible isn't null + assert sys.argv[5] #check to see that the column types variable isn't null in_column_types = sys.argv[5].split( ',' ) except: stop_err( "Data does not appear to be tabular. This tool can only be used with tab-delimited data." ) @@ -61,20 +61,32 @@ if operand in secured: stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) ) -# Prepare the column variable names and wrappers for column data types +# Work out which columns are used in the filter (save using 1 based counting) +used_cols = sorted(set(int(match.group()[1:]) \ + for match in re.finditer('c(\d)+', cond_text))) +largest_col_index = max(used_cols) + +# Prepare the column variable names and wrappers for column data types. Only +# cast columns used in the filter. cols, type_casts = [], [] -for col in range( 1, in_columns + 1 ): +for col in range( 1, largest_col_index + 1 ): col_name = "c%d" % col cols.append( col_name ) col_type = in_column_types[ col - 1 ] - type_cast = "%s(%s)" % ( col_type, col_name ) + if col in used_cols: + type_cast = "%s(%s)" % ( col_type, col_name ) + else: + #If we don't use this column, don't cast it. + #Otherwise we get errors on things like optional integer columns. + type_cast = col_name type_casts.append( type_cast ) col_str = ', '.join( cols ) # 'c1, c2, c3, c4' type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)' -assign = "%s = line.split( '\\t' )" % col_str +assign = "%s, = line.split( '\\t' )[:%i]" % ( col_str, largest_col_index ) wrap = "%s = %s" % ( col_str, type_cast_str ) skipped_lines = 0 +invalid_lines = 0 first_invalid_line = 0 invalid_line = None lines_kept = 0 @@ -88,9 +100,6 @@ line = line.rstrip( '\\r\\n' ) if not line or line.startswith( '#' ): skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line continue try: %s @@ -99,7 +108,7 @@ lines_kept += 1 print >> out, line except: - skipped_lines += 1 + invalid_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line @@ -121,8 +130,10 @@ valid_lines = total_lines - skipped_lines print 'Filtering with %s, ' % cond_text if valid_lines > 0: - print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines ) + print 'kept %4.2f%% of %d valid lines (%d total lines).' % ( 100.0*lines_kept/valid_lines, valid_lines, total_lines ) else: print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text - if skipped_lines > 0: - print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) + if invalid_lines: + print 'Skipped %d invalid line(s) starting at line #%d: "%s"' % ( invalid_lines, first_invalid_line, invalid_line ) + if skipped_lines: + print 'Skipped %i comment (starting with #) or blank line(s)' % skipped_lines
--- a/filtering.xml Tue Aug 07 09:20:08 2012 -0400 +++ b/filtering.xml Wed Oct 03 13:22:53 2012 -0400 @@ -1,10 +1,10 @@ -<tool id="Filter1" name="Filter (new behavior)" version="1.0.2"> +<tool id="Filter1" name="Filter" version="3.3.0"> <description>data on any column using simple expressions</description> <command interpreter="python"> filtering.py $input $out_file1 "$cond" ${input.metadata.columns} "${input.metadata.column_types}" </command> <inputs> - <param format="tabular" name="input" type="data" label="Filter" help="Query missing? See TIP below."/> + <param format="tabular" name="input" type="data" label="Filter" help="Dataset missing? See TIP below."/> <param name="cond" size="40" type="text" value="c1=='chr22'" label="With following condition" help="Double equal signs, ==, must be used as shown above. To filter for an arbitrary string, use the Select tool."> <validator type="empty_field" message="Enter a valid filtering condition, see syntax and examples below."/> </param> @@ -23,6 +23,17 @@ <param name="cond" value="c1=='chr1' and c3-c2>=2000 and c6=='+'"/> <output name="out_file1" file="filter1_test2.bed"/> </test> + <!-- Test filtering of file with a variable number of columns. --> + <test> + <param name="input" value="filter1_in3.sam"/> + <param name="cond" value="c3=='chr1' and c5>5"/> + <output name="out_file1" file="filter1_test3.sam"/> + </test> + <test> + <param name="input" value="filter1_inbad.bed"/> + <param name="cond" value="c1=='chr22'"/> + <output name="out_file1" file="filter1_test4.bed"/> + </test> </tests> <help>