changeset 3:48edb538c102 draft default tip

Uploaded 3.3.0
author greg
date Wed, 03 Oct 2012 13:22:53 -0400
parents 24884dd48421
children
files READ_ME filtering.py filtering.xml
diffstat 3 files changed, 36 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/READ_ME	Tue Aug 07 09:20:08 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-Hello...
--- a/filtering.py	Tue Aug 07 09:20:08 2012 -0400
+++ b/filtering.py	Wed Oct 03 13:22:53 2012 -0400
@@ -32,7 +32,7 @@
 cond_text = sys.argv[3]
 try:
     in_columns = int( sys.argv[4] )
-    assert sys.argv[5]  #check to see that the column types varaible isn't null
+    assert sys.argv[5]  #check to see that the column types variable isn't null
     in_column_types = sys.argv[5].split( ',' )
 except:
     stop_err( "Data does not appear to be tabular.  This tool can only be used with tab-delimited data." )
@@ -61,20 +61,32 @@
         if operand in secured:
             stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
 
-# Prepare the column variable names and wrappers for column data types
+# Work out which columns are used in the filter (save using 1 based counting)
+used_cols = sorted(set(int(match.group()[1:]) \
+                   for match in re.finditer('c(\d)+', cond_text))) 
+largest_col_index = max(used_cols)
+
+# Prepare the column variable names and wrappers for column data types. Only 
+# cast columns used in the filter.
 cols, type_casts = [], []
-for col in range( 1, in_columns + 1 ):
+for col in range( 1, largest_col_index + 1 ):
     col_name = "c%d" % col
     cols.append( col_name )
     col_type = in_column_types[ col - 1 ]
-    type_cast = "%s(%s)" % ( col_type, col_name )
+    if col in used_cols:
+        type_cast = "%s(%s)" % ( col_type, col_name )
+    else:
+        #If we don't use this column, don't cast it.
+        #Otherwise we get errors on things like optional integer columns.
+        type_cast = col_name
     type_casts.append( type_cast )
  
 col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
 type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
-assign = "%s = line.split( '\\t' )" % col_str
+assign = "%s, = line.split( '\\t' )[:%i]" % ( col_str, largest_col_index )
 wrap = "%s = %s" % ( col_str, type_cast_str )
 skipped_lines = 0
+invalid_lines = 0
 first_invalid_line = 0
 invalid_line = None
 lines_kept = 0
@@ -88,9 +100,6 @@
     line = line.rstrip( '\\r\\n' )
     if not line or line.startswith( '#' ):
         skipped_lines += 1
-        if not invalid_line:
-            first_invalid_line = i + 1
-            invalid_line = line
         continue
     try:
         %s
@@ -99,7 +108,7 @@
             lines_kept += 1
             print >> out, line
     except:
-        skipped_lines += 1
+        invalid_lines += 1
         if not invalid_line:
             first_invalid_line = i + 1
             invalid_line = line
@@ -121,8 +130,10 @@
     valid_lines = total_lines - skipped_lines
     print 'Filtering with %s, ' % cond_text
     if valid_lines > 0:
-        print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
+        print 'kept %4.2f%% of %d valid lines (%d total lines).' % ( 100.0*lines_kept/valid_lines, valid_lines, total_lines )
     else:
         print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text
-    if skipped_lines > 0:
-        print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
+    if invalid_lines:
+        print 'Skipped %d invalid line(s) starting at line #%d: "%s"' % ( invalid_lines, first_invalid_line, invalid_line )
+    if skipped_lines:
+        print 'Skipped %i comment (starting with #) or blank line(s)' % skipped_lines
--- a/filtering.xml	Tue Aug 07 09:20:08 2012 -0400
+++ b/filtering.xml	Wed Oct 03 13:22:53 2012 -0400
@@ -1,10 +1,10 @@
-<tool id="Filter1" name="Filter (new behavior)" version="1.0.2">
+<tool id="Filter1" name="Filter" version="3.3.0">
   <description>data on any column using simple expressions</description>
   <command interpreter="python">
     filtering.py $input $out_file1 "$cond" ${input.metadata.columns} "${input.metadata.column_types}"
   </command>
   <inputs>
-    <param format="tabular" name="input" type="data" label="Filter" help="Query missing? See TIP below."/>
+    <param format="tabular" name="input" type="data" label="Filter" help="Dataset missing? See TIP below."/>
     <param name="cond" size="40" type="text" value="c1=='chr22'" label="With following condition" help="Double equal signs, ==, must be used as shown above. To filter for an arbitrary string, use the Select tool.">
       <validator type="empty_field" message="Enter a valid filtering condition, see syntax and examples below."/>
     </param>
@@ -23,6 +23,17 @@
       <param name="cond" value="c1=='chr1' and c3-c2>=2000 and c6=='+'"/>
       <output name="out_file1" file="filter1_test2.bed"/>
     </test>
+    <!-- Test filtering of file with a variable number of columns. -->
+    <test>
+      <param name="input" value="filter1_in3.sam"/>
+      <param name="cond" value="c3=='chr1' and c5>5"/>
+      <output name="out_file1" file="filter1_test3.sam"/>
+    </test>
+    <test>
+      <param name="input" value="filter1_inbad.bed"/>
+      <param name="cond" value="c1=='chr22'"/>
+      <output name="out_file1" file="filter1_test4.bed"/>
+    </test>
   </tests>
   <help>