Mercurial > repos > peterjc > seq_filter_by_id

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/k12_hypothetical_alt.tabular	Thu Nov 27 08:41:36 2014 -0500
@@ -0,0 +1,2 @@
+#ID and Description	Length
+gi|16127999|ref|NP_414546.1| hypothetical protein b0005 [Escherichia coli str. K-12 substr. MG1655]	98
--- a/tools/seq_filter_by_id/README.rst	Wed Nov 19 10:05:51 2014 -0500
+++ b/tools/seq_filter_by_id/README.rst	Thu Nov 27 08:41:36 2014 -0500
@@ -81,6 +81,8 @@
 v0.2.1  - Use Biopython instead of Galaxy for FASTQ handling.
         - Tool definition now embeds citation information.
         - Include input dataset name in output dataset names.
+        - If white space is found in the requested tabular field then only
+          the first word is used as the identifier (with a warning to stderr).
 ======= ======================================================================


@@ -97,7 +99,7 @@
 For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
 the following command from the Galaxy root folder::

-    $ tar -czf seq_filter_by_id.tar.gz tools/seq_filter_by_id/README.rst tools/seq_filter_by_id/seq_filter_by_id.* tools/seq_filter_by_id/tool_dependencies.xml test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular test-data/sanger-pairs-mixed.fastq test-data/sanger-pairs-names.tabular test-data/sanger-sample.fastq test-data/empty_file.dat
+    $ tar -czf seq_filter_by_id.tar.gz tools/seq_filter_by_id/README.rst tools/seq_filter_by_id/seq_filter_by_id.* tools/seq_filter_by_id/tool_dependencies.xml test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical_alt.tabular test-data/k12_hypothetical.tabular test-data/sanger-pairs-mixed.fastq test-data/sanger-pairs-names.tabular test-data/sanger-sample.fastq test-data/empty_file.dat

 Check this worked::

@@ -109,6 +111,7 @@
     test-data/k12_ten_proteins.fasta
     test-data/k12_hypothetical.fasta
     test-data/k12_hypothetical.tabular
+    test-data/k12_hypothetical_alt.tabular
     test-data/sanger-pairs-mixed.fastq
     test-data/sanger-pairs-names.tabular
     test-data/sanger-sample.fastq
--- a/tools/seq_filter_by_id/seq_filter_by_id.py	Wed Nov 19 10:05:51 2014 -0500
+++ b/tools/seq_filter_by_id/seq_filter_by_id.py	Thu Nov 27 08:41:36 2014 -0500
@@ -137,9 +137,19 @@
         stop_err("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg)
     identifiers.append((tabular_file, columns))

+name_warn = False
+def check_white_space(name):
+    parts = name.split(None, 1)
+    global name_warn
+    if not name_warn and len(parts) > 1:
+        name_warn = "WARNING: Some of your identifiers had white space in them, " + \
+                    "using first word only. e.g.:\n%s\n" % name
+    return parts[0]
+
 if drop_suffices:
     def clean_name(name):
         """Remove suffix."""
+        name = check_white_space(name)
         match = re_suffix.search(name)
         if match:
             # Use the fact this is a suffix, and regular expression will be
@@ -155,9 +165,9 @@
     assert clean_name("baz.p1") == "baz"
     assert clean_name("baz.q2") == "baz"
 else:
-    def clean_name(name):
-        """Do nothing!"""
-        return name
+    # Just check the white space
+    clean_name = check_white_space
+

 mapped_chars = { '>' :'__gt__',
                  '<' :'__lt__',
@@ -217,7 +227,8 @@
         print "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers))
     else:
         print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers))
-
+if name_warn:
+    sys.stderr.write(name_warn)

 def crude_fasta_iterator(handle):
     """Yields tuples, record ID and the full record as a string."""
--- a/tools/seq_filter_by_id/seq_filter_by_id.xml	Wed Nov 19 10:05:51 2014 -0500
+++ b/tools/seq_filter_by_id/seq_filter_by_id.xml	Thu Nov 27 08:41:36 2014 -0500
@@ -100,7 +100,7 @@
         </test>
         <test>
             <param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" />
-            <param name="input_tabular" value="k12_hypothetical.tabular" ftype="tabular" />
+            <param name="input_tabular" value="k12_hypothetical_alt.tabular" ftype="tabular" />
             <param name="columns" value="1" />
             <param name="output_choice" value="pos" />
             <param name="adv_opts_selector" value="advanced" />