changeset 15:1fc238a9fa95 draft

Uploaded v0.0.9a, handle white space in identifiers
author peterjc
date Thu, 27 Nov 2014 06:38:33 -0500
parents 9c2b10d7b8ac
children 9aa51c1a17cc
files test-data/k12_hypothetical_alt.tabular tools/seq_select_by_id/README.rst tools/seq_select_by_id/seq_select_by_id.py tools/seq_select_by_id/seq_select_by_id.xml
diffstat 4 files changed, 29 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/k12_hypothetical_alt.tabular	Thu Nov 27 06:38:33 2014 -0500
@@ -0,0 +1,2 @@
+#ID and Description	Length
+gi|16127999|ref|NP_414546.1| hypothetical protein b0005 [Escherichia coli str. K-12 substr. MG1655]	98
--- a/tools/seq_select_by_id/README.rst	Fri Nov 21 08:32:19 2014 -0500
+++ b/tools/seq_select_by_id/README.rst	Thu Nov 27 06:38:33 2014 -0500
@@ -77,6 +77,8 @@
 v0.0.9  - Simplified XML to apply input format to output data.
         - Tool definition now embeds citation information.
         - Include input dataset name in output dataset names.
+        - If white space is found in the requested tabular field then only
+          the first word is used as the identifier (with a warning to stderr).
 ======= ======================================================================
 
 
@@ -92,7 +94,7 @@
 For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
 the following command from the Galaxy root folder::
 
-    $ tar -czf seq_select_by_id.tar.gz tools/seq_select_by_id/README.rst tools/seq_select_by_id/seq_select_by_id.* tools/seq_select_by_id/tool_dependencies.xml test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular
+    $ tar -czf seq_select_by_id.tar.gz tools/seq_select_by_id/README.rst tools/seq_select_by_id/seq_select_by_id.* tools/seq_select_by_id/tool_dependencies.xml test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular test-data/k12_hypothetical_alt.tabular
 
 Check this worked::
 
@@ -104,6 +106,7 @@
     test-data/k12_ten_proteins.fasta
     test-data/k12_hypothetical.fasta
     test-data/k12_hypothetical.tabular
+    test-data/k12_hypothetical_alt.tabular
 
 
 Licence (MIT)
--- a/tools/seq_select_by_id/seq_select_by_id.py	Fri Nov 21 08:32:19 2014 -0500
+++ b/tools/seq_select_by_id/seq_select_by_id.py	Thu Nov 27 06:38:33 2014 -0500
@@ -19,8 +19,6 @@
 This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
 All rights reserved. See accompanying text file for licence details (MIT
 license).
-
-This is version 0.0.6 of the script.
 """
 import sys
 
@@ -29,7 +27,7 @@
     sys.exit(err)
 
 if "-v" in sys.argv or "--version" in sys.argv:
-    print "v0.0.6"
+    print "v0.0.9"
     sys.exit(0)
 
 #Parse Command Line
@@ -66,12 +64,25 @@
 
 
 def parse_ids(tabular_file, col):
-    """Read tabular file and record all specified identifiers."""
+    """Read tabular file and record all specified identifiers.
+
+    Will print a single warning to stderr if any of the fields have
+    non-trailing white space (only the first word will be used as
+    the identifier).
+    """
     handle = open(tabular_file, "rU")
+    warn = False
     for line in handle:
         if line.strip() and not line.startswith("#"):
-            yield line.rstrip("\n").split("\t")[col].strip()
+            field = line.rstrip("\n").split("\t")[col].strip()
+            parts = field.split(None, 1)
+            if len(parts) > 1 and not warn:
+                warn = "WARNING: Some of your identifiers had white space in them, " + \
+                       "using first word only. e.g.:\n%s\n" % field
+            yield parts[0]
     handle.close()
+    if warn:
+        sys.stderr.write(warn)
 
 #Index the sequence file.
 #If very big, could use SeqIO.index_db() to avoid memory bottleneck...
--- a/tools/seq_select_by_id/seq_select_by_id.xml	Fri Nov 21 08:32:19 2014 -0500
+++ b/tools/seq_select_by_id/seq_select_by_id.xml	Thu Nov 27 06:38:33 2014 -0500
@@ -28,6 +28,13 @@
             <param name="column" value="1" />
             <output name="output_file" file="k12_hypothetical.fasta" ftype="fasta" />
         </test>
+        <!-- this version has white space in the identifier column (id and description) -->
+        <test>
+            <param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" />
+            <param name="input_tabular" value="k12_hypothetical_alt.tabular" ftype="tabular" />
+            <param name="column" value="1" />
+            <output name="output_file" file="k12_hypothetical.fasta" ftype="fasta" />
+        </test>
     </tests>
     <help>
 **What it does**