# HG changeset patch
# User peterjc
# Date 1417088313 18000
# Node ID 1fc238a9fa95f991712d8bc4d34091c5ecd6ce2c
# Parent 9c2b10d7b8ac0294a66a5d858f1e9768ef084b6a
Uploaded v0.0.9a, handle white space in identifiers
diff -r 9c2b10d7b8ac -r 1fc238a9fa95 test-data/k12_hypothetical_alt.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/k12_hypothetical_alt.tabular Thu Nov 27 06:38:33 2014 -0500
@@ -0,0 +1,2 @@
+#ID and Description Length
+gi|16127999|ref|NP_414546.1| hypothetical protein b0005 [Escherichia coli str. K-12 substr. MG1655] 98
diff -r 9c2b10d7b8ac -r 1fc238a9fa95 tools/seq_select_by_id/README.rst
--- a/tools/seq_select_by_id/README.rst Fri Nov 21 08:32:19 2014 -0500
+++ b/tools/seq_select_by_id/README.rst Thu Nov 27 06:38:33 2014 -0500
@@ -77,6 +77,8 @@
v0.0.9 - Simplified XML to apply input format to output data.
- Tool definition now embeds citation information.
- Include input dataset name in output dataset names.
+ - If white space is found in the requested tabular field then only
+ the first word is used as the identifier (with a warning to stderr).
======= ======================================================================
@@ -92,7 +94,7 @@
For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
the following command from the Galaxy root folder::
- $ tar -czf seq_select_by_id.tar.gz tools/seq_select_by_id/README.rst tools/seq_select_by_id/seq_select_by_id.* tools/seq_select_by_id/tool_dependencies.xml test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular
+ $ tar -czf seq_select_by_id.tar.gz tools/seq_select_by_id/README.rst tools/seq_select_by_id/seq_select_by_id.* tools/seq_select_by_id/tool_dependencies.xml test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular test-data/k12_hypothetical_alt.tabular
Check this worked::
@@ -104,6 +106,7 @@
test-data/k12_ten_proteins.fasta
test-data/k12_hypothetical.fasta
test-data/k12_hypothetical.tabular
+ test-data/k12_hypothetical_alt.tabular
Licence (MIT)
diff -r 9c2b10d7b8ac -r 1fc238a9fa95 tools/seq_select_by_id/seq_select_by_id.py
--- a/tools/seq_select_by_id/seq_select_by_id.py Fri Nov 21 08:32:19 2014 -0500
+++ b/tools/seq_select_by_id/seq_select_by_id.py Thu Nov 27 06:38:33 2014 -0500
@@ -19,8 +19,6 @@
This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
All rights reserved. See accompanying text file for licence details (MIT
license).
-
-This is version 0.0.6 of the script.
"""
import sys
@@ -29,7 +27,7 @@
sys.exit(err)
if "-v" in sys.argv or "--version" in sys.argv:
- print "v0.0.6"
+ print "v0.0.9"
sys.exit(0)
#Parse Command Line
@@ -66,12 +64,25 @@
def parse_ids(tabular_file, col):
- """Read tabular file and record all specified identifiers."""
+ """Read tabular file and record all specified identifiers.
+
+ Will print a single warning to stderr if any of the fields have
+ non-trailing white space (only the first word will be used as
+ the identifier).
+ """
handle = open(tabular_file, "rU")
+ warn = False
for line in handle:
if line.strip() and not line.startswith("#"):
- yield line.rstrip("\n").split("\t")[col].strip()
+ field = line.rstrip("\n").split("\t")[col].strip()
+ parts = field.split(None, 1)
+ if len(parts) > 1 and not warn:
+ warn = "WARNING: Some of your identifiers had white space in them, " + \
+ "using first word only. e.g.:\n%s\n" % field
+ yield parts[0]
handle.close()
+ if warn:
+ sys.stderr.write(warn)
#Index the sequence file.
#If very big, could use SeqIO.index_db() to avoid memory bottleneck...
diff -r 9c2b10d7b8ac -r 1fc238a9fa95 tools/seq_select_by_id/seq_select_by_id.xml
--- a/tools/seq_select_by_id/seq_select_by_id.xml Fri Nov 21 08:32:19 2014 -0500
+++ b/tools/seq_select_by_id/seq_select_by_id.xml Thu Nov 27 06:38:33 2014 -0500
@@ -28,6 +28,13 @@
+
+
+
+
+
+
+
**What it does**