Mercurial > repos > peterjc > seq_select_by_id
changeset 15:1fc238a9fa95 draft
Uploaded v0.0.9a, handle white space in identifiers
author | peterjc |
---|---|
date | Thu, 27 Nov 2014 06:38:33 -0500 |
parents | 9c2b10d7b8ac |
children | 9aa51c1a17cc |
files | test-data/k12_hypothetical_alt.tabular tools/seq_select_by_id/README.rst tools/seq_select_by_id/seq_select_by_id.py tools/seq_select_by_id/seq_select_by_id.xml |
diffstat | 4 files changed, 29 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/k12_hypothetical_alt.tabular Thu Nov 27 06:38:33 2014 -0500 @@ -0,0 +1,2 @@ +#ID and Description Length +gi|16127999|ref|NP_414546.1| hypothetical protein b0005 [Escherichia coli str. K-12 substr. MG1655] 98
--- a/tools/seq_select_by_id/README.rst Fri Nov 21 08:32:19 2014 -0500 +++ b/tools/seq_select_by_id/README.rst Thu Nov 27 06:38:33 2014 -0500 @@ -77,6 +77,8 @@ v0.0.9 - Simplified XML to apply input format to output data. - Tool definition now embeds citation information. - Include input dataset name in output dataset names. + - If white space is found in the requested tabular field then only + the first word is used as the identifier (with a warning to stderr). ======= ====================================================================== @@ -92,7 +94,7 @@ For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use the following command from the Galaxy root folder:: - $ tar -czf seq_select_by_id.tar.gz tools/seq_select_by_id/README.rst tools/seq_select_by_id/seq_select_by_id.* tools/seq_select_by_id/tool_dependencies.xml test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular + $ tar -czf seq_select_by_id.tar.gz tools/seq_select_by_id/README.rst tools/seq_select_by_id/seq_select_by_id.* tools/seq_select_by_id/tool_dependencies.xml test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular test-data/k12_hypothetical_alt.tabular Check this worked:: @@ -104,6 +106,7 @@ test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular + test-data/k12_hypothetical_alt.tabular Licence (MIT)
--- a/tools/seq_select_by_id/seq_select_by_id.py Fri Nov 21 08:32:19 2014 -0500 +++ b/tools/seq_select_by_id/seq_select_by_id.py Thu Nov 27 06:38:33 2014 -0500 @@ -19,8 +19,6 @@ This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK. All rights reserved. See accompanying text file for licence details (MIT license). - -This is version 0.0.6 of the script. """ import sys @@ -29,7 +27,7 @@ sys.exit(err) if "-v" in sys.argv or "--version" in sys.argv: - print "v0.0.6" + print "v0.0.9" sys.exit(0) #Parse Command Line @@ -66,12 +64,25 @@ def parse_ids(tabular_file, col): - """Read tabular file and record all specified identifiers.""" + """Read tabular file and record all specified identifiers. + + Will print a single warning to stderr if any of the fields have + non-trailing white space (only the first word will be used as + the identifier). + """ handle = open(tabular_file, "rU") + warn = False for line in handle: if line.strip() and not line.startswith("#"): - yield line.rstrip("\n").split("\t")[col].strip() + field = line.rstrip("\n").split("\t")[col].strip() + parts = field.split(None, 1) + if len(parts) > 1 and not warn: + warn = "WARNING: Some of your identifiers had white space in them, " + \ + "using first word only. e.g.:\n%s\n" % field + yield parts[0] handle.close() + if warn: + sys.stderr.write(warn) #Index the sequence file. #If very big, could use SeqIO.index_db() to avoid memory bottleneck...
--- a/tools/seq_select_by_id/seq_select_by_id.xml Fri Nov 21 08:32:19 2014 -0500 +++ b/tools/seq_select_by_id/seq_select_by_id.xml Thu Nov 27 06:38:33 2014 -0500 @@ -28,6 +28,13 @@ <param name="column" value="1" /> <output name="output_file" file="k12_hypothetical.fasta" ftype="fasta" /> </test> + <!-- this version has white space in the identifier column (id and description) --> + <test> + <param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" /> + <param name="input_tabular" value="k12_hypothetical_alt.tabular" ftype="tabular" /> + <param name="column" value="1" /> + <output name="output_file" file="k12_hypothetical.fasta" ftype="fasta" /> + </test> </tests> <help> **What it does**