# HG changeset patch
# User peterjc
# Date 1417094454 18000
# Node ID ba3ec1b3e6350350603e6ea91788e8586bbef3df
# Parent 61cec46f6be595505f649c62ba5d988ce85d3476
Uploaded v0.0.6, handle white space in identifier columns, embed citation, simpler XML
diff -r 61cec46f6be5 -r ba3ec1b3e635 test-data/four_human_proteins.rename.tabular
--- a/test-data/four_human_proteins.rename.tabular Wed Nov 20 12:11:32 2013 -0500
+++ b/test-data/four_human_proteins.rename.tabular Thu Nov 27 08:20:54 2014 -0500
@@ -1,5 +1,5 @@
#FASTA ID
-sp|Q9BS26|ERP44_HUMAN Q9BS26
+sp|Q9BS26|ERP44_HUMAN Q9BS26 and ignore this description
sp|Q9NSY1|BMP2K_HUMAN Q9NSY1
-sp|P06213|INSR_HUMAN P06213
+sp|P06213|INSR_HUMAN and ignore this description P06213
sp|P08100|OPSD_HUMAN P08100
diff -r 61cec46f6be5 -r ba3ec1b3e635 tools/seq_rename/README.rst
--- a/tools/seq_rename/README.rst Wed Nov 20 12:11:32 2013 -0500
+++ b/tools/seq_rename/README.rst Thu Nov 27 08:20:54 2014 -0500
@@ -1,7 +1,7 @@
Galaxy tool to rename FASTA, QUAL, FASTQ or SFF sequences
=========================================================
-This tool is copyright 2011-2013 by Peter Cock, The James Hutton Institute
+This tool is copyright 2011-2014 by Peter Cock, The James Hutton Institute
(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
See the licence text below.
@@ -35,20 +35,20 @@
There are just two files to install to use this tool from within Galaxy:
-* seq_rename.py (the Python script)
-* seq_rename.xml (the Galaxy tool definition)
+* ``seq_rename.py`` (the Python script)
+* ``seq_rename.xml`` (the Galaxy tool definition)
-The suggested location is in a dedicated tools/seq_rename folder.
+The suggested location is in a dedicated ``tools/seq_rename`` folder.
-You will also need to modify the tools_conf.xml file to tell Galaxy to offer the
+You will also need to modify the ``tools_conf.xml`` file to tell Galaxy to offer the
tool. One suggested location is in the filters section. Simply add the line::
-If you wish to run the unit tests, also add this to tools_conf.xml.sample
-and move/copy the test-data files under Galaxy's test-data folder. Then::
+If you wish to run the unit tests, also move/copy the ``test-data/`` files
+under Galaxy's ``test-data/`` folder. Then::
- $ ./run_functional_tests.sh -id seq_rename
+ $ ./run_tests.sh -id seq_rename
You will also need to install Biopython 1.54 or later. That's it.
@@ -70,7 +70,11 @@
- Updated citation information (Cock et al. 2013).
- Development moved to GitHub, https://github.com/peterjc/pico_galaxy
- Renamed folder and adopted README.rst naming.
-v0.0.5 - Correct automated dependency definition
+v0.0.5 - Correct automated dependency definition.
+v0.0.6 - Simplified XML to apply input format to output data.
+ - Tool definition now embeds citation information.
+ - If white space is found in the requested tabular field then only
+ the first word is used as the identifier (with a warning to stderr).
======= ======================================================================
diff -r 61cec46f6be5 -r ba3ec1b3e635 tools/seq_rename/seq_rename.py
--- a/tools/seq_rename/seq_rename.py Wed Nov 20 12:11:32 2013 -0500
+++ b/tools/seq_rename/seq_rename.py Thu Nov 27 08:20:54 2014 -0500
@@ -20,13 +20,11 @@
This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
All rights reserved. See accompanying text file for licence details (MIT
license).
-
-This is version 0.0.4 of the script.
"""
import sys
if "-v" in sys.argv or "--version" in sys.argv:
- print "v0.0.4"
+ print "v0.0.6"
sys.exit(0)
def stop_err(msg, err=1):
@@ -57,13 +55,38 @@
stop_err("Old and new column arguments are the same!")
def parse_ids(tabular_file, old_col, new_col):
- """Read tabular file and record all specified ID mappings."""
+ """Read tabular file and record all specified ID mappings.
+
+ Will print a single warning to stderr if any of the old/new column
+ entries have non-trailing white space (only the first word will
+ be used as the identifier).
+
+ Internal white space in the new column is taken as desired output.
+ """
handle = open(tabular_file, "rU")
+ old_warn = False
+ new_warn = False
for line in handle:
+ if not line.strip():
+ # Ignore blank lines
+ continue
if not line.startswith("#"):
parts = line.rstrip("\n").split("\t")
- yield parts[old_col].strip(), parts[new_col].strip()
+ old = parts[old_col].strip().split(None, 1)
+ new = parts[new_col].strip().split(None, 1)
+ if not old_warn and len(old) > 1:
+ old_warn = "WARNING: Some of your old identifiers had white space in them, " + \
+ "using first word only. e.g.:\n%s\n" % parts[old_col].strip()
+ if not new_warn and len(new) > 1:
+ new_warn = "WARNING: Some of your new identifiers had white space in them, " + \
+ "using first word only. e.g.:\n%s\n" % parts[new_col].strip()
+ yield old[0], new[0]
handle.close()
+ if old_warn:
+ sys.stderr.write(old_warn)
+ if new_warn:
+ sys.stderr.write(new_warn)
+
#Load the rename mappings
rename = dict(parse_ids(tabular_file, old_column, new_column))
diff -r 61cec46f6be5 -r ba3ec1b3e635 tools/seq_rename/seq_rename.xml
--- a/tools/seq_rename/seq_rename.xml Wed Nov 20 12:11:32 2013 -0500
+++ b/tools/seq_rename/seq_rename.xml Thu Nov 27 08:20:54 2014 -0500
@@ -1,4 +1,4 @@
-
+
with ID mapping from a tabular file
biopython
@@ -20,17 +20,7 @@
-
-
-
-
-
-
-
-
-
-
-
+
@@ -55,12 +45,17 @@
new sequence file (of the same format) where the sequence identifiers have been
renamed according to the specified columns in your tabular file.
+Any original description is preserved (N/A for the SFF file format).
+
WARNING: If you have any duplicates in the input sequence file, you will still
have duplicate sequences in the output.
WARNING: If the tabular file has more than one new name for any old ID, the
last one is used.
+WARNING: The old and new names in your tabular file should not contain white space.
+If they do, only the first word is used as the identifier.
+
**References**
If you use this Galaxy tool in work leading to a scientific publication please
@@ -81,4 +76,8 @@
This tool is available to install into other Galaxy Instances via the Galaxy
Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_rename
+
+ 10.7717/peerj.167
+ 10.1093/bioinformatics/btp163
+
diff -r 61cec46f6be5 -r ba3ec1b3e635 tools/seq_rename/tool_dependencies.xml
--- a/tools/seq_rename/tool_dependencies.xml Wed Nov 20 12:11:32 2013 -0500
+++ b/tools/seq_rename/tool_dependencies.xml Thu Nov 27 08:20:54 2014 -0500
@@ -1,6 +1,6 @@
-
+