# HG changeset patch # User peterjc # Date 1417095696 18000 # Node ID b5967bd17660a287679905000fa6ab25aa91dfe0 # Parent a6cb323413c04b928b955c9c30f8fb1248d3e686 Uploaded v0.2.1a, handle white space in identifier fields diff -r a6cb323413c0 -r b5967bd17660 test-data/k12_hypothetical_alt.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/k12_hypothetical_alt.tabular Thu Nov 27 08:41:36 2014 -0500 @@ -0,0 +1,2 @@ +#ID and Description Length +gi|16127999|ref|NP_414546.1| hypothetical protein b0005 [Escherichia coli str. K-12 substr. MG1655] 98 diff -r a6cb323413c0 -r b5967bd17660 tools/seq_filter_by_id/README.rst --- a/tools/seq_filter_by_id/README.rst Wed Nov 19 10:05:51 2014 -0500 +++ b/tools/seq_filter_by_id/README.rst Thu Nov 27 08:41:36 2014 -0500 @@ -81,6 +81,8 @@ v0.2.1 - Use Biopython instead of Galaxy for FASTQ handling. - Tool definition now embeds citation information. - Include input dataset name in output dataset names. + - If white space is found in the requested tabular field then only + the first word is used as the identifier (with a warning to stderr). ======= ====================================================================== @@ -97,7 +99,7 @@ For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use the following command from the Galaxy root folder:: - $ tar -czf seq_filter_by_id.tar.gz tools/seq_filter_by_id/README.rst tools/seq_filter_by_id/seq_filter_by_id.* tools/seq_filter_by_id/tool_dependencies.xml test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular test-data/sanger-pairs-mixed.fastq test-data/sanger-pairs-names.tabular test-data/sanger-sample.fastq test-data/empty_file.dat + $ tar -czf seq_filter_by_id.tar.gz tools/seq_filter_by_id/README.rst tools/seq_filter_by_id/seq_filter_by_id.* tools/seq_filter_by_id/tool_dependencies.xml test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical_alt.tabular test-data/k12_hypothetical.tabular test-data/sanger-pairs-mixed.fastq test-data/sanger-pairs-names.tabular test-data/sanger-sample.fastq test-data/empty_file.dat Check this worked:: @@ -109,6 +111,7 @@ test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular + test-data/k12_hypothetical_alt.tabular test-data/sanger-pairs-mixed.fastq test-data/sanger-pairs-names.tabular test-data/sanger-sample.fastq diff -r a6cb323413c0 -r b5967bd17660 tools/seq_filter_by_id/seq_filter_by_id.py --- a/tools/seq_filter_by_id/seq_filter_by_id.py Wed Nov 19 10:05:51 2014 -0500 +++ b/tools/seq_filter_by_id/seq_filter_by_id.py Thu Nov 27 08:41:36 2014 -0500 @@ -137,9 +137,19 @@ stop_err("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) identifiers.append((tabular_file, columns)) +name_warn = False +def check_white_space(name): + parts = name.split(None, 1) + global name_warn + if not name_warn and len(parts) > 1: + name_warn = "WARNING: Some of your identifiers had white space in them, " + \ + "using first word only. e.g.:\n%s\n" % name + return parts[0] + if drop_suffices: def clean_name(name): """Remove suffix.""" + name = check_white_space(name) match = re_suffix.search(name) if match: # Use the fact this is a suffix, and regular expression will be @@ -155,9 +165,9 @@ assert clean_name("baz.p1") == "baz" assert clean_name("baz.q2") == "baz" else: - def clean_name(name): - """Do nothing!""" - return name + # Just check the white space + clean_name = check_white_space + mapped_chars = { '>' :'__gt__', '<' :'__lt__', @@ -217,7 +227,8 @@ print "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers)) else: print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)) - +if name_warn: + sys.stderr.write(name_warn) def crude_fasta_iterator(handle): """Yields tuples, record ID and the full record as a string.""" diff -r a6cb323413c0 -r b5967bd17660 tools/seq_filter_by_id/seq_filter_by_id.xml --- a/tools/seq_filter_by_id/seq_filter_by_id.xml Wed Nov 19 10:05:51 2014 -0500 +++ b/tools/seq_filter_by_id/seq_filter_by_id.xml Thu Nov 27 08:41:36 2014 -0500 @@ -100,7 +100,7 @@ - +