Mercurial > repos > peterjc > seq_filter_by_id
changeset 16:b5967bd17660 draft
Uploaded v0.2.1a, handle white space in identifier fields
author | peterjc |
---|---|
date | Thu, 27 Nov 2014 08:41:36 -0500 |
parents | a6cb323413c0 |
children | 4ae973da053b |
files | test-data/k12_hypothetical_alt.tabular tools/seq_filter_by_id/README.rst tools/seq_filter_by_id/seq_filter_by_id.py tools/seq_filter_by_id/seq_filter_by_id.xml |
diffstat | 4 files changed, 22 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/k12_hypothetical_alt.tabular Thu Nov 27 08:41:36 2014 -0500 @@ -0,0 +1,2 @@ +#ID and Description Length +gi|16127999|ref|NP_414546.1| hypothetical protein b0005 [Escherichia coli str. K-12 substr. MG1655] 98
--- a/tools/seq_filter_by_id/README.rst Wed Nov 19 10:05:51 2014 -0500 +++ b/tools/seq_filter_by_id/README.rst Thu Nov 27 08:41:36 2014 -0500 @@ -81,6 +81,8 @@ v0.2.1 - Use Biopython instead of Galaxy for FASTQ handling. - Tool definition now embeds citation information. - Include input dataset name in output dataset names. + - If white space is found in the requested tabular field then only + the first word is used as the identifier (with a warning to stderr). ======= ====================================================================== @@ -97,7 +99,7 @@ For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use the following command from the Galaxy root folder:: - $ tar -czf seq_filter_by_id.tar.gz tools/seq_filter_by_id/README.rst tools/seq_filter_by_id/seq_filter_by_id.* tools/seq_filter_by_id/tool_dependencies.xml test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular test-data/sanger-pairs-mixed.fastq test-data/sanger-pairs-names.tabular test-data/sanger-sample.fastq test-data/empty_file.dat + $ tar -czf seq_filter_by_id.tar.gz tools/seq_filter_by_id/README.rst tools/seq_filter_by_id/seq_filter_by_id.* tools/seq_filter_by_id/tool_dependencies.xml test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical_alt.tabular test-data/k12_hypothetical.tabular test-data/sanger-pairs-mixed.fastq test-data/sanger-pairs-names.tabular test-data/sanger-sample.fastq test-data/empty_file.dat Check this worked:: @@ -109,6 +111,7 @@ test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular + test-data/k12_hypothetical_alt.tabular test-data/sanger-pairs-mixed.fastq test-data/sanger-pairs-names.tabular test-data/sanger-sample.fastq
--- a/tools/seq_filter_by_id/seq_filter_by_id.py Wed Nov 19 10:05:51 2014 -0500 +++ b/tools/seq_filter_by_id/seq_filter_by_id.py Thu Nov 27 08:41:36 2014 -0500 @@ -137,9 +137,19 @@ stop_err("Expect one-based column numbers (not zero-based counting), got %r" % cols_arg) identifiers.append((tabular_file, columns)) +name_warn = False +def check_white_space(name): + parts = name.split(None, 1) + global name_warn + if not name_warn and len(parts) > 1: + name_warn = "WARNING: Some of your identifiers had white space in them, " + \ + "using first word only. e.g.:\n%s\n" % name + return parts[0] + if drop_suffices: def clean_name(name): """Remove suffix.""" + name = check_white_space(name) match = re_suffix.search(name) if match: # Use the fact this is a suffix, and regular expression will be @@ -155,9 +165,9 @@ assert clean_name("baz.p1") == "baz" assert clean_name("baz.q2") == "baz" else: - def clean_name(name): - """Do nothing!""" - return name + # Just check the white space + clean_name = check_white_space + mapped_chars = { '>' :'__gt__', '<' :'__lt__', @@ -217,7 +227,8 @@ print "Have %i IDs combined from %i tabular files" % (len(ids), len(identifiers)) else: print "Have %i IDs in common from %i tabular files" % (len(ids), len(identifiers)) - +if name_warn: + sys.stderr.write(name_warn) def crude_fasta_iterator(handle): """Yields tuples, record ID and the full record as a string."""
--- a/tools/seq_filter_by_id/seq_filter_by_id.xml Wed Nov 19 10:05:51 2014 -0500 +++ b/tools/seq_filter_by_id/seq_filter_by_id.xml Thu Nov 27 08:41:36 2014 -0500 @@ -100,7 +100,7 @@ </test> <test> <param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" /> - <param name="input_tabular" value="k12_hypothetical.tabular" ftype="tabular" /> + <param name="input_tabular" value="k12_hypothetical_alt.tabular" ftype="tabular" /> <param name="columns" value="1" /> <param name="output_choice" value="pos" /> <param name="adv_opts_selector" value="advanced" />