Mercurial > repos > peterjc > tmhmm_and_signalp
changeset 26:20139cb4c844 draft
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
author | peterjc |
---|---|
date | Wed, 13 May 2015 06:14:42 -0400 |
parents | 41a42022f815 |
children | 9e36a1b9302d |
files | tools/protein_analysis/README.rst tools/protein_analysis/promoter2.py tools/protein_analysis/promoter2.xml tools/protein_analysis/psortb.py tools/protein_analysis/psortb.xml tools/protein_analysis/rxlr_motifs.py tools/protein_analysis/rxlr_motifs.xml tools/protein_analysis/seq_analysis_utils.py tools/protein_analysis/signalp3.py tools/protein_analysis/signalp3.xml tools/protein_analysis/suite_config.xml tools/protein_analysis/tmhmm2.py tools/protein_analysis/tmhmm2.xml tools/protein_analysis/wolf_psort.py tools/protein_analysis/wolf_psort.xml |
diffstat | 15 files changed, 290 insertions(+), 223 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/protein_analysis/README.rst Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/README.rst Wed May 13 06:14:42 2015 -0400 @@ -14,7 +14,7 @@ To use these Galaxy wrappers you must first install the command line tools. At the time of writing they are all free for academic use, or open source. -These wrappers are copyright 2010-2013 by Peter Cock, James Hutton Institute +These wrappers are copyright 2010-2015 by Peter Cock, James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. Contributions/revisions copyright 2011 Konrad Paszkiewicz. All rights reserved. See the included LICENCE file for details (MIT open source licence). @@ -174,6 +174,9 @@ v0.2.6 - Use the new ``$GALAXY_SLOTS`` environment variable for thread count. - Updated the ``suite_config.xml`` file (overdue). - Tool definition now embeds citation information. +v0.2.7 - Style cleanup in Python scripts. +v0.2.8 - Reorder XML elements (internal change only). + - Planemo for Tool Shed upload (``.shed.yml``, internal change only). ======= ====================================================================== @@ -187,10 +190,61 @@ Development has now moved to a dedicated GitHub repository: https://github.com/peterjc/pico_galaxy/tree/master/tools -For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use -the following command from the Galaxy root folder:: + +For pushing a release to the test or main "Galaxy Tool Shed", use the following +Planemo commands (which requires you have set your Tool Shed access details in +``~/.planemo.yml`` and that you have access rights on the Tool Shed):: + + $ planemo shed_upload --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/protein_analysis/ + ... + +or:: + + $ planemo shed_upload --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/protein_analysis/ + ... + +To just build and check the tar ball, use:: - $ ./tools/protein_analysis/make_tmhmm_and_signalp.sh + $ planemo shed_upload --tar_only ~/repositories/pico_galaxy/tools/protein_analysis/ + ... + $ tar -tzf shed_upload.tar.gz + test-data/Adenovirus.fasta + test-data/Adenovirus.promoter2.tabular + test-data/empty.fasta + test-data/empty_promoter2.tabular + test-data/empty_psortb_terse.tabular + test-data/empty_rxlr.Bhattacharjee2006.tabular + test-data/empty_rxlr.Whisson2007.tabular + test-data/empty_rxlr.Win2007.tabular + test-data/empty_signalp3.tabular + test-data/empty_tmhmm2.tabular + test-data/empty_wolf_psort.tabular + test-data/four_human_proteins.fasta + test-data/four_human_proteins.signalp3.tabular + test-data/four_human_proteins.tmhmm2.tabular + test-data/four_human_proteins.wolf_psort.tabular + test-data/k12_ten_proteins.fasta + test-data/k12_ten_proteins_psortb_p_terse.tabular + test-data/rxlr_win_et_al_2007.fasta + test-data/rxlr_win_et_al_2007.tabular + test-data/rxlr_win_et_al_2007_sp3.tabular + tools/protein_analysis/LICENSE.txt + tools/protein_analysis/README.rst + tools/protein_analysis/promoter2.py + tools/protein_analysis/promoter2.xml + tools/protein_analysis/psortb.py + tools/protein_analysis/psortb.xml + tools/protein_analysis/rxlr_motifs.py + tools/protein_analysis/rxlr_motifs.xml + tools/protein_analysis/seq_analysis_utils.py + tools/protein_analysis/signalp3.py + tools/protein_analysis/signalp3.xml + tools/protein_analysis/suite_config.xml + tools/protein_analysis/tmhmm2.py + tools/protein_analysis/tmhmm2.xml + tools/protein_analysis/whisson_et_al_rxlr_eer_cropped.hmm + tools/protein_analysis/wolf_psort.py + tools/protein_analysis/wolf_psort.xml This simplifies ensuring a consistent set of files is bundled each time, including all the relevant test files.
--- a/tools/protein_analysis/promoter2.py Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/promoter2.py Wed May 13 06:14:42 2015 -0400 @@ -30,12 +30,12 @@ import os import commands import tempfile -from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count +from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if len(sys.argv) != 4: - stop_err("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " + sys_exit("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " "Got %i arguments." % (len(sys.argv)-1)) num_threads = thread_count(sys.argv[3],default=4) @@ -48,7 +48,7 @@ platform = commands.getoutput("uname") #e.g. Linux shell_script = commands.getoutput("which promoter") if not os.path.isfile(shell_script): - stop_err("ERROR: Missing promoter executable shell script") + sys_exit("ERROR: Missing promoter executable shell script") path = None for line in open(shell_script): if line.startswith("setenv"): #could then be tab or space! @@ -56,12 +56,12 @@ if parts[0] == "setenv" and parts[1] == "PROM": path = parts[2] if not path: - stop_err("ERROR: Could not find promoter path (PROM) in %r" % shell_script) + sys_exit("ERROR: Could not find promoter path (PROM) in %r" % shell_script) if not os.path.isdir(path): - stop_error("ERROR: %r is not a directory" % path) + sys_exit("ERROR: %r is not a directory" % path) bin = "%s/bin/promoter_%s" % (path, platform) if not os.path.isfile(bin): - stop_err("ERROR: Missing promoter binary %r" % bin) + sys_exit("ERROR: Missing promoter binary %r" % bin) return path, bin def make_tabular(raw_handle, out_handle): @@ -86,19 +86,19 @@ except ValueError: print "WARNING: Problem with line: %r" % line continue - #stop_err("ERROR: Problem with line: %r" % line) + #sys_exit("ERROR: Problem with line: %r" % line) if likelihood not in ["ignored", "Marginal prediction", "Medium likely prediction", "Highly likely prediction"]: - stop_err("ERROR: Problem with line: %r" % line) + sys_exit("ERROR: Problem with line: %r" % line) out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) return queries working_dir, bin = get_path_and_binary() if not os.path.isfile(fasta_file): - stop_err("ERROR: Missing input FASTA file %r" % fasta_file) + sys_exit("ERROR: Missing input FASTA file %r" % fasta_file) #Note that if the input FASTA file contains no sequences, #split_fasta returns an empty list (i.e. zero temp files). @@ -133,7 +133,7 @@ except IOError: output = "" clean_up(fasta_files + temp_files) - stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results @@ -148,7 +148,7 @@ data_handle.close() if not count: clean_up(fasta_files + temp_files) - stop_err("No output from promoter2") + sys_exit("No output from promoter2") queries += count out_handle.close()
--- a/tools/protein_analysis/promoter2.xml Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/promoter2.xml Wed May 13 06:14:42 2015 -0400 @@ -1,27 +1,28 @@ -<tool id="promoter2" name="Promoter 2.0" version="0.0.8"> +<tool id="promoter2" name="Promoter 2.0" version="0.0.9"> <description>Find eukaryotic PolII promoters in DNA sequences</description> <!-- If job splitting is enabled, break up the query file into parts --> <!-- Using 2000 per chunk so 4 threads each doing 500 is ideal --> <parallelism method="basic" split_inputs="fasta_file" split_mode="to_size" split_size="2000" merge_outputs="tabular_file"></parallelism> + <requirements> + <requirement type="binary">promoter</requirement> + <requirement type="package">promoter</requirement> + </requirements> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <command interpreter="python"> promoter2.py "\$GALAXY_SLOTS" "$fasta_file" "$tabular_file" ##If the environment variable isn't set, get "", and the python wrapper ##defaults to four threads. </command> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of DNA sequences"/> </inputs> <outputs> <data name="tabular_file" format="tabular" label="Promoter2 on ${fasta_file.name}" /> </outputs> - <requirements> - <requirement type="binary">promoter</requirement> - </requirements> <tests> <test> <param name="fasta_file" value="Adenovirus.fasta" ftype="fasta"/>
--- a/tools/protein_analysis/psortb.py Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/psortb.py Wed May 13 06:14:42 2015 -0400 @@ -24,7 +24,7 @@ import sys import os import tempfile -from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count +from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 @@ -33,7 +33,7 @@ sys.exit(os.system("psort --version")) if len(sys.argv) != 8: - stop_err("Require 7 arguments, number of threads (int), type (e.g. archaea), " + sys_exit("Require 7 arguments, number of threads (int), type (e.g. archaea), " "output (e.g. terse/normal/long), cutoff, divergent, input protein " "FASTA file & output tabular file") @@ -56,7 +56,7 @@ if out_type == "terse": header = ['SeqID', 'Localization', 'Score'] elif out_type == "normal": - stop_err("Normal output not implemented yet, sorry.") + sys_exit("Normal output not implemented yet, sorry.") elif out_type == "long": if org_type == "-n": #Gram negative bacteria @@ -93,9 +93,9 @@ 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 'Secondary_Localization', 'PSortb_Version'] else: - stop_err("Expected -n, -p or -a for the organism type, not %r" % org_type) + sys_exit("Expected -n, -p or -a for the organism type, not %r" % org_type) else: - stop_err("Expected terse, normal or long for the output type, not %r" % out_type) + sys_exit("Expected terse, normal or long for the output type, not %r" % out_type) tmp_dir = tempfile.mkdtemp() @@ -149,7 +149,7 @@ except IOError: output = "" clean_up(fasta_files + temp_files) - stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results del jobs @@ -163,7 +163,7 @@ data_handle.close() if not count: clean_up(fasta_files + temp_files) - stop_err("No output from psortb") + sys_exit("No output from psortb") out_handle.close() print "%i records" % count
--- a/tools/protein_analysis/psortb.xml Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/psortb.xml Wed May 13 06:14:42 2015 -0400 @@ -1,61 +1,62 @@ -<tool id="Psortb" name="psortb" version="0.0.5"> - <description>Determines sub-cellular localisation of bacterial/archaeal protein sequences</description> - <!-- If job splitting is enabled, break up the query file into parts --> - <!-- Using 2000 chunks meaning 4 threads doing 500 each is ideal --> - <parallelism method="basic" split_inputs="fasta_file" split_mode="to_size" split_size="2000" merge_outputs="tabular_file"></parallelism> - <version_command interpreter="python">psortb.py --version</version_command> - <command interpreter="python"> - psortb.py "\$GALAXY_SLOTS" "$type" "$long" "$cutoff" "$divergent" "$sequence" "$outfile" - ##If the environment variable isn't set, get "", and python wrapper - ##defaults to four threads. - </command> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - </stdio> - <inputs> - <param format="fasta" name="sequence" type="data" - label="Input sequences for which to predict localisation (protein FASTA format)" /> - <param name="type" type="select" - label="Organism type (N.B. all sequences in the above file must be of the same type)" > - <option value="-p">Gram positive bacteria</option> - <option value="-n">Gram negative bacteria</option> - <option value="-a">Archaea</option> - </param> - <param name="long" type="select" label="Output type"> - <option value="terse">Short (terse, tabular with 3 columns)</option> - <!-- The normal output is text, not tabular - worth offering? - <option value="normal">Normal</option> - --> - <option value="long">Long (verbose, tabular with about 30 columns, depending on organism type)</option> - </param> - <param name="cutoff" size="10" type="float" optional="true" value="" - label="Sets a cutoff value for reported results (e.g. 7.5)" - help="Leave blank or use zero for no cutoff." /> - <param name="divergent" size="10" type="float" optional="true" value="" - label="Sets a cutoff value for the multiple localization flag (e.g. 4.5)" - help="Leave blank or use zero for no cutoff." /> - </inputs> - <outputs> - <data format="tabular" name="outfile" /> - </outputs> - <requirements> - <requirement type="binary">psort</requirement> - </requirements> - <tests> - <test> - <param name="sequence" value="empty.fasta" ftype="fasta"/> - <param name="long" value="terse"/> - <output name="outfile" file="empty_psortb_terse.tabular" ftype="tabular"/> - </test> - <test> - <param name="sequence" value="k12_ten_proteins.fasta" ftype="fasta"/> - <param name="long" value="terse"/> - <output name="outfile" file="k12_ten_proteins_psortb_p_terse.tabular" ftype="tabular"/> - </test> - </tests> - <help> +<tool id="Psortb" name="psortb" version="0.0.7"> + <description>Determines sub-cellular localisation of bacterial/archaeal protein sequences</description> + <!-- If job splitting is enabled, break up the query file into parts --> + <!-- Using 2000 chunks meaning 4 threads doing 500 each is ideal --> + <parallelism method="basic" split_inputs="fasta_file" split_mode="to_size" split_size="2000" merge_outputs="tabular_file"></parallelism> + <requirements> + <requirement type="binary">psort</requirement> + <requirement type="package">psort</requirement> + </requirements> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> + <version_command interpreter="python">psortb.py --version</version_command> + <command interpreter="python"> +psortb.py "\$GALAXY_SLOTS" "$type" "$long" "$cutoff" "$divergent" "$sequence" "$outfile" +##If the environment variable isn't set, get "", and python wrapper +##defaults to four threads. + </command> + <inputs> + <param format="fasta" name="sequence" type="data" + label="Input sequences for which to predict localisation (protein FASTA format)" /> + <param name="type" type="select" + label="Organism type (N.B. all sequences in the above file must be of the same type)" > + <option value="-p">Gram positive bacteria</option> + <option value="-n">Gram negative bacteria</option> + <option value="-a">Archaea</option> + </param> + <param name="long" type="select" label="Output type"> + <option value="terse">Short (terse, tabular with 3 columns)</option> + <!-- The normal output is text, not tabular - worth offering? + <option value="normal">Normal</option> + --> + <option value="long">Long (verbose, tabular with about 30 columns, depending on organism type)</option> + </param> + <param name="cutoff" size="10" type="float" optional="true" value="" + label="Sets a cutoff value for reported results (e.g. 7.5)" + help="Leave blank or use zero for no cutoff." /> + <param name="divergent" size="10" type="float" optional="true" value="" + label="Sets a cutoff value for the multiple localization flag (e.g. 4.5)" + help="Leave blank or use zero for no cutoff." /> + </inputs> + <outputs> + <data format="tabular" name="outfile" /> + </outputs> + <tests> + <test> + <param name="sequence" value="empty.fasta" ftype="fasta"/> + <param name="long" value="terse"/> + <output name="outfile" file="empty_psortb_terse.tabular" ftype="tabular"/> + </test> + <test> + <param name="sequence" value="k12_ten_proteins.fasta" ftype="fasta"/> + <param name="long" value="terse"/> + <output name="outfile" file="k12_ten_proteins_psortb_p_terse.tabular" ftype="tabular"/> + </test> + </tests> + <help> **What it does** @@ -99,9 +100,9 @@ This wrapper is available to install into other Galaxy Instances via the Galaxy Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/tmhmm_and_signalp + </help> <citations> <citation type="doi">10.7717/peerj.167</citation> <citation type="doi">10.1093/bioinformatics/btq249</citation> </citations> - </help> </tool>
--- a/tools/protein_analysis/rxlr_motifs.py Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/rxlr_motifs.py Wed May 13 06:14:42 2015 -0400 @@ -40,10 +40,14 @@ import sys import re import subprocess -from seq_analysis_utils import stop_err, fasta_iterator +from seq_analysis_utils import sys_exit, fasta_iterator + +if "-v" in sys.argv: + print("RXLR Motifs v0.0.10") + sys.exit(0) if len(sys.argv) != 5: - stop_err("Requires four arguments: protein FASTA filename, threads, model, and output filename") + sys_exit("Requires four arguments: protein FASTA filename, threads, model, and output filename") fasta_file, threads, model, tabular_file = sys.argv[1:] hmm_output_file = tabular_file + ".hmm.tmp" @@ -53,36 +57,36 @@ hmmer_search = "hmmsearch2" if model == "Bhattacharjee2006": - signalp_trunc = 70 - re_rxlr = re.compile("R.LR") - min_sp = 10 - max_sp = 40 - max_sp_rxlr = 100 - min_rxlr_start = 1 - #Allow signal peptide to be at most 40aa, and want RXLR to be - #within 100aa, therefore for the prescreen the max start is 140: - max_rxlr_start = max_sp + max_sp_rxlr + signalp_trunc = 70 + re_rxlr = re.compile("R.LR") + min_sp = 10 + max_sp = 40 + max_sp_rxlr = 100 + min_rxlr_start = 1 + # Allow signal peptide to be at most 40aa, and want RXLR to be + # within 100aa, therefore for the prescreen the max start is 140: + max_rxlr_start = max_sp + max_sp_rxlr elif model == "Win2007": - signalp_trunc = 70 - re_rxlr = re.compile("R.LR") - min_sp = 10 - max_sp = 40 - min_rxlr_start = 30 - max_rxlr_start = 60 - #No explicit limit on separation of signal peptide clevage - #and RXLR, but shortest signal peptide is 10, and furthest - #away RXLR is 60, so effectively limit is 50. - max_sp_rxlr = max_rxlr_start - min_sp + 1 + signalp_trunc = 70 + re_rxlr = re.compile("R.LR") + min_sp = 10 + max_sp = 40 + min_rxlr_start = 30 + max_rxlr_start = 60 + # No explicit limit on separation of signal peptide clevage + # and RXLR, but shortest signal peptide is 10, and furthest + # away RXLR is 60, so effectively limit is 50. + max_sp_rxlr = max_rxlr_start - min_sp + 1 elif model == "Whisson2007": - signalp_trunc = 0 #zero for no truncation - re_rxlr = re.compile("R.LR.{,40}[ED][ED][KR]") - min_sp = 10 - max_sp = 40 - max_sp_rxlr = 100 - min_rxlr_start = 1 - max_rxlr_start = max_sp + max_sp_rxlr + signalp_trunc = 0 # zero for no truncation + re_rxlr = re.compile("R.LR.{,40}[ED][ED][KR]") + min_sp = 10 + max_sp = 40 + max_sp_rxlr = 100 + min_rxlr_start = 1 + max_rxlr_start = max_sp + max_sp_rxlr else: - stop_err("Did not recognise the model name %r\n" + sys_exit("Did not recognise the model name %r\n" "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) @@ -108,49 +112,49 @@ hmm_file = os.path.join(os.path.split(sys.argv[0])[0], "whisson_et_al_rxlr_eer_cropped.hmm") if not os.path.isfile(hmm_file): - stop_err("Missing HMM file for Whisson et al. (2007)") + sys_exit("Missing HMM file for Whisson et al. (2007)") if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): - stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher) + sys_exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) hmm_hits = set() valid_ids = set() for title, seq in fasta_iterator(fasta_file): name = title.split(None,1)[0] if name in valid_ids: - stop_err("Duplicated identifier %r" % name) + sys_exit("Duplicated identifier %r" % name) else: valid_ids.add(name) if not valid_ids: - #Special case, don't need to run HMMER if there are no sequences + # Special case, don't need to run HMMER if there are no sequences pass else: - #I've left the code to handle HMMER 3 in situ, in case - #we revisit the choice to insist on HMMER 2. + # I've left the code to handle HMMER 3 in situ, in case + # we revisit the choice to insist on HMMER 2. hmmer3 = (3 == get_hmmer_version(hmmer_search)) - #Using zero (or 5.6?) for bitscore threshold + # Using zero (or 5.6?) for bitscore threshold if hmmer3: - #The HMMER3 table output is easy to parse - #In HMMER3 can't use both -T and -E + # The HMMER3 table output is easy to parse + # In HMMER3 can't use both -T and -E cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ % (hmmer_search, hmm_output_file, hmm_file, fasta_file) else: - #For HMMER2 we are stuck with parsing stdout - #Put 1e6 to effectively have no expectation threshold (otherwise - #HMMER defaults to 10 and the calculated e-value depends on the - #input FASTA file, and we can loose hits of interest). + # For HMMER2 we are stuck with parsing stdout + # Put 1e6 to effectively have no expectation threshold (otherwise + # HMMER defaults to 10 and the calculated e-value depends on the + # input FASTA file, and we can loose hits of interest). cmd = "%s -T 0 -E 1e6 %s %s > %s" \ % (hmmer_search, hmm_file, fasta_file, hmm_output_file) return_code = os.system(cmd) if return_code: - stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) + sys_exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) handle = open(hmm_output_file) for line in handle: if not line.strip(): - #We expect blank lines in the HMMER2 stdout + # We expect blank lines in the HMMER2 stdout continue elif line.startswith("#"): - #Header + # Header continue else: name = line.split(None,1)[0] @@ -159,18 +163,18 @@ if name in valid_ids: hmm_hits.add(name) elif hmmer3: - stop_err("Unexpected identifer %r in hmmsearch output" % name) + sys_exit("Unexpected identifer %r in hmmsearch output" % name) handle.close() - #if hmmer3: - # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) - #else: - # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) - #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) + # if hmmer3: + # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) + # else: + # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) + # print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) os.remove(hmm_output_file) del valid_ids -#Prepare short list of candidates containing RXLR to pass to SignalP +# Prepare short list of candidates containing RXLR to pass to SignalP assert min_rxlr_start > 0, "Min value one, since zero based counting" count = 0 total = 0 @@ -180,27 +184,28 @@ name = title.split(None,1)[0] match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: - #This is a potential RXLR, depending on the SignalP results. - #Might as well truncate the sequence now, makes the temp file smaller + # This is a potential RXLR, depending on the SignalP results. + # Might as well truncate the sequence now, makes the temp file smaller if signalp_trunc: handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc])) else: - #Does it matter we don't line wrap? + # Does it matter we don't line wrap? handle.write(">%s\n%s\n" % (name, seq)) count += 1 handle.close() -#print "Running SignalP on %i/%i potentials." % (count, total) +# print "Running SignalP on %i/%i potentials." % (count, total) -#Run SignalP (using our wrapper script to get multi-core support etc) +# Run SignalP (using our wrapper script to get multi-core support etc) signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") if not os.path.isfile(signalp_script): - stop_err("Error - missing signalp3.py script") + sys_exit("Error - missing signalp3.py script") cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) return_code = os.system(cmd) if return_code: - stop_err("Error %i from SignalP:\n%s" % (return_code, cmd)) -#print "SignalP done" + sys_exit("Error %i from SignalP:\n%s" % (return_code, cmd)) +# print "SignalP done" + def parse_signalp(filename): """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length. @@ -217,7 +222,7 @@ handle.close() -#Parse SignalP results and apply the strict RXLR criteria +# Parse SignalP results and apply the strict RXLR criteria total = 0 tally = dict() handle = open(tabular_file, "w") @@ -230,26 +235,26 @@ match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: del match - #This was the criteria for calling SignalP, + # This was the criteria for calling SignalP, #so it will be in the SignalP results. sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() assert name == sp_id, "%s vs %s" % (name, sp_id) if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: match = re_rxlr.search(seq[sp_nn_len:].upper()) - if match and match.start() + 1 <= max_sp_rxlr: #1-based counting + if match and match.start() + 1 <= max_sp_rxlr: # 1-based counting rxlr_start = sp_nn_len + match.start() + 1 if min_rxlr_start <= rxlr_start <= max_rxlr_start: rxlr = "Y" if model == "Whisson2007": - #Combine the signalp with regular expression heuristic and the HMM + # Combine the signalp with regular expression heuristic and the HMM if name in hmm_hits and rxlr == "N": - rxlr = "hmm" #HMM only + rxlr = "hmm" # HMM only elif rxlr == "N": - rxlr = "neither" #Don't use N (no) + rxlr = "neither" # Don't use N (no) elif name not in hmm_hits and rxlr == "Y": - rxlr = "re" #Heuristic only - #Now have a four way classifier: Y, hmm, re, neither - #and count is the number of Y results (both HMM and heuristic) + rxlr = "re" # Heuristic only + # Now have a four way classifier: Y, hmm, re, neither + # and count is the number of Y results (both HMM and heuristic) handle.write("%s\t%s\n" % (name, rxlr)) try: tally[rxlr] += 1 @@ -258,17 +263,17 @@ handle.close() assert sum(tally.values()) == total -#Check the iterator is finished +# Check the iterator is finished try: signalp_results.next() assert False, "Unexpected data in SignalP output" except StopIteration: pass -#Cleanup +# Cleanup os.remove(signalp_input_file) os.remove(signalp_output_file) -#Short summary to stdout for Galaxy's info display +# Short summary to stdout for Galaxy's info display print "%s for %i sequences:" % (model, total) print ", ".join("%s = %i" % kv for kv in sorted(tally.iteritems()))
--- a/tools/protein_analysis/rxlr_motifs.xml Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/rxlr_motifs.xml Wed May 13 06:14:42 2015 -0400 @@ -1,13 +1,22 @@ -<tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.9"> +<tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.11"> <description>Find RXLR Effectors of Plant Pathogenic Oomycetes</description> - <command interpreter="python"> - rxlr_motifs.py "$fasta_file" "\$GALAXY_SLOTS" $model "$tabular_file" - </command> + <requirements> + <!-- Need SignalP for all the models --> + <requirement type="binary">signalp</requirement> + <requirement type="package">signalp</requirement> + <!-- Need HMMER for Whisson et al. (2007) --> + <requirement type="binary">hmmsearch</requirement> + <requirement type="package">hmmsearch</requirement> + </requirements> <stdio> <!-- Anything other than zero is an error --> <exit_code range="1:" /> <exit_code range=":-1" /> </stdio> + <version_command interpreter="python">rxlr_motifs.py -v</version_command> + <command interpreter="python"> + rxlr_motifs.py "$fasta_file" "\$GALAXY_SLOTS" $model "$tabular_file" + </command> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences" /> <param name="model" type="select" label="Which RXLR model?"> @@ -19,12 +28,6 @@ <outputs> <data name="tabular_file" format="tabular" label="$model.value_label" /> </outputs> - <requirements> - <!-- Need SignalP for all the models --> - <requirement type="binary">signalp</requirement> - <!-- Need HMMER for Whisson et al. (2007) --> - <requirement type="binary">hmmsearch</requirement> - </requirements> <tests> <test> <param name="fasta_file" value="rxlr_win_et_al_2007.fasta" ftype="fasta" />
--- a/tools/protein_analysis/seq_analysis_utils.py Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/seq_analysis_utils.py Wed May 13 06:14:42 2015 -0400 @@ -14,7 +14,7 @@ __version__ = "0.0.1" -def stop_err(msg, error_level=1): +def sys_exit(msg, error_level=1): """Print error message to stdout and quit with given error level.""" sys.stderr.write("%s\n" % msg) sys.exit(error_level) @@ -57,7 +57,7 @@ except: num = default if num < 1: - stop_err("Threads argument %r is not a positive integer" % command_line_arg) + sys_exit("Threads argument %r is not a positive integer" % command_line_arg) #Cap this with the pysical limit of the machine, try: num = min(num, cpu_count())
--- a/tools/protein_analysis/signalp3.py Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/signalp3.py Wed May 13 06:14:42 2015 -0400 @@ -56,28 +56,28 @@ import sys import os import tempfile -from seq_analysis_utils import stop_err, split_fasta, fasta_iterator +from seq_analysis_utils import sys_exit, split_fasta, fasta_iterator from seq_analysis_utils import run_jobs, thread_count FASTA_CHUNK = 500 MAX_LEN = 6000 #Found by trial and error if len(sys.argv) not in [6,8]: - stop_err("Require five (or 7) arguments, organism, truncate, threads, " + sys_exit("Require five (or 7) arguments, organism, truncate, threads, " "input protein FASTA file & output tabular file (plus " "optionally cut method and GFF3 output file). " "Got %i arguments." % (len(sys.argv)-1)) organism = sys.argv[1] if organism not in ["euk", "gram+", "gram-"]: - stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) + sys_exit("Organism argument %s is not one of euk, gram+ or gram-" % organism) try: truncate = int(sys.argv[2]) except: truncate = 0 if truncate < 0: - stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) + sys_exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) num_threads = thread_count(sys.argv[3], default=4) fasta_file = sys.argv[4] @@ -86,7 +86,7 @@ if len(sys.argv) == 8: cut_method = sys.argv[6] if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]: - stop_err("Invalid cut method %r" % cut_method) + sys_exit("Invalid cut method %r" % cut_method) gff3_file = sys.argv[7] else: cut_method = None @@ -197,7 +197,7 @@ output = "(no output)" if error_level or output.lower().startswith("error running"): clean_up(fasta_files + temp_files) - stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results
--- a/tools/protein_analysis/signalp3.xml Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/signalp3.xml Wed May 13 06:14:42 2015 -0400 @@ -1,18 +1,22 @@ -<tool id="signalp3" name="SignalP 3.0" version="0.0.14"> +<tool id="signalp3" name="SignalP 3.0" version="0.0.15"> <description>Find signal peptides in protein sequences</description> <!-- If job splitting is enabled, break up the query file into parts --> <!-- Using 2000 chunks meaning 4 threads doing 500 each is ideal --> <parallelism method="basic" split_inputs="fasta_file" split_mode="to_size" split_size="2000" merge_outputs="tabular_file"></parallelism> + <requirements> + <requirement type="binary">signalp</requirement> + <requirement type="package">signalp</requirement> + </requirements> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <command interpreter="python"> signalp3.py $organism $truncate "\$GALAXY_SLOTS" $fasta_file $tabular_file ##If the environment variable isn't set, get "", and the python wrapper ##defaults to four threads. </command> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> <param name="organism" type="select" display="radio" label="Organism"> @@ -27,9 +31,6 @@ <outputs> <data name="tabular_file" format="tabular" label="SignalP $organism results" /> </outputs> - <requirements> - <requirement type="binary">signalp</requirement> - </requirements> <tests> <test> <param name="fasta_file" value="four_human_proteins.fasta" ftype="fasta"/>
--- a/tools/protein_analysis/suite_config.xml Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/suite_config.xml Wed May 13 06:14:42 2015 -0400 @@ -1,21 +1,21 @@ - <suite id="tmhmm_and_signalp" name="Protein/gene sequence analysis tools" version="0.2.6"> + <suite id="tmhmm_and_signalp" name="Protein/gene sequence analysis tools" version="0.2.8"> <description>TMHMM, SignalP, RXLR motifs, WoLF PSORT</description> - <tool id="tmhmm2" name="TMHMM 2.0" version="0.0.12"> + <tool id="tmhmm2" name="TMHMM 2.0" version="0.0.14"> <description>Find transmembrane domains in protein sequences</description> </tool> - <tool id="signalp3" name="SignalP 3.0" version="0.0.13"> + <tool id="signalp3" name="SignalP 3.0" version="0.0.15"> <description>Find signal peptides in protein sequences</description> </tool> - <tool id="promoter2" name="Promoter 2.0" version="0.0.7"> + <tool id="promoter2" name="Promoter 2.0" version="0.0.9"> <description>Find eukaryotic PolII promoters in DNA sequences</description> </tool> - <tool id="psortb" name="PSORTb" version="0.0.4"> + <tool id="psortb" name="PSORTb" version="0.0.6"> <description>Bacteria/archaea protein subcellular localization prediction</description> </tool> - <tool id="wolf_psort" name="WoLF PSORT" version="0.0.7"> + <tool id="wolf_psort" name="WoLF PSORT" version="0.0.9"> <description>Eukaryote protein subcellular localization prediction</description> </tool> - <tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.8"> + <tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.11"> <description>Find RXLR Effectors of Plant Pathogenic Oomycetes</description> </tool> </suite>
--- a/tools/protein_analysis/tmhmm2.py Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/tmhmm2.py Wed May 13 06:14:42 2015 -0400 @@ -43,12 +43,12 @@ import sys import os import tempfile -from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count +from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if len(sys.argv) != 4: - stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") + sys_exit("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") num_threads = thread_count(sys.argv[1], default=4) fasta_file = sys.argv[2] @@ -68,7 +68,7 @@ identifier, length, expAA, first60, predhel, topology = parts except: assert len(parts)!=6 - stop_err("Bad line: %r" % line) + sys_exit("Bad line: %r" % line) assert length.startswith("len="), line length = length[4:] assert expAA.startswith("ExpAA="), line @@ -112,7 +112,7 @@ except IOError: output = "" clean_up(fasta_files + temp_files) - stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results del jobs @@ -125,7 +125,7 @@ data_handle.close() if not count: clean_up(fasta_files + temp_files) - stop_err("No output from tmhmm2") + sys_exit("No output from tmhmm2") out_handle.close() clean_up(fasta_files + temp_files)
--- a/tools/protein_analysis/tmhmm2.xml Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/tmhmm2.xml Wed May 13 06:14:42 2015 -0400 @@ -1,18 +1,22 @@ -<tool id="tmhmm2" name="TMHMM 2.0" version="0.0.13"> +<tool id="tmhmm2" name="TMHMM 2.0" version="0.0.14"> <description>Find transmembrane domains in protein sequences</description> <!-- If job splitting is enabled, break up the query file into parts --> <!-- Using 2000 chunks meaning 4 threads doing 500 each is ideal --> <parallelism method="basic" split_inputs="fasta_file" split_mode="to_size" split_size="2000" merge_outputs="tabular_file"></parallelism> + <requirements> + <requirement type="binary">tmhmm</requirement> + <requirement type="package">tmhmm</requirement> + </requirements> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <command interpreter="python"> tmhmm2.py "\$GALAXY_SLOTS" $fasta_file $tabular_file ##If the environment variable isn't set, get "", and the python wrapper ##defaults to four threads. </command> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> <!-- @@ -25,9 +29,6 @@ <outputs> <data name="tabular_file" format="tabular" label="TMHMM results" /> </outputs> - <requirements> - <requirement type="binary">tmhmm</requirement> - </requirements> <tests> <test> <param name="fasta_file" value="four_human_proteins.fasta" ftype="fasta"/>
--- a/tools/protein_analysis/wolf_psort.py Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/wolf_psort.py Wed May 13 06:14:42 2015 -0400 @@ -35,7 +35,7 @@ """ import sys import os -from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count +from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 exe = "runWolfPsortSummary" @@ -59,11 +59,11 @@ """ if len(sys.argv) != 5: - stop_err("Require four arguments, organism, threads, input protein FASTA file & output tabular file") + sys_exit("Require four arguments, organism, threads, input protein FASTA file & output tabular file") organism = sys.argv[1] if organism not in ["animal", "plant", "fungi"]: - stop_err("Organism argument %s is not one of animal, plant, fungi" % organism) + sys_exit("Organism argument %s is not one of animal, plant, fungi" % organism) num_threads = thread_count(sys.argv[2], default=4) fasta_file = sys.argv[3] @@ -106,7 +106,7 @@ if error_level or output.lower().startswith("error running"): clean_up(fasta_files) clean_up(temp_files) - stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results
--- a/tools/protein_analysis/wolf_psort.xml Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/wolf_psort.xml Wed May 13 06:14:42 2015 -0400 @@ -1,15 +1,19 @@ -<tool id="wolf_psort" name="WoLF PSORT" version="0.0.8"> +<tool id="wolf_psort" name="WoLF PSORT" version="0.0.9"> <description>Eukaryote protein subcellular localization prediction</description> + <requirements> + <requirement type="binary">runWolfPsortSummary</requirement> + <requirement type="binary">psort</requirement> + </requirements> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <command interpreter="python"> wolf_psort.py $organism "\$GALAXY_SLOTS" "$fasta_file" "$tabular_file" ##If the environment variable isn't set, get "", and python wrapper ##defaults to four threads. </command> - <stdio> - <!-- Anything other than zero is an error --> - <exit_code range="1:" /> - <exit_code range=":-1" /> - </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> <param name="organism" type="select" display="radio" label="Organism"> @@ -21,9 +25,6 @@ <outputs> <data name="tabular_file" format="tabular" label="WoLF PSORT $organism results" /> </outputs> - <requirements> - <requirement type="binary">runWolfPsortSummary</requirement> - </requirements> <tests> <test> <param name="fasta_file" value="four_human_proteins.fasta" ftype="fasta"/>