changeset 7:5e62aefb2918 draft

Uploaded v0.1.2 to Test Tool Shed
author peterjc
date Tue, 26 Mar 2013 14:24:56 -0400
parents 39a6e46cdda3
children 391a142c1e60
files test-data/four_human_proteins.fasta.orig test-data/four_human_proteins.predictnls.tabular test-data/four_human_proteins.signalp3.tsv test-data/four_human_proteins.tmhmm2.tsv tools/protein_analysis/README tools/protein_analysis/promoter2.py tools/protein_analysis/promoter2.xml tools/protein_analysis/rxlr_motifs.xml tools/protein_analysis/seq_analysis_utils.py tools/protein_analysis/signalp3.py tools/protein_analysis/signalp3.xml tools/protein_analysis/suite_config.xml tools/protein_analysis/tmhmm2.py tools/protein_analysis/tmhmm2.xml tools/protein_analysis/wolf_psort.py tools/protein_analysis/wolf_psort.xml
diffstat 16 files changed, 537 insertions(+), 78 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins.fasta.orig	Tue Mar 26 14:24:56 2013 -0400
@@ -0,0 +1,61 @@
+>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF
+SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK
+REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER
+VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK
+CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD
+CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF
+HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL
+>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2
+MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG
+GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS
+DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD
+LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG
+KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP
+DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT
+IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE
+ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ
+QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY
+QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV
+ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD
+KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD
+QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE
+NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA
+QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK
+APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD
+EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR
+HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS
+WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ
+SQQSQPVELDPFGAAPFPSKQ
+>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4
+MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL
+QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL
+VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE
+ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL
+GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG
+CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC
+TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL
+EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE
+RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ
+NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS
+DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE
+RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL
+KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF
+PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV
+SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV
+SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG
+PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR
+EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG
+FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA
+AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV
+RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN
+CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME
+FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN
+PS
+>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1
+MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY
+VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG
+GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP
+EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES
+ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI
+YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins.predictnls.tabular	Tue Mar 26 14:24:56 2013 -0400
@@ -0,0 +1,2 @@
+#ID	NLS start	NLS seq	NLS pattern	Type	ProtCount	%NucProt	ProtList	ProtLoci
+sp|P06213|INSR_HUMAN	758	SRKRRS	[STQM]RKRR[STQM]	Potential	1	100	mklp_human	nuc
--- a/test-data/four_human_proteins.signalp3.tsv	Tue Jun 07 17:41:38 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-#ID	NN_Cmax_score	NN_Cmax_pos	NN_Cmax_pred	NN_Ymax_score	NN_Ymax_pos	NN_Ymax_pred	NN_Smax_score	NN_Smax_pos	NN_Smax_pred	NN_Smean_score	NN_Smean_pred	NN_D_score	NN_D_pred	HMM_type	HMM_Cmax_score	HMM_Cmax_pos	HMM_Cmax_pred	HMM_Sprob_score	HMM_Sprob_pred
-sp|Q9BS26|ERP44_HUMAN	0.565	30	Y	0.686	30	Y	0.986	12	Y	0.818	Y	0.752	Y	S	0.945	30	Y	0.966	Y
-sp|Q9NSY1|BMP2K_HUMAN	0.153	869	N	0.050	270	N	0.229	12	N	0.008	N	0.029	N	Q	0.000	0	N	0.000	N
-sp|P06213|INSR_HUMAN	0.396	28	Y	0.561	28	Y	0.993	19	Y	0.902	Y	0.731	Y	Q	0.205	28	N	0.341	N
-sp|P08100|OPSD_HUMAN	0.211	52	N	0.344	52	Y	0.945	50	Y	0.245	N	0.295	N	Q	0.000	52	N	0.000	N
--- a/test-data/four_human_proteins.tmhmm2.tsv	Tue Jun 07 17:41:38 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-#ID	len	ExpAA	First60	PredHel	Topology
-sp|Q9BS26|ERP44_HUMAN	406	0.23	0.23	0	o
-sp|Q9NSY1|BMP2K_HUMAN	1161	0.35	0.26	0	o
-sp|P06213|INSR_HUMAN	1382	50.22	20.76	2	i9-31o957-979i
-sp|P08100|OPSD_HUMAN	348	157.99	21.69	7	o39-61i74-96o111-133i153-175o202-224i254-276o286-308i
--- a/tools/protein_analysis/README	Tue Jun 07 17:41:38 2011 -0400
+++ b/tools/protein_analysis/README	Tue Mar 26 14:24:56 2013 -0400
@@ -1,7 +1,7 @@
 This package contains Galaxy wrappers for a selection of standalone command
 line protein analysis tools:
 
-* SignalP 3.0 and THMHMM 2.0, from the Center for Biological
+* SignalP 3.0, THMHMM 2.0, Promoter 2.0 from the Center for Biological
   Sequence Analysis at the Technical University of Denmark,
   http://www.cbs.dtu.dk/cbs/
 
@@ -12,9 +12,9 @@
 To use these Galaxy wrappers you must first install the command line tools.
 At the time of writing they are all free for academic use.
 
-These wrappers are copyright 2010-2011 by Peter Cock, James Hutton Institute
+These wrappers are copyright 2010-2012 by Peter Cock, James Hutton Institute
 (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
-See the included LICENCE file for details.
+See the included LICENCE file for details (an MIT style open source licence).
 
 Requirements
 ============
@@ -27,22 +27,27 @@
 2. Install the command line version of TMHMM 2.0 and ensure "tmhmm" is on
    the PATH, see: http://www.cbs.dtu.dk/services/TMHMM/
 
-3. Install the WoLF PSORT v0.2 package, and ensure "runWolfPsortSummary"
+3. Install the command line version of Promoter 2.0 and ensure "promoter" is
+   on the PATH, see: http://www.cbs.dtu.dk/services/Promoter
+
+4. Install the WoLF PSORT v0.2 package, and ensure "runWolfPsortSummary"
    is on the PATH (we use an extra wrapper script to change to the WoLF PSORT
    directory, run runWolfPsortSummary, and then change back to the original
    directory), see: http://wolfpsort.org/WoLFPSORT_package/version0.2/
 
-4. Install hmmsearch from HMMER 2.3.2 (the last stable release of HMMER 2)
+5. Install hmmsearch from HMMER 2.3.2 (the last stable release of HMMER 2)
    but put it on the path under the name hmmsearch2 (allowing it to co-exist
    with HMMER 3), or edit rlxr_motif.py accordingly.
 
 Verify each of the tools is installed and working from the command line
-(when logged in at the Galaxy user if appropriate).
+(when logged in as the Galaxy user if appropriate).
 
-Installation
-============
+Manual Installation
+===================
 
 1. Create a folder tools/protein_analysis under your Galaxy installation.
+   This folder name is not critical, and can be changed if desired - you
+   must update the paths used in tool_conf.xml to match.
 
 2. Copy/move the following files (from this archive) there:
 
@@ -52,6 +57,9 @@
 signalp3.xml (Galaxy tool definition)
 signalp3.py (Python wrapper script)
 
+promoter2.xml (Galaxy tool definition)
+promoter2.py (Python wrapper script)
+
 wolf_psort.xml (Galaxy tool definition)
 wolf_psort.py (Python wrapper script)
 
@@ -59,7 +67,8 @@
 rxlr_motifs.py (Python script)
 
 seq_analysis_utils.py (shared Python code)
-README (optional)
+LICENCE
+README (this file)
 
 3. Edit your Galaxy conjuration file tool_conf.xml (to use the tools) AND
    also tool_conf.xml.sample (to run the tests) to include the new tools
@@ -71,6 +80,9 @@
     <tool file="protein_analysis/wolf_psort.xml" />
     <tool file="protein_analysis/rxlr_motifs.xml" />
   </section>
+  <section name="Nucleotide sequence analysis" id="nucleotide_analysis">
+    <tool file="protein_analysis/promoter2.xml" />
+  </section>
 
    Leave out the lines for any tools you do not wish to use in Galaxy.
 
@@ -112,6 +124,11 @@
          SignalP webservice.
 v0.0.8 - Added WoLF PSORT wrapper to the suite.
 v0.0.9 - Added our RXLR motifs tool to the suite.
+v0.1.0 - Added Promoter 2.0 wrapper (similar to SignalP & TMHMM wrappers)
+       - Support Galaxy's <parallelism> tag for SignalP, TMHMM & Promoter
+v0.1.1 - Fixed an error in the header of the tabular output from Promoter
+v0.1.2 - Use the new <stdio> settings in the XML wrappers to catch errors
+       - Use SGE style $NSLOTS for thread count (otherwise default to 4)
 
 
 Developers
@@ -126,7 +143,7 @@
 For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use
 the following command from the Galaxy root folder:
 
-tar -czf tmhmm_signalp_etc.tar.gz tools/protein_analysis/LICENSE tools/protein_analysis/README tools/protein_analysis/suite_config.xml tools/protein_analysis/seq_analysis_utils.py tools/protein_analysis/signalp3.xml tools/protein_analysis/signalp3.py tools/protein_analysis/tmhmm2.xml tools/protein_analysis/tmhmm2.py tools/protein_analysis/wolf_psort.xml tools/protein_analysis/wolf_psort.py tools/protein_analysis/rxlr_motifs.xml tools/protein_analysis/rxlr_motifs.py  test-data/four_human_proteins.* test-data/empty.fasta test-data/empty_tmhmm2.tabular test-data/empty_signalp3.tabular
+tar -czf ~/tmhmm_signalp_etc.tar.gz tools/protein_analysis/LICENSE tools/protein_analysis/README tools/protein_analysis/suite_config.xml tools/protein_analysis/seq_analysis_utils.py tools/protein_analysis/signalp3.xml tools/protein_analysis/signalp3.py tools/protein_analysis/tmhmm2.xml tools/protein_analysis/tmhmm2.py tools/protein_analysis/promoter2.xml tools/protein_analysis/promoter2.py tools/protein_analysis/wolf_psort.xml tools/protein_analysis/wolf_psort.py tools/protein_analysis/rxlr_motifs.xml tools/protein_analysis/rxlr_motifs.py  test-data/four_human_proteins.* test-data/empty.fasta test-data/empty_tmhmm2.tabular test-data/empty_signalp3.tabular
 
 Check this worked:
 
@@ -139,6 +156,8 @@
 tools/protein_analysis/signalp3.py
 tools/protein_analysis/tmhmm2.xml
 tools/protein_analysis/tmhmm2.py
+tools/protein_analysis/promoter2.xml
+tools/protein_analysis/promoter2.py
 tools/protein_analysis/wolf_psort.xml
 tools/protein_analysis/wolf_psort.py
 tools/protein_analysis/rxlr_motifs.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/protein_analysis/promoter2.py	Tue Mar 26 14:24:56 2013 -0400
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+"""Wrapper for Promoter 2.0 for use in Galaxy.
+
+This script takes exactly three command line arguments:
+ * number of threads
+ * an input DNA FASTA filename
+ * output tabular filename.
+
+It calls the Promoter 2.0 binary (e.g. .../promoter-2.0/bin/promoter_Linux,
+bypassing the Perl wrapper script 'promoter' which imposes a significant
+performace overhead for no benefit here (we don't need HTML output for
+example).
+
+The main feature is this Python wrapper script parsers the bespoke
+tabular output from Promoter 2.0 and reformats it into a Galaxy friendly
+tab separated table.
+
+Additionally, in order to take advantage of multiple cores the input FASTA
+file is broken into chunks and multiple copies of promoter run at once.
+This can be used in combination with the job-splitting available in Galaxy.
+
+Note that rewriting the FASTA input file allows us to avoid a bug in
+promoter 2 with long descriptions in the FASTA header line (over 200
+characters) which produces stray fragements of the description in the
+output file, making parsing non-trivial.
+
+TODO - Automatically extract the sequence containing a promoter prediction?
+"""
+import sys
+import os
+import commands
+import tempfile
+from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count
+
+FASTA_CHUNK = 500
+
+if len(sys.argv) != 4:
+    stop_err("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. "
+             "Got %i arguments." % (len(sys.argv)-1))
+
+num_threads = thread_count(sys.argv[3],default=4)
+fasta_file = os.path.abspath(sys.argv[2])
+tabular_file = os.path.abspath(sys.argv[3])
+
+tmp_dir = tempfile.mkdtemp()
+
+def get_path_and_binary():
+    platform = commands.getoutput("uname") #e.g. Linux
+    shell_script = commands.getoutput("which promoter")
+    if not os.path.isfile(shell_script):
+        stop_err("ERROR: Missing promoter executable shell script")
+    path = None
+    for line in open(shell_script):
+        if line.startswith("setenv"): #could then be tab or space!
+            parts = line.rstrip().split(None, 2)
+            if parts[0] == "setenv" and parts[1] == "PROM":
+                path = parts[2]
+    if not path:
+        stop_err("ERROR: Could not find promoter path (PROM) in %r" % shell_script)
+    if not os.path.isdir(path):
+        stop_error("ERROR: %r is not a directory" % path)
+    bin = "%s/bin/promoter_%s" % (path, platform)
+    if not os.path.isfile(bin):
+        stop_err("ERROR: Missing promoter binary %r" % bin)
+    return path, bin
+
+def make_tabular(raw_handle, out_handle):
+    """Parse text output into tabular, return query count."""
+    identifier = None
+    queries = 0
+    for line in raw_handle:
+        #print repr(line)
+        if not line.strip() or line == "Promoter prediction:\n":
+            pass
+        elif line[0] != " ":
+            identifier = line.strip().replace("\t", " ").split(None,1)[0]
+            queries += 1
+        elif line == "  No promoter predicted\n":
+            #End of a record
+            identifier = None
+        elif line == "  Position  Score  Likelihood\n":
+            assert identifier
+        else:
+            try:
+                position, score, likelihood = line.strip().split(None,2)
+            except ValueError:
+                print "WARNING: Problem with line: %r" % line
+                continue
+                #stop_err("ERROR: Problem with line: %r" % line)
+            if likelihood not in ["ignored",
+                                  "Marginal prediction",
+                                  "Medium likely prediction",
+                                  "Highly likely prediction"]:
+                stop_err("ERROR: Problem with line: %r" % line)
+            out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood))
+    return queries
+    
+working_dir, bin = get_path_and_binary()
+
+if not os.path.isfile(fasta_file):
+   stop_err("ERROR: Missing input FASTA file %r" % fasta_file)
+
+#Note that if the input FASTA file contains no sequences,
+#split_fasta returns an empty list (i.e. zero temp files).
+#We deliberately omit the FASTA descriptions to avoid a
+#bug in promoter2 with descriptions over 200 characters.
+fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False)
+temp_files = [f+".out" for f in fasta_files]
+jobs = ["%s %s > %s" % (bin, fasta, temp)
+        for fasta, temp in zip(fasta_files, temp_files)]
+
+def clean_up(file_list):
+    for f in file_list:
+        if os.path.isfile(f):
+            os.remove(f)
+    try:
+        os.rmdir(tmp_dir)
+    except:
+        pass
+
+if len(jobs) > 1 and num_threads > 1:
+    #A small "info" message for Galaxy to show the user.
+    print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
+cur_dir = os.path.abspath(os.curdir)
+os.chdir(working_dir)
+results = run_jobs(jobs, num_threads)
+os.chdir(cur_dir)
+for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
+    error_level = results[cmd]
+    if error_level:
+        try:
+            output = open(temp).readline()
+        except IOError:
+            output = ""
+        clean_up(fasta_files + temp_files)
+        stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
+                 error_level)
+
+del results
+del jobs
+
+out_handle = open(tabular_file, "w")
+out_handle.write("#Identifier\tPosition\tScore\tLikelihood\n")
+queries = 0
+for temp in temp_files:
+    data_handle = open(temp)
+    count = make_tabular(data_handle, out_handle)
+    data_handle.close()
+    if not count:
+        clean_up(fasta_files + temp_files)
+        stop_err("No output from promoter2")
+    queries += count
+out_handle.close()
+
+clean_up(fasta_files + temp_files)
+print "Results for %i queries" % queries
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/protein_analysis/promoter2.xml	Tue Mar 26 14:24:56 2013 -0400
@@ -0,0 +1,62 @@
+<tool id="promoter2" name="Promoter 2.0" version="0.0.3">
+    <description>Find eukaryotic PolII promoters in DNA sequences</description>
+    <!-- If job splitting is enabled, break up the query file into parts -->
+    <!-- Using 2000 per chunk so 4 threads each doing 500 is ideal -->
+    <parallelism method="basic" split_inputs="fasta_file" split_mode="to_size" split_size="2000" merge_outputs="tabular_file"></parallelism>
+    <command interpreter="python">
+        promoter2.py "\$NSLOTS" $fasta_file $tabular_file
+        ##Set the number of threads in the runner entry in universe_wsgi.ini
+        ##which (on SGE at least) will set the $NSLOTS environment variable.
+        ##If the environment variable isn't set, get "", and defaults to one.
+    </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
+    <inputs>
+        <param name="fasta_file" type="data" format="fasta" label="FASTA file of DNA sequences"/> 
+    </inputs>
+    <outputs>
+        <data name="tabular_file" format="tabular" label="Promoter2 on ${fasta_file.name}" />
+    </outputs>
+    <requirements>
+        <requirement type="binary">promoter</requirement>
+    </requirements>
+    <help>
+    
+**What it does**
+
+This calls the Promoter 2.0 tool for prediction of eukaryotic PolII promoter sequences using a Neural Network (NN) model.
+
+The input is a FASTA file of nucleotide sequences (e.g. upstream regions of your genes), and the output is tabular with five columns (one row per promoter):
+
+ 1. Sequence identifier (first word of FASTA header)
+ 2. Promoter position, e.g. 600
+ 3. Promoter score, e.g. 1.063
+ 4. Promoter likelihood, e.g. Highly likely prediction
+
+The scores are classified very simply as follows:
+
+========= ========================
+Score     Description
+--------- ------------------------
+below 0.5 ignored
+0.5 - 0.8 Marginal prediction
+0.8 - 1.0 Medium likely prediction
+above 1.0 Highly likely prediction 
+========= ========================
+
+Internally the input FASTA file is divided into parts (to allow multiple processors to be used), and the raw output is reformatted into this tabular layout suitable for downstream analysis within Galaxy.
+
+**References**
+
+Knudsen.
+Promoter2.0: for the recognition of PolII promoter sequences.
+Bioinformatics, 15:356-61, 1999.
+http://dx.doi.org/10.1093/bioinformatics/15.5.356
+
+http://www.cbs.dtu.dk/services/Promoter/output.php
+
+    </help>
+</tool>
--- a/tools/protein_analysis/rxlr_motifs.xml	Tue Jun 07 17:41:38 2011 -0400
+++ b/tools/protein_analysis/rxlr_motifs.xml	Tue Mar 26 14:24:56 2013 -0400
@@ -1,9 +1,14 @@
-<tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.5">
+<tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.6">
     <description>Find RXLR Effectors of Plant Pathogenic Oomycetes</description>
     <command interpreter="python">
       rxlr_motifs.py $fasta_file 8 $model $tabular_file
       ##I want the number of threads to be a Galaxy config option...
     </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
     <inputs>
         <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences" /> 
         <param name="model" type="select" label="Which RXLR model?">
@@ -32,10 +37,10 @@
     
 **Background**
 
-Many effector proteins from Oomycete plant pathogens for manipulating the host
+Many effector proteins from oomycete plant pathogens for manipulating the host
 have been found to contain a signal peptide followed by a conserved RXLR motif
 (Arg, any amino acid, Leu, Arg), and then sometimes EER (Glu, Glu, Arg). There
-are stiking parallels with the malarial host-targeting signal (Plasmodium
+are striking parallels with the malarial host-targeting signal (Plasmodium
 export element, or "Pexel" for short).
 
 -----
--- a/tools/protein_analysis/seq_analysis_utils.py	Tue Jun 07 17:41:38 2011 -0400
+++ b/tools/protein_analysis/seq_analysis_utils.py	Tue Mar 26 14:24:56 2013 -0400
@@ -19,6 +19,56 @@
     sys.stderr.write("%s\n" % msg)
     sys.exit(error_level)
 
+try:
+    from multiprocessing import cpu_count
+except ImportError:
+    #Must be under Python 2.5, this is copied from multiprocessing:
+    def cpu_count():
+        """Returns the number of CPUs in the system."""
+        if sys.platform == 'win32':
+            try:
+                num = int(os.environ['NUMBER_OF_PROCESSORS'])
+            except (ValueError, KeyError):
+                num = 0
+        elif 'bsd' in sys.platform or sys.platform == 'darwin':
+            comm = '/sbin/sysctl -n hw.ncpu'
+            if sys.platform == 'darwin':
+                comm = '/usr' + comm
+                try:
+                    with os.popen(comm) as p:
+                        num = int(p.read())
+                except ValueError:
+                    num = 0
+        else:
+            try:
+                num = os.sysconf('SC_NPROCESSORS_ONLN')
+            except (ValueError, OSError, AttributeError):
+                num = 0
+
+        if num >= 1:
+            return num
+        else:
+            raise NotImplementedError('cannot determine number of cpus')
+
+
+def thread_count(command_line_arg, default=1):
+    try:
+        num = int(command_line_arg)
+    except:
+        num = default
+    if num < 1:
+        stop_err("Threads argument %r is not a positive integer" % command_line_arg)
+    #Cap this with the pysical limit of the machine,
+    try:
+        num = min(num, cpu_count())
+    except NotImplementedError:
+        pass
+    #For debugging,
+    #hostname = os.environ.get("HOSTNAME", "this machine")
+    #sys.stderr.write("Using %i cores on %s\n" % (num, hostname))
+    return num
+
+
 def fasta_iterator(filename, max_len=None, truncate=None):
     """Simple FASTA parser yielding tuples of (title, sequence) strings."""
     handle = open(filename)
@@ -100,6 +150,8 @@
             if os.path.isfile(f):
                 os.remove(f)
         raise err
+    for f in files:
+        assert os.path.isfile(f), "Missing split file %r (!??)" % f
     return files
 
 def run_jobs(jobs, threads, pause=10, verbose=False):
@@ -107,6 +159,11 @@
     pending = jobs[:]
     running = []
     results = {}
+    if threads == 1:
+        #Special case this for speed, don't need the waits
+        for cmd in jobs:
+            results[cmd] = subprocess.call(cmd, shell=True)
+        return results
     while pending or running:
         #See if any have finished
         for (cmd, process) in running:
--- a/tools/protein_analysis/signalp3.py	Tue Jun 07 17:41:38 2011 -0400
+++ b/tools/protein_analysis/signalp3.py	Tue Mar 26 14:24:56 2013 -0400
@@ -4,10 +4,14 @@
 This script takes exactly five command line arguments:
  * the organism type (euk, gram+ or gram-)
  * length to truncate sequences to (integer)
- * number of threads to use (integer)
+ * number of threads to use (integer, defaults to one)
  * an input protein FASTA filename
  * output tabular filename.
 
+There are two further optional arguments
+ * cut type (NN_Cmax, NN_Ymax, NN_Smax or HMM_Cmax)
+ * output GFF3 filename
+
 It then calls the standalone SignalP v3.0 program (not the webservice)
 requesting the short output (one line per protein) using both NN and HMM
 for predictions.
@@ -41,16 +45,28 @@
 run multiple copies of TMHMM in parallel. I would normally use Python's
 multiprocessing library in this situation but it requires at least Python 2.6
 and at the time of writing Galaxy still supports Python 2.4.
+
+Note that this is somewhat redundant with job-splitting available in Galaxy
+itself (see the SignalP XML file for settings).
+
+Finally, you can opt to have a GFF3 file produced which will describe the
+predicted signal peptide and mature peptide for each protein (using one of
+the predictors which gives a cleavage site). *WORK IN PROGRESS*
 """
 import sys
 import os
-from seq_analysis_utils import stop_err, split_fasta, run_jobs
+import tempfile
+from seq_analysis_utils import stop_err, split_fasta, fasta_iterator
+from seq_analysis_utils import run_jobs, thread_count
 
 FASTA_CHUNK = 500
 MAX_LEN = 6000 #Found by trial and error
 
-if len(sys.argv) != 6:
-   stop_err("Require five arguments, organism, truncate, threads, input protein FASTA file & output tabular file")
+if len(sys.argv) not in  [6,8]:
+   stop_err("Require five (or 7) arguments, organism, truncate, threads, "
+            "input protein FASTA file & output tabular file (plus "
+            "optionally cut method and GFF3 output file). "
+            "Got %i arguments." % (len(sys.argv)-1))
 
 organism = sys.argv[1]
 if organism not in ["euk", "gram+", "gram-"]:
@@ -63,19 +79,31 @@
 if truncate < 0:
    stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])
 
-try:
-   num_threads = int(sys.argv[3])
-except:
-   num_threads = 0
-if num_threads < 1:
-   stop_err("Threads argument %s is not a positive integer" % sys.argv[3])
-
+num_threads = thread_count(sys.argv[3], default=4)
 fasta_file = sys.argv[4]
-
 tabular_file = sys.argv[5]
 
-def clean_tabular(raw_handle, out_handle):
+if len(sys.argv) == 8:
+   cut_method = sys.argv[6]
+   if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]:
+      stop_err("Invalid cut method %r" % cut_method)
+   gff3_file = sys.argv[7]
+else:
+   cut_method = None
+   gff3_file = None
+
+
+tmp_dir = tempfile.mkdtemp()
+
+def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None):
     """Clean up SignalP output to make it tabular."""
+    if cut_method:
+       cut_col = {"NN_Cmax" : 2,
+                  "NN_Ymax" : 5,
+                  "NN_Smax" : 8,
+                  "HMM_Cmax" : 16}[cut_method]
+    else:
+       cut_col = None
     for line in raw_handle:
         if not line or line.startswith("#"):
             continue
@@ -87,7 +115,59 @@
         parts = parts[14:15] + parts[1:14] + parts[15:]
         out_handle.write("\t".join(parts) + "\n")
 
-fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)
+def make_gff(fasta_file, tabular_file, gff_file, cut_method):
+    cut_col, score_col = {"NN_Cmax" : (2,1),
+                          "NN_Ymax" : (5,4),
+                          "NN_Smax" : (8,7),
+                          "HMM_Cmax" : (16,15),
+                          }[cut_method]
+
+    source = "SignalP"
+    strand = "." #not stranded
+    phase = "." #not phased
+    tags = "Note=%s" % cut_method
+    
+    tab_handle = open(tabular_file)
+    line = tab_handle.readline()
+    assert line.startswith("#ID\t"), line
+
+    gff_handle = open(gff_file, "w")
+    gff_handle.write("##gff-version 3\n")
+
+    for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle):
+        parts = line.rstrip("\n").split("\t")
+        seqid = parts[0]
+        assert title.startswith(seqid), "%s vs %s" % (seqid, title)
+        if len(seq)==0:
+            #Is it possible to have a zero length reference in GFF3?
+            continue
+        cut = int(parts[cut_col])
+        if cut == 0:
+            assert cut_method == "HMM_Cmax", cut_method
+            #TODO - Why does it do this?
+            cut = 1
+        assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq))
+        score = parts[score_col]
+        gff_handle.write("##sequence-region %s %i %i\n" \
+                          % (seqid, 1, len(seq)))
+        #If the cut is at the very begining, there is no signal peptide!
+        if cut > 1:
+            #signal_peptide = SO:0000418
+            gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
+                             % (seqid, source,
+                                "signal_peptide", 1, cut-1,
+                                score, strand, phase, tags))
+        #mature_protein_region = SO:0000419
+        gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
+                         % (seqid, source,
+                            "mature_protein_region", cut, len(seq),
+                            score, strand, phase, tags))
+        tab_handle.close()
+    gff_handle.close()
+
+
+fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"),
+                          n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)
 temp_files = [f+".out" for f in fasta_files]
 assert len(fasta_files) == len(temp_files)
 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp)
@@ -98,6 +178,10 @@
     for f in file_list:
         if os.path.isfile(f):
             os.remove(f)
+    try:
+        os.rmdir(tmp_dir)
+    except:
+        pass
 
 if len(jobs) > 1 and num_threads > 1:
     #A small "info" message for Galaxy to show the user.
@@ -109,10 +193,9 @@
     try:
         output = open(temp).readline()
     except IOError:
-        output = ""
+        output = "(no output)"
     if error_level or output.lower().startswith("error running"):
-        clean_up(fasta_files)
-        clean_up(temp_files)
+        clean_up(fasta_files + temp_files)
         stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
                  error_level)
 del results
@@ -133,5 +216,8 @@
     data_handle.close()
 out_handle.close()
 
-clean_up(fasta_files)
-clean_up(temp_files)
+#GFF3:
+if cut_method:
+   make_gff(fasta_file, tabular_file, gff3_file, cut_method)
+
+clean_up(fasta_files + temp_files)
--- a/tools/protein_analysis/signalp3.xml	Tue Jun 07 17:41:38 2011 -0400
+++ b/tools/protein_analysis/signalp3.xml	Tue Mar 26 14:24:56 2013 -0400
@@ -1,9 +1,19 @@
-<tool id="signalp3" name="SignalP 3.0" version="0.0.8">
+<tool id="signalp3" name="SignalP 3.0" version="0.0.10">
     <description>Find signal peptides in protein sequences</description>
+    <!-- If job splitting is enabled, break up the query file into parts -->
+    <!-- Using 2000 chunks meaning 4 threads doing 500 each is ideal -->
+    <parallelism method="basic" split_inputs="fasta_file" split_mode="to_size" split_size="2000" merge_outputs="tabular_file"></parallelism>
     <command interpreter="python">
-      signalp3.py $organism $truncate 8 $fasta_file $tabular_file
-      ##I want the number of threads to be a Galaxy config option...
+      signalp3.py $organism $truncate "\$NSLOTS" $fasta_file $tabular_file
+      ##Set the number of threads in the runner entry in universe_wsgi.ini
+      ##which (on SGE at least) will set the $NSLOTS environment variable.
+      ##If the environment variable isn't set, get "", and defaults to one.
     </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
     <inputs>
         <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> 
         <param name="organism" type="select" display="radio" label="Organism">
--- a/tools/protein_analysis/suite_config.xml	Tue Jun 07 17:41:38 2011 -0400
+++ b/tools/protein_analysis/suite_config.xml	Tue Mar 26 14:24:56 2013 -0400
@@ -1,4 +1,4 @@
-    <suite id="tmhmm_and_signalp" name="TMHMM, SignalP, WoLF PSORT" version="0.0.9">
+    <suite id="tmhmm_and_signalp" name="Protein sequence analysis tools" version="0.0.9">
         <description>TMHMM, SignalP, RXLR motifs, WoLF PSORT</description>
         <tool id="tmhmm2" name="TMHMM 2.0" version="0.0.7">
             <description>Find transmembrane domains in protein sequences</description>
--- a/tools/protein_analysis/tmhmm2.py	Tue Jun 07 17:41:38 2011 -0400
+++ b/tools/protein_analysis/tmhmm2.py	Tue Mar 26 14:24:56 2013 -0400
@@ -1,10 +1,10 @@
 #!/usr/bin/env python
 """Wrapper for TMHMM v2.0 for use in Galaxy.
 
-This script takes exactly two command line arguments - an input protein FASTA
-filename and an output tabular filename. It then calls the standalone TMHMM
-v2.0 program (not the webservice) requesting the short output (one line per
-protein).
+This script takes exactly three command line arguments - number of threads,
+an input protein FASTA filename, and an output tabular filename. It then
+calls the standalone TMHMM v2.0 program (not the webservice) requesting
+the short output (one line per protein).
 
 The first major feature is cleaning up the tabular output. The short form raw
 output from TMHMM v2.0 looks like this (six columns tab separated):
@@ -33,27 +33,29 @@
 use Python's multiprocessing library in this situation but it requires at
 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4.
 
+Note that this is somewhat redundant with job-splitting available in Galaxy
+itself (see the SignalP XML file for settings).
+
 Also tmhmm2 can fail without returning an error code, for example if run on a
 64 bit machine with only the 32 bit binaries installed. This script will spot
 when there is no output from tmhmm2, and raise an error.
 """
 import sys
 import os
-from seq_analysis_utils import stop_err, split_fasta, run_jobs
+import tempfile
+from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count
 
 FASTA_CHUNK = 500
 
 if len(sys.argv) != 4:
    stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file")
-try:
-   num_threads = int(sys.argv[1])
-except:
-   num_threads = 0
-if num_threads < 1:
-   stop_err("Threads argument %s is not a positive integer" % sys.argv[1])
+
+num_threads = thread_count(sys.argv[1], default=4)
 fasta_file = sys.argv[2]
 tabular_file = sys.argv[3]
 
+tmp_dir = tempfile.mkdtemp()
+
 def clean_tabular(raw_handle, out_handle):
     """Clean up tabular TMHMM output, returns output line count."""
     count = 0
@@ -84,7 +86,7 @@
 
 #Note that if the input FASTA file contains no sequences,
 #split_fasta returns an empty list (i.e. zero temp files).
-fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK)
+fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK)
 temp_files = [f+".out" for f in fasta_files]
 jobs = ["tmhmm -short %s > %s" % (fasta, temp)
         for fasta, temp in zip(fasta_files, temp_files)]
@@ -93,6 +95,10 @@
     for f in file_list:
         if os.path.isfile(f):
             os.remove(f)
+    try:
+        os.rmdir(tmp_dir)
+    except:
+        pass
 
 if len(jobs) > 1 and num_threads > 1:
     #A small "info" message for Galaxy to show the user.
@@ -105,8 +111,7 @@
             output = open(temp).readline()
         except IOError:
             output = ""
-        clean_up(fasta_files)
-        clean_up(temp_files)
+        clean_up(fasta_files + temp_files)
         stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
                  error_level)
 del results
@@ -119,10 +124,8 @@
     count = clean_tabular(data_handle, out_handle)
     data_handle.close()
     if not count:
-        clean_up(fasta_files)
-        clean_up(temp_files)
+        clean_up(fasta_files + temp_files)
         stop_err("No output from tmhmm2")
 out_handle.close()
 
-clean_up(fasta_files)
-clean_up(temp_files)
+clean_up(fasta_files + temp_files)
--- a/tools/protein_analysis/tmhmm2.xml	Tue Jun 07 17:41:38 2011 -0400
+++ b/tools/protein_analysis/tmhmm2.xml	Tue Mar 26 14:24:56 2013 -0400
@@ -1,9 +1,19 @@
-<tool id="tmhmm2" name="TMHMM 2.0" version="0.0.7">
+<tool id="tmhmm2" name="TMHMM 2.0" version="0.0.9">
     <description>Find transmembrane domains in protein sequences</description>
+    <!-- If job splitting is enabled, break up the query file into parts -->
+    <!-- Using 2000 chunks meaning 4 threads doing 500 each is ideal -->
+    <parallelism method="basic" split_inputs="fasta_file" split_mode="to_size" split_size="2000" merge_outputs="tabular_file"></parallelism>
     <command interpreter="python">
-      tmhmm2.py 8 $fasta_file $tabular_file
-      ##I want the number of threads to be a Galaxy config option...
+      tmhmm2.py "\$NSLOTS" $fasta_file $tabular_file
+      ##Set the number of threads in the runner entry in universe_wsgi.ini
+      ##which (on SGE at least) will set the $NSLOTS environment variable.
+      ##If the environment variable isn't set, get "", and defaults to one.
     </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
     <inputs>
         <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> 
         <!--
--- a/tools/protein_analysis/wolf_psort.py	Tue Jun 07 17:41:38 2011 -0400
+++ b/tools/protein_analysis/wolf_psort.py	Tue Mar 26 14:24:56 2013 -0400
@@ -35,13 +35,13 @@
 """
 import sys
 import os
-from seq_analysis_utils import stop_err, split_fasta, run_jobs
+from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count
 
 FASTA_CHUNK = 500
 exe = "runWolfPsortSummary"
 
 """
-Note: I had trouble getting runWolfPsortSummary on the path (via a link, other
+Note: I had trouble getting runWolfPsortSummary on the path (via a link), other
 than by including all of /opt/WoLFPSORT_package_v0.2/bin , so used a wrapper
 python script called runWolfPsortSummary as follows:
 
@@ -65,15 +65,8 @@
 if organism not in ["animal", "plant", "fungi"]:
    stop_err("Organism argument %s is not one of animal, plant, fungi" % organism)
 
-try:
-   num_threads = int(sys.argv[2])
-except:
-   num_threads = 0
-if num_threads < 1:
-   stop_err("Threads argument %s is not a positive integer" % sys.argv[2])
-
+num_threads = thread_count(sys.argv[2], default=4)
 fasta_file = sys.argv[3]
-
 tabular_file = sys.argv[4]
 
 def clean_tabular(raw_handle, out_handle):
--- a/tools/protein_analysis/wolf_psort.xml	Tue Jun 07 17:41:38 2011 -0400
+++ b/tools/protein_analysis/wolf_psort.xml	Tue Mar 26 14:24:56 2013 -0400
@@ -1,9 +1,14 @@
-<tool id="wolf_psort" name="WoLF PSORT" version="0.0.1">
+<tool id="wolf_psort" name="WoLF PSORT" version="0.0.2">
     <description>Eukaryote protein subcellular localization prediction</description>
     <command interpreter="python">
       wolf_psort.py $organism 8 $fasta_file $tabular_file
       ##I want the number of threads to be a Galaxy config option...
     </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
     <inputs>
         <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> 
         <param name="organism" type="select" display="radio" label="Organism">