Mercurial > repos > peterjc > tmhmm_and_signalp
changeset 7:5e62aefb2918 draft
Uploaded v0.1.2 to Test Tool Shed
author | peterjc |
---|---|
date | Tue, 26 Mar 2013 14:24:56 -0400 |
parents | 39a6e46cdda3 |
children | 391a142c1e60 |
files | test-data/four_human_proteins.fasta.orig test-data/four_human_proteins.predictnls.tabular test-data/four_human_proteins.signalp3.tsv test-data/four_human_proteins.tmhmm2.tsv tools/protein_analysis/README tools/protein_analysis/promoter2.py tools/protein_analysis/promoter2.xml tools/protein_analysis/rxlr_motifs.xml tools/protein_analysis/seq_analysis_utils.py tools/protein_analysis/signalp3.py tools/protein_analysis/signalp3.xml tools/protein_analysis/suite_config.xml tools/protein_analysis/tmhmm2.py tools/protein_analysis/tmhmm2.xml tools/protein_analysis/wolf_psort.py tools/protein_analysis/wolf_psort.xml |
diffstat | 16 files changed, 537 insertions(+), 78 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.fasta.orig Tue Mar 26 14:24:56 2013 -0400 @@ -0,0 +1,61 @@ +>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 +MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF +SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK +REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER +VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK +CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD +CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF +HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL +>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2 +MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG +GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS +DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD +LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG +KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP +DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT +IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE +ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ +QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY +QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV +ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD +KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD +QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE +NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA +QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK +APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD +EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR +HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS +WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ +SQQSQPVELDPFGAAPFPSKQ +>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 +MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL +QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL +VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE +ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL +GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG +CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC +TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL +EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE +RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ +NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS +DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE +RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL +KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF +PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV +SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV +SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG +PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR +EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG +FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA +AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV +RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN +CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME +FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN +PS +>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 +MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY +VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG +GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP +EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES +ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI +YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.predictnls.tabular Tue Mar 26 14:24:56 2013 -0400 @@ -0,0 +1,2 @@ +#ID NLS start NLS seq NLS pattern Type ProtCount %NucProt ProtList ProtLoci +sp|P06213|INSR_HUMAN 758 SRKRRS [STQM]RKRR[STQM] Potential 1 100 mklp_human nuc
--- a/test-data/four_human_proteins.signalp3.tsv Tue Jun 07 17:41:38 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -#ID NN_Cmax_score NN_Cmax_pos NN_Cmax_pred NN_Ymax_score NN_Ymax_pos NN_Ymax_pred NN_Smax_score NN_Smax_pos NN_Smax_pred NN_Smean_score NN_Smean_pred NN_D_score NN_D_pred HMM_type HMM_Cmax_score HMM_Cmax_pos HMM_Cmax_pred HMM_Sprob_score HMM_Sprob_pred -sp|Q9BS26|ERP44_HUMAN 0.565 30 Y 0.686 30 Y 0.986 12 Y 0.818 Y 0.752 Y S 0.945 30 Y 0.966 Y -sp|Q9NSY1|BMP2K_HUMAN 0.153 869 N 0.050 270 N 0.229 12 N 0.008 N 0.029 N Q 0.000 0 N 0.000 N -sp|P06213|INSR_HUMAN 0.396 28 Y 0.561 28 Y 0.993 19 Y 0.902 Y 0.731 Y Q 0.205 28 N 0.341 N -sp|P08100|OPSD_HUMAN 0.211 52 N 0.344 52 Y 0.945 50 Y 0.245 N 0.295 N Q 0.000 52 N 0.000 N
--- a/test-data/four_human_proteins.tmhmm2.tsv Tue Jun 07 17:41:38 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -#ID len ExpAA First60 PredHel Topology -sp|Q9BS26|ERP44_HUMAN 406 0.23 0.23 0 o -sp|Q9NSY1|BMP2K_HUMAN 1161 0.35 0.26 0 o -sp|P06213|INSR_HUMAN 1382 50.22 20.76 2 i9-31o957-979i -sp|P08100|OPSD_HUMAN 348 157.99 21.69 7 o39-61i74-96o111-133i153-175o202-224i254-276o286-308i
--- a/tools/protein_analysis/README Tue Jun 07 17:41:38 2011 -0400 +++ b/tools/protein_analysis/README Tue Mar 26 14:24:56 2013 -0400 @@ -1,7 +1,7 @@ This package contains Galaxy wrappers for a selection of standalone command line protein analysis tools: -* SignalP 3.0 and THMHMM 2.0, from the Center for Biological +* SignalP 3.0, THMHMM 2.0, Promoter 2.0 from the Center for Biological Sequence Analysis at the Technical University of Denmark, http://www.cbs.dtu.dk/cbs/ @@ -12,9 +12,9 @@ To use these Galaxy wrappers you must first install the command line tools. At the time of writing they are all free for academic use. -These wrappers are copyright 2010-2011 by Peter Cock, James Hutton Institute +These wrappers are copyright 2010-2012 by Peter Cock, James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. -See the included LICENCE file for details. +See the included LICENCE file for details (an MIT style open source licence). Requirements ============ @@ -27,22 +27,27 @@ 2. Install the command line version of TMHMM 2.0 and ensure "tmhmm" is on the PATH, see: http://www.cbs.dtu.dk/services/TMHMM/ -3. Install the WoLF PSORT v0.2 package, and ensure "runWolfPsortSummary" +3. Install the command line version of Promoter 2.0 and ensure "promoter" is + on the PATH, see: http://www.cbs.dtu.dk/services/Promoter + +4. Install the WoLF PSORT v0.2 package, and ensure "runWolfPsortSummary" is on the PATH (we use an extra wrapper script to change to the WoLF PSORT directory, run runWolfPsortSummary, and then change back to the original directory), see: http://wolfpsort.org/WoLFPSORT_package/version0.2/ -4. Install hmmsearch from HMMER 2.3.2 (the last stable release of HMMER 2) +5. Install hmmsearch from HMMER 2.3.2 (the last stable release of HMMER 2) but put it on the path under the name hmmsearch2 (allowing it to co-exist with HMMER 3), or edit rlxr_motif.py accordingly. Verify each of the tools is installed and working from the command line -(when logged in at the Galaxy user if appropriate). +(when logged in as the Galaxy user if appropriate). -Installation -============ +Manual Installation +=================== 1. Create a folder tools/protein_analysis under your Galaxy installation. + This folder name is not critical, and can be changed if desired - you + must update the paths used in tool_conf.xml to match. 2. Copy/move the following files (from this archive) there: @@ -52,6 +57,9 @@ signalp3.xml (Galaxy tool definition) signalp3.py (Python wrapper script) +promoter2.xml (Galaxy tool definition) +promoter2.py (Python wrapper script) + wolf_psort.xml (Galaxy tool definition) wolf_psort.py (Python wrapper script) @@ -59,7 +67,8 @@ rxlr_motifs.py (Python script) seq_analysis_utils.py (shared Python code) -README (optional) +LICENCE +README (this file) 3. Edit your Galaxy conjuration file tool_conf.xml (to use the tools) AND also tool_conf.xml.sample (to run the tests) to include the new tools @@ -71,6 +80,9 @@ <tool file="protein_analysis/wolf_psort.xml" /> <tool file="protein_analysis/rxlr_motifs.xml" /> </section> + <section name="Nucleotide sequence analysis" id="nucleotide_analysis"> + <tool file="protein_analysis/promoter2.xml" /> + </section> Leave out the lines for any tools you do not wish to use in Galaxy. @@ -112,6 +124,11 @@ SignalP webservice. v0.0.8 - Added WoLF PSORT wrapper to the suite. v0.0.9 - Added our RXLR motifs tool to the suite. +v0.1.0 - Added Promoter 2.0 wrapper (similar to SignalP & TMHMM wrappers) + - Support Galaxy's <parallelism> tag for SignalP, TMHMM & Promoter +v0.1.1 - Fixed an error in the header of the tabular output from Promoter +v0.1.2 - Use the new <stdio> settings in the XML wrappers to catch errors + - Use SGE style $NSLOTS for thread count (otherwise default to 4) Developers @@ -126,7 +143,7 @@ For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use the following command from the Galaxy root folder: -tar -czf tmhmm_signalp_etc.tar.gz tools/protein_analysis/LICENSE tools/protein_analysis/README tools/protein_analysis/suite_config.xml tools/protein_analysis/seq_analysis_utils.py tools/protein_analysis/signalp3.xml tools/protein_analysis/signalp3.py tools/protein_analysis/tmhmm2.xml tools/protein_analysis/tmhmm2.py tools/protein_analysis/wolf_psort.xml tools/protein_analysis/wolf_psort.py tools/protein_analysis/rxlr_motifs.xml tools/protein_analysis/rxlr_motifs.py test-data/four_human_proteins.* test-data/empty.fasta test-data/empty_tmhmm2.tabular test-data/empty_signalp3.tabular +tar -czf ~/tmhmm_signalp_etc.tar.gz tools/protein_analysis/LICENSE tools/protein_analysis/README tools/protein_analysis/suite_config.xml tools/protein_analysis/seq_analysis_utils.py tools/protein_analysis/signalp3.xml tools/protein_analysis/signalp3.py tools/protein_analysis/tmhmm2.xml tools/protein_analysis/tmhmm2.py tools/protein_analysis/promoter2.xml tools/protein_analysis/promoter2.py tools/protein_analysis/wolf_psort.xml tools/protein_analysis/wolf_psort.py tools/protein_analysis/rxlr_motifs.xml tools/protein_analysis/rxlr_motifs.py test-data/four_human_proteins.* test-data/empty.fasta test-data/empty_tmhmm2.tabular test-data/empty_signalp3.tabular Check this worked: @@ -139,6 +156,8 @@ tools/protein_analysis/signalp3.py tools/protein_analysis/tmhmm2.xml tools/protein_analysis/tmhmm2.py +tools/protein_analysis/promoter2.xml +tools/protein_analysis/promoter2.py tools/protein_analysis/wolf_psort.xml tools/protein_analysis/wolf_psort.py tools/protein_analysis/rxlr_motifs.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/protein_analysis/promoter2.py Tue Mar 26 14:24:56 2013 -0400 @@ -0,0 +1,156 @@ +#!/usr/bin/env python +"""Wrapper for Promoter 2.0 for use in Galaxy. + +This script takes exactly three command line arguments: + * number of threads + * an input DNA FASTA filename + * output tabular filename. + +It calls the Promoter 2.0 binary (e.g. .../promoter-2.0/bin/promoter_Linux, +bypassing the Perl wrapper script 'promoter' which imposes a significant +performace overhead for no benefit here (we don't need HTML output for +example). + +The main feature is this Python wrapper script parsers the bespoke +tabular output from Promoter 2.0 and reformats it into a Galaxy friendly +tab separated table. + +Additionally, in order to take advantage of multiple cores the input FASTA +file is broken into chunks and multiple copies of promoter run at once. +This can be used in combination with the job-splitting available in Galaxy. + +Note that rewriting the FASTA input file allows us to avoid a bug in +promoter 2 with long descriptions in the FASTA header line (over 200 +characters) which produces stray fragements of the description in the +output file, making parsing non-trivial. + +TODO - Automatically extract the sequence containing a promoter prediction? +""" +import sys +import os +import commands +import tempfile +from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count + +FASTA_CHUNK = 500 + +if len(sys.argv) != 4: + stop_err("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " + "Got %i arguments." % (len(sys.argv)-1)) + +num_threads = thread_count(sys.argv[3],default=4) +fasta_file = os.path.abspath(sys.argv[2]) +tabular_file = os.path.abspath(sys.argv[3]) + +tmp_dir = tempfile.mkdtemp() + +def get_path_and_binary(): + platform = commands.getoutput("uname") #e.g. Linux + shell_script = commands.getoutput("which promoter") + if not os.path.isfile(shell_script): + stop_err("ERROR: Missing promoter executable shell script") + path = None + for line in open(shell_script): + if line.startswith("setenv"): #could then be tab or space! + parts = line.rstrip().split(None, 2) + if parts[0] == "setenv" and parts[1] == "PROM": + path = parts[2] + if not path: + stop_err("ERROR: Could not find promoter path (PROM) in %r" % shell_script) + if not os.path.isdir(path): + stop_error("ERROR: %r is not a directory" % path) + bin = "%s/bin/promoter_%s" % (path, platform) + if not os.path.isfile(bin): + stop_err("ERROR: Missing promoter binary %r" % bin) + return path, bin + +def make_tabular(raw_handle, out_handle): + """Parse text output into tabular, return query count.""" + identifier = None + queries = 0 + for line in raw_handle: + #print repr(line) + if not line.strip() or line == "Promoter prediction:\n": + pass + elif line[0] != " ": + identifier = line.strip().replace("\t", " ").split(None,1)[0] + queries += 1 + elif line == " No promoter predicted\n": + #End of a record + identifier = None + elif line == " Position Score Likelihood\n": + assert identifier + else: + try: + position, score, likelihood = line.strip().split(None,2) + except ValueError: + print "WARNING: Problem with line: %r" % line + continue + #stop_err("ERROR: Problem with line: %r" % line) + if likelihood not in ["ignored", + "Marginal prediction", + "Medium likely prediction", + "Highly likely prediction"]: + stop_err("ERROR: Problem with line: %r" % line) + out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) + return queries + +working_dir, bin = get_path_and_binary() + +if not os.path.isfile(fasta_file): + stop_err("ERROR: Missing input FASTA file %r" % fasta_file) + +#Note that if the input FASTA file contains no sequences, +#split_fasta returns an empty list (i.e. zero temp files). +#We deliberately omit the FASTA descriptions to avoid a +#bug in promoter2 with descriptions over 200 characters. +fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False) +temp_files = [f+".out" for f in fasta_files] +jobs = ["%s %s > %s" % (bin, fasta, temp) + for fasta, temp in zip(fasta_files, temp_files)] + +def clean_up(file_list): + for f in file_list: + if os.path.isfile(f): + os.remove(f) + try: + os.rmdir(tmp_dir) + except: + pass + +if len(jobs) > 1 and num_threads > 1: + #A small "info" message for Galaxy to show the user. + print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) +cur_dir = os.path.abspath(os.curdir) +os.chdir(working_dir) +results = run_jobs(jobs, num_threads) +os.chdir(cur_dir) +for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): + error_level = results[cmd] + if error_level: + try: + output = open(temp).readline() + except IOError: + output = "" + clean_up(fasta_files + temp_files) + stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + error_level) + +del results +del jobs + +out_handle = open(tabular_file, "w") +out_handle.write("#Identifier\tPosition\tScore\tLikelihood\n") +queries = 0 +for temp in temp_files: + data_handle = open(temp) + count = make_tabular(data_handle, out_handle) + data_handle.close() + if not count: + clean_up(fasta_files + temp_files) + stop_err("No output from promoter2") + queries += count +out_handle.close() + +clean_up(fasta_files + temp_files) +print "Results for %i queries" % queries
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/protein_analysis/promoter2.xml Tue Mar 26 14:24:56 2013 -0400 @@ -0,0 +1,62 @@ +<tool id="promoter2" name="Promoter 2.0" version="0.0.3"> + <description>Find eukaryotic PolII promoters in DNA sequences</description> + <!-- If job splitting is enabled, break up the query file into parts --> + <!-- Using 2000 per chunk so 4 threads each doing 500 is ideal --> + <parallelism method="basic" split_inputs="fasta_file" split_mode="to_size" split_size="2000" merge_outputs="tabular_file"></parallelism> + <command interpreter="python"> + promoter2.py "\$NSLOTS" $fasta_file $tabular_file + ##Set the number of threads in the runner entry in universe_wsgi.ini + ##which (on SGE at least) will set the $NSLOTS environment variable. + ##If the environment variable isn't set, get "", and defaults to one. + </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> + <inputs> + <param name="fasta_file" type="data" format="fasta" label="FASTA file of DNA sequences"/> + </inputs> + <outputs> + <data name="tabular_file" format="tabular" label="Promoter2 on ${fasta_file.name}" /> + </outputs> + <requirements> + <requirement type="binary">promoter</requirement> + </requirements> + <help> + +**What it does** + +This calls the Promoter 2.0 tool for prediction of eukaryotic PolII promoter sequences using a Neural Network (NN) model. + +The input is a FASTA file of nucleotide sequences (e.g. upstream regions of your genes), and the output is tabular with five columns (one row per promoter): + + 1. Sequence identifier (first word of FASTA header) + 2. Promoter position, e.g. 600 + 3. Promoter score, e.g. 1.063 + 4. Promoter likelihood, e.g. Highly likely prediction + +The scores are classified very simply as follows: + +========= ======================== +Score Description +--------- ------------------------ +below 0.5 ignored +0.5 - 0.8 Marginal prediction +0.8 - 1.0 Medium likely prediction +above 1.0 Highly likely prediction +========= ======================== + +Internally the input FASTA file is divided into parts (to allow multiple processors to be used), and the raw output is reformatted into this tabular layout suitable for downstream analysis within Galaxy. + +**References** + +Knudsen. +Promoter2.0: for the recognition of PolII promoter sequences. +Bioinformatics, 15:356-61, 1999. +http://dx.doi.org/10.1093/bioinformatics/15.5.356 + +http://www.cbs.dtu.dk/services/Promoter/output.php + + </help> +</tool>
--- a/tools/protein_analysis/rxlr_motifs.xml Tue Jun 07 17:41:38 2011 -0400 +++ b/tools/protein_analysis/rxlr_motifs.xml Tue Mar 26 14:24:56 2013 -0400 @@ -1,9 +1,14 @@ -<tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.5"> +<tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.6"> <description>Find RXLR Effectors of Plant Pathogenic Oomycetes</description> <command interpreter="python"> rxlr_motifs.py $fasta_file 8 $model $tabular_file ##I want the number of threads to be a Galaxy config option... </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences" /> <param name="model" type="select" label="Which RXLR model?"> @@ -32,10 +37,10 @@ **Background** -Many effector proteins from Oomycete plant pathogens for manipulating the host +Many effector proteins from oomycete plant pathogens for manipulating the host have been found to contain a signal peptide followed by a conserved RXLR motif (Arg, any amino acid, Leu, Arg), and then sometimes EER (Glu, Glu, Arg). There -are stiking parallels with the malarial host-targeting signal (Plasmodium +are striking parallels with the malarial host-targeting signal (Plasmodium export element, or "Pexel" for short). -----
--- a/tools/protein_analysis/seq_analysis_utils.py Tue Jun 07 17:41:38 2011 -0400 +++ b/tools/protein_analysis/seq_analysis_utils.py Tue Mar 26 14:24:56 2013 -0400 @@ -19,6 +19,56 @@ sys.stderr.write("%s\n" % msg) sys.exit(error_level) +try: + from multiprocessing import cpu_count +except ImportError: + #Must be under Python 2.5, this is copied from multiprocessing: + def cpu_count(): + """Returns the number of CPUs in the system.""" + if sys.platform == 'win32': + try: + num = int(os.environ['NUMBER_OF_PROCESSORS']) + except (ValueError, KeyError): + num = 0 + elif 'bsd' in sys.platform or sys.platform == 'darwin': + comm = '/sbin/sysctl -n hw.ncpu' + if sys.platform == 'darwin': + comm = '/usr' + comm + try: + with os.popen(comm) as p: + num = int(p.read()) + except ValueError: + num = 0 + else: + try: + num = os.sysconf('SC_NPROCESSORS_ONLN') + except (ValueError, OSError, AttributeError): + num = 0 + + if num >= 1: + return num + else: + raise NotImplementedError('cannot determine number of cpus') + + +def thread_count(command_line_arg, default=1): + try: + num = int(command_line_arg) + except: + num = default + if num < 1: + stop_err("Threads argument %r is not a positive integer" % command_line_arg) + #Cap this with the pysical limit of the machine, + try: + num = min(num, cpu_count()) + except NotImplementedError: + pass + #For debugging, + #hostname = os.environ.get("HOSTNAME", "this machine") + #sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) + return num + + def fasta_iterator(filename, max_len=None, truncate=None): """Simple FASTA parser yielding tuples of (title, sequence) strings.""" handle = open(filename) @@ -100,6 +150,8 @@ if os.path.isfile(f): os.remove(f) raise err + for f in files: + assert os.path.isfile(f), "Missing split file %r (!??)" % f return files def run_jobs(jobs, threads, pause=10, verbose=False): @@ -107,6 +159,11 @@ pending = jobs[:] running = [] results = {} + if threads == 1: + #Special case this for speed, don't need the waits + for cmd in jobs: + results[cmd] = subprocess.call(cmd, shell=True) + return results while pending or running: #See if any have finished for (cmd, process) in running:
--- a/tools/protein_analysis/signalp3.py Tue Jun 07 17:41:38 2011 -0400 +++ b/tools/protein_analysis/signalp3.py Tue Mar 26 14:24:56 2013 -0400 @@ -4,10 +4,14 @@ This script takes exactly five command line arguments: * the organism type (euk, gram+ or gram-) * length to truncate sequences to (integer) - * number of threads to use (integer) + * number of threads to use (integer, defaults to one) * an input protein FASTA filename * output tabular filename. +There are two further optional arguments + * cut type (NN_Cmax, NN_Ymax, NN_Smax or HMM_Cmax) + * output GFF3 filename + It then calls the standalone SignalP v3.0 program (not the webservice) requesting the short output (one line per protein) using both NN and HMM for predictions. @@ -41,16 +45,28 @@ run multiple copies of TMHMM in parallel. I would normally use Python's multiprocessing library in this situation but it requires at least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. + +Note that this is somewhat redundant with job-splitting available in Galaxy +itself (see the SignalP XML file for settings). + +Finally, you can opt to have a GFF3 file produced which will describe the +predicted signal peptide and mature peptide for each protein (using one of +the predictors which gives a cleavage site). *WORK IN PROGRESS* """ import sys import os -from seq_analysis_utils import stop_err, split_fasta, run_jobs +import tempfile +from seq_analysis_utils import stop_err, split_fasta, fasta_iterator +from seq_analysis_utils import run_jobs, thread_count FASTA_CHUNK = 500 MAX_LEN = 6000 #Found by trial and error -if len(sys.argv) != 6: - stop_err("Require five arguments, organism, truncate, threads, input protein FASTA file & output tabular file") +if len(sys.argv) not in [6,8]: + stop_err("Require five (or 7) arguments, organism, truncate, threads, " + "input protein FASTA file & output tabular file (plus " + "optionally cut method and GFF3 output file). " + "Got %i arguments." % (len(sys.argv)-1)) organism = sys.argv[1] if organism not in ["euk", "gram+", "gram-"]: @@ -63,19 +79,31 @@ if truncate < 0: stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) -try: - num_threads = int(sys.argv[3]) -except: - num_threads = 0 -if num_threads < 1: - stop_err("Threads argument %s is not a positive integer" % sys.argv[3]) - +num_threads = thread_count(sys.argv[3], default=4) fasta_file = sys.argv[4] - tabular_file = sys.argv[5] -def clean_tabular(raw_handle, out_handle): +if len(sys.argv) == 8: + cut_method = sys.argv[6] + if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]: + stop_err("Invalid cut method %r" % cut_method) + gff3_file = sys.argv[7] +else: + cut_method = None + gff3_file = None + + +tmp_dir = tempfile.mkdtemp() + +def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): """Clean up SignalP output to make it tabular.""" + if cut_method: + cut_col = {"NN_Cmax" : 2, + "NN_Ymax" : 5, + "NN_Smax" : 8, + "HMM_Cmax" : 16}[cut_method] + else: + cut_col = None for line in raw_handle: if not line or line.startswith("#"): continue @@ -87,7 +115,59 @@ parts = parts[14:15] + parts[1:14] + parts[15:] out_handle.write("\t".join(parts) + "\n") -fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) +def make_gff(fasta_file, tabular_file, gff_file, cut_method): + cut_col, score_col = {"NN_Cmax" : (2,1), + "NN_Ymax" : (5,4), + "NN_Smax" : (8,7), + "HMM_Cmax" : (16,15), + }[cut_method] + + source = "SignalP" + strand = "." #not stranded + phase = "." #not phased + tags = "Note=%s" % cut_method + + tab_handle = open(tabular_file) + line = tab_handle.readline() + assert line.startswith("#ID\t"), line + + gff_handle = open(gff_file, "w") + gff_handle.write("##gff-version 3\n") + + for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle): + parts = line.rstrip("\n").split("\t") + seqid = parts[0] + assert title.startswith(seqid), "%s vs %s" % (seqid, title) + if len(seq)==0: + #Is it possible to have a zero length reference in GFF3? + continue + cut = int(parts[cut_col]) + if cut == 0: + assert cut_method == "HMM_Cmax", cut_method + #TODO - Why does it do this? + cut = 1 + assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) + score = parts[score_col] + gff_handle.write("##sequence-region %s %i %i\n" \ + % (seqid, 1, len(seq))) + #If the cut is at the very begining, there is no signal peptide! + if cut > 1: + #signal_peptide = SO:0000418 + gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ + % (seqid, source, + "signal_peptide", 1, cut-1, + score, strand, phase, tags)) + #mature_protein_region = SO:0000419 + gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ + % (seqid, source, + "mature_protein_region", cut, len(seq), + score, strand, phase, tags)) + tab_handle.close() + gff_handle.close() + + +fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"), + n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) temp_files = [f+".out" for f in fasta_files] assert len(fasta_files) == len(temp_files) jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) @@ -98,6 +178,10 @@ for f in file_list: if os.path.isfile(f): os.remove(f) + try: + os.rmdir(tmp_dir) + except: + pass if len(jobs) > 1 and num_threads > 1: #A small "info" message for Galaxy to show the user. @@ -109,10 +193,9 @@ try: output = open(temp).readline() except IOError: - output = "" + output = "(no output)" if error_level or output.lower().startswith("error running"): - clean_up(fasta_files) - clean_up(temp_files) + clean_up(fasta_files + temp_files) stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results @@ -133,5 +216,8 @@ data_handle.close() out_handle.close() -clean_up(fasta_files) -clean_up(temp_files) +#GFF3: +if cut_method: + make_gff(fasta_file, tabular_file, gff3_file, cut_method) + +clean_up(fasta_files + temp_files)
--- a/tools/protein_analysis/signalp3.xml Tue Jun 07 17:41:38 2011 -0400 +++ b/tools/protein_analysis/signalp3.xml Tue Mar 26 14:24:56 2013 -0400 @@ -1,9 +1,19 @@ -<tool id="signalp3" name="SignalP 3.0" version="0.0.8"> +<tool id="signalp3" name="SignalP 3.0" version="0.0.10"> <description>Find signal peptides in protein sequences</description> + <!-- If job splitting is enabled, break up the query file into parts --> + <!-- Using 2000 chunks meaning 4 threads doing 500 each is ideal --> + <parallelism method="basic" split_inputs="fasta_file" split_mode="to_size" split_size="2000" merge_outputs="tabular_file"></parallelism> <command interpreter="python"> - signalp3.py $organism $truncate 8 $fasta_file $tabular_file - ##I want the number of threads to be a Galaxy config option... + signalp3.py $organism $truncate "\$NSLOTS" $fasta_file $tabular_file + ##Set the number of threads in the runner entry in universe_wsgi.ini + ##which (on SGE at least) will set the $NSLOTS environment variable. + ##If the environment variable isn't set, get "", and defaults to one. </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> <param name="organism" type="select" display="radio" label="Organism">
--- a/tools/protein_analysis/suite_config.xml Tue Jun 07 17:41:38 2011 -0400 +++ b/tools/protein_analysis/suite_config.xml Tue Mar 26 14:24:56 2013 -0400 @@ -1,4 +1,4 @@ - <suite id="tmhmm_and_signalp" name="TMHMM, SignalP, WoLF PSORT" version="0.0.9"> + <suite id="tmhmm_and_signalp" name="Protein sequence analysis tools" version="0.0.9"> <description>TMHMM, SignalP, RXLR motifs, WoLF PSORT</description> <tool id="tmhmm2" name="TMHMM 2.0" version="0.0.7"> <description>Find transmembrane domains in protein sequences</description>
--- a/tools/protein_analysis/tmhmm2.py Tue Jun 07 17:41:38 2011 -0400 +++ b/tools/protein_analysis/tmhmm2.py Tue Mar 26 14:24:56 2013 -0400 @@ -1,10 +1,10 @@ #!/usr/bin/env python """Wrapper for TMHMM v2.0 for use in Galaxy. -This script takes exactly two command line arguments - an input protein FASTA -filename and an output tabular filename. It then calls the standalone TMHMM -v2.0 program (not the webservice) requesting the short output (one line per -protein). +This script takes exactly three command line arguments - number of threads, +an input protein FASTA filename, and an output tabular filename. It then +calls the standalone TMHMM v2.0 program (not the webservice) requesting +the short output (one line per protein). The first major feature is cleaning up the tabular output. The short form raw output from TMHMM v2.0 looks like this (six columns tab separated): @@ -33,27 +33,29 @@ use Python's multiprocessing library in this situation but it requires at least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. +Note that this is somewhat redundant with job-splitting available in Galaxy +itself (see the SignalP XML file for settings). + Also tmhmm2 can fail without returning an error code, for example if run on a 64 bit machine with only the 32 bit binaries installed. This script will spot when there is no output from tmhmm2, and raise an error. """ import sys import os -from seq_analysis_utils import stop_err, split_fasta, run_jobs +import tempfile +from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if len(sys.argv) != 4: stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") -try: - num_threads = int(sys.argv[1]) -except: - num_threads = 0 -if num_threads < 1: - stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) + +num_threads = thread_count(sys.argv[1], default=4) fasta_file = sys.argv[2] tabular_file = sys.argv[3] +tmp_dir = tempfile.mkdtemp() + def clean_tabular(raw_handle, out_handle): """Clean up tabular TMHMM output, returns output line count.""" count = 0 @@ -84,7 +86,7 @@ #Note that if the input FASTA file contains no sequences, #split_fasta returns an empty list (i.e. zero temp files). -fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK) +fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) temp_files = [f+".out" for f in fasta_files] jobs = ["tmhmm -short %s > %s" % (fasta, temp) for fasta, temp in zip(fasta_files, temp_files)] @@ -93,6 +95,10 @@ for f in file_list: if os.path.isfile(f): os.remove(f) + try: + os.rmdir(tmp_dir) + except: + pass if len(jobs) > 1 and num_threads > 1: #A small "info" message for Galaxy to show the user. @@ -105,8 +111,7 @@ output = open(temp).readline() except IOError: output = "" - clean_up(fasta_files) - clean_up(temp_files) + clean_up(fasta_files + temp_files) stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results @@ -119,10 +124,8 @@ count = clean_tabular(data_handle, out_handle) data_handle.close() if not count: - clean_up(fasta_files) - clean_up(temp_files) + clean_up(fasta_files + temp_files) stop_err("No output from tmhmm2") out_handle.close() -clean_up(fasta_files) -clean_up(temp_files) +clean_up(fasta_files + temp_files)
--- a/tools/protein_analysis/tmhmm2.xml Tue Jun 07 17:41:38 2011 -0400 +++ b/tools/protein_analysis/tmhmm2.xml Tue Mar 26 14:24:56 2013 -0400 @@ -1,9 +1,19 @@ -<tool id="tmhmm2" name="TMHMM 2.0" version="0.0.7"> +<tool id="tmhmm2" name="TMHMM 2.0" version="0.0.9"> <description>Find transmembrane domains in protein sequences</description> + <!-- If job splitting is enabled, break up the query file into parts --> + <!-- Using 2000 chunks meaning 4 threads doing 500 each is ideal --> + <parallelism method="basic" split_inputs="fasta_file" split_mode="to_size" split_size="2000" merge_outputs="tabular_file"></parallelism> <command interpreter="python"> - tmhmm2.py 8 $fasta_file $tabular_file - ##I want the number of threads to be a Galaxy config option... + tmhmm2.py "\$NSLOTS" $fasta_file $tabular_file + ##Set the number of threads in the runner entry in universe_wsgi.ini + ##which (on SGE at least) will set the $NSLOTS environment variable. + ##If the environment variable isn't set, get "", and defaults to one. </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> <!--
--- a/tools/protein_analysis/wolf_psort.py Tue Jun 07 17:41:38 2011 -0400 +++ b/tools/protein_analysis/wolf_psort.py Tue Mar 26 14:24:56 2013 -0400 @@ -35,13 +35,13 @@ """ import sys import os -from seq_analysis_utils import stop_err, split_fasta, run_jobs +from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 exe = "runWolfPsortSummary" """ -Note: I had trouble getting runWolfPsortSummary on the path (via a link, other +Note: I had trouble getting runWolfPsortSummary on the path (via a link), other than by including all of /opt/WoLFPSORT_package_v0.2/bin , so used a wrapper python script called runWolfPsortSummary as follows: @@ -65,15 +65,8 @@ if organism not in ["animal", "plant", "fungi"]: stop_err("Organism argument %s is not one of animal, plant, fungi" % organism) -try: - num_threads = int(sys.argv[2]) -except: - num_threads = 0 -if num_threads < 1: - stop_err("Threads argument %s is not a positive integer" % sys.argv[2]) - +num_threads = thread_count(sys.argv[2], default=4) fasta_file = sys.argv[3] - tabular_file = sys.argv[4] def clean_tabular(raw_handle, out_handle):
--- a/tools/protein_analysis/wolf_psort.xml Tue Jun 07 17:41:38 2011 -0400 +++ b/tools/protein_analysis/wolf_psort.xml Tue Mar 26 14:24:56 2013 -0400 @@ -1,9 +1,14 @@ -<tool id="wolf_psort" name="WoLF PSORT" version="0.0.1"> +<tool id="wolf_psort" name="WoLF PSORT" version="0.0.2"> <description>Eukaryote protein subcellular localization prediction</description> <command interpreter="python"> wolf_psort.py $organism 8 $fasta_file $tabular_file ##I want the number of threads to be a Galaxy config option... </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> <param name="organism" type="select" display="radio" label="Organism">