Mercurial > repos > peterjc > clinod
changeset 0:f886a0b8b117 draft
Uploaded v0.0.4
author | peterjc |
---|---|
date | Tue, 16 Apr 2013 13:06:20 -0400 |
parents | |
children | a4881144d6f4 |
files | test-data/four_human_proteins.clinod-1.3.tabular test-data/four_human_proteins.fasta tools/protein_analysis/clinod.txt tools/protein_analysis/clinod.xml |
diffstat | 4 files changed, 224 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.clinod-1.3.tabular Tue Apr 16 13:06:20 2013 -0400 @@ -0,0 +1,4 @@ +#ID Start End NOLS +sp|Q9NSY1|BMP2K_HUMAN 965 998 SQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHH +sp|Q9NSY1|BMP2K_HUMAN 1000 1035 TPTSTKKTLKPTYRTPERARRHKKVGRRDSQSSNEF +sp|P06213|INSR_HUMAN 286 307 CQDLHHKCKNSRRQGCHQYVIH
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.fasta Tue Apr 16 13:06:20 2013 -0400 @@ -0,0 +1,61 @@ +>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 +MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF +SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK +REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER +VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK +CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD +CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF +HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL +>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2 +MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG +GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS +DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD +LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG +KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP +DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT +IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE +ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ +QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY +QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV +ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD +KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD +QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE +NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA +QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK +APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD +EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR +HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS +WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ +SQQSQPVELDPFGAAPFPSKQ +>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 +MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL +QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL +VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE +ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL +GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG +CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC +TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL +EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE +RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ +NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS +DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE +RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL +KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF +PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV +SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV +SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG +PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR +EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG +FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA +AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV +RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN +CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME +FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN +PS +>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 +MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY +VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG +GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP +EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES +ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI +YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/protein_analysis/clinod.txt Tue Apr 16 13:06:20 2013 -0400 @@ -0,0 +1,89 @@ +Galaxy wrapper for Command line NoD predictor (v1.3) +==================================================== + +This wrapper is copyright 2011 by Peter Cock, The James Hutton Institute +(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +See the licence text below. + +Command line NoD predictor is a tool for predicting nucleolar localization +sequences (NoLSs) in a FASTA file of proteins using a neural network. There +is also a webtool version at http://www.compbio.dundee.ac.uk/www-nod/ + + +Installation +============ +This wrapper expects the java binary to be on the system PATH, and to be able +to access command line NoD as /opt/clinod/clinod-1.3.jar (edit clinod.xml if +you wish to use a different location). + +Internally NoD calls the binary batchman from the Stuttgart Neural Network +Simulator (SNNS) v 4.2 software suite. This binary can either be on the system +path or located next to the JAR file, i.e. /opt/clinod/batchman + +To install the wrapper copy or move the following files under the Galaxy tools +folder, e.g. in a tools/protein_analysis folder: + +* clinod.xml (the Galaxy tool definition) +* clinod.txt (this README file) + +You will also need to modify the tools_conf.xml file to tell Galaxy to offer the +tool. If you are using other protein analysis tools like TMHMM or SignalP, put +it next to them. Just add the line: + +<tool file="protein_analysis/clinod.xml" /> + +That's it. + + +History +======= + +v0.0.1 - Initial public release +v0.0.2 - Treat non-zero return codes as errors +v0.0.3 - Describe output table in help +v0.0.4 - Added unit test + + +Developers +========== + +This script and related tools are being developed on the following hg branch: +http://bitbucket.org/peterjc/galaxy-central/src/tools + +For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use +the following command from the Galaxy root folder: + +$ tar -czf clinod.tar.gz tools/protein_analysis/clinod.xml tools/protein_analysis/clinod.txt test-data/four_human_proteins.fasta test-data/four_human_proteins.clinod-1.3.tabular + +Check this worked: + +$ tar -tzf clinod.tar.gz +tools/protein_analysis/clinod.xml +tools/protein_analysis/clinod.txt +test-data/four_human_proteins.fasta +test-data/four_human_proteins.clinod-1.3.tabular + + +Licence (MIT/BSD style) +======================= + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE. + +NOTE: This is the licence for the Galaxy Wrapper only. Command line +NoD is available and licenced separately.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/protein_analysis/clinod.xml Tue Apr 16 13:06:20 2013 -0400 @@ -0,0 +1,70 @@ +<tool id="clinod" name="Nucleolar localization sequence Detector (NoD)" version="0.0.4"> + <description>Find nucleolar localization signals (NoLSs) in protein sequences</description> + <command> + java -jar /opt/clinod/clinod-1.3.jar -in="$fasta_file" -out="$tabular_file" -t=8 -f=MEDIUM_TAB -nonols -clean_sequence + ##I want the number of threads to be a Galaxy config option... + ##TODO - Make the -clean_sequence argument a parameter? + </command> + <stdio> + <!-- Assume anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> + <inputs> + <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> + </inputs> + <outputs> + <data name="tabular_file" format="tabular" label="NoD results" /> + </outputs> + <requirements> + <requirement type="binary">java</requirement> + </requirements> + <tests> + <test> + <param name="fasta_file" value="four_human_proteins.fasta" ftype="fasta" /> + <output name="tabular_file" file="four_human_proteins.clinod-1.3.tabular" ftype="tabular" /> + </test> + </tests> + <help> + +**What it does** + +This calls the command line version of the NoD tool from the Barton Group for +prediction of nucleolar localization sequences (NoLSs). The NoD tool uses an +artificial neural network trained on a set of human NoLSs. + +The nucleolus is a sub-compartmentof the nucleus, thus an NoLS can be regarded +as a special nuclear localization sequence (NLS). + +The input is a FASTA file of protein sequences, and the output is tabular with +four columns (multiple rows per protein): + +====== =================== +Column Description +------ ------------------- + 1 Sequence identifier + 2 Start of NoLS + 3 End of NoLS + 4 NoLS sequence +====== =================== + +If a sequence has no predicted NoLS, then there is no line in the output file +for it. + + +**References** + +M. S. Scott, F. M. Boisvert, M. D. McDowall, A. I. Lamond and G. J. Barton. +Characterization and prediction of protein nucleolar localization sequences. +Nucleic Acids Research 38(21), 7388-7399, 2010. +http://dx.doi.org/10.1093/nar/gkq653 + +M. S. Scott, P. V. Troshin and G. J. Barton. +NoD: a Nucleolar localization sequence detector for eukaryotic and viral proteins. +BMC Bioinformatics, 12:317, 2011. +http://dx.doi.org/10.1186/1471-2105-12-317 + +http://www.compbio.dundee.ac.uk/www-nod/ + + </help> +</tool>