Mercurial > repos > peterjc > ncbi_blast_plus
changeset 40:f83e5d79b6ab draft
Uploaded v0.1.0 preview 3, adds a missing test file & more tests for makeblastdb
author | peterjc |
---|---|
date | Wed, 26 Feb 2014 10:35:01 -0500 |
parents | 22b7cdcf4960 |
children | af4da561893b |
files | test-data/blastn_rhodopsin_vs_three_human_converted.tabular test-data/convert2blastmask_four_human_masked.maskinfo-asn1 test-data/convert2blastmask_four_human_masked.maskinfo-asn1-binary test-data/four_human_proteins.fasta.log test-data/four_human_proteins.fasta.log.txt test-data/four_human_proteins_taxid.fasta.log test-data/four_human_proteins_taxid.fasta.log.txt test-data/four_human_proteins_taxid.fasta.pin test-data/segmasker_four_human.fasta test-data/segmasker_four_human.maskinfo-asn1 test-data/segmasker_four_human.maskinfo-asn1-binary tools/ncbi_blast_plus/README.rst tools/ncbi_blast_plus/ncbi_convert2blastmask_wrapper.xml tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml tools/ncbi_blast_plus/ncbi_makeblastdb.xml tools/ncbi_blast_plus/ncbi_segmasker_wrapper.xml |
diffstat | 16 files changed, 605 insertions(+), 91 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastn_rhodopsin_vs_three_human_converted.tabular Wed Feb 26 10:35:01 2014 -0500 @@ -0,0 +1,7 @@ +gi|57163782|ref|NM_001009242.1| ENA|BC112106|BC112106.1 92.07 1047 83 0 1 1047 88 1134 0.0 1474 +gi|283855845|gb|GQ290303.1| ENA|BC112106|BC112106.1 91.59 333 28 0 1 333 118 450 4e-132 460 +gi|283855845|gb|GQ290303.1| ENA|BC112106|BC112106.1 91.36 243 19 2 3127 3368 782 1023 3e-93 331 +gi|283855845|gb|GQ290303.1| ENA|BC112106|BC112106.1 94.22 173 10 0 1410 1582 448 620 3e-73 265 +gi|283855845|gb|GQ290303.1| ENA|BC112106|BC112106.1 92.94 170 12 0 2854 3023 615 784 3e-68 248 +gi|283855822|gb|GQ290312.1| ENA|BC112106|BC112106.1 91.55 959 81 0 1 959 118 1076 0.0 1323 +gi|18148870|dbj|AB062417.1| ENA|BC112106|BC112106.1 87.50 1048 129 2 1 1047 88 1134 0.0 1208
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/convert2blastmask_four_human_masked.maskinfo-asn1 Wed Feb 26 10:35:01 2014 -0500 @@ -0,0 +1,158 @@ +Blast-db-mask-info ::= { + algo-id 0, + algo-program seg, + algo-options "window=12; locut=2.2; hicut=2.5", + masks { + masks { + int { + from 6, + to 18, + id swissprot { + name "ERP44_HUMAN", + accession "Q9BS26", + release "reviewed" + } + }, + packed-int { + { + from 11, + to 46, + id swissprot { + name "BMP2K_HUMAN", + accession "Q9NSY1", + release "reviewed" + } + }, + { + from 325, + to 332, + id swissprot { + name "BMP2K_HUMAN", + accession "Q9NSY1", + release "reviewed" + } + }, + { + from 421, + to 496, + id swissprot { + name "BMP2K_HUMAN", + accession "Q9NSY1", + release "reviewed" + } + }, + { + from 501, + to 516, + id swissprot { + name "BMP2K_HUMAN", + accession "Q9NSY1", + release "reviewed" + } + }, + { + from 536, + to 558, + id swissprot { + name "BMP2K_HUMAN", + accession "Q9NSY1", + release "reviewed" + } + }, + { + from 636, + to 648, + id swissprot { + name "BMP2K_HUMAN", + accession "Q9NSY1", + release "reviewed" + } + }, + { + from 737, + to 762, + id swissprot { + name "BMP2K_HUMAN", + accession "Q9NSY1", + release "reviewed" + } + }, + { + from 789, + to 806, + id swissprot { + name "BMP2K_HUMAN", + accession "Q9NSY1", + release "reviewed" + } + }, + { + from 970, + to 983, + id swissprot { + name "BMP2K_HUMAN", + accession "Q9NSY1", + release "reviewed" + } + }, + { + from 999, + to 1010, + id swissprot { + name "BMP2K_HUMAN", + accession "Q9NSY1", + release "reviewed" + } + } + }, + packed-int { + { + from 3, + to 26, + id swissprot { + name "INSR_HUMAN", + accession "P06213", + release "reviewed" + } + }, + { + from 372, + to 390, + id swissprot { + name "INSR_HUMAN", + accession "P06213", + release "reviewed" + } + }, + { + from 766, + to 791, + id swissprot { + name "INSR_HUMAN", + accession "P06213", + release "reviewed" + } + }, + { + from 1312, + to 1324, + id swissprot { + name "INSR_HUMAN", + accession "P06213", + release "reviewed" + } + } + }, + int { + from 230, + to 246, + id swissprot { + name "OPSD_HUMAN", + accession "P08100", + release "reviewed" + } + } + }, + more FALSE + } +}
--- a/test-data/four_human_proteins.fasta.log Thu Feb 20 05:39:48 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ - - -Building a new DB, current time: 11/21/2013 11:16:27 -New DB name: /tmp/tmpnSjpCP/tmpwAbNo4/database/files/000/dataset_2_files/blastdb -New DB title: Just 4 human proteins -Sequence type: Protein -Keep Linkouts: T -Keep MBits: T -Maximum file size: 1000000000B -Adding sequences from FASTA; added 4 sequences in 0.00202417 seconds.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.fasta.log.txt Wed Feb 26 10:35:01 2014 -0500 @@ -0,0 +1,5 @@ +New DB title: Just 4 human proteins +Sequence type: Protein +Keep Linkouts: T +Keep MBits: T +Maximum file size: 1000000000B
--- a/test-data/four_human_proteins_taxid.fasta.log Thu Feb 20 05:39:48 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ - - -Building a new DB, current time: 02/10/2014 18:40:09 -New DB name: four_human_proteins_taxid.fasta -New DB title: Just 4 human proteins -Sequence type: Protein -Keep Linkouts: T -Keep MBits: T -Maximum file size: 1000000000B -Adding sequences from FASTA; added 4 sequences in 0.00230002 seconds.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins_taxid.fasta.log.txt Wed Feb 26 10:35:01 2014 -0500 @@ -0,0 +1,5 @@ +New DB title: Just 4 human proteins +Sequence type: Protein +Keep Linkouts: T +Keep MBits: T +Maximum file size: 1000000000B
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/segmasker_four_human.fasta Wed Feb 26 10:35:01 2014 -0500 @@ -0,0 +1,61 @@ +>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 +MHPAVFlslpdlrcsllllVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF +SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK +REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER +VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK +CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD +CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF +HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL +>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2 +MKKFSRMPKSEggsgggaagggaggagagagcgsggssvgvrvfavgRHQVTLEESLAEG +GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS +DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD +LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG +KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP +DPEHRPDIFQVSYFAFKFAKKDCPVsninnssiPSALPEPMTASEAAARKSQIKARITDT +IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE +Illgqgppqqppqqhrvlqqlqqgdwrlqqlhlqhrhphqqqqqqqqqqqqqqqqqqqqq +qqqqqqhhhhhhhhllqDAYMqqyqhatqqqqmlqqqFLMHSVYQPQPSASQYPTMmpqy +qqaffqqqmlaqhqpsqqqASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV +ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTeeelldrefdllrSNRLEERASSD +KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD +QRTGKKTSVQGQVQKGNdesesdfesdppspksseeeeqddeeVLQGEQGDFNDDDTEPE +NLGHRPLLMdsedeeeeekhssdsdyeQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA +QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK +APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD +EITGSQQQKVkqrslqklssrqrrTKQDMSKSNGKRHHGtptstkktlkptYRTPERARR +HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS +WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ +SQQSQPVELDPFGAAPFPSKQ +>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 +MATggrrgaaaapllvavaalllgaagHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL +QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL +VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE +ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL +GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG +CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC +TVINGSLIINIRggnnlaaeleanlglieeiSGYLKIRRSYALVSLSFFRKLRLIRGETL +EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE +RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ +NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS +DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE +RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL +KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDvgnvtvavptvaaf +pntsstsvptspEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV +SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV +SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG +PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR +EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG +FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA +AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV +RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN +CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPeseeleme +fedmeNVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN +PS +>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 +MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY +VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG +GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP +EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVkeaaaqqqes +attqkaeKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI +YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/segmasker_four_human.maskinfo-asn1 Wed Feb 26 10:35:01 2014 -0500 @@ -0,0 +1,114 @@ +Blast-db-mask-info ::= { + algo-id 1, + algo-program seg, + algo-options "window=12; locut=2.2; hicut=2.5", + masks { + masks { + int { + from 6, + to 18, + id local id 1 + }, + packed-int { + { + from 11, + to 46, + id local id 2 + }, + { + from 325, + to 332, + id local id 2 + }, + { + from 421, + to 443, + id local id 2 + }, + { + from 437, + to 450, + id local id 2 + }, + { + from 447, + to 496, + id local id 2 + }, + { + from 501, + to 516, + id local id 2 + }, + { + from 536, + to 554, + id local id 2 + }, + { + from 545, + to 558, + id local id 2 + }, + { + from 636, + to 648, + id local id 2 + }, + { + from 737, + to 762, + id local id 2 + }, + { + from 789, + to 806, + id local id 2 + }, + { + from 970, + to 983, + id local id 2 + }, + { + from 999, + to 1010, + id local id 2 + } + }, + packed-int { + { + from 3, + to 26, + id local id 3 + }, + { + from 372, + to 390, + id local id 3 + }, + { + from 766, + to 782, + id local id 3 + }, + { + from 780, + to 791, + id local id 3 + }, + { + from 1312, + to 1324, + id local id 3 + } + }, + int { + from 230, + to 246, + id local id 4 + } + }, + more FALSE + } +}
--- a/tools/ncbi_blast_plus/README.rst Thu Feb 20 05:39:48 2014 -0500 +++ b/tools/ncbi_blast_plus/README.rst Wed Feb 26 10:35:01 2014 -0500 @@ -136,25 +136,29 @@ - Development moved to GitHub, https://github.com/peterjc/galaxy_blast - Updated citation information (Cock et al. 2013). v0.0.21 - Use macros to simplify the XML wrappers. - - Added wrapper for dustmasker - - Enabled masking for makeblastdb - - Requires 'maskinfo-asn1' and 'maskinfo-asn1-binary' datatypes + - Added wrapper for dustmasker. + - Enabled masking for makeblastdb. + - Requires 'maskinfo-asn1' and 'maskinfo-asn1-binary' datatypes. defined in updated blast_datatypes on Galaxy ToolShed. - - Tests updated for BLAST+ 2.2.27 instead of BLAST+ 2.2.26 - - Now depends on package_blast_plus_2_2_27 in ToolShed -v0.0.22 - More use macros to simplify the wrappers - - Set number of threads via $GALAXY_SLOTS environment variable - - More descriptive default output names - - Tests require updated BLAST DB definitions (blast_datatypes v0.0.18) + - Tests updated for BLAST+ 2.2.27 instead of BLAST+ 2.2.26. + - Now depends on package_blast_plus_2_2_27 in ToolShed. +v0.0.22 - More use macros to simplify the wrappers. + - Set number of threads via $GALAXY_SLOTS environment variable. + - More descriptive default output names. + - Tests require updated BLAST DB definitions (blast_datatypes v0.0.18). - Pre-check for duplicate identifiers in makeblastdb wrapper. - - Tests updated for BLAST+ 2.2.28 instead of BLAST+ 2.2.27 - - Now depends on package_blast_plus_2_2_28 in ToolShed + - Tests updated for BLAST+ 2.2.28 instead of BLAST+ 2.2.27. + - Now depends on package_blast_plus_2_2_28 in ToolShed. - Extended tabular output includes 'salltitles' as column 25. -v0.1.00 - Now depends on package_blast_plus_2_2_29 in ToolShed - - Tablar output now includes option to pick specific columns +v0.1.00 - Now depends on package_blast_plus_2_2_29 in ToolShed. + - Tablar output now includes option to pick specific columns. - BLAST XML to tabular tool supports multiple input files. - - More detailed descriptions for BLASTN and BLASTP task option + - More detailed descriptions for BLASTN and BLASTP task option. + - Wrappers for segmasker, dustmasker and convert2blastmask. + - Supports using maskinfo with makeblastdb wrapper. - Supports setting a taxonomy ID in makeblastdb wrapper. + - Subtle changes like new conditional settings will require some old + workflows be updated to cope. ======= ======================================================================
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/ncbi_convert2blastmask_wrapper.xml Wed Feb 26 10:35:01 2014 -0500 @@ -0,0 +1,87 @@ +<tool id="ncbi_convert2blastmask_wrapper" name="NCBI BLAST+ convert2blastmask" version="0.1.00"> + <description>Convert masking information in lower-case masked FASTA input to file formats suitable for makeblastdb</description> + <macros> + <token name="@BINARY@">convert2blastmask</token> + <import>ncbi_macros.xml</import> + </macros> + <expand macro="requirements" /> + <command> +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +convert2blastmask +-in $infile +-masking_algorithm "$masking_algorithm" +-masking_options "$masking_options" +$parse_seqids +-out "$outfile" +-outfmt $outformat + </command> + <expand macro="stdio" /> + <inputs> + <param name="infile" type="data" format="fasta" label="masked FASTA file"/> + <param name="masking_algorithm" type="select" label="Used masking algorithm"> + <option value="dust">DUST</option> + <option value="seg" selected="true">SEG</option> + <option value="windowmasker">windowmasker</option> + <option value="repeat">repeat</option> + <option value="other">other</option> + </param> + <param name="masking_options" type="text" value="" size="20" label="Masking algorithm options to create the masked input" + help ="free text to describe the options used to create the masking files. (-masking_options)"> + <sanitizer invalid_char=""> + <valid initial="string.printable" /> + </sanitizer> + </param> + <param name="parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="true" label="Parse Seq-ids in FASTA input" help="(-parse_seqids)" /> + <param name="outformat" type="select" label="Output format"> + <option value="maskinfo_asn1_bin">maskinfo ASN.1 binary</option> + <option value="maskinfo_asn1_text" selected="True">maskinfo ASN.1 text</option> + <option value="maskinfo_xml">maskinfo_xml</option> + </param> + </inputs> + <outputs> + <data name="outfile" format="maskinfo-asn1" label="SEG Masked File"> + <change_format> + <when input="outformat" value="maskinfo_asn1_bin" format="maskinfo-asn1-binary" /> + <!-- + <when input="outformat" value="maskinfo_asn1_text" format="maskinfo-asn1" /> + --> + <when input="outformat" value="maskinfo_xml" format="xml" /> + </change_format> + </data> + </outputs> + <tests> + <test> + <param name="infile" value="four_human_proteins_masked.fasta" ftype="fasta" /> + <param name="masking_algorithm" value="seg" /> + <param name="masking_options" value="window=12; locut=2.2; hicut=2.5" /> + <param name="parse_seqids" value="True" /> + <param name="outformat" value="maskinfo_asn1_bin" /> + <output name="outfile" file="convert2blastmask_four_human_masked.maskinfo-asn1-binary" /> + </test> + <test> + <param name="infile" value="four_human_proteins_masked.fasta" ftype="fasta" /> + <param name="masking_algorithm" value="seg" /> + <param name="masking_options" value="window=12; locut=2.2; hicut=2.5" /> + <param name="parse_seqids" value="True" /> + <param name="outformat" value="maskinfo_asn1_text" /> + <output name="outfile" file="convert2blastmask_four_human_masked.maskinfo-asn1" /> + </test> + </tests> + <help> +**What it does** + +Convert masking information in lower-case masked FASTA input to file formats suitable for makeblastdb. + +More information about segmasker can be found in the `BLAST Command Line Applications User Manual`_. + +.. _BLAST Command Line Applications User Manual: http://www.ncbi.nlm.nih.gov/books/NBK1763/ + +**References** + +If you use this Galaxy tool in work leading to a scientific publication please +cite the following papers (a more specific paper covering this wrapper is planned): + +@REFERENCES@ + </help> +</tool>
--- a/tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml Thu Feb 20 05:39:48 2014 -0500 +++ b/tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml Wed Feb 26 10:35:01 2014 -0500 @@ -27,27 +27,24 @@ <param name="level" type="integer" value="20" label="DUST level" help="Score threshold for subwindows" /> <param name="linker" type="integer" value="1" label="DUST linker" help="How close masked intervals should be to get merged together" /> <param name="outformat" type="select" label="Output format"> -<!-- acclist and maskinfo_xml are listed as possible output formats in - "dustmasker -help", but were not recognized by NCBI BLAST up to - release 2.2.27+. Fixed in BLAST 2.2.28+. - seqloc_* formats are not very useful --> -<!-- <option value="acclist">acclist</option>--> + <!-- seqloc_* formats are not very useful + and what BLAST+ calls 'interval' is not what Galaxy calls interval format + --> <option value="fasta">FASTA</option> - <option value="interval" selected="true">interval</option> <option value="maskinfo_asn1_bin">maskinfo ASN.1 binary</option> - <option value="maskinfo_asn1_text">maskinfo ASN.1 text</option> -<!-- <option value="maskinfo_xml">maskinfo_xml</option> - <option value="seqloc_asn1_bin">seqloc_asn1_bin</option> - <option value="seqloc_asn1_text">seqloc_asn1_text</option> - <option value="seqloc_xml">seqloc_xml</option>--> + <option value="maskinfo_asn1_text" selected="true">maskinfo ASN.1 text</option> + <option value="maskinfo_xml">maskinfo_xml</option> </param> </inputs> <outputs> - <data name="outfile" format="interval" label="DUST Masked File"> + <data name="outfile" format="maskinfo-asn1" label="DUST Masked File"> <change_format> <when input="outformat" value="fasta" format="fasta" /> <when input="outformat" value="maskinfo_asn1_bin" format="maskinfo-asn1-binary" /> + <!-- <when input="outformat" value="maskinfo_asn1_text" format="maskinfo-asn1" /> + --> + <when input="outformat" value="maskinfo_xml" format="xml" /> </change_format> </data> </outputs>
--- a/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Thu Feb 20 05:39:48 2014 -0500 +++ b/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Wed Feb 26 10:35:01 2014 -0500 @@ -8,21 +8,15 @@ <command interpreter="python">check_no_duplicates.py ##First check for duplicates (since BLAST+ 2.2.28 fails to do so) ##and abort (via the ampersand ampersand trick) if any are found. -#for $i in $in -"${i.file}" -#end for +#for i in $input_file#"${i}" #end for# && makeblastdb -out "${os.path.join($outfile.extra_files_path,'blastdb')}" $parse_seqids $hash_index ## Single call to -in with multiple filenames space separated with outer quotes ## (presumably any filenames with spaces would be a problem). Note this gives -## some extra spaces, e.g. -in " file1 file2 file3 " but BLAST seems happy: --in " -#for $i in $in -${i.file} -#end for -" +## some extra spaces, e.g. -in "file1 file2 file3 " but BLAST seems happy: +-in "#for i in $input_file#${i} #end for#" #if $title: -title "$title" #else: @@ -33,20 +27,13 @@ ## -------------------------------------------------------------------- ## Masking ## -------------------------------------------------------------------- -#set $mask_string = '' -#set $sep = '-mask_data ' -#for $i in $mask_data -#set $mask_string += $sep + str($i.file) -#set $sep = ',' +## HACK: If no mask files, evaluates as a list with just None in it: +## See Trello issue https://trello.com/c/lp5YmA1O +#if ' '.join( map(str, $mask_data_file) ) != 'None': +#for i in $mask_data_file: +-mask_data "${i}" #end for -$mask_string -## #set $gi_mask_string = '' -## #set $sep = '-gi_mask -gi_mask_name ' -## #for $i in $gi_mask -## #set $gi_mask_string += $sep + str($i.file) -## #set $sep = ',' -## #end for -## $gi_mask_string +#end if ## -------------------------------------------------------------------- ## Taxonomy ## -------------------------------------------------------------------- @@ -58,7 +45,7 @@ #end if ## -------------------------------------------------------------------- ## Capture the stdout log information to the primary file (plain text): ->> "$outfile" +> "$outfile" </command> <expand macro="stdio" /> <inputs> @@ -66,29 +53,18 @@ <option value="prot">protein</option> <option value="nucl">nucleotide</option> </param> - <!-- TODO Allow merging of existing BLAST databases (conditional on the database type) + <!-- TODO Allow merging of existing BLAST databases (conditional on the database type)? NOTE Double check the new database would be self contained first - <repeat name="in" title="BLAST or FASTA Database" min="1"> - <param name="file" type="data" format="fasta,blastdbn,blastdbp" label="BLAST or FASTA database" /> - </repeat> --> - <!-- TODO Switch this to using <param ... multiple="true" /> instead of <repeat> block? --> - <repeat name="in" title="FASTA file" min="1"> - <param name="file" type="data" format="fasta" /> - </repeat> + <!-- Note this is a mandatory parameter - default should be most recent FASTA file --> + <param name="input_file" type="data" multiple="true" optional="false" format="fasta" label="Input FASTA files(s)" help="One or more FASTA files" /> <param name="title" type="text" value="" label="Title for BLAST database" help="This is the database name shown in BLAST search output" /> <param name="parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="False" label="Parse the sequence identifiers" help="This is only advised if your FASTA file follows the NCBI naming conventions using pipe '|' symbols" /> <param name="hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values" help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." /> <!-- SEQUENCE MASKING OPTIONS --> - <repeat name="mask_data" title="Masking data file"> - <param name="mask_data_file" type="data" format="maskinfo-asn1,maskinfo-asn1-binary" label="ASN.1 file containing masking data" help="As produced by NCBI masking applications (e.g. dustmasker, segmasker, windowmasker)" /> - </repeat> - <!-- TODO - <repeat name="gi_mask" title="Create GI indexed masking data"> - <param name="gi_mask_file" type="data" format="asnb" label="Masking data output file" /> - </repeat> - --> - + <!-- Note this is an optional parameter - default should be NO files --> + <param name="mask_data_file" type="data" multiple="true" optional="true" value="" format="maskinfo-asn1,maskinfo-asn1-binary" label="Optional ASN.1 file(s) containing masking data" help="As produced by NCBI masking applications (e.g. dustmasker, segmasker, windowmasker)" /> + <!-- TODO - Option to create GI indexed masking data? via -gi_mask and -gi_mask_name? --> <!-- TAXONOMY OPTIONS --> <conditional name="tax"> <param name="taxselect" type="select" label="Taxonomy options"> @@ -121,16 +97,17 @@ </outputs> <tests> <!-- Note the (two line) PIN file is not reproducible run to run. - Likewise there is a datestamp in the log file as well. + Likewise there is a datestamp in the log file as well, so use contains comparison + With and without the masking makes no difference. With and without the taxid the only real difference is in the *.phr file. --> <test> <param name="dbtype" value="prot" /> - <param name="file" value="four_human_proteins.fasta" ftype="fasta" /> + <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" /> <param name="title" value="Just 4 human proteins" /> <param name="parse_seqids" value="" /> <param name="hash_index" value="true" /> - <output name="out_file" file="four_human_proteins.fasta.log" ftype="blastdbp" lines_diff="6"> + <output name="out_file" compare="contains" file="four_human_proteins.fasta.log.txt" ftype="blastdbp"> <extra_files type="file" value="four_human_proteins.fasta.phr" name="blastdb.phr" /> <extra_files type="file" value="four_human_proteins.fasta.pin" name="blastdb.pin" lines_diff="2" /> <extra_files type="file" value="four_human_proteins.fasta.psq" name="blastdb.psq" /> @@ -143,13 +120,13 @@ </test> <test> <param name="dbtype" value="prot" /> - <param name="file" value="four_human_proteins.fasta" ftype="fasta" /> + <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" /> <param name="title" value="Just 4 human proteins" /> <param name="parse_seqids" value="" /> <param name="hash_index" value="true" /> <param name="taxselect" value="id" /> <param name="taxid" value="9606" /> - <output name="out_file" file="four_human_proteins_taxid.fasta.log" ftype="blastdbp" lines_diff="6"> + <output name="out_file" compare="contains" file="four_human_proteins_taxid.fasta.log.txt" ftype="blastdbp"> <extra_files type="file" value="four_human_proteins_taxid.fasta.phr" name="blastdb.phr" /> <extra_files type="file" value="four_human_proteins_taxid.fasta.pin" name="blastdb.pin" lines_diff="2" /> <extra_files type="file" value="four_human_proteins_taxid.fasta.psq" name="blastdb.psq" /> @@ -160,6 +137,24 @@ <extra_files type="file" value="four_human_proteins_taxid.fasta.psi" name="blastdb.psi" /> </output> </test> + <test> + <param name="dbtype" value="prot" /> + <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" /> + <param name="title" value="Just 4 human proteins" /> + <param name="parse_seqids" value="" /> + <param name="hash_index" value="true" /> + <param name="mask_data_file" value="segmasker_four_human.maskinfo-asn1" ftype="maskinfo-asn1" /> + <output name="out_file" compare="contains" file="four_human_proteins.fasta.log.txt" ftype="blastdbp"> + <extra_files type="file" value="four_human_proteins.fasta.phr" name="blastdb.phr" /> + <extra_files type="file" value="four_human_proteins.fasta.pin" name="blastdb.pin" lines_diff="2" /> + <extra_files type="file" value="four_human_proteins.fasta.psq" name="blastdb.psq" /> + <extra_files type="file" value="four_human_proteins.fasta.pog" name="blastdb.pog" /> + <extra_files type="file" value="four_human_proteins.fasta.phd" name="blastdb.phd" /> + <extra_files type="file" value="four_human_proteins.fasta.phi" name="blastdb.phi" /> + <extra_files type="file" value="four_human_proteins.fasta.psd" name="blastdb.psd" /> + <extra_files type="file" value="four_human_proteins.fasta.psi" name="blastdb.psi" /> + </output> + </test> </tests> <help> **What it does**
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/ncbi_segmasker_wrapper.xml Wed Feb 26 10:35:01 2014 -0500 @@ -0,0 +1,101 @@ +<tool id="ncbi_segmasker_wrapper" name="NCBI BLAST+ segmasker" version="0.1.00"> + <description>low-complexity regions in protein sequences</description> + <macros> + <token name="@BINARY@">segmasker</token> + <import>ncbi_macros.xml</import> + </macros> + <expand macro="requirements" /> + <command> +## The command is a Cheetah template which allows some Python based syntax. +## Lines starting hash hash are comments. Galaxy will turn newlines into spaces +segmasker +#if $db_opts.db_opts_selector == "db": + -in "${db_opts.database.fields.path}" -infmt blastdb +#elif $db_opts.db_opts_selector == "histdb": + -in "${os.path.join($db_opts.histdb.extra_files_path, 'blastdb')}" -infmt blastdb +#else: + -in "$subject" -infmt fasta +#end if +-out "$outfile" +-window $window +-locut $locut +-hicut $hicut +-outfmt $outformat + </command> + <expand macro="stdio" /> + <inputs> + <expand macro="input_conditional_protein_db" /> + <param name="window" type="integer" value="12" label="SEG window length" help="(-window)" /> + <param name="locut" type="float" value="2.2" label="SEG low cutoff" help="(-locut)" /> + <param name="hicut" type="float" value="2.5" label="SEG high cutoff" help="(-hicut)" /> + <param name="outformat" type="select" label="Output format"> + <!-- seqloc_* formats are not very useful + and what BLAST+ calls 'interval' is not what Galaxy calls interval format + --> + <option value="fasta">FASTA</option> + <option value="maskinfo_asn1_bin">maskinfo ASN.1 binary</option> + <option value="maskinfo_asn1_text" selected="true">maskinfo ASN.1 text</option> + <option value="maskinfo_xml">maskinfo_xml</option> + </param> + </inputs> + <outputs> + <data name="outfile" format="maskinfo-asn1" label="SEG Masked File"> + <change_format> + <when input="outformat" value="fasta" format="fasta" /> + <when input="outformat" value="maskinfo_asn1_bin" format="maskinfo-asn1-binary" /> + <!-- + <when input="outformat" value="maskinfo_asn1_text" format="maskinfo-asn1" /> + --> + <when input="outformat" value="maskinfo_xml" format="xml" /> + </change_format> + </data> + </outputs> + <tests> + <test> + <param name="db_opts_selector" value="file" /> + <param name="subject" value="four_human_proteins.fasta" ftype="fasta" /> + <param name="window" value="12" /> + <param name="locut" value="2.2" /> + <param name="hicut" value="2.5" /> + <param name="outformat" value="fasta" /> + <output name="outfile" file="segmasker_four_human.fasta" /> + </test> + <test> + <param name="db_opts_selector" value="file" /> + <param name="subject" value="four_human_proteins.fasta" ftype="fasta" /> + <param name="window" value="12" /> + <param name="locut" value="2.2" /> + <param name="hicut" value="2.5" /> + <param name="outformat" value="maskinfo_asn1_bin" /> + <output name="outfile" file="segmasker_four_human.maskinfo-asn1-binary" /> + </test> + <test> + <param name="db_opts_selector" value="file" /> + <param name="subject" value="four_human_proteins.fasta" ftype="fasta" /> + <param name="window" value="12" /> + <param name="locut" value="2.2" /> + <param name="hicut" value="2.5" /> + <param name="outformat" value="maskinfo_asn1_text" /> + <output name="outfile" file="segmasker_four_human.maskinfo-asn1" /> + </test> + </tests> + <help> +**What it does** + +This tool identifies and masks out low complexity regions of a protein database (or proteins in FASTA format) by using the SEG_ algorithm. + +If you select *maskinfo ASN.1* (binary or text) as output format, the output file can be used as masking data for NCBI BLAST+ makeblastdb tool. + +More information about segmasker can be found in the `BLAST Command Line Applications User Manual`_. + +.. _BLAST Command Line Applications User Manual: http://www.ncbi.nlm.nih.gov/books/NBK1763/ +.. _SEG: http://www.ncbi.nlm.nih.gov/pubmed/8743706 + +**References** + +If you use this Galaxy tool in work leading to a scientific publication please +cite the following papers (a more specific paper covering this wrapper is planned): + +@REFERENCES@ + </help> +</tool>