changeset 40:f83e5d79b6ab draft

Uploaded v0.1.0 preview 3, adds a missing test file & more tests for makeblastdb
author peterjc
date Wed, 26 Feb 2014 10:35:01 -0500
parents 22b7cdcf4960
children af4da561893b
files test-data/blastn_rhodopsin_vs_three_human_converted.tabular test-data/convert2blastmask_four_human_masked.maskinfo-asn1 test-data/convert2blastmask_four_human_masked.maskinfo-asn1-binary test-data/four_human_proteins.fasta.log test-data/four_human_proteins.fasta.log.txt test-data/four_human_proteins_taxid.fasta.log test-data/four_human_proteins_taxid.fasta.log.txt test-data/four_human_proteins_taxid.fasta.pin test-data/segmasker_four_human.fasta test-data/segmasker_four_human.maskinfo-asn1 test-data/segmasker_four_human.maskinfo-asn1-binary tools/ncbi_blast_plus/README.rst tools/ncbi_blast_plus/ncbi_convert2blastmask_wrapper.xml tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml tools/ncbi_blast_plus/ncbi_makeblastdb.xml tools/ncbi_blast_plus/ncbi_segmasker_wrapper.xml
diffstat 16 files changed, 605 insertions(+), 91 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_rhodopsin_vs_three_human_converted.tabular	Wed Feb 26 10:35:01 2014 -0500
@@ -0,0 +1,7 @@
+gi|57163782|ref|NM_001009242.1|	ENA|BC112106|BC112106.1	92.07	1047	83	0	1	1047	88	1134	0.0	1474
+gi|283855845|gb|GQ290303.1|	ENA|BC112106|BC112106.1	91.59	333	28	0	1	333	118	450	4e-132	460
+gi|283855845|gb|GQ290303.1|	ENA|BC112106|BC112106.1	91.36	243	19	2	3127	3368	782	1023	3e-93	331
+gi|283855845|gb|GQ290303.1|	ENA|BC112106|BC112106.1	94.22	173	10	0	1410	1582	448	620	3e-73	265
+gi|283855845|gb|GQ290303.1|	ENA|BC112106|BC112106.1	92.94	170	12	0	2854	3023	615	784	3e-68	248
+gi|283855822|gb|GQ290312.1|	ENA|BC112106|BC112106.1	91.55	959	81	0	1	959	118	1076	0.0	1323
+gi|18148870|dbj|AB062417.1|	ENA|BC112106|BC112106.1	87.50	1048	129	2	1	1047	88	1134	0.0	1208
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/convert2blastmask_four_human_masked.maskinfo-asn1	Wed Feb 26 10:35:01 2014 -0500
@@ -0,0 +1,158 @@
+Blast-db-mask-info ::= {
+  algo-id 0,
+  algo-program seg,
+  algo-options "window=12; locut=2.2; hicut=2.5",
+  masks {
+    masks {
+      int {
+        from 6,
+        to 18,
+        id swissprot {
+          name "ERP44_HUMAN",
+          accession "Q9BS26",
+          release "reviewed"
+        }
+      },
+      packed-int {
+        {
+          from 11,
+          to 46,
+          id swissprot {
+            name "BMP2K_HUMAN",
+            accession "Q9NSY1",
+            release "reviewed"
+          }
+        },
+        {
+          from 325,
+          to 332,
+          id swissprot {
+            name "BMP2K_HUMAN",
+            accession "Q9NSY1",
+            release "reviewed"
+          }
+        },
+        {
+          from 421,
+          to 496,
+          id swissprot {
+            name "BMP2K_HUMAN",
+            accession "Q9NSY1",
+            release "reviewed"
+          }
+        },
+        {
+          from 501,
+          to 516,
+          id swissprot {
+            name "BMP2K_HUMAN",
+            accession "Q9NSY1",
+            release "reviewed"
+          }
+        },
+        {
+          from 536,
+          to 558,
+          id swissprot {
+            name "BMP2K_HUMAN",
+            accession "Q9NSY1",
+            release "reviewed"
+          }
+        },
+        {
+          from 636,
+          to 648,
+          id swissprot {
+            name "BMP2K_HUMAN",
+            accession "Q9NSY1",
+            release "reviewed"
+          }
+        },
+        {
+          from 737,
+          to 762,
+          id swissprot {
+            name "BMP2K_HUMAN",
+            accession "Q9NSY1",
+            release "reviewed"
+          }
+        },
+        {
+          from 789,
+          to 806,
+          id swissprot {
+            name "BMP2K_HUMAN",
+            accession "Q9NSY1",
+            release "reviewed"
+          }
+        },
+        {
+          from 970,
+          to 983,
+          id swissprot {
+            name "BMP2K_HUMAN",
+            accession "Q9NSY1",
+            release "reviewed"
+          }
+        },
+        {
+          from 999,
+          to 1010,
+          id swissprot {
+            name "BMP2K_HUMAN",
+            accession "Q9NSY1",
+            release "reviewed"
+          }
+        }
+      },
+      packed-int {
+        {
+          from 3,
+          to 26,
+          id swissprot {
+            name "INSR_HUMAN",
+            accession "P06213",
+            release "reviewed"
+          }
+        },
+        {
+          from 372,
+          to 390,
+          id swissprot {
+            name "INSR_HUMAN",
+            accession "P06213",
+            release "reviewed"
+          }
+        },
+        {
+          from 766,
+          to 791,
+          id swissprot {
+            name "INSR_HUMAN",
+            accession "P06213",
+            release "reviewed"
+          }
+        },
+        {
+          from 1312,
+          to 1324,
+          id swissprot {
+            name "INSR_HUMAN",
+            accession "P06213",
+            release "reviewed"
+          }
+        }
+      },
+      int {
+        from 230,
+        to 246,
+        id swissprot {
+          name "OPSD_HUMAN",
+          accession "P08100",
+          release "reviewed"
+        }
+      }
+    },
+    more FALSE
+  }
+}
Binary file test-data/convert2blastmask_four_human_masked.maskinfo-asn1-binary has changed
--- a/test-data/four_human_proteins.fasta.log	Thu Feb 20 05:39:48 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-
-
-Building a new DB, current time: 11/21/2013 11:16:27
-New DB name:   /tmp/tmpnSjpCP/tmpwAbNo4/database/files/000/dataset_2_files/blastdb
-New DB title:  Just 4 human proteins
-Sequence type: Protein
-Keep Linkouts: T
-Keep MBits: T
-Maximum file size: 1000000000B
-Adding sequences from FASTA; added 4 sequences in 0.00202417 seconds.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins.fasta.log.txt	Wed Feb 26 10:35:01 2014 -0500
@@ -0,0 +1,5 @@
+New DB title:  Just 4 human proteins
+Sequence type: Protein
+Keep Linkouts: T
+Keep MBits: T
+Maximum file size: 1000000000B
--- a/test-data/four_human_proteins_taxid.fasta.log	Thu Feb 20 05:39:48 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,10 +0,0 @@
-
-
-Building a new DB, current time: 02/10/2014 18:40:09
-New DB name:   four_human_proteins_taxid.fasta
-New DB title:  Just 4 human proteins
-Sequence type: Protein
-Keep Linkouts: T
-Keep MBits: T
-Maximum file size: 1000000000B
-Adding sequences from FASTA; added 4 sequences in 0.00230002 seconds.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins_taxid.fasta.log.txt	Wed Feb 26 10:35:01 2014 -0500
@@ -0,0 +1,5 @@
+New DB title:  Just 4 human proteins
+Sequence type: Protein
+Keep Linkouts: T
+Keep MBits: T
+Maximum file size: 1000000000B
Binary file test-data/four_human_proteins_taxid.fasta.pin has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/segmasker_four_human.fasta	Wed Feb 26 10:35:01 2014 -0500
@@ -0,0 +1,61 @@
+>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1
+MHPAVFlslpdlrcsllllVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF
+SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK
+REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER
+VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK
+CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD
+CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF
+HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL
+>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2
+MKKFSRMPKSEggsgggaagggaggagagagcgsggssvgvrvfavgRHQVTLEESLAEG
+GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS
+DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD
+LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG
+KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP
+DPEHRPDIFQVSYFAFKFAKKDCPVsninnssiPSALPEPMTASEAAARKSQIKARITDT
+IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE
+Illgqgppqqppqqhrvlqqlqqgdwrlqqlhlqhrhphqqqqqqqqqqqqqqqqqqqqq
+qqqqqqhhhhhhhhllqDAYMqqyqhatqqqqmlqqqFLMHSVYQPQPSASQYPTMmpqy
+qqaffqqqmlaqhqpsqqqASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV
+ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTeeelldrefdllrSNRLEERASSD
+KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD
+QRTGKKTSVQGQVQKGNdesesdfesdppspksseeeeqddeeVLQGEQGDFNDDDTEPE
+NLGHRPLLMdsedeeeeekhssdsdyeQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA
+QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK
+APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD
+EITGSQQQKVkqrslqklssrqrrTKQDMSKSNGKRHHGtptstkktlkptYRTPERARR
+HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS
+WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ
+SQQSQPVELDPFGAAPFPSKQ
+>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4
+MATggrrgaaaapllvavaalllgaagHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL
+QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL
+VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE
+ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL
+GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG
+CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC
+TVINGSLIINIRggnnlaaeleanlglieeiSGYLKIRRSYALVSLSFFRKLRLIRGETL
+EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE
+RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ
+NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS
+DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE
+RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL
+KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDvgnvtvavptvaaf
+pntsstsvptspEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV
+SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV
+SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG
+PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR
+EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG
+FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA
+AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV
+RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN
+CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPeseeleme
+fedmeNVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN
+PS
+>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1
+MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY
+VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG
+GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP
+EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVkeaaaqqqes
+attqkaeKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI
+YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/segmasker_four_human.maskinfo-asn1	Wed Feb 26 10:35:01 2014 -0500
@@ -0,0 +1,114 @@
+Blast-db-mask-info ::= {
+  algo-id 1,
+  algo-program seg,
+  algo-options "window=12; locut=2.2; hicut=2.5",
+  masks {
+    masks {
+      int {
+        from 6,
+        to 18,
+        id local id 1
+      },
+      packed-int {
+        {
+          from 11,
+          to 46,
+          id local id 2
+        },
+        {
+          from 325,
+          to 332,
+          id local id 2
+        },
+        {
+          from 421,
+          to 443,
+          id local id 2
+        },
+        {
+          from 437,
+          to 450,
+          id local id 2
+        },
+        {
+          from 447,
+          to 496,
+          id local id 2
+        },
+        {
+          from 501,
+          to 516,
+          id local id 2
+        },
+        {
+          from 536,
+          to 554,
+          id local id 2
+        },
+        {
+          from 545,
+          to 558,
+          id local id 2
+        },
+        {
+          from 636,
+          to 648,
+          id local id 2
+        },
+        {
+          from 737,
+          to 762,
+          id local id 2
+        },
+        {
+          from 789,
+          to 806,
+          id local id 2
+        },
+        {
+          from 970,
+          to 983,
+          id local id 2
+        },
+        {
+          from 999,
+          to 1010,
+          id local id 2
+        }
+      },
+      packed-int {
+        {
+          from 3,
+          to 26,
+          id local id 3
+        },
+        {
+          from 372,
+          to 390,
+          id local id 3
+        },
+        {
+          from 766,
+          to 782,
+          id local id 3
+        },
+        {
+          from 780,
+          to 791,
+          id local id 3
+        },
+        {
+          from 1312,
+          to 1324,
+          id local id 3
+        }
+      },
+      int {
+        from 230,
+        to 246,
+        id local id 4
+      }
+    },
+    more FALSE
+  }
+}
Binary file test-data/segmasker_four_human.maskinfo-asn1-binary has changed
--- a/tools/ncbi_blast_plus/README.rst	Thu Feb 20 05:39:48 2014 -0500
+++ b/tools/ncbi_blast_plus/README.rst	Wed Feb 26 10:35:01 2014 -0500
@@ -136,25 +136,29 @@
         - Development moved to GitHub, https://github.com/peterjc/galaxy_blast
         - Updated citation information (Cock et al. 2013).
 v0.0.21 - Use macros to simplify the XML wrappers.
-        - Added wrapper for dustmasker
-        - Enabled masking for makeblastdb
-        - Requires 'maskinfo-asn1' and 'maskinfo-asn1-binary' datatypes
+        - Added wrapper for dustmasker.
+        - Enabled masking for makeblastdb.
+        - Requires 'maskinfo-asn1' and 'maskinfo-asn1-binary' datatypes.
           defined in updated blast_datatypes on Galaxy ToolShed.
-        - Tests updated for BLAST+ 2.2.27 instead of BLAST+ 2.2.26
-        - Now depends on package_blast_plus_2_2_27 in ToolShed
-v0.0.22 - More use macros to simplify the wrappers
-        - Set number of threads via $GALAXY_SLOTS environment variable
-        - More descriptive default output names
-        - Tests require updated BLAST DB definitions (blast_datatypes v0.0.18)
+        - Tests updated for BLAST+ 2.2.27 instead of BLAST+ 2.2.26.
+        - Now depends on package_blast_plus_2_2_27 in ToolShed.
+v0.0.22 - More use macros to simplify the wrappers.
+        - Set number of threads via $GALAXY_SLOTS environment variable.
+        - More descriptive default output names.
+        - Tests require updated BLAST DB definitions (blast_datatypes v0.0.18).
         - Pre-check for duplicate identifiers in makeblastdb wrapper.
-        - Tests updated for BLAST+ 2.2.28 instead of BLAST+ 2.2.27
-        - Now depends on package_blast_plus_2_2_28 in ToolShed
+        - Tests updated for BLAST+ 2.2.28 instead of BLAST+ 2.2.27.
+        - Now depends on package_blast_plus_2_2_28 in ToolShed.
         - Extended tabular output includes 'salltitles' as column 25.
-v0.1.00 - Now depends on package_blast_plus_2_2_29 in ToolShed
-        - Tablar output now includes option to pick specific columns
+v0.1.00 - Now depends on package_blast_plus_2_2_29 in ToolShed.
+        - Tablar output now includes option to pick specific columns.
         - BLAST XML to tabular tool supports multiple input files.
-        - More detailed descriptions for BLASTN and BLASTP task option
+        - More detailed descriptions for BLASTN and BLASTP task option.
+        - Wrappers for segmasker, dustmasker and convert2blastmask.
+        - Supports using maskinfo with makeblastdb wrapper.
         - Supports setting a taxonomy ID in makeblastdb wrapper.
+        - Subtle changes like new conditional settings will require some old
+          workflows be updated to cope. 
 ======= ======================================================================
 
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_convert2blastmask_wrapper.xml	Wed Feb 26 10:35:01 2014 -0500
@@ -0,0 +1,87 @@
+<tool id="ncbi_convert2blastmask_wrapper" name="NCBI BLAST+ convert2blastmask" version="0.1.00">
+    <description>Convert masking information in lower-case masked FASTA input to file formats suitable for makeblastdb</description>
+    <macros>
+        <token name="@BINARY@">convert2blastmask</token>
+        <import>ncbi_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command>
+## The command is a Cheetah template which allows some Python based syntax.
+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
+convert2blastmask
+-in $infile
+-masking_algorithm "$masking_algorithm"
+-masking_options "$masking_options"
+$parse_seqids
+-out "$outfile"
+-outfmt $outformat
+    </command>
+    <expand macro="stdio" />
+    <inputs>
+        <param name="infile" type="data" format="fasta" label="masked FASTA file"/> 
+        <param name="masking_algorithm" type="select" label="Used masking algorithm">
+            <option value="dust">DUST</option>
+            <option value="seg" selected="true">SEG</option>
+            <option value="windowmasker">windowmasker</option>
+            <option value="repeat">repeat</option>
+            <option value="other">other</option>
+        </param>
+        <param name="masking_options" type="text" value="" size="20" label="Masking algorithm options to create the masked input" 
+            help ="free text to describe the options used to create the masking files. (-masking_options)">
+            <sanitizer invalid_char="">
+                <valid initial="string.printable" />
+            </sanitizer>
+        </param>
+        <param name="parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="true" label="Parse Seq-ids in FASTA input" help="(-parse_seqids)" />
+        <param name="outformat" type="select" label="Output format">
+            <option value="maskinfo_asn1_bin">maskinfo ASN.1 binary</option>
+            <option value="maskinfo_asn1_text" selected="True">maskinfo ASN.1 text</option>
+            <option value="maskinfo_xml">maskinfo_xml</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="outfile" format="maskinfo-asn1" label="SEG Masked File">
+            <change_format>
+                <when input="outformat" value="maskinfo_asn1_bin" format="maskinfo-asn1-binary" />
+		<!--
+                <when input="outformat" value="maskinfo_asn1_text" format="maskinfo-asn1" />
+		-->
+                <when input="outformat" value="maskinfo_xml" format="xml" />
+            </change_format>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="infile" value="four_human_proteins_masked.fasta" ftype="fasta" />
+            <param name="masking_algorithm" value="seg" />
+            <param name="masking_options" value="window=12; locut=2.2; hicut=2.5" />
+            <param name="parse_seqids" value="True" />
+            <param name="outformat" value="maskinfo_asn1_bin" />
+            <output name="outfile" file="convert2blastmask_four_human_masked.maskinfo-asn1-binary" />
+        </test>
+        <test>
+            <param name="infile" value="four_human_proteins_masked.fasta" ftype="fasta" />
+            <param name="masking_algorithm" value="seg" />
+            <param name="masking_options" value="window=12; locut=2.2; hicut=2.5" />
+            <param name="parse_seqids" value="True" />
+            <param name="outformat" value="maskinfo_asn1_text" />
+            <output name="outfile" file="convert2blastmask_four_human_masked.maskinfo-asn1" />
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Convert masking information in lower-case masked FASTA input to file formats suitable for makeblastdb.
+
+More information about segmasker can be found in the `BLAST Command Line Applications User Manual`_.
+
+.. _BLAST Command Line Applications User Manual: http://www.ncbi.nlm.nih.gov/books/NBK1763/
+
+**References**
+
+If you use this Galaxy tool in work leading to a scientific publication please
+cite the following papers (a more specific paper covering this wrapper is planned):
+
+@REFERENCES@
+    </help>
+</tool>
--- a/tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml	Thu Feb 20 05:39:48 2014 -0500
+++ b/tools/ncbi_blast_plus/ncbi_dustmasker_wrapper.xml	Wed Feb 26 10:35:01 2014 -0500
@@ -27,27 +27,24 @@
         <param name="level" type="integer" value="20" label="DUST level" help="Score threshold for subwindows" />
         <param name="linker" type="integer" value="1" label="DUST linker" help="How close masked intervals should be to get merged together" />
         <param name="outformat" type="select" label="Output format">
-<!-- acclist and maskinfo_xml are listed as possible output formats in
-     "dustmasker -help", but were not recognized by NCBI BLAST up to
-     release 2.2.27+. Fixed in BLAST 2.2.28+.
-     seqloc_* formats are not very useful -->
-<!--            <option value="acclist">acclist</option>-->
+            <!-- seqloc_* formats are not very useful
+                 and what BLAST+ calls 'interval' is not what Galaxy calls interval format
+            -->
             <option value="fasta">FASTA</option>
-            <option value="interval" selected="true">interval</option>
             <option value="maskinfo_asn1_bin">maskinfo ASN.1 binary</option>
-            <option value="maskinfo_asn1_text">maskinfo ASN.1 text</option>
-<!--            <option value="maskinfo_xml">maskinfo_xml</option>
-            <option value="seqloc_asn1_bin">seqloc_asn1_bin</option>
-            <option value="seqloc_asn1_text">seqloc_asn1_text</option>
-            <option value="seqloc_xml">seqloc_xml</option>-->
+            <option value="maskinfo_asn1_text" selected="true">maskinfo ASN.1 text</option>
+            <option value="maskinfo_xml">maskinfo_xml</option>
         </param>
     </inputs>
     <outputs>
-        <data name="outfile" format="interval" label="DUST Masked File">
+        <data name="outfile" format="maskinfo-asn1" label="DUST Masked File">
             <change_format>
                 <when input="outformat" value="fasta" format="fasta" />
                 <when input="outformat" value="maskinfo_asn1_bin" format="maskinfo-asn1-binary" />
+		<!--
                 <when input="outformat" value="maskinfo_asn1_text" format="maskinfo-asn1" />
+		-->
+                <when input="outformat" value="maskinfo_xml" format="xml" />
             </change_format>
         </data>
     </outputs>
--- a/tools/ncbi_blast_plus/ncbi_makeblastdb.xml	Thu Feb 20 05:39:48 2014 -0500
+++ b/tools/ncbi_blast_plus/ncbi_makeblastdb.xml	Wed Feb 26 10:35:01 2014 -0500
@@ -8,21 +8,15 @@
     <command interpreter="python">check_no_duplicates.py
 ##First check for duplicates (since BLAST+ 2.2.28 fails to do so)
 ##and abort (via the ampersand ampersand trick) if any are found.
-#for $i in $in
-"${i.file}"
-#end for
+#for i in $input_file#"${i}" #end for#
 &amp;&amp;
 makeblastdb -out "${os.path.join($outfile.extra_files_path,'blastdb')}"
 $parse_seqids
 $hash_index
 ## Single call to -in with multiple filenames space separated with outer quotes
 ## (presumably any filenames with spaces would be a problem). Note this gives
-## some extra spaces, e.g. -in " file1 file2 file3  " but BLAST seems happy:
--in "
-#for $i in $in
-${i.file}
-#end for
-"
+## some extra spaces, e.g. -in "file1 file2 file3 " but BLAST seems happy:
+-in "#for i in $input_file#${i} #end for#"
 #if $title:
 -title "$title"
 #else:
@@ -33,20 +27,13 @@
 ## --------------------------------------------------------------------
 ## Masking
 ## --------------------------------------------------------------------
-#set $mask_string = ''
-#set $sep = '-mask_data '
-#for $i in $mask_data
-#set $mask_string += $sep + str($i.file)
-#set $sep = ','
+## HACK: If no mask files, evaluates as a list with just None in it:
+## See Trello issue https://trello.com/c/lp5YmA1O
+#if ' '.join( map(str, $mask_data_file) ) != 'None':
+#for i in $mask_data_file:
+-mask_data "${i}"
 #end for
-$mask_string
-## #set $gi_mask_string = ''
-## #set $sep = '-gi_mask -gi_mask_name '
-## #for $i in $gi_mask
-## #set $gi_mask_string += $sep + str($i.file)
-## #set $sep = ','
-## #end for
-## $gi_mask_string
+#end if
 ## --------------------------------------------------------------------
 ## Taxonomy
 ## --------------------------------------------------------------------
@@ -58,7 +45,7 @@
 #end if
 ## --------------------------------------------------------------------
 ## Capture the stdout log information to the primary file (plain text):
-&gt;&gt; "$outfile"
+&gt; "$outfile"
     </command>
     <expand macro="stdio" />
     <inputs>
@@ -66,29 +53,18 @@
             <option value="prot">protein</option>
             <option value="nucl">nucleotide</option>
         </param>
-        <!-- TODO Allow merging of existing BLAST databases (conditional on the database type)
+        <!-- TODO Allow merging of existing BLAST databases (conditional on the database type)?
              NOTE Double check the new database would be self contained first
-        <repeat name="in" title="BLAST or FASTA Database" min="1">
-            <param name="file" type="data" format="fasta,blastdbn,blastdbp" label="BLAST or FASTA database" />
-        </repeat>
         -->
-        <!-- TODO Switch this to using <param ... multiple="true" /> instead of <repeat> block? -->
-        <repeat name="in" title="FASTA file" min="1">
-            <param name="file" type="data" format="fasta" />
-        </repeat>
+        <!-- Note this is a mandatory parameter - default should be most recent FASTA file -->
+        <param name="input_file" type="data" multiple="true" optional="false" format="fasta" label="Input FASTA files(s)" help="One or more FASTA files" />
         <param name="title" type="text" value="" label="Title for BLAST database" help="This is the database name shown in BLAST search output" />
         <param name="parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="False" label="Parse the sequence identifiers" help="This is only advised if your FASTA file follows the NCBI naming conventions using pipe '|' symbols" />
         <param name="hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values" help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." />
         <!-- SEQUENCE MASKING OPTIONS -->
-        <repeat name="mask_data" title="Masking data file">
-            <param name="mask_data_file" type="data" format="maskinfo-asn1,maskinfo-asn1-binary" label="ASN.1 file containing masking data" help="As produced by NCBI masking applications (e.g. dustmasker, segmasker, windowmasker)" />
-        </repeat>
-        <!-- TODO
-        <repeat name="gi_mask" title="Create GI indexed masking data">
-            <param name="gi_mask_file" type="data" format="asnb" label="Masking data output file" />
-        </repeat>
-        -->
-
+        <!-- Note this is an optional parameter - default should be NO files -->
+        <param name="mask_data_file" type="data" multiple="true" optional="true" value="" format="maskinfo-asn1,maskinfo-asn1-binary" label="Optional ASN.1 file(s) containing masking data" help="As produced by NCBI masking applications (e.g. dustmasker, segmasker, windowmasker)" />
+        <!-- TODO - Option to create GI indexed masking data? via -gi_mask and -gi_mask_name? -->
         <!-- TAXONOMY OPTIONS -->
         <conditional name="tax">
             <param name="taxselect" type="select" label="Taxonomy options">
@@ -121,16 +97,17 @@
     </outputs>
     <tests>
         <!-- Note the (two line) PIN file is not reproducible run to run.
-             Likewise there is a datestamp in the log file as well.
+             Likewise there is a datestamp in the log file as well, so use contains comparison
+             With and without the masking makes no difference.
              With and without the taxid the only real difference is in the *.phr file.
         -->
         <test>
             <param name="dbtype" value="prot" />
-            <param name="file" value="four_human_proteins.fasta" ftype="fasta" />
+            <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" />
             <param name="title" value="Just 4 human proteins" />
             <param name="parse_seqids" value="" />
             <param name="hash_index" value="true" />
-            <output name="out_file" file="four_human_proteins.fasta.log" ftype="blastdbp" lines_diff="6">
+            <output name="out_file" compare="contains" file="four_human_proteins.fasta.log.txt" ftype="blastdbp">
                 <extra_files type="file" value="four_human_proteins.fasta.phr" name="blastdb.phr" />
                 <extra_files type="file" value="four_human_proteins.fasta.pin" name="blastdb.pin" lines_diff="2" />
                 <extra_files type="file" value="four_human_proteins.fasta.psq" name="blastdb.psq" />
@@ -143,13 +120,13 @@
         </test>
         <test>
             <param name="dbtype" value="prot" />
-            <param name="file" value="four_human_proteins.fasta" ftype="fasta" />
+            <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" />
             <param name="title" value="Just 4 human proteins" />
             <param name="parse_seqids" value="" />
             <param name="hash_index" value="true" />
             <param name="taxselect" value="id" />
             <param name="taxid" value="9606" />
-            <output name="out_file" file="four_human_proteins_taxid.fasta.log" ftype="blastdbp" lines_diff="6">
+            <output name="out_file" compare="contains" file="four_human_proteins_taxid.fasta.log.txt" ftype="blastdbp">
                 <extra_files type="file" value="four_human_proteins_taxid.fasta.phr" name="blastdb.phr" />
                 <extra_files type="file" value="four_human_proteins_taxid.fasta.pin" name="blastdb.pin" lines_diff="2" />
                 <extra_files type="file" value="four_human_proteins_taxid.fasta.psq" name="blastdb.psq" />
@@ -160,6 +137,24 @@
                 <extra_files type="file" value="four_human_proteins_taxid.fasta.psi" name="blastdb.psi" />
             </output>
         </test>
+        <test>
+            <param name="dbtype" value="prot" />
+            <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" />
+            <param name="title" value="Just 4 human proteins" />
+            <param name="parse_seqids" value="" />
+            <param name="hash_index" value="true" />
+            <param name="mask_data_file" value="segmasker_four_human.maskinfo-asn1" ftype="maskinfo-asn1" />
+            <output name="out_file" compare="contains" file="four_human_proteins.fasta.log.txt" ftype="blastdbp">
+                <extra_files type="file" value="four_human_proteins.fasta.phr" name="blastdb.phr" />
+                <extra_files type="file" value="four_human_proteins.fasta.pin" name="blastdb.pin" lines_diff="2" />
+                <extra_files type="file" value="four_human_proteins.fasta.psq" name="blastdb.psq" />
+                <extra_files type="file" value="four_human_proteins.fasta.pog" name="blastdb.pog" />
+                <extra_files type="file" value="four_human_proteins.fasta.phd" name="blastdb.phd" />
+                <extra_files type="file" value="four_human_proteins.fasta.phi" name="blastdb.phi" />
+                <extra_files type="file" value="four_human_proteins.fasta.psd" name="blastdb.psd" />
+                <extra_files type="file" value="four_human_proteins.fasta.psi" name="blastdb.psi" />
+            </output>
+        </test>
     </tests>
     <help>
 **What it does**
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ncbi_blast_plus/ncbi_segmasker_wrapper.xml	Wed Feb 26 10:35:01 2014 -0500
@@ -0,0 +1,101 @@
+<tool id="ncbi_segmasker_wrapper" name="NCBI BLAST+ segmasker" version="0.1.00">
+    <description>low-complexity regions in protein sequences</description>
+    <macros>
+        <token name="@BINARY@">segmasker</token>
+        <import>ncbi_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command>
+## The command is a Cheetah template which allows some Python based syntax.
+## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
+segmasker
+#if $db_opts.db_opts_selector == "db":
+  -in "${db_opts.database.fields.path}" -infmt blastdb
+#elif $db_opts.db_opts_selector == "histdb":
+  -in "${os.path.join($db_opts.histdb.extra_files_path, 'blastdb')}" -infmt blastdb
+#else:
+  -in "$subject" -infmt fasta
+#end if
+-out "$outfile"
+-window $window
+-locut $locut
+-hicut $hicut
+-outfmt $outformat
+    </command>
+    <expand macro="stdio" />
+    <inputs>
+        <expand macro="input_conditional_protein_db" />
+        <param name="window" type="integer" value="12" label="SEG window length" help="(-window)" />
+        <param name="locut" type="float" value="2.2" label="SEG low cutoff" help="(-locut)" />
+        <param name="hicut" type="float" value="2.5" label="SEG high cutoff" help="(-hicut)" />
+        <param name="outformat" type="select" label="Output format">
+            <!-- seqloc_* formats are not very useful
+                 and what BLAST+ calls 'interval' is not what Galaxy calls interval format
+            -->
+            <option value="fasta">FASTA</option>
+            <option value="maskinfo_asn1_bin">maskinfo ASN.1 binary</option>
+            <option value="maskinfo_asn1_text" selected="true">maskinfo ASN.1 text</option>
+            <option value="maskinfo_xml">maskinfo_xml</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="outfile" format="maskinfo-asn1" label="SEG Masked File">
+            <change_format>
+                <when input="outformat" value="fasta" format="fasta" />
+                <when input="outformat" value="maskinfo_asn1_bin" format="maskinfo-asn1-binary" />
+		<!--
+                <when input="outformat" value="maskinfo_asn1_text" format="maskinfo-asn1" />
+		-->
+                <when input="outformat" value="maskinfo_xml" format="xml" />
+            </change_format>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="db_opts_selector" value="file" />
+            <param name="subject" value="four_human_proteins.fasta" ftype="fasta" />
+            <param name="window" value="12" />
+            <param name="locut" value="2.2" />
+            <param name="hicut" value="2.5" />
+            <param name="outformat" value="fasta" />
+            <output name="outfile" file="segmasker_four_human.fasta" />
+        </test>
+        <test>
+            <param name="db_opts_selector" value="file" />
+            <param name="subject" value="four_human_proteins.fasta" ftype="fasta" />
+            <param name="window" value="12" />
+            <param name="locut" value="2.2" />
+            <param name="hicut" value="2.5" />
+            <param name="outformat" value="maskinfo_asn1_bin" />
+            <output name="outfile" file="segmasker_four_human.maskinfo-asn1-binary" />
+        </test>
+        <test>
+            <param name="db_opts_selector" value="file" />
+            <param name="subject" value="four_human_proteins.fasta" ftype="fasta" />
+            <param name="window" value="12" />
+            <param name="locut" value="2.2" />
+            <param name="hicut" value="2.5" />
+            <param name="outformat" value="maskinfo_asn1_text" />
+            <output name="outfile" file="segmasker_four_human.maskinfo-asn1" />
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+This tool identifies and masks out low complexity regions of a protein database (or proteins in FASTA format) by using the SEG_ algorithm.
+
+If you select *maskinfo ASN.1* (binary or text) as output format, the output file can be used as masking data for NCBI BLAST+ makeblastdb tool.
+
+More information about segmasker can be found in the `BLAST Command Line Applications User Manual`_.
+
+.. _BLAST Command Line Applications User Manual: http://www.ncbi.nlm.nih.gov/books/NBK1763/
+.. _SEG: http://www.ncbi.nlm.nih.gov/pubmed/8743706
+
+**References**
+
+If you use this Galaxy tool in work leading to a scientific publication please
+cite the following papers (a more specific paper covering this wrapper is planned):
+
+@REFERENCES@
+    </help>
+</tool>