Mercurial > repos > devteam > ncbi_blast_plus
changeset 33:42e6f199d11f draft
v0.3.0 Updated for NCBI BLAST+ 2.7.1
author | peterjc |
---|---|
date | Sat, 30 Jun 2018 15:13:38 -0400 |
parents | 360352490a06 |
children | 8f82e05831dc |
files | test-data/chimera.fasta.gz test-data/rhodopsin_nucs.fasta.gz test-data/three_human_mRNA.fasta.gz tools/ncbi_blast_plus/README.rst tools/ncbi_blast_plus/check_no_duplicates.py tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml tools/ncbi_blast_plus/ncbi_macros.xml tools/ncbi_blast_plus/ncbi_makeblastdb.xml |
diffstat | 8 files changed, 134 insertions(+), 86 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/ncbi_blast_plus/README.rst Tue Jun 05 11:42:10 2018 -0400 +++ b/tools/ncbi_blast_plus/README.rst Sat Jun 30 15:13:38 2018 -0400 @@ -1,10 +1,9 @@ Galaxy wrappers for NCBI BLAST+ suite ===================================== -These wrappers are copyright 2010-2017 by Peter Cock (The James Hutton Institute, -UK) and additional contributors including Edward Kirton, John Chilton, -Nicola Soranzo, Jim Johnson, Bjoern Gruening, and Caleb Easterly. - +These wrappers are copyright 2010-2018 by Peter Cock (James Hutton Institute, +UK) and additional contributors including Edward Kirton, John Chilton, Nicola +Soranzo, Jim Johnson, Bjoern Gruening, Caleb Easterly, and Anton Nekrutenko. See the licence text below. Note this does not work with the NCBI 'legacy' BLAST suite written in C @@ -259,6 +258,7 @@ - Depends on BioConda or legacy ToolShed ``package_blast_plus_2_7_1``. - Document the BLAST+ 2.6.0 change in the standard 12 column output from ``qacc,sacc,...`` to ``qaccver,saccver,...`` instead. + - Accept gzipped FASTA inputs (contribution from Anton Nekrutenko). ======= ======================================================================
--- a/tools/ncbi_blast_plus/check_no_duplicates.py Tue Jun 05 11:42:10 2018 -0400 +++ b/tools/ncbi_blast_plus/check_no_duplicates.py Sat Jun 30 15:13:38 2018 -0400 @@ -9,10 +9,11 @@ will return a non-zero error if any duplicate identifiers are found. """ - +import gzip import os import sys + if "-v" in sys.argv or "--version" in sys.argv: print("v0.0.23") sys.exit(0) @@ -24,7 +25,19 @@ sys.stderr.write("Missing FASTA file %r\n" % filename) sys.exit(2) files += 1 - handle = open(filename) + + with open(filename, "rb") as binary_handle: + magic = binary_handle.read(2) + if not magic: + # Empty file, special case + continue + elif magic == b'\x1f\x8b': + # Gzipped + handle = gzip.open(filename, "rt") + elif magic[0:1] == b">": + # Not gzipped, shoudl be plain FASTA + handle = open(filename, "r") + for line in handle: if line.startswith(">"): # The split will also take care of the new line character,
--- a/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Tue Jun 05 11:42:10 2018 -0400 +++ b/tools/ncbi_blast_plus/ncbi_blastn_wrapper.xml Sat Jun 30 15:13:38 2018 -0400 @@ -7,24 +7,29 @@ <expand macro="parallelism" /> <expand macro="preamble" /> <command detect_errors="aggressive"> +<![CDATA[ ## The command is a Cheetah template which allows some Python based syntax. ## Lines starting hash hash are comments. Galaxy will turn newlines into spaces blastn --query '$query' +#if $query.is_of_type('fasta.gz'): +-query <(gunzip -c '${query}') +#else: +-query '${query}' +#end if @BLAST_DB_SUBJECT@ --task $blast_type --evalue $evalue_cutoff +-task '${blast_type}' +-evalue '${evalue_cutoff}' @BLAST_OUTPUT@ @THREADS@ #if $adv_opts.adv_opts_selector=="advanced": -$adv_opts.strand +${adv_opts.strand} @ADV_FILTER_QUERY@ @ADV_MAX_HITS@ @ADV_WORD_SIZE@ #if (str($adv_opts.identity_cutoff) and float(str($adv_opts.identity_cutoff)) > 0 ): --perc_identity $adv_opts.identity_cutoff +-perc_identity '${adv_opts.identity_cutoff}' #end if -$adv_opts.ungapped +${adv_opts.ungapped} @ADV_ID_LIST_FILTER@ @ADV_QCOV_HSP_PERC@ ## only use window size if dc-megablast mode is used @@ -35,9 +40,10 @@ @ADV_GAPEXTEND@ ## End of advanced options: #end if +]]> </command> <inputs> - <param argument="-query" type="data" format="fasta" label="Nucleotide query sequence(s)"/> + <param argument="-query" type="data" format="fasta,fasta.gz" label="Nucleotide query sequence(s)"/> <expand macro="input_conditional_nucleotide_db" /> <param name="blast_type" argument="-task" type="select" display="radio" label="Type of BLAST"> <option value="megablast">megablast - Traditional megablast used to find very similar (e.g., intraspecies or closely related species) sequences</option> @@ -102,6 +108,16 @@ <test> <param name="query" value="rhodopsin_nucs.fasta" ftype="fasta" /> <param name="db_opts_selector" value="file" /> + <param name="subject" value="three_human_mRNA.fasta.gz" ftype="fasta.gz" /> + <param name="database" value="" /> + <param name="evalue_cutoff" value="1e-40" /> + <param name="out_format" value="6" /> + <param name="adv_opts_selector" value="basic" /> + <output name="output1" file="blastn_rhodopsin_vs_three_human.tabular" ftype="tabular" /> + </test> + <test> + <param name="query" value="rhodopsin_nucs.fasta" ftype="fasta" /> + <param name="db_opts_selector" value="file" /> <param name="subject" value="three_human_mRNA.fasta" ftype="fasta" /> <param name="database" value="" /> <param name="evalue_cutoff" value="1e-40" />
--- a/tools/ncbi_blast_plus/ncbi_macros.xml Tue Jun 05 11:42:10 2018 -0400 +++ b/tools/ncbi_blast_plus/ncbi_macros.xml Sat Jun 30 15:13:38 2018 -0400 @@ -357,7 +357,7 @@ <when value="file"> <param name="database" type="hidden" value="" /> <param name="histdb" type="hidden" value="" /> - <param argument="-subject" type="data" format="fasta" label="Nucleotide FASTA subject file to use instead of a database"/> + <param argument="-subject" type="data" format="fasta,fasta.gz" label="Nucleotide FASTA subject file to use instead of a database"/> </when> </conditional> </xml> @@ -533,42 +533,46 @@ </conditional> </xml> <!--Tokens--> - <token name="@ADV_MATRIX_GAPCOSTS@"> + <token name="@ADV_MATRIX_GAPCOSTS@"><![CDATA[ #if str($adv_opts.matrix_gapcosts.matrix): - -matrix $adv_opts.matrix_gapcosts.matrix - $adv_opts.matrix_gapcosts.gap_costs + -matrix '${adv_opts.matrix_gapcosts.matrix}' + ${adv_opts.matrix_gapcosts.gap_costs} #end if - </token> + ]]></token> - <token name="@ADV_QCOV_HSP_PERC@"> -#if float(str($adv_opts.qcov_hsp_perc)) > 0: - -qcov_hsp_perc $adv_opts.qcov_hsp_perc + <token name="@ADV_QCOV_HSP_PERC@"><![CDATA[ +#if float(str($adv_opts.qcov_hsp_perc)) > 0: + -qcov_hsp_perc '${adv_opts.qcov_hsp_perc}' #end if - </token> + ]]></token> - <token name="@ADV_ID_LIST_FILTER@"> + <token name="@ADV_ID_LIST_FILTER@"><![CDATA[ #if $adv_opts.adv_optional_id_files_opts.adv_optional_id_files_opts_selector == 'negative_gilist': - -negative_gilist $adv_opts.adv_optional_id_files_opts.negative_gilist + -negative_gilist '${adv_opts.adv_optional_id_files_opts.negative_gilist}' #elif $adv_opts.adv_optional_id_files_opts.adv_optional_id_files_opts_selector == 'gilist': - -gilist $adv_opts.adv_optional_id_files_opts.gilist + -gilist '{$adv_opts.adv_optional_id_files_opts.gilist}' #elif $adv_opts.adv_optional_id_files_opts.adv_optional_id_files_opts_selector == 'seqidlist': - -seqidlist $adv_opts.adv_optional_id_files_opts.seqidlist + -seqidlist '${adv_opts.adv_optional_id_files_opts.seqidlist}' #end if - </token> + ]]></token> <token name="@THREADS@">-num_threads "\${GALAXY_SLOTS:-8}"</token> - <token name="@BLAST_DB_SUBJECT@"> + <token name="@BLAST_DB_SUBJECT@"><![CDATA[ #if $db_opts.db_opts_selector == "db": -db '${" ".join(str($db_opts.database.fields.path).split(","))}' #elif $db_opts.db_opts_selector == "histdb": -db '${os.path.join($db_opts.histdb.extra_files_path, "blastdb")}' #else: - -subject '$db_opts.subject' + #if $db_opts.subject.is_of_type('fasta.gz'): + -subject <(gunzip -c '${$db_opts.subject}') + #else: + -subject '${db_opts.subject}' + #end if #end if - </token> + ]]></token> - <token name="@BLAST_OUTPUT@">-out '$output1' + <token name="@BLAST_OUTPUT@"><![CDATA[ -out '$output1' ##Set the extended list here so when we add things, saved workflows are not affected #if str($output.out_format)=="ext": -outfmt '6 std sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen salltitles' @@ -579,79 +583,81 @@ -outfmt '6 $cols' #else: ## Note do not quote this as can be '0 -html' which is really two arguments - -outfmt $output.out_format + -outfmt ${output.out_format} #end if - </token> + ]]></token> <token name="@ADV_FILTER_QUERY@">$adv_opts.filter_query</token> - <token name="@ADV_MAX_HITS@"> + <token name="@ADV_MAX_HITS@"><![CDATA[ ## Need int(str(...)) because $adv_opts.max_hits is an InputValueWrapper object not a string ## Note -max_target_seqs used to simply override -num_descriptions and -num_alignments ## but this was changed in BLAST+ 2.2.27 onwards to force their use (raised with NCBI) #if (str($adv_opts.max_hits) and int(str($adv_opts.max_hits)) > 0): #if str($output.out_format) in ["6", "ext", "cols", "5"]: ## Most output formats use this, including tabular and XML: - -max_target_seqs $adv_opts.max_hits + -max_target_seqs '${adv_opts.max_hits}' #else ## Text and HTML output formats 0-4 currently need this instead: -num_descriptions $adv_opts.max_hits -num_alignments $adv_opts.max_hits #end if #end if #if str($adv_opts.max_hsps) - -max_hsps $adv_opts.max_hsps + -max_hsps '${adv_opts.max_hsps}' #end if - </token> - <token name="@ADV_WORD_SIZE@"> + ]]></token> + <token name="@ADV_WORD_SIZE@"><![CDATA[ #if str($adv_opts.word_size): --word_size $adv_opts.word_size +-word_size '${adv_opts.word_size}' #end if $adv_opts.parse_deflines - </token> - <token name="@ADV_WINDOW_SIZE@"> + ]]></token> + <token name="@ADV_WINDOW_SIZE@"><![CDATA[ #if str($adv_opts.window_size): --window_size $adv_opts.window_size +-window_size '${adv_opts.window_size}' #end if - </token> + ]]></token> - <token name="@ADV_THRESHOLD@"> + <token name="@ADV_THRESHOLD@"><![CDATA[ #if str($adv_opts.threshold): --threshold $adv_opts.threshold +-threshold '${adv_opts.threshold}' #end if - </token> + ]]></token> - <token name="@ADV_COMP_BASED_STATS@"> + <token name="@ADV_COMP_BASED_STATS@"><![CDATA[ #if str($adv_opts.comp_based_stats): --comp_based_stats $adv_opts.comp_based_stats +-comp_based_stats '${adv_opts.comp_based_stats}' #end if - </token> + ]]></token> - <token name="@ADV_GAPOPEN@"> + <token name="@ADV_GAPOPEN@"><![CDATA[ #if str($adv_opts.gapopen): --gapopen $adv_opts.gapopen +-gapopen '${adv_opts.gapopen}' #end if - </token> + ]]></token> - <token name="@ADV_GAPEXTEND@"> + <token name="@ADV_GAPEXTEND@"><![CDATA[ #if str($adv_opts.gapextend): --gapextend $adv_opts.gapextend +-gapextend '${adv_opts.gapextend}' #end if - </token> + ]]></token> - <token name="@ADV_MATRIX@"> + <token name="@ADV_MATRIX@"><![CDATA[ #if str($adv_opts.matrix): --matrix $adv_opts.matrix +-matrix '${adv_opts.matrix}' #end if - </token> + ]]></token> <!-- @ON_DB_SUBJECT@ is for use with @BLAST_DB_SUBJECT@ --> - <token name="@ON_DB_SUBJECT@">#if str($db_opts.db_opts_selector)=='db' + <token name="@ON_DB_SUBJECT@"><![CDATA[ +#if str($db_opts.db_opts_selector)=='db' '${db_opts.database}' #elif str($db_opts.db_opts_selector)=='histdb' '${db_opts.histdb.name}' #else '${db_opts.subject.name}' -#end if</token> +#end if +]]></token> - <token name="@REFERENCES@"> + <token name="@REFERENCES@"><![CDATA[ Peter J. A. Cock, John M. Chilton, Björn Grüning, James E. Johnson, Nicola Soranzo (2015). NCBI BLAST+ integrated into Galaxy. *GigaScience* 4:39 https://doi.org/10.1186/s13742-015-0080-7 @@ -663,14 +669,16 @@ This wrapper is available to install into other Galaxy Instances via the Galaxy Tool Shed at http://toolshed.g2.bx.psu.edu/view/devteam/ncbi_blast_plus - </token> + ]]></token> <xml name="blast_citations"> <citations> + <citation type="doi">10.1093/nar/25.17.3389</citation> <citation type="doi">10.1186/1471-2105-10-421</citation> <citation type="doi">10.1186/s13742-015-0080-7</citation> </citations> </xml> - <token name="@OUTPUT_FORMAT@">**Output format** + <token name="@OUTPUT_FORMAT@"><![CDATA[ +**Output format** Because Galaxy focuses on processing tabular data, the default output of this tool is tabular. The standard BLAST+ tabular output contains 12 columns: @@ -720,7 +728,7 @@ 22 sseq Aligned part of subject sequence 23 qlen Query sequence length 24 slen Subject sequence length - 25 salltitles All subject title(s), separated by a '<>' + 25 salltitles All subject title(s), separated by a '<>' ====== ============= =========================================== The third option is to customise the tabular output by selecting which @@ -735,8 +743,9 @@ The pairwise output (the default on the NCBI BLAST website) shows each match as a pairwise alignment with the query. The two query anchored outputs show a multiple sequence alignment between the query and all the matches, and differ in how insertions are shown (marked as insertions or with gap characters added to the other sequences). - </token> - <token name="@FASTA_WARNING@">.. class:: warningmark + ]]></token> + <token name="@FASTA_WARNING@"><![CDATA[ +.. class:: warningmark You can also search against a FASTA file of subject (target) sequences. This is *not* advised because it is slower (only one @@ -744,25 +753,26 @@ searches (very small e-values which will look overly signficiant). In most cases you should instead turn the other FASTA file into a database first using *makeblastdb* and search against that. - </token> - <token name="@SEARCH_TIME_WARNING@">.. class:: warningmark + ]]></token> + <token name="@SEARCH_TIME_WARNING@"><![CDATA[ +.. class:: warningmark **Note**. Database searches may take a substantial amount of time. For large input datasets it is advisable to allow overnight processing. ----- - </token> - - <token name="@CLI_OPTIONS@">**Advanced Options** + ]]></token> + <token name="@CLI_OPTIONS@"><![CDATA[ +**Advanced Options** For help with advanced options and their default values, visit the NCBI BLAST® Command Line Applications User Manual, Appendices, `Options for the command-line applications -<https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_Options_for_the_commandline_a_>`_. +<https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_Options_for_the_commandline_a_>`_. For amino acid substitution matrices, see `BLAST Substitution Matrices -<https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_BLAST_Substitution_Matrices_>`_ in the same +<https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_BLAST_Substitution_Matrices_>`_ in the same appendices. - </token> + ]]></token> </macros>
--- a/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Tue Jun 05 11:42:10 2018 -0400 +++ b/tools/ncbi_blast_plus/ncbi_makeblastdb.xml Sat Jun 30 15:13:38 2018 -0400 @@ -5,21 +5,30 @@ <import>ncbi_macros.xml</import> </macros> <expand macro="preamble" /> - <command detect_errors="aggressive" strict="true"> + <command detect_errors="aggressive" strict="true"><![CDATA[ python $__tool_directory__/check_no_duplicates.py ##First check for duplicates (since BLAST+ 2.2.28 fails to do so) ##and abort (via the ampersand ampersand trick) if any are found. #for i in $input_file#'${i}' #end for# -&& -makeblastdb -out '${os.path.join($outfile.files_path, "blastdb")}' +&& +##makeblastdb does not like input redirects of the sort +##makeblastdb -in <(gunzip -c gzipped_fasta_file) +##therefore we're cramming everything +##into a single cat command below +cat +#for i in $input_file: + #if $i.is_of_type('fasta.gz'): + <(gunzip -c ${i}) + #else: + ${i} + #end if +#end for +| makeblastdb -out '${os.path.join($outfile.files_path, "blastdb")}' $parse_seqids $hash_index -## Single call to -in with multiple filenames space separated with outer quotes -## (presumably any filenames with spaces would be a problem). Note this gives -## some extra spaces, e.g. -in "file1 file2 file3 " but BLAST seems happy: --in '#for i in $input_file#${i} #end for#' +-in - #if $title: --title '$title' +-title '${title}' #else: ##Would default to being based on the cryptic Galaxy filenames, which is unhelpful -title 'BLAST Database' @@ -46,8 +55,8 @@ #end if ## -------------------------------------------------------------------- ## Capture the stdout log information to the primary file (plain text): -> "$outfile" - </command> +> '$outfile' + ]]></command> <inputs> <param argument="-dbtype" type="select" display="radio" label="Molecule type of input"> <option value="prot">protein</option> @@ -57,7 +66,7 @@ NOTE Double check the new database would be self contained first --> <!-- Note this is a mandatory parameter - default should be most recent FASTA file --> - <param name="input_file" argument="-in" type="data" multiple="true" optional="false" format="fasta" label="Input FASTA files(s)" help="One or more FASTA files" /> + <param name="input_file" argument="-in" type="data" multiple="true" optional="false" format="fasta,fasta.gz" label="Input FASTA files(s)" help="One or more FASTA files" /> <param argument="-title" type="text" value="" label="Title for BLAST database" help="This is the database name shown in BLAST search output" /> <param argument="-parse_seqids" type="boolean" truevalue="-parse_seqids" falsevalue="" checked="false" label="Parse the sequence identifiers" help="This is only advised if your FASTA file follows the NCBI naming conventions using pipe '|' symbols" /> <param argument="-hash_index" type="boolean" truevalue="-hash_index" falsevalue="" checked="true" label="Enable the creation of sequence hash values" help="These hash values can then be used to quickly determine if a given sequence data exists in this BLAST database." /> @@ -158,7 +167,7 @@ </test> <test> <param name="dbtype" value="nucl" /> - <param name="input_file" value="three_human_mRNA.fasta" ftype="fasta" /> + <param name="input_file" value="three_human_mRNA.fasta.gz" ftype="fasta.gz" /> <param name="title" value="Just 3 human mRNA sequences" /> <param name="parse_seqids" value="" /> <param name="hash_index" value="true" />