Mercurial > repos > peterjc > blastxml_to_top_descr
changeset 0:b23e0a323bee draft
Uploaded v0.0.2b
author | peterjc |
---|---|
date | Mon, 29 Apr 2013 13:08:32 -0400 |
parents | |
children | ec31e95c56e1 |
files | tools/ncbi_blast_plus/blastxml_to_top_descr.py tools/ncbi_blast_plus/blastxml_to_top_descr.txt tools/ncbi_blast_plus/blastxml_to_top_descr.xml tools/ncbi_blast_plus/repository_dependencies.xml |
diffstat | 4 files changed, 261 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/blastxml_to_top_descr.py Mon Apr 29 13:08:32 2013 -0400 @@ -0,0 +1,115 @@ +#!/usr/bin/env python +"""Convert a BLAST XML file to a top hits description table. + +Takes three command line options, input BLAST XML filename, output tabular +BLAST filename, number of hits to collect the descriptions of. +""" +import sys +import re + +if sys.version_info[:2] >= ( 2, 5 ): + import xml.etree.cElementTree as ElementTree +else: + from galaxy import eggs + import pkg_resources; pkg_resources.require( "elementtree" ) + from elementtree import ElementTree + +def stop_err( msg ): + sys.stderr.write("%s\n" % msg) + sys.exit(1) + +#Parse Command Line +try: + in_file, out_file, topN = sys.argv[1:] +except: + stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, number of hits") + + +try: + topN = int(topN) +except ValueError: + stop_err("Number of hits argument should be an integer (at least 1)") +if topN < 1: + stop_err("Number of hits argument should be an integer (at least 1)") + +# get an iterable +try: + context = ElementTree.iterparse(in_file, events=("start", "end")) +except: + stop_err("Invalid data format.") +# turn it into an iterator +context = iter(context) +# get the root element +try: + event, root = context.next() +except: + stop_err( "Invalid data format." ) + + +re_default_query_id = re.compile("^Query_\d+$") +assert re_default_query_id.match("Query_101") +assert not re_default_query_id.match("Query_101a") +assert not re_default_query_id.match("MyQuery_101") +re_default_subject_id = re.compile("^Subject_\d+$") +assert re_default_subject_id.match("Subject_1") +assert not re_default_subject_id.match("Subject_") +assert not re_default_subject_id.match("Subject_12a") +assert not re_default_subject_id.match("TheSubject_1") + + +count = 0 +outfile = open(out_file, 'w') +outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i+1) for i in range(topN))) +for event, elem in context: + # for every <Iteration> tag + if event == "end" and elem.tag == "Iteration": + #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA + # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> + # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> + # <Iteration_query-len>406</Iteration_query-len> + # <Iteration_hits></Iteration_hits> + # + #Or, from BLAST 2.2.24+ run online + # <Iteration_query-ID>Query_1</Iteration_query-ID> + # <Iteration_query-def>Sample</Iteration_query-def> + # <Iteration_query-len>516</Iteration_query-len> + # <Iteration_hits>... + qseqid = elem.findtext("Iteration_query-ID") + if qseqid is None: + stop_err("Missing <Iteration_query-ID> (could be really old BLAST XML data?)") + if re_default_query_id.match(qseqid): + #Place holder ID, take the first word of the query definition + qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] + # for every <Hit> within <Iteration> + hit_descrs = [] + for hit in elem.findall("Iteration_hits/Hit"): + #Expecting either this, + # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> + # <Hit_def>RecName: Full=Rhodopsin</Hit_def> + # <Hit_accession>P56514</Hit_accession> + #or, + # <Hit_id>Subject_1</Hit_id> + # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> + # <Hit_accession>Subject_1</Hit_accession> + # + #apparently depending on the parse_deflines switch + sseqid = hit.findtext("Hit_id").split(None,1)[0] + hit_def = sseqid + " " + hit.findtext("Hit_def") + if re_default_subject_id.match(sseqid) \ + and sseqid == hit.findtext("Hit_accession"): + #Place holder ID, take the first word of the subject definition + hit_def = hit.findtext("Hit_def") + sseqid = hit_def.split(None,1)[0] + assert hit_def not in hit_descrs + hit_descrs.append(hit_def) + #print "%r has %i hits" % (qseqid, len(hit_descrs)) + hit_descrs = hit_descrs[:topN] + while len(hit_descrs) < topN: + hit_descrs.append("") + outfile.write("%s\t%s\n" % (qseqid, "\t".join(hit_descrs))) + count += 1 + # prevents ElementTree from growing large datastructure + root.clear() + elem.clear() +outfile.close() +print "%i BLAST results" % count
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/blastxml_to_top_descr.txt Mon Apr 29 13:08:32 2013 -0400 @@ -0,0 +1,94 @@ +Galaxy tool to extract top BLAST hit descriptions from BLAST XML +================================================================ + +This tool is copyright 2012-2013 by Peter Cock, The James Hutton Institute +(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +See the licence text below. + +This tool is a short Python script to parse a BLAST XML file, and extract the +identifiers with description for the top matches (by default the top 3), and +output these as a simple tabular file along with the query identifiers. + +This requires the 'blast_datatypes' repository from the Galaxy Tool Shed +to provide the 'blastxml' file format definition. + + +Automated Installation +====================== + +This should be straightforward, Galaxy should automatically install the +'blast_datatypes' dependency. + + + +Manual Installation +=================== + +If you haven't done so before, first install the 'blast_datatypes' repository. + +There are just two files to install (if doing this manually): + +* blastxml_to_top_descr.py (the Python script) +* blastxml_to_top_descr.xml (the Galaxy tool definition) + +The suggested location is in the Galaxy folder tools/ncbi_blast_plus next to +the NCBI BLAST+ tool wrappers. + +You will also need to modify the tools_conf.xml file to tell Galaxy to offer +the tool. e.g. next to the NCBI BLAST+ tools. Simply add the line: + +<tool file="filters/seq_select_by_id.xml" /> + +To run the tool's tests, also add this line to tools_conf.xml.sample then: + +$ sh run_functional_tests.sh -id blastxml_to_top_descr + + +History +======= + +v0.0.1 - Initial version. +v0.0.2 - Since BLAST+ was moved out of the Galaxy core, now have a dependency + on the 'blast_datatypes' repository in the Tool Shed. + + +Developers +========== + +This script and related tools are being developed on the following hg branch: +http://bitbucket.org/peterjc/galaxy-central/src/tools + +For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use +the following command from the Galaxy root folder: + +$ tar -czf blastxml_to_top_descr.tar.gz tools/ncbi_blast_plus/blastxml_to_top_descr.* tools/ncbi_blast_plus/repository_dependencies.xml + +Check this worked: + +$ tar -tzf blastxml_to_top_descr.tar.gz +tools/ncbi_blast_plus/blastxml_to_top_descr.py +tools/ncbi_blast_plus/blastxml_to_top_descr.txt +tools/ncbi_blast_plus/blastxml_to_top_descr.xml +tools/ncbi_blast_plus/repository_dependencies.xml + + +Licence (MIT/BSD style) +======================= + +Permission to use, copy, modify, and distribute this software and its +documentation with or without modifications and for any purpose and +without fee is hereby granted, provided that any copyright notices +appear in all copies and that both those copyright notices and this +permission notice appear in supporting documentation, and that the +names of the contributors or copyright holders not be used in +advertising or publicity pertaining to distribution of the software +without specific prior permission. + +THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL +WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT +OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE +OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE +OR PERFORMANCE OF THIS SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/blastxml_to_top_descr.xml Mon Apr 29 13:08:32 2013 -0400 @@ -0,0 +1,47 @@ +<tool id="blastxml_to_top_descr" name="BLAST top hit descriptions" version="0.0.1"> + <description>Make a table from BLAST XML</description> + <command interpreter="python"> + blastxml_to_top_descr.py $blastxml_file $tabular_file $topN + </command> + <inputs> + <param name="blastxml_file" type="data" format="blastxml" label="BLAST results as XML"/> + <param name="topN" type="integer" min="1" max="100" optional="false" label="Number of descriptions" value="3"/> + </inputs> + <outputs> + <data name="tabular_file" format="tabular" label="Top $topN descriptions from $blastxml_file.name" /> + </outputs> + <requirements> + </requirements> + <tests> + <test> + <param name="blastxml_file" value="blastp_four_human_vs_rhodopsin.xml" ftype="blastxml" /> + <param name="topN" value="3" /> + <output name="tabular_file" file="blastp_four_human_vs_rhodopsin_top3.tabular" ftype="tabular" /> + </test> + </tests> + <help> + +**What it does** + +NCBI BLAST+ (and the older NCBI 'legacy' BLAST) can output in a range of +formats including text, tabular and a more detailed XML format. You can +do a lot of things with tabular files in Galaxy (sorting, filtering, joins, +etc) however currently the BLAST tabular output omits the hit descriptions +found in the other output formats. + +This tool turns a BLAST XML file into a simple tabular file containing +one row per query sequence, containing the query identifier and then +the three (by default) top hit descriptions. If a query doesn't have +that many hits, then these entries are left blank. + +**Example Usage** + +One simple usage would be to take a transcriptome assembly or set of +gene predictions, run a BLAST search against the NCBI NR database, and +then use this tool to make a table of the top three BLAST hits. This +can give you a 'quick and dirty' crude annotation, potentially enough +to spot some problems (e.g. bacterial contaimination could be very +obvious). + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ncbi_blast_plus/repository_dependencies.xml Mon Apr 29 13:08:32 2013 -0400 @@ -0,0 +1,5 @@ +<?xml version="1.0"?> +<repositories description="This requires the BLAST datatype definitions (e.g. the BLAST XML format)."> +<!-- Revision 4:f9a7783ed7b6 on the main tool shed is v0.0.14 which added BLAST databases --> +<repository toolshed="http://toolshed.g2.bx.psu.edu" name="blast_datatypes" owner="devteam" changeset_revision="f9a7783ed7b6" /> +</repositories>