Mercurial > repos > peterjc > blast2go
changeset 21:b80762947b27 draft
Uploaded v0.0.8 preview 16, split out XML formatter into standalone script
author | peterjc |
---|---|
date | Mon, 16 Sep 2013 15:02:33 -0400 |
parents | fe9dbe7817cc |
children | 8c462f7b2c8d |
files | blast2go/README.rst blast2go/blast2go.py blast2go/massage_xml_for_blast2go.py |
diffstat | 3 files changed, 108 insertions(+), 51 deletions(-) [+] |
line wrap: on
line diff
--- a/blast2go/README.rst Mon Sep 16 08:59:42 2013 -0400 +++ b/blast2go/README.rst Mon Sep 16 15:02:33 2013 -0400 @@ -67,6 +67,7 @@ * blast2go.xml (the Galaxy tool definition) * blast2go.py (the Python wrapper script) +* massage_xml_for_blast2go.py (Python XML reformatting script) * README.rst (this file) For a manual installation of the wrapper you will also need to modify the @@ -148,6 +149,7 @@ - Use reStructuredText for this README file. - Updated citation information (Cock et al. 2013). - Development moved to GitHub, https://github.com/peterjc/galaxy_blast + - Split out massage_xml_for_blast2go.py as a standalone file. ======= ====================================================================== @@ -164,7 +166,7 @@ For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball I use the following command from the Galaxy root folder:: - $ tar -czf blast2go.tar.gz blast2go/README.rst blast2go/blast2go.xml blast2go/blast2go.py blast2go/repository_dependencies.xml blast2go/tool_dependencies.xml tool-data/blast2go.loc.sample test-data/blastp_sample.xml test-data/blastp_sample.blast2go.tabular + $ tar -czf blast2go.tar.gz blast2go/README.rst blast2go/blast2go.xml blast2go/blast2go.py blast2go/massage_xml_for_blast2go.py blast2go/repository_dependencies.xml blast2go/tool_dependencies.xml tool-data/blast2go.loc.sample test-data/blastp_sample.xml test-data/blastp_sample.blast2go.tabular Check this worked:: @@ -172,6 +174,7 @@ blast2go/README.rst blast2go/blast2go.xml blast2go/blast2go.py + blast2go/massage_xml_for_blast2go.py blast2go/repository_dependencies.xml blast2go/tool_dependencies.xml tool-data/blast2go.loc.sample
--- a/blast2go/blast2go.py Mon Sep 16 08:59:42 2013 -0400 +++ b/blast2go/blast2go.py Mon Sep 16 15:02:33 2013 -0400 @@ -18,6 +18,13 @@ It then calls the Java command line tool, and moves the output file to the location Galaxy is expecting, and removes the tempory XML file. + +This script is called from my Galaxy wrapper for Blast2GO for pipelines, +available from the Galaxy Tool Shed here: +http://toolshed.g2.bx.psu.edu/view/peterjc/blast2go + +This script is under version control here: +https://github.com/peterjc/galaxy_blast/tree/master/blast2go """ import sys import os @@ -32,6 +39,11 @@ sys.stderr.write("%s\n" % msg) sys.exit(error_level) +try: + from massage_xml_for_blast2go import prepare_xml +except ImportError: + stop_err("Missing sister file massage_xml_for_blast2go.py") + if len(sys.argv) != 4: stop_err("Require three arguments: XML filename, properties filename, output tabular filename") @@ -55,56 +67,6 @@ stop_err("Blast2GO configuration file not found: %s" % prop_file) del tmp -def prepare_xml(original_xml, mangled_xml): - """Reformat BLAST XML to suit Blast2GO. - - Blast2GO can't cope with 1000s of <Iteration> tags within a - single <BlastResult> tag, so instead split this into one - full XML record per interation (i.e. per query). This gives - a concatenated XML file mimicing old versions of BLAST. - - This also checks for BLASTP or BLASTX output, and outputs - the number of queries. Galaxy will show this as "info". - """ - in_handle = open(original_xml) - footer = " </BlastOutput_iterations>\n</BlastOutput>\n" - header = "" - while True: - line = in_handle.readline() - if not line: - #No hits? - stop_err("Problem with XML file?") - if line.strip() == "<Iteration>": - break - header += line - - if "<BlastOutput_program>blastx</BlastOutput_program>" in header: - print "BLASTX output identified" - elif "<BlastOutput_program>blastp</BlastOutput_program>" in header: - print "BLASTP output identified" - else: - in_handle.close() - stop_err("Expect BLASTP or BLASTX output") - - out_handle = open(mangled_xml, "w") - out_handle.write(header) - out_handle.write(line) - count = 1 - while True: - line = in_handle.readline() - if not line: - break - elif line.strip() == "<Iteration>": - #Insert footer/header - out_handle.write(footer) - out_handle.write(header) - count += 1 - out_handle.write(line) - - out_handle.close() - in_handle.close() - print "Input has %i queries" % count - def run(cmd): #Avoid using shell=True when we call subprocess to ensure if the Python
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast2go/massage_xml_for_blast2go.py Mon Sep 16 15:02:33 2013 -0400 @@ -0,0 +1,92 @@ +#!/usr/bin/env python +"""Script for reformatting Blast XML to suite Blast2GO. + +This script takes exactly two command line arguments: + * Input BLAST XML filename + * Output BLAST XML filename + +Sadly b2g4pipe (at least v2.3.5 to v2.5.0) cannot cope with current +style large BLAST XML files (e.g. from BLAST 2.2.25+), so we reformat +these to avoid it crashing with a Java heap space OutOfMemoryError. + +As part of this reformatting, we check for BLASTP or BLASTX output +(otherwise raise an error), and print the query count. + +This script is called from my Galaxy wrapper for Blast2GO for pipelines, +available from the Galaxy Tool Shed here: +http://toolshed.g2.bx.psu.edu/view/peterjc/blast2go + +This script is under version control here: +https://github.com/peterjc/galaxy_blast/tree/master/blast2go +""" +import sys +import os +import subprocess + +def stop_err(msg, error_level=1): + """Print error message to stdout and quit with given error level.""" + sys.stderr.write("%s\n" % msg) + sys.exit(error_level) + +def prepare_xml(original_xml, mangled_xml): + """Reformat BLAST XML to suit Blast2GO. + + Blast2GO can't cope with 1000s of <Iteration> tags within a + single <BlastResult> tag, so instead split this into one + full XML record per interation (i.e. per query). This gives + a concatenated XML file mimicing old versions of BLAST. + + This also checks for BLASTP or BLASTX output, and outputs + the number of queries. Galaxy will show this as "info". + """ + in_handle = open(original_xml) + footer = " </BlastOutput_iterations>\n</BlastOutput>\n" + header = "" + while True: + line = in_handle.readline() + if not line: + #No hits? + stop_err("Problem with XML file?") + if line.strip() == "<Iteration>": + break + header += line + + if "<BlastOutput_program>blastx</BlastOutput_program>" in header: + print "BLASTX output identified" + elif "<BlastOutput_program>blastp</BlastOutput_program>" in header: + print "BLASTP output identified" + else: + in_handle.close() + stop_err("Expect BLASTP or BLASTX output") + + out_handle = open(mangled_xml, "w") + out_handle.write(header) + out_handle.write(line) + count = 1 + while True: + line = in_handle.readline() + if not line: + break + elif line.strip() == "<Iteration>": + #Insert footer/header + out_handle.write(footer) + out_handle.write(header) + count += 1 + out_handle.write(line) + + out_handle.close() + in_handle.close() + print "Input has %i queries" % count + + +if __name__ == "__main__": + # Run the conversion... + if len(sys.argv) != 3: + stop_err("Require two arguments: XML input filename, XML output filename") + + xml_file, out_xml_file = sys.argv[1:] + + if not os.path.isfile(xml_file): + stop_err("Input BLAST XML file not found: %s" % xml_file) + + prepare_xml(xml_file, out_xml_file)