changeset 21:b80762947b27 draft

Uploaded v0.0.8 preview 16, split out XML formatter into standalone script
author peterjc
date Mon, 16 Sep 2013 15:02:33 -0400
parents fe9dbe7817cc
children 8c462f7b2c8d
files blast2go/README.rst blast2go/blast2go.py blast2go/massage_xml_for_blast2go.py
diffstat 3 files changed, 108 insertions(+), 51 deletions(-) [+]
line wrap: on
line diff
--- a/blast2go/README.rst	Mon Sep 16 08:59:42 2013 -0400
+++ b/blast2go/README.rst	Mon Sep 16 15:02:33 2013 -0400
@@ -67,6 +67,7 @@
 
 * blast2go.xml (the Galaxy tool definition)
 * blast2go.py (the Python wrapper script)
+* massage_xml_for_blast2go.py (Python XML reformatting script)
 * README.rst (this file)
 
 For a manual installation of the wrapper you will also need to modify the
@@ -148,6 +149,7 @@
         - Use reStructuredText for this README file.
         - Updated citation information (Cock et al. 2013).
         - Development moved to GitHub, https://github.com/peterjc/galaxy_blast
+        - Split out massage_xml_for_blast2go.py as a standalone file.
 ======= ======================================================================
 
 
@@ -164,7 +166,7 @@
 For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball I use
 the following command from the Galaxy root folder::
 
-    $ tar -czf blast2go.tar.gz blast2go/README.rst blast2go/blast2go.xml blast2go/blast2go.py blast2go/repository_dependencies.xml blast2go/tool_dependencies.xml tool-data/blast2go.loc.sample test-data/blastp_sample.xml test-data/blastp_sample.blast2go.tabular
+    $ tar -czf blast2go.tar.gz blast2go/README.rst blast2go/blast2go.xml blast2go/blast2go.py blast2go/massage_xml_for_blast2go.py blast2go/repository_dependencies.xml blast2go/tool_dependencies.xml tool-data/blast2go.loc.sample test-data/blastp_sample.xml test-data/blastp_sample.blast2go.tabular
 
 Check this worked::
 
@@ -172,6 +174,7 @@
     blast2go/README.rst
     blast2go/blast2go.xml
     blast2go/blast2go.py
+    blast2go/massage_xml_for_blast2go.py
     blast2go/repository_dependencies.xml
     blast2go/tool_dependencies.xml
     tool-data/blast2go.loc.sample
--- a/blast2go/blast2go.py	Mon Sep 16 08:59:42 2013 -0400
+++ b/blast2go/blast2go.py	Mon Sep 16 15:02:33 2013 -0400
@@ -18,6 +18,13 @@
 
 It then calls the Java command line tool, and moves the output file to
 the location Galaxy is expecting, and removes the tempory XML file.
+
+This script is called from my Galaxy wrapper for Blast2GO for pipelines,
+available from the Galaxy Tool Shed here:
+http://toolshed.g2.bx.psu.edu/view/peterjc/blast2go
+
+This script is under version control here:
+https://github.com/peterjc/galaxy_blast/tree/master/blast2go
 """
 import sys
 import os
@@ -32,6 +39,11 @@
     sys.stderr.write("%s\n" % msg)
     sys.exit(error_level)
 
+try:
+    from massage_xml_for_blast2go import prepare_xml
+except ImportError:
+    stop_err("Missing sister file massage_xml_for_blast2go.py")
+
 if len(sys.argv) != 4:
     stop_err("Require three arguments: XML filename, properties filename, output tabular filename")
 
@@ -55,56 +67,6 @@
         stop_err("Blast2GO configuration file not found: %s" % prop_file)
     del tmp
 
-def prepare_xml(original_xml, mangled_xml):
-    """Reformat BLAST XML to suit Blast2GO.
-
-    Blast2GO can't cope with 1000s of <Iteration> tags within a
-    single <BlastResult> tag, so instead split this into one
-    full XML record per interation (i.e. per query). This gives
-    a concatenated XML file mimicing old versions of BLAST.
-
-    This also checks for BLASTP or BLASTX output, and outputs
-    the number of queries. Galaxy will show this as "info".
-    """
-    in_handle = open(original_xml)
-    footer = "  </BlastOutput_iterations>\n</BlastOutput>\n"
-    header = ""
-    while True:
-        line = in_handle.readline()
-        if not line:
-            #No hits?
-            stop_err("Problem with XML file?")
-        if line.strip() == "<Iteration>":
-            break
-        header += line
-
-    if "<BlastOutput_program>blastx</BlastOutput_program>" in header:
-        print "BLASTX output identified"
-    elif "<BlastOutput_program>blastp</BlastOutput_program>" in header:
-        print "BLASTP output identified"
-    else:
-        in_handle.close()
-        stop_err("Expect BLASTP or BLASTX output")
-
-    out_handle = open(mangled_xml, "w")
-    out_handle.write(header)
-    out_handle.write(line)
-    count = 1
-    while True:
-        line = in_handle.readline()
-        if not line:
-            break
-        elif line.strip() == "<Iteration>":
-           #Insert footer/header
-           out_handle.write(footer)
-           out_handle.write(header)
-           count += 1
-        out_handle.write(line)
-
-    out_handle.close()
-    in_handle.close()
-    print "Input has %i queries" % count
-
 
 def run(cmd):
     #Avoid using shell=True when we call subprocess to ensure if the Python
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blast2go/massage_xml_for_blast2go.py	Mon Sep 16 15:02:33 2013 -0400
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+"""Script for reformatting Blast XML to suite Blast2GO.
+
+This script takes exactly two command line arguments:
+ * Input BLAST XML filename
+ * Output BLAST XML filename
+
+Sadly b2g4pipe (at least v2.3.5 to v2.5.0) cannot cope with current
+style large BLAST XML files (e.g. from BLAST 2.2.25+), so we reformat
+these to avoid it crashing with a Java heap space OutOfMemoryError.
+
+As part of this reformatting, we check for BLASTP or BLASTX output
+(otherwise raise an error), and print the query count.
+
+This script is called from my Galaxy wrapper for Blast2GO for pipelines,
+available from the Galaxy Tool Shed here:
+http://toolshed.g2.bx.psu.edu/view/peterjc/blast2go 
+
+This script is under version control here:
+https://github.com/peterjc/galaxy_blast/tree/master/blast2go
+"""
+import sys
+import os
+import subprocess
+
+def stop_err(msg, error_level=1):
+    """Print error message to stdout and quit with given error level."""
+    sys.stderr.write("%s\n" % msg)
+    sys.exit(error_level)
+
+def prepare_xml(original_xml, mangled_xml):
+    """Reformat BLAST XML to suit Blast2GO.
+
+    Blast2GO can't cope with 1000s of <Iteration> tags within a
+    single <BlastResult> tag, so instead split this into one
+    full XML record per interation (i.e. per query). This gives
+    a concatenated XML file mimicing old versions of BLAST.
+
+    This also checks for BLASTP or BLASTX output, and outputs
+    the number of queries. Galaxy will show this as "info".
+    """
+    in_handle = open(original_xml)
+    footer = "  </BlastOutput_iterations>\n</BlastOutput>\n"
+    header = ""
+    while True:
+        line = in_handle.readline()
+        if not line:
+            #No hits?
+            stop_err("Problem with XML file?")
+        if line.strip() == "<Iteration>":
+            break
+        header += line
+
+    if "<BlastOutput_program>blastx</BlastOutput_program>" in header:
+        print "BLASTX output identified"
+    elif "<BlastOutput_program>blastp</BlastOutput_program>" in header:
+        print "BLASTP output identified"
+    else:
+        in_handle.close()
+        stop_err("Expect BLASTP or BLASTX output")
+
+    out_handle = open(mangled_xml, "w")
+    out_handle.write(header)
+    out_handle.write(line)
+    count = 1
+    while True:
+        line = in_handle.readline()
+        if not line:
+            break
+        elif line.strip() == "<Iteration>":
+           #Insert footer/header
+           out_handle.write(footer)
+           out_handle.write(header)
+           count += 1
+        out_handle.write(line)
+
+    out_handle.close()
+    in_handle.close()
+    print "Input has %i queries" % count
+
+
+if __name__ == "__main__":
+    # Run the conversion...
+    if len(sys.argv) != 3:
+        stop_err("Require two arguments: XML input filename, XML output filename")
+
+    xml_file, out_xml_file = sys.argv[1:]
+
+    if not os.path.isfile(xml_file):
+        stop_err("Input BLAST XML file not found: %s" % xml_file)
+
+    prepare_xml(xml_file, out_xml_file)