# HG changeset patch # User peterjc # Date 1307476306 14400 # Node ID 7b53cc52e7eda18a49312bfab66a962d4b6ada71 # Parent 4bfd64cf18ab5d0fe74e14afdb6634d8a5f9abb2 Migrated tool version 0.0.2 from old tool shed archive to new tool shed repository diff -r 4bfd64cf18ab -r 7b53cc52e7ed tools/ncbi_blast_plus/blast2go.py --- a/tools/ncbi_blast_plus/blast2go.py Tue Jun 07 15:51:25 2011 -0400 +++ b/tools/ncbi_blast_plus/blast2go.py Tue Jun 07 15:51:46 2011 -0400 @@ -6,8 +6,15 @@ * Blast2GO properties filename (settings file) * Output tabular filename +Sadly b2g4pipe v2.3.5 cannot cope with current style large BLAST XML +files (e.g. from BLAST 2.2.25+), so we have to reformat these to +avoid it crashing with a Java heap space OutOfMemoryError. + +As part of this reformatting, we check for BLASTP or BLASTX output +(otherwise raise an error), and print the query count. + It then calls the Java command line tool, and moves the output file to -the location Galaxy is expecting. +the location Galaxy is expecting, and removes the tempory XML file. """ import sys import os @@ -27,12 +34,66 @@ xml_file, prop_file, tabular_file = sys.argv[1:] +#We should have write access here: +tmp_xml_file = tabular_file + ".tmp.xml" + if not os.path.isfile(xml_file): stop_err("Input BLAST XML file not found: %s" % xml_file) if not os.path.isfile(prop_file): stop_err("Blast2GO configuration file not found: %s" % prop_file) +def prepare_xml(original_xml, mangled_xml): + """Reformat BLAST XML to suit Blast2GO. + + Blast2GO can't cope with 1000s of tags within a + single tag, so instead split this into one + full XML record per interation (i.e. per query). This gives + a concatenated XML file mimicing old versions of BLAST. + + This also checks for BLASTP or BLASTX output, and outputs + the number of queries. Galaxy will show this as "info". + """ + in_handle = open(original_xml) + footer = " \n\n" + header = "" + while True: + line = in_handle.readline() + if not line: + #No hits? + stop_err("Problem with XML file?") + if line.strip() == "": + break + header += line + + if "blastx" in header: + print "BLASTX output identified" + elif "blastp" in header: + print "BLASTP output identified" + else: + in_handle.close() + stop_err("Expect BLASTP or BLASTX output") + + out_handle = open(mangled_xml, "w") + out_handle.write(header) + out_handle.write(line) + count = 1 + while True: + line = in_handle.readline() + if not line: + break + elif line.strip() == "": + #Insert footer/header + out_handle.write(footer) + out_handle.write(header) + count += 1 + out_handle.write(line) + + out_handle.close() + in_handle.close() + print "Input has %i queries" % count + + def run(cmd): #Avoid using shell=True when we call subprocess to ensure if the Python #script is killed, so too is the child process. @@ -44,10 +105,11 @@ stdout, stderr = child.communicate() return_code = child.returncode if return_code: + cmd_str = " ".join(cmd) if stderr and stdout: - stop_err("Return code %i from command:\n%s\n\n%s\n\n%s" % (return_code, err, stdout, stderr)) + stop_err("Return code %i from command:\n%s\n\n%s\n\n%s" % (return_code, cmd_str, stdout, stderr)) else: - stop_err("Return code %i from command:\n%s\n%s" % (return_code, err, stderr)) + stop_err("Return code %i from command:\n%s\n%s" % (return_code, cmd_str, stderr)) #For early diagnostics, else: print stdout @@ -56,16 +118,25 @@ if not os.path.isfile(blast2go_jar): stop_err("Blast2GO JAR file not found: %s" % blast2go_jar) -#We will have write access whereever the output should be, +prepare_xml(xml_file, tmp_xml_file) +#print "XML file prepared for Blast2GO" + +#We will have write access wherever the output should be, #so we'll ask Blast2GO to use that as the stem for its output #(it will append .annot to the filename) cmd = ["java", "-jar", blast2go_jar, - "-in", xml_file, + "-in", tmp_xml_file, "-prop", prop_file, - "-out", tabular_file, - "-a"] + "-out", tabular_file, #Used as base name for output files + "-a", # Generate *.annot tabular file + #"-img", # Generate images, feature not in v2.3.5 + ] +#print " ".join(cmd) run(cmd) +#Remove the temp XML file +os.remove(tmp_xml_file) + out_file = tabular_file + ".annot" if not os.path.isfile(out_file): stop_err("ERROR - No output annotation file from Blast2GO") diff -r 4bfd64cf18ab -r 7b53cc52e7ed tools/ncbi_blast_plus/blast2go.txt --- a/tools/ncbi_blast_plus/blast2go.txt Tue Jun 07 15:51:25 2011 -0400 +++ b/tools/ncbi_blast_plus/blast2go.txt Tue Jun 07 15:51:46 2011 -0400 @@ -76,6 +76,11 @@ ======= v0.0.1 - Initial public release +v0.0.2 - Documentation clarifications, e.g. concatenated BLAST XML is allowed. + - Fixed error handler in wrapper script (for when b2g4pipe fails). + - Reformats the XML to use old NCBI-style concatenated BLAST XML since + b2g4pipe crashes with heap space error on with large files using + current NCBI output. Developers diff -r 4bfd64cf18ab -r 7b53cc52e7ed tools/ncbi_blast_plus/blast2go.xml --- a/tools/ncbi_blast_plus/blast2go.xml Tue Jun 07 15:51:25 2011 -0400 +++ b/tools/ncbi_blast_plus/blast2go.xml Tue Jun 07 15:51:46 2011 -0400 @@ -1,4 +1,4 @@ - + Maps BLAST results to GO annotation terms blast2go.py $xml ${prop.fields.path} $tab @@ -35,8 +35,13 @@ for use in pipelines. It takes as input BLAST XML results against a protein database, typically -the NCBI non-redundant (NR) database. The BLAST matches are used to assign -Gene Ontology (GO) annotation terms to each query sequence. +the NCBI non-redundant (NR) database. This tool will accept concatenated +BLAST XML files (although they are technically invalid XML), which is very +useful if you have sub-divided your protein FASTA files and run BLAST on +them in batches. + +The BLAST matches are used to assign Gene Ontology (GO) annotation terms +to each query sequence. The output from this tool is a tabular file containing three columns, with the order taken from query order in the original BLAST XML file: @@ -52,6 +57,16 @@ Note that if no GO terms are assigned to a sequence (e.g. if it had no BLAST matches), then it will not be present in the output file. + +**Advanced Settings** + +Blast2GO has a properties setting file which includes which database +server to connect to (e.g. the public server in Valencia, Spain, or a +local server), as well as more advanced options such as thresholds and +evidence code weights. To change these settings, your Galaxy administrator +must create a new properties file, and add it to the drop down menu above. + + **References** S. Götz et al.