annotate blast2go-7b53cc52e7ed/tools/ncbi_blast_plus/blast2go.py @ 0:c5c578f2a1ba draft

Uploaded
author devteam
date Fri, 17 May 2013 07:34:10 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
1 #!/usr/bin/env python
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
2 """Galaxy wrapper for Blast2GO for pipelines, b2g4pipe v2.3.5.
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
3
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
4 This script takes exactly three command line arguments:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
5 * Input BLAST XML filename
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
6 * Blast2GO properties filename (settings file)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
7 * Output tabular filename
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
8
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
9 Sadly b2g4pipe v2.3.5 cannot cope with current style large BLAST XML
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
10 files (e.g. from BLAST 2.2.25+), so we have to reformat these to
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
11 avoid it crashing with a Java heap space OutOfMemoryError.
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
12
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
13 As part of this reformatting, we check for BLASTP or BLASTX output
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
14 (otherwise raise an error), and print the query count.
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
15
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
16 It then calls the Java command line tool, and moves the output file to
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
17 the location Galaxy is expecting, and removes the tempory XML file.
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
18 """
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
19 import sys
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
20 import os
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
21 import subprocess
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
22
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
23 #You may need to edit this to match your local setup,
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
24 blast2go_jar = "/opt/b2g4pipe/blast2go.jar"
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
25
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
26
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
27 def stop_err(msg, error_level=1):
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
28 """Print error message to stdout and quit with given error level."""
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
29 sys.stderr.write("%s\n" % msg)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
30 sys.exit(error_level)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
31
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
32 if len(sys.argv) != 4:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
33 stop_err("Require three arguments: XML filename, properties filename, output tabular filename")
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
34
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
35 xml_file, prop_file, tabular_file = sys.argv[1:]
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
36
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
37 #We should have write access here:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
38 tmp_xml_file = tabular_file + ".tmp.xml"
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
39
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
40 if not os.path.isfile(xml_file):
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
41 stop_err("Input BLAST XML file not found: %s" % xml_file)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
42
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
43 if not os.path.isfile(prop_file):
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
44 stop_err("Blast2GO configuration file not found: %s" % prop_file)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
45
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
46 def prepare_xml(original_xml, mangled_xml):
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
47 """Reformat BLAST XML to suit Blast2GO.
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
48
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
49 Blast2GO can't cope with 1000s of <Iteration> tags within a
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
50 single <BlastResult> tag, so instead split this into one
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
51 full XML record per interation (i.e. per query). This gives
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
52 a concatenated XML file mimicing old versions of BLAST.
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
53
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
54 This also checks for BLASTP or BLASTX output, and outputs
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
55 the number of queries. Galaxy will show this as "info".
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
56 """
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
57 in_handle = open(original_xml)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
58 footer = " </BlastOutput_iterations>\n</BlastOutput>\n"
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
59 header = ""
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
60 while True:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
61 line = in_handle.readline()
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
62 if not line:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
63 #No hits?
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
64 stop_err("Problem with XML file?")
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
65 if line.strip() == "<Iteration>":
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
66 break
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
67 header += line
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
68
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
69 if "<BlastOutput_program>blastx</BlastOutput_program>" in header:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
70 print "BLASTX output identified"
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
71 elif "<BlastOutput_program>blastp</BlastOutput_program>" in header:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
72 print "BLASTP output identified"
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
73 else:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
74 in_handle.close()
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
75 stop_err("Expect BLASTP or BLASTX output")
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
76
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
77 out_handle = open(mangled_xml, "w")
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
78 out_handle.write(header)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
79 out_handle.write(line)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
80 count = 1
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
81 while True:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
82 line = in_handle.readline()
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
83 if not line:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
84 break
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
85 elif line.strip() == "<Iteration>":
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
86 #Insert footer/header
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
87 out_handle.write(footer)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
88 out_handle.write(header)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
89 count += 1
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
90 out_handle.write(line)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
91
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
92 out_handle.close()
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
93 in_handle.close()
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
94 print "Input has %i queries" % count
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
95
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
96
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
97 def run(cmd):
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
98 #Avoid using shell=True when we call subprocess to ensure if the Python
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
99 #script is killed, so too is the child process.
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
100 try:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
101 child = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
102 except Exception, err:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
103 stop_err("Error invoking command:\n%s\n\n%s\n" % (" ".join(cmd), err))
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
104 #Use .communicate as can get deadlocks with .wait(),
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
105 stdout, stderr = child.communicate()
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
106 return_code = child.returncode
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
107 if return_code:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
108 cmd_str = " ".join(cmd)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
109 if stderr and stdout:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
110 stop_err("Return code %i from command:\n%s\n\n%s\n\n%s" % (return_code, cmd_str, stdout, stderr))
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
111 else:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
112 stop_err("Return code %i from command:\n%s\n%s" % (return_code, cmd_str, stderr))
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
113 #For early diagnostics,
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
114 else:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
115 print stdout
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
116 print stderr
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
117
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
118 if not os.path.isfile(blast2go_jar):
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
119 stop_err("Blast2GO JAR file not found: %s" % blast2go_jar)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
120
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
121 prepare_xml(xml_file, tmp_xml_file)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
122 #print "XML file prepared for Blast2GO"
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
123
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
124 #We will have write access wherever the output should be,
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
125 #so we'll ask Blast2GO to use that as the stem for its output
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
126 #(it will append .annot to the filename)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
127 cmd = ["java", "-jar", blast2go_jar,
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
128 "-in", tmp_xml_file,
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
129 "-prop", prop_file,
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
130 "-out", tabular_file, #Used as base name for output files
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
131 "-a", # Generate *.annot tabular file
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
132 #"-img", # Generate images, feature not in v2.3.5
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
133 ]
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
134 #print " ".join(cmd)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
135 run(cmd)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
136
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
137 #Remove the temp XML file
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
138 os.remove(tmp_xml_file)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
139
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
140 out_file = tabular_file + ".annot"
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
141 if not os.path.isfile(out_file):
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
142 stop_err("ERROR - No output annotation file from Blast2GO")
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
143
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
144 #Move the output file where Galaxy expects it to be:
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
145 os.rename(out_file, tabular_file)
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
146
c5c578f2a1ba Uploaded
devteam
parents:
diff changeset
147 print "Done"