Mercurial > repos > peterjc > blast2go
annotate tools/blast2go/massage_xml_for_blast2go.py @ 24:05eef6b222af draft
planemo upload for repository https://github.com/peterjc/galaxy_blast/tools/blast2go commit 6f3c1a8da279f3b34d3bc627c97713d8dfe5f8ed
author | peterjc |
---|---|
date | Fri, 15 May 2015 06:01:16 -0400 |
parents | 31cb702eb5a8 |
children | 242cf17c3bf9 |
rev | line source |
---|---|
23 | 1 #!/usr/bin/env python |
2 """Script for reformatting Blast XML to suit Blast2GO. | |
3 | |
4 This script takes exactly two command line arguments: | |
5 * Input BLAST XML filename | |
6 * Output BLAST XML filename | |
7 | |
8 Sadly b2g4pipe (at least v2.3.5 to v2.5.0) cannot cope with current | |
9 style large BLAST XML files (e.g. from BLAST 2.2.25+), so we reformat | |
10 these to avoid it crashing with a Java heap space OutOfMemoryError. | |
11 | |
12 As part of this reformatting, we check for BLASTP or BLASTX output | |
13 (otherwise raise an error), and print the query count. | |
14 | |
15 This script is called from my Galaxy wrapper for Blast2GO for pipelines, | |
16 available from the Galaxy Tool Shed here: | |
17 http://toolshed.g2.bx.psu.edu/view/peterjc/blast2go | |
18 | |
19 This script is under version control here: | |
20 https://github.com/peterjc/galaxy_blast/tree/master/blast2go | |
21 """ | |
22 import sys | |
23 import os | |
24 | |
25 def stop_err(msg, error_level=1): | |
26 """Print error message to stdout and quit with given error level.""" | |
27 sys.stderr.write("%s\n" % msg) | |
28 sys.exit(error_level) | |
29 | |
30 def prepare_xml(original_xml, mangled_xml): | |
31 """Reformat BLAST XML to suit Blast2GO. | |
32 | |
33 Blast2GO can't cope with 1000s of <Iteration> tags within a | |
34 single <BlastResult> tag, so instead split this into one | |
35 full XML record per interation (i.e. per query). This gives | |
36 a concatenated XML file mimicing old versions of BLAST. | |
37 | |
38 This also checks for BLASTP or BLASTX output, and outputs | |
39 the number of queries. Galaxy will show this as "info". | |
40 """ | |
41 in_handle = open(original_xml) | |
42 footer = " </BlastOutput_iterations>\n</BlastOutput>\n" | |
43 header = "" | |
44 while True: | |
45 line = in_handle.readline() | |
46 if not line: | |
47 #No hits? | |
48 stop_err("Problem with XML file?") | |
49 if line.strip() == "<Iteration>": | |
50 break | |
51 header += line | |
52 | |
53 if "<BlastOutput_program>blastx</BlastOutput_program>" in header: | |
24
05eef6b222af
planemo upload for repository https://github.com/peterjc/galaxy_blast/tools/blast2go commit 6f3c1a8da279f3b34d3bc627c97713d8dfe5f8ed
peterjc
parents:
23
diff
changeset
|
54 print("BLASTX output identified") |
23 | 55 elif "<BlastOutput_program>blastp</BlastOutput_program>" in header: |
24
05eef6b222af
planemo upload for repository https://github.com/peterjc/galaxy_blast/tools/blast2go commit 6f3c1a8da279f3b34d3bc627c97713d8dfe5f8ed
peterjc
parents:
23
diff
changeset
|
56 print("BLASTP output identified") |
23 | 57 else: |
58 in_handle.close() | |
59 stop_err("Expect BLASTP or BLASTX output") | |
60 | |
61 out_handle = open(mangled_xml, "w") | |
62 out_handle.write(header) | |
63 out_handle.write(line) | |
64 count = 1 | |
65 while True: | |
66 line = in_handle.readline() | |
67 if not line: | |
68 break | |
69 elif line.strip() == "<Iteration>": | |
70 #Insert footer/header | |
71 out_handle.write(footer) | |
72 out_handle.write(header) | |
73 count += 1 | |
74 out_handle.write(line) | |
75 | |
76 out_handle.close() | |
77 in_handle.close() | |
24
05eef6b222af
planemo upload for repository https://github.com/peterjc/galaxy_blast/tools/blast2go commit 6f3c1a8da279f3b34d3bc627c97713d8dfe5f8ed
peterjc
parents:
23
diff
changeset
|
78 print("Input has %i queries" % count) |
23 | 79 |
80 | |
81 if __name__ == "__main__": | |
82 # Run the conversion... | |
83 if len(sys.argv) != 3: | |
84 stop_err("Require two arguments: XML input filename, XML output filename") | |
85 | |
86 xml_file, out_xml_file = sys.argv[1:] | |
87 | |
88 if not os.path.isfile(xml_file): | |
89 stop_err("Input BLAST XML file not found: %s" % xml_file) | |
90 | |
91 prepare_xml(xml_file, out_xml_file) |