Mercurial > repos > peterjc > blast2go
annotate tools/blast2go/massage_xml_for_blast2go.py @ 25:242cf17c3bf9 draft default tip
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
author | peterjc |
---|---|
date | Wed, 09 Sep 2020 15:01:39 +0000 |
parents | 05eef6b222af |
children |
rev | line source |
---|---|
23 | 1 #!/usr/bin/env python |
2 """Script for reformatting Blast XML to suit Blast2GO. | |
3 | |
4 This script takes exactly two command line arguments: | |
5 * Input BLAST XML filename | |
6 * Output BLAST XML filename | |
7 | |
8 Sadly b2g4pipe (at least v2.3.5 to v2.5.0) cannot cope with current | |
9 style large BLAST XML files (e.g. from BLAST 2.2.25+), so we reformat | |
10 these to avoid it crashing with a Java heap space OutOfMemoryError. | |
11 | |
12 As part of this reformatting, we check for BLASTP or BLASTX output | |
13 (otherwise raise an error), and print the query count. | |
14 | |
15 This script is called from my Galaxy wrapper for Blast2GO for pipelines, | |
16 available from the Galaxy Tool Shed here: | |
25
242cf17c3bf9
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
peterjc
parents:
24
diff
changeset
|
17 http://toolshed.g2.bx.psu.edu/view/peterjc/blast2go |
23 | 18 |
19 This script is under version control here: | |
20 https://github.com/peterjc/galaxy_blast/tree/master/blast2go | |
21 """ | |
22 | |
25
242cf17c3bf9
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
peterjc
parents:
24
diff
changeset
|
23 import os |
242cf17c3bf9
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
peterjc
parents:
24
diff
changeset
|
24 import sys |
242cf17c3bf9
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
peterjc
parents:
24
diff
changeset
|
25 |
23 | 26 |
27 def prepare_xml(original_xml, mangled_xml): | |
28 """Reformat BLAST XML to suit Blast2GO. | |
29 | |
30 Blast2GO can't cope with 1000s of <Iteration> tags within a | |
31 single <BlastResult> tag, so instead split this into one | |
32 full XML record per interation (i.e. per query). This gives | |
33 a concatenated XML file mimicing old versions of BLAST. | |
34 | |
35 This also checks for BLASTP or BLASTX output, and outputs | |
36 the number of queries. Galaxy will show this as "info". | |
37 """ | |
38 in_handle = open(original_xml) | |
39 footer = " </BlastOutput_iterations>\n</BlastOutput>\n" | |
40 header = "" | |
41 while True: | |
42 line = in_handle.readline() | |
43 if not line: | |
25
242cf17c3bf9
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
peterjc
parents:
24
diff
changeset
|
44 # No hits? |
242cf17c3bf9
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
peterjc
parents:
24
diff
changeset
|
45 sys.exit("Problem with XML file?") |
23 | 46 if line.strip() == "<Iteration>": |
47 break | |
48 header += line | |
49 | |
50 if "<BlastOutput_program>blastx</BlastOutput_program>" in header: | |
24
05eef6b222af
planemo upload for repository https://github.com/peterjc/galaxy_blast/tools/blast2go commit 6f3c1a8da279f3b34d3bc627c97713d8dfe5f8ed
peterjc
parents:
23
diff
changeset
|
51 print("BLASTX output identified") |
23 | 52 elif "<BlastOutput_program>blastp</BlastOutput_program>" in header: |
24
05eef6b222af
planemo upload for repository https://github.com/peterjc/galaxy_blast/tools/blast2go commit 6f3c1a8da279f3b34d3bc627c97713d8dfe5f8ed
peterjc
parents:
23
diff
changeset
|
53 print("BLASTP output identified") |
23 | 54 else: |
55 in_handle.close() | |
25
242cf17c3bf9
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
peterjc
parents:
24
diff
changeset
|
56 sys.exit("Expect BLASTP or BLASTX output") |
23 | 57 |
58 out_handle = open(mangled_xml, "w") | |
59 out_handle.write(header) | |
60 out_handle.write(line) | |
61 count = 1 | |
62 while True: | |
63 line = in_handle.readline() | |
64 if not line: | |
65 break | |
66 elif line.strip() == "<Iteration>": | |
25
242cf17c3bf9
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
peterjc
parents:
24
diff
changeset
|
67 # Insert footer/header |
242cf17c3bf9
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
peterjc
parents:
24
diff
changeset
|
68 out_handle.write(footer) |
242cf17c3bf9
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
peterjc
parents:
24
diff
changeset
|
69 out_handle.write(header) |
242cf17c3bf9
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
peterjc
parents:
24
diff
changeset
|
70 count += 1 |
23 | 71 out_handle.write(line) |
72 | |
73 out_handle.close() | |
74 in_handle.close() | |
24
05eef6b222af
planemo upload for repository https://github.com/peterjc/galaxy_blast/tools/blast2go commit 6f3c1a8da279f3b34d3bc627c97713d8dfe5f8ed
peterjc
parents:
23
diff
changeset
|
75 print("Input has %i queries" % count) |
23 | 76 |
77 | |
78 if __name__ == "__main__": | |
79 # Run the conversion... | |
80 if len(sys.argv) != 3: | |
25
242cf17c3bf9
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
peterjc
parents:
24
diff
changeset
|
81 sys.exit("Require two arguments: XML input filename, XML output filename") |
23 | 82 |
83 xml_file, out_xml_file = sys.argv[1:] | |
84 | |
85 if not os.path.isfile(xml_file): | |
25
242cf17c3bf9
"planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/tools/blast2go commit 0c82b9ef284c686cbffd30582d2586e4fb52881e"
peterjc
parents:
24
diff
changeset
|
86 sys.exit("Input BLAST XML file not found: %s" % xml_file) |
23 | 87 |
88 prepare_xml(xml_file, out_xml_file) |