Mercurial > repos > devteam > megablast_xml_parser
comparison megablast_xml_parser.py @ 1:3ce5d56297ed draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
author | devteam |
---|---|
date | Wed, 09 Sep 2020 10:27:20 +0000 |
parents | 35ff246876fc |
children |
comparison
equal
deleted
inserted
replaced
0:35ff246876fc | 1:3ce5d56297ed |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | |
3 import sys, os, re | |
4 | 2 |
5 if sys.version_info[:2] >= ( 2, 5 ): | 3 import re |
6 import xml.etree.cElementTree as ElementTree | 4 import sys |
7 else: | 5 import xml.etree.cElementTree as ElementTree |
8 from galaxy import eggs | |
9 import pkg_resources; pkg_resources.require( "elementtree" ) | |
10 from elementtree import ElementTree | |
11 | 6 |
12 def stop_err( msg ): | |
13 sys.stderr.write( "%s\n" % msg ) | |
14 sys.exit() | |
15 | 7 |
16 def __main__(): | 8 def __main__(): |
17 source = sys.argv[1] | 9 source = sys.argv[1] |
18 hspTags = [ | 10 hspTags = ["Hsp_bit-score", |
19 "Hsp_bit-score", | 11 "Hsp_evalue", |
20 "Hsp_evalue", | 12 "Hsp_query-from", |
21 "Hsp_query-from", | 13 "Hsp_query-to", |
22 "Hsp_query-to", | 14 "Hsp_hit-from", |
23 "Hsp_hit-from", | 15 "Hsp_hit-to", |
24 "Hsp_hit-to", | 16 "Hsp_query-frame", |
25 "Hsp_query-frame", | 17 "Hsp_hit-frame", |
26 "Hsp_hit-frame", | 18 "Hsp_identity", |
27 "Hsp_identity", | 19 "Hsp_align-len", |
28 "Hsp_align-len", | 20 "Hsp_qseq", |
29 "Hsp_qseq", | 21 "Hsp_hseq", |
30 "Hsp_hseq", | 22 "Hsp_midline"] |
31 "Hsp_midline" | |
32 ] | |
33 hspData = [] | |
34 | 23 |
35 # get an iterable | 24 # get an iterable |
36 try: | 25 try: |
37 context = ElementTree.iterparse( source, events=( "start", "end" ) ) | 26 context = ElementTree.iterparse(source, events=("start", "end")) |
38 except: | 27 except Exception: |
39 stop_err( "Invalid data format." ) | 28 sys.exit("Invalid data format.") |
40 # turn it into an iterator | 29 # turn it into an iterator |
41 context = iter( context ) | 30 context = iter(context) |
42 # get the root element | 31 # get the root element |
43 try: | 32 try: |
44 event, root = context.next() | 33 event, root = next(context) |
45 except: | 34 except Exception: |
46 stop_err( "Invalid data format." ) | 35 sys.exit("Invalid data format.") |
47 | 36 |
48 outfile = open( sys.argv[2], 'w' ) | 37 with open(sys.argv[2], 'w') as outfile: |
49 try: | 38 try: |
50 for event, elem in context: | 39 for event, elem in context: |
51 # for every <Iteration> tag | 40 # for every <Iteration> tag |
52 if event == "end" and elem.tag == "Iteration": | 41 if event == "end" and elem.tag == "Iteration": |
53 query = elem.findtext( "Iteration_query-def" ) | 42 query = elem.findtext("Iteration_query-def") |
54 qLen = elem.findtext( "Iteration_query-len" ) | 43 qLen = elem.findtext("Iteration_query-len") |
55 # for every <Hit> within <Iteration> | 44 # for every <Hit> within <Iteration> |
56 for hit in elem.findall( "Iteration_hits/Hit" ): | 45 for hit in elem.findall("Iteration_hits/Hit"): |
57 subject = hit.findtext( "Hit_id" ) | 46 subject = hit.findtext("Hit_id") |
58 if re.search( '^gi', subject ): | 47 if re.search('^gi', subject): |
59 subject = subject.split('|')[1] | 48 subject = subject.split('|')[1] |
60 sLen = hit.findtext( "Hit_len" ) | 49 sLen = hit.findtext("Hit_len") |
61 # for every <Hsp> within <Hit> | 50 # for every <Hsp> within <Hit> |
62 for hsp in hit.findall( "Hit_hsps/Hsp" ): | 51 for hsp in hit.findall("Hit_hsps/Hsp"): |
63 outfile.write( "%s\t%s\t%s\t%s" % ( query, qLen, subject, sLen ) ) | 52 outfile.write("%s\t%s\t%s\t%s" % (query, qLen, subject, sLen)) |
64 for tag in hspTags: | 53 for tag in hspTags: |
65 outfile.write("\t%s" %(hsp.findtext( tag ))) | 54 outfile.write("\t%s" % (hsp.findtext(tag))) |
66 #hspData.append( hsp.findtext( tag ) ) | 55 outfile.write('\n') |
67 #hspData = [] | 56 # prevents ElementTree from growing large datastructure |
68 outfile.write('\n') | 57 root.clear() |
69 # prevents ElementTree from growing large datastructure | 58 elem.clear() |
70 root.clear() | 59 except Exception: |
71 elem.clear() | 60 sys.exit("The input data is malformed, or there is more than one dataset in the input file. Error: %s" % sys.exc_info()[1]) |
72 except: | |
73 outfile.close() | |
74 stop_err( "The input data is malformed, or there is more than one dataset in the input file. Error: %s" % sys.exc_info()[1] ) | |
75 | 61 |
76 outfile.close() | |
77 | 62 |
78 if __name__ == "__main__": __main__() | 63 if __name__ == "__main__": |
64 __main__() |