annotate megablast_xml_parser.py @ 1:3ce5d56297ed draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
author devteam
date Wed, 09 Sep 2020 10:27:20 +0000
parents 35ff246876fc
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
35ff246876fc Imported from capsule None
devteam
parents:
diff changeset
1 #!/usr/bin/env python
35ff246876fc Imported from capsule None
devteam
parents:
diff changeset
2
1
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
3 import re
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
4 import sys
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
5 import xml.etree.cElementTree as ElementTree
0
35ff246876fc Imported from capsule None
devteam
parents:
diff changeset
6
35ff246876fc Imported from capsule None
devteam
parents:
diff changeset
7
35ff246876fc Imported from capsule None
devteam
parents:
diff changeset
8 def __main__():
1
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
9 source = sys.argv[1]
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
10 hspTags = ["Hsp_bit-score",
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
11 "Hsp_evalue",
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
12 "Hsp_query-from",
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
13 "Hsp_query-to",
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
14 "Hsp_hit-from",
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
15 "Hsp_hit-to",
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
16 "Hsp_query-frame",
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
17 "Hsp_hit-frame",
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
18 "Hsp_identity",
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
19 "Hsp_align-len",
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
20 "Hsp_qseq",
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
21 "Hsp_hseq",
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
22 "Hsp_midline"]
0
35ff246876fc Imported from capsule None
devteam
parents:
diff changeset
23
35ff246876fc Imported from capsule None
devteam
parents:
diff changeset
24 # get an iterable
1
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
25 try:
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
26 context = ElementTree.iterparse(source, events=("start", "end"))
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
27 except Exception:
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
28 sys.exit("Invalid data format.")
0
35ff246876fc Imported from capsule None
devteam
parents:
diff changeset
29 # turn it into an iterator
1
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
30 context = iter(context)
0
35ff246876fc Imported from capsule None
devteam
parents:
diff changeset
31 # get the root element
35ff246876fc Imported from capsule None
devteam
parents:
diff changeset
32 try:
1
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
33 event, root = next(context)
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
34 except Exception:
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
35 sys.exit("Invalid data format.")
0
35ff246876fc Imported from capsule None
devteam
parents:
diff changeset
36
1
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
37 with open(sys.argv[2], 'w') as outfile:
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
38 try:
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
39 for event, elem in context:
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
40 # for every <Iteration> tag
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
41 if event == "end" and elem.tag == "Iteration":
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
42 query = elem.findtext("Iteration_query-def")
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
43 qLen = elem.findtext("Iteration_query-len")
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
44 # for every <Hit> within <Iteration>
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
45 for hit in elem.findall("Iteration_hits/Hit"):
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
46 subject = hit.findtext("Hit_id")
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
47 if re.search('^gi', subject):
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
48 subject = subject.split('|')[1]
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
49 sLen = hit.findtext("Hit_len")
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
50 # for every <Hsp> within <Hit>
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
51 for hsp in hit.findall("Hit_hsps/Hsp"):
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
52 outfile.write("%s\t%s\t%s\t%s" % (query, qLen, subject, sLen))
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
53 for tag in hspTags:
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
54 outfile.write("\t%s" % (hsp.findtext(tag)))
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
55 outfile.write('\n')
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
56 # prevents ElementTree from growing large datastructure
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
57 root.clear()
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
58 elem.clear()
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
59 except Exception:
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
60 sys.exit("The input data is malformed, or there is more than one dataset in the input file. Error: %s" % sys.exc_info()[1])
0
35ff246876fc Imported from capsule None
devteam
parents:
diff changeset
61
35ff246876fc Imported from capsule None
devteam
parents:
diff changeset
62
1
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
63 if __name__ == "__main__":
3ce5d56297ed "planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents: 0
diff changeset
64 __main__()