Mercurial > repos > devteam > megablast_xml_parser
annotate megablast_xml_parser.py @ 1:3ce5d56297ed draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
author | devteam |
---|---|
date | Wed, 09 Sep 2020 10:27:20 +0000 |
parents | 35ff246876fc |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 | |
1
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
3 import re |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
4 import sys |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
5 import xml.etree.cElementTree as ElementTree |
0 | 6 |
7 | |
8 def __main__(): | |
1
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
9 source = sys.argv[1] |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
10 hspTags = ["Hsp_bit-score", |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
11 "Hsp_evalue", |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
12 "Hsp_query-from", |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
13 "Hsp_query-to", |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
14 "Hsp_hit-from", |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
15 "Hsp_hit-to", |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
16 "Hsp_query-frame", |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
17 "Hsp_hit-frame", |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
18 "Hsp_identity", |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
19 "Hsp_align-len", |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
20 "Hsp_qseq", |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
21 "Hsp_hseq", |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
22 "Hsp_midline"] |
0 | 23 |
24 # get an iterable | |
1
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
25 try: |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
26 context = ElementTree.iterparse(source, events=("start", "end")) |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
27 except Exception: |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
28 sys.exit("Invalid data format.") |
0 | 29 # turn it into an iterator |
1
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
30 context = iter(context) |
0 | 31 # get the root element |
32 try: | |
1
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
33 event, root = next(context) |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
34 except Exception: |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
35 sys.exit("Invalid data format.") |
0 | 36 |
1
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
37 with open(sys.argv[2], 'w') as outfile: |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
38 try: |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
39 for event, elem in context: |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
40 # for every <Iteration> tag |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
41 if event == "end" and elem.tag == "Iteration": |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
42 query = elem.findtext("Iteration_query-def") |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
43 qLen = elem.findtext("Iteration_query-len") |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
44 # for every <Hit> within <Iteration> |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
45 for hit in elem.findall("Iteration_hits/Hit"): |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
46 subject = hit.findtext("Hit_id") |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
47 if re.search('^gi', subject): |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
48 subject = subject.split('|')[1] |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
49 sLen = hit.findtext("Hit_len") |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
50 # for every <Hsp> within <Hit> |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
51 for hsp in hit.findall("Hit_hsps/Hsp"): |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
52 outfile.write("%s\t%s\t%s\t%s" % (query, qLen, subject, sLen)) |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
53 for tag in hspTags: |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
54 outfile.write("\t%s" % (hsp.findtext(tag))) |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
55 outfile.write('\n') |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
56 # prevents ElementTree from growing large datastructure |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
57 root.clear() |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
58 elem.clear() |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
59 except Exception: |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
60 sys.exit("The input data is malformed, or there is more than one dataset in the input file. Error: %s" % sys.exc_info()[1]) |
0 | 61 |
62 | |
1
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
63 if __name__ == "__main__": |
3ce5d56297ed
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
devteam
parents:
0
diff
changeset
|
64 __main__() |