Mercurial > repos > devteam > megablast_xml_parser
view megablast_xml_parser.py @ 1:3ce5d56297ed draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
author | devteam |
---|---|
date | Wed, 09 Sep 2020 10:27:20 +0000 |
parents | 35ff246876fc |
children |
line wrap: on
line source
#!/usr/bin/env python import re import sys import xml.etree.cElementTree as ElementTree def __main__(): source = sys.argv[1] hspTags = ["Hsp_bit-score", "Hsp_evalue", "Hsp_query-from", "Hsp_query-to", "Hsp_hit-from", "Hsp_hit-to", "Hsp_query-frame", "Hsp_hit-frame", "Hsp_identity", "Hsp_align-len", "Hsp_qseq", "Hsp_hseq", "Hsp_midline"] # get an iterable try: context = ElementTree.iterparse(source, events=("start", "end")) except Exception: sys.exit("Invalid data format.") # turn it into an iterator context = iter(context) # get the root element try: event, root = next(context) except Exception: sys.exit("Invalid data format.") with open(sys.argv[2], 'w') as outfile: try: for event, elem in context: # for every <Iteration> tag if event == "end" and elem.tag == "Iteration": query = elem.findtext("Iteration_query-def") qLen = elem.findtext("Iteration_query-len") # for every <Hit> within <Iteration> for hit in elem.findall("Iteration_hits/Hit"): subject = hit.findtext("Hit_id") if re.search('^gi', subject): subject = subject.split('|')[1] sLen = hit.findtext("Hit_len") # for every <Hsp> within <Hit> for hsp in hit.findall("Hit_hsps/Hsp"): outfile.write("%s\t%s\t%s\t%s" % (query, qLen, subject, sLen)) for tag in hspTags: outfile.write("\t%s" % (hsp.findtext(tag))) outfile.write('\n') # prevents ElementTree from growing large datastructure root.clear() elem.clear() except Exception: sys.exit("The input data is malformed, or there is more than one dataset in the input file. Error: %s" % sys.exc_info()[1]) if __name__ == "__main__": __main__()