# HG changeset patch # User devteam # Date 1345209031 14400 # Node ID e1c29f3023014106d9997a433c9a4948f83af4a1 Uploaded diff -r 000000000000 -r e1c29f302301 datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Fri Aug 17 09:10:31 2012 -0400 @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff -r 000000000000 -r e1c29f302301 xml.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xml.py Fri Aug 17 09:10:31 2012 -0400 @@ -0,0 +1,124 @@ +""" +BlastXml class +""" + +from galaxy.datatypes.data import get_file_peek +from galaxy.datatypes.data import Text +from galaxy.datatypes.xml import GenericXml + +class BlastXml( GenericXml ): + """NCBI Blast XML Output data""" + file_ext = "blastxml" + + def set_peek( self, dataset, is_multi_byte=False ): + """Set the peek and blurb text""" + if not dataset.dataset.purged: + dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) + dataset.blurb = 'NCBI Blast XML data' + else: + dataset.peek = 'file does not exist' + dataset.blurb = 'file purged from disk' + def sniff( self, filename ): + """ + Determines whether the file is blastxml + + >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' ) + >>> BlastXml().sniff( fname ) + True + >>> fname = get_test_fname( 'tblastn_four_human_vs_rhodopsin.xml' ) + >>> BlastXml().sniff( fname ) + True + >>> fname = get_test_fname( 'interval.interval' ) + >>> BlastXml().sniff( fname ) + False + """ + #TODO - Use a context manager on Python 2.5+ to close handle + handle = open(filename) + line = handle.readline() + if line.strip() != '': + handle.close() + return False + line = handle.readline() + if line.strip() not in ['', + '']: + handle.close() + return False + line = handle.readline() + if line.strip() != '': + handle.close() + return False + handle.close() + return True + + def merge(split_files, output_file): + """Merging multiple XML files is non-trivial and must be done in subclasses.""" + if len(split_files) == 1: + #For one file only, use base class method (move/copy) + return Text.merge(split_files, output_file) + out = open(output_file, "w") + h = None + for f in split_files: + h = open(f) + body = False + header = h.readline() + if not header: + out.close() + h.close() + raise ValueError("BLAST XML file %s was empty" % f) + if header.strip() != '': + out.write(header) #for diagnosis + out.close() + h.close() + raise ValueError("%s is not an XML file!" % f) + line = h.readline() + header += line + if line.strip() not in ['', + '']: + out.write(header) #for diagnosis + out.close() + h.close() + raise ValueError("%s is not a BLAST XML file!" % f) + while True: + line = h.readline() + if not line: + out.write(header) #for diagnosis + out.close() + h.close() + raise ValueError("BLAST XML file %s ended prematurely" % f) + header += line + if "" in line: + break + if len(header) > 10000: + #Something has gone wrong, don't load too much into memory! + #Write what we have to the merged file for diagnostics + out.write(header) + out.close() + h.close() + raise ValueError("BLAST XML file %s has too long a header!" % f) + if "" not in header: + out.close() + h.close() + raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header)) + if f == split_files[0]: + out.write(header) + old_header = header + elif old_header[:300] != header[:300]: + #Enough to check and match + out.close() + h.close() + raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \ + % (split_files[0], f, old_header[:300], header[:300])) + else: + out.write(" \n") + for line in h: + if "" in line: + break + #TODO - Increment and if required automatic query names + #like Query_3 to be increasing? + out.write(line) + h.close() + out.write(" \n") + out.write("\n") + out.close() + merge = staticmethod(merge) +