Mercurial > repos > devteam > blast_datatypes
changeset 14:623a3fbe5340 draft
planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/datatypes/blast_datatypes/ commit 3f6b5c953d522a724bbcd403bcb86f1e2757a556-dirty
author | peterjc |
---|---|
date | Fri, 03 Feb 2017 12:34:03 -0500 |
parents | e96b515847dd |
children | 310ec0f47485 |
files | README.rst blast.py |
diffstat | 2 files changed, 70 insertions(+), 69 deletions(-) [+] |
line wrap: on
line diff
--- a/README.rst Fri Sep 04 07:09:45 2015 -0400 +++ b/README.rst Fri Feb 03 12:34:03 2017 -0500 @@ -48,6 +48,7 @@ as ``*.smp``), used as input files for makeprofiledb. v0.0.20 - Added "NCBI BLAST+ integrated into Galaxy" preprint citation. v0.0.21 - Updated citation information with GigaScience paper. +v0.0.22 - Removed unused imports in ``blast.py`` (internal change only). ======= ======================================================================
--- a/blast.py Fri Sep 04 07:09:45 2015 -0400 +++ b/blast.py Fri Feb 03 12:34:03 2017 -0500 @@ -1,46 +1,47 @@ +"""NCBI BLAST datatypes. + +Covers the ``blastxml`` format and the BLAST databases. """ -BlastXml class -""" + +import logging +import os +from time import sleep from galaxy.datatypes.data import get_file_peek -from galaxy.datatypes.data import Text, Data, GenericAsn1 +from galaxy.datatypes.data import Data, Text from galaxy.datatypes.xml import GenericXml -from galaxy.datatypes.metadata import MetadataElement - -from time import sleep -import os -import logging log = logging.getLogger(__name__) -class BlastXml( GenericXml ): + +class BlastXml(GenericXml): """NCBI Blast XML Output data""" file_ext = "blastxml" - def set_peek( self, dataset, is_multi_byte=False ): + def set_peek(self, dataset, is_multi_byte=False): """Set the peek and blurb text""" if not dataset.dataset.purged: - dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) + dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte) dataset.blurb = 'NCBI Blast XML data' else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' - def sniff( self, filename ): - """ - Determines whether the file is blastxml - - >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' ) - >>> BlastXml().sniff( fname ) + def sniff(self, filename): + """Determines whether the file is blastxml + + >>> from galaxy.datatypes.sniff import get_test_fname + >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml') + >>> BlastXml().sniff(fname) True - >>> fname = get_test_fname( 'tblastn_four_human_vs_rhodopsin.xml' ) - >>> BlastXml().sniff( fname ) + >>> fname = get_test_fname('tblastn_four_human_vs_rhodopsin.xml') + >>> BlastXml().sniff(fname) True - >>> fname = get_test_fname( 'interval.interval' ) - >>> BlastXml().sniff( fname ) + >>> fname = get_test_fname('interval.interval') + >>> BlastXml().sniff(fname) False """ - #TODO - Use a context manager on Python 2.5+ to close handle + # TODO - Use a context manager on Python 2.5+ to close handle handle = open(filename) line = handle.readline() if line.strip() != '<?xml version="1.0"?>': @@ -57,14 +58,14 @@ return False handle.close() return True - + def merge(split_files, output_file): """Merging multiple XML files is non-trivial and must be done in subclasses.""" if len(split_files) == 1: - #For one file only, use base class method (move/copy) + # For one file only, use base class method (move/copy) return Text.merge(split_files, output_file) if not split_files: - raise ValueError("Given no BLAST XML files, %r, to merge into %s" \ + raise ValueError("Given no BLAST XML files, %r, to merge into %s" % (split_files, output_file)) out = open(output_file, "w") h = None @@ -80,7 +81,7 @@ if not header: out.close() h.close() - #Retry, could be transient error with networked file system... + # Retry, could be transient error with networked file system... log.warning("BLAST XML file %s empty, retry in 1s..." % f) sleep(1) h = open(f) @@ -89,7 +90,7 @@ log.error("BLAST XML file %s was empty" % f) raise ValueError("BLAST XML file %s was empty" % f) if header.strip() != '<?xml version="1.0"?>': - out.write(header) #for diagnosis + out.write(header) # for diagnosis out.close() h.close() raise ValueError("%s is not an XML file!" % f) @@ -97,14 +98,14 @@ header += line if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']: - out.write(header) #for diagnosis + out.write(header) # for diagnosis out.close() h.close() raise ValueError("%s is not a BLAST XML file!" % f) while True: line = h.readline() if not line: - out.write(header) #for diagnosis + out.write(header) # for diagnosis out.close() h.close() raise ValueError("BLAST XML file %s ended prematurely" % f) @@ -112,12 +113,12 @@ if "<Iteration>" in line: break if len(header) > 10000: - #Something has gone wrong, don't load too much into memory! - #Write what we have to the merged file for diagnostics + # Something has gone wrong, don't load too much into memory! + # Write what we have to the merged file for diagnostics out.write(header) out.close() h.close() - raise ValueError("BLAST XML file %s has too long a header!" % f) + raise ValueError("The header in BLAST XML file %s is too long" % f) if "<BlastOutput>" not in header: out.close() h.close() @@ -126,18 +127,18 @@ out.write(header) old_header = header elif old_header[:300] != header[:300]: - #Enough to check <BlastOutput_program> and <BlastOutput_version> match + # Enough to check <BlastOutput_program> and <BlastOutput_version> match out.close() h.close() - raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \ + raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" % (split_files[0], f, old_header[:300], header[:300])) else: out.write(" <Iteration>\n") for line in h: if "</BlastOutput_iterations>" in line: break - #TODO - Increment <Iteration_iter-num> and if required automatic query names - #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing? + # TODO - Increment <Iteration_iter-num> and if required automatic query names + # like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing? out.write(line) h.close() out.write(" </BlastOutput_iterations>\n") @@ -149,20 +150,20 @@ class _BlastDb(object): """Base class for BLAST database datatype.""" - def set_peek( self, dataset, is_multi_byte=False ): + def set_peek(self, dataset, is_multi_byte=False): """Set the peek and blurb text.""" if not dataset.dataset.purged: - dataset.peek = "BLAST database (multiple files)" + dataset.peek = "BLAST database (multiple files)" dataset.blurb = "BLAST database (multiple files)" else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' - def display_peek( self, dataset ): + def display_peek(self, dataset): """Create HTML content, used for displaying peek.""" try: return dataset.peek - except: + except Exception: return "BLAST database (multiple files)" def display_data(self, trans, data, preview=False, filename=None, @@ -172,44 +173,43 @@ This allows us to format the data shown in the central pane via the "eye" icon. """ if filename is not None and filename != "index": - #Change nothing - important for the unit tests to access child files: + # Change nothing - important for the unit tests to access child files: return Data.display_data(self, trans, data, preview, filename, to_ext, size, offset, **kwd) if self.file_ext == "blastdbn": title = "This is a nucleotide BLAST database" - elif self.file_ext =="blastdbp": + elif self.file_ext == "blastdbp": title = "This is a protein BLAST database" - elif self.file_ext =="blastdbd": + elif self.file_ext == "blastdbd": title = "This is a domain BLAST database" else: - #Error? + # Error? title = "This is a BLAST database." msg = "" try: - #Try to use any text recorded in the dummy index file: + # Try to use any text recorded in the dummy index file: handle = open(data.file_name, "rU") msg = handle.read().strip() handle.close() - except Exception, err: - #msg = str(err) + except Exception: pass if not msg: msg = title - #Galaxy assumes HTML for the display of composite datatypes, + # Galaxy assumes HTML for the display of composite datatypes, return "<html><head><title>%s</title></head><body><pre>%s</pre></body></html>" % (title, msg) def merge(split_files, output_file): """Merge BLAST databases (not implemented for now).""" raise NotImplementedError("Merging BLAST databases is non-trivial (do this via makeblastdb?)") - def split( cls, input_datasets, subdir_generator_function, split_params): + def split(cls, input_datasets, subdir_generator_function, split_params): """Split a BLAST database (not implemented for now).""" if split_params is None: return None raise NotImplementedError("Can't split BLAST databases") -class BlastNucDb( _BlastDb, Data ): +class BlastNucDb(_BlastDb, Data): """Class for nucleotide BLAST database files.""" file_ext = 'blastdbn' allow_datatype_change = False @@ -217,27 +217,27 @@ def __init__(self, **kwd): Data.__init__(self, **kwd) - self.add_composite_file('blastdb.nhr', is_binary=True) # sequence headers - self.add_composite_file('blastdb.nin', is_binary=True) # index file - self.add_composite_file('blastdb.nsq', is_binary=True) # nucleotide sequences - self.add_composite_file('blastdb.nal', is_binary=False, optional=True) # alias ( -gi_mask option of makeblastdb) - self.add_composite_file('blastdb.nhd', is_binary=True, optional=True) # sorted sequence hash values ( -hash_index option of makeblastdb) - self.add_composite_file('blastdb.nhi', is_binary=True, optional=True) # index of sequence hash values ( -hash_index option of makeblastdb) - self.add_composite_file('blastdb.nnd', is_binary=True, optional=True) # sorted GI values ( -parse_seqids option of makeblastdb and gi present in the description lines) - self.add_composite_file('blastdb.nni', is_binary=True, optional=True) # index of GI values ( -parse_seqids option of makeblastdb and gi present in the description lines) - self.add_composite_file('blastdb.nog', is_binary=True, optional=True) # OID->GI lookup file ( -hash_index or -parse_seqids option of makeblastdb) - self.add_composite_file('blastdb.nsd', is_binary=True, optional=True) # sorted sequence accession values ( -hash_index or -parse_seqids option of makeblastdb) - self.add_composite_file('blastdb.nsi', is_binary=True, optional=True) # index of sequence accession values ( -hash_index or -parse_seqids option of makeblastdb) -# self.add_composite_file('blastdb.00.idx', is_binary=True, optional=True) # first volume of the MegaBLAST index generated by makembindex + self.add_composite_file('blastdb.nhr', is_binary=True) # sequence headers + self.add_composite_file('blastdb.nin', is_binary=True) # index file + self.add_composite_file('blastdb.nsq', is_binary=True) # nucleotide sequences + self.add_composite_file('blastdb.nal', is_binary=False, optional=True) # alias ( -gi_mask option of makeblastdb) + self.add_composite_file('blastdb.nhd', is_binary=True, optional=True) # sorted sequence hash values ( -hash_index option of makeblastdb) + self.add_composite_file('blastdb.nhi', is_binary=True, optional=True) # index of sequence hash values ( -hash_index option of makeblastdb) + self.add_composite_file('blastdb.nnd', is_binary=True, optional=True) # sorted GI values ( -parse_seqids option of makeblastdb and gi present in the description lines) + self.add_composite_file('blastdb.nni', is_binary=True, optional=True) # index of GI values ( -parse_seqids option of makeblastdb and gi present in the description lines) + self.add_composite_file('blastdb.nog', is_binary=True, optional=True) # OID->GI lookup file ( -hash_index or -parse_seqids option of makeblastdb) + self.add_composite_file('blastdb.nsd', is_binary=True, optional=True) # sorted sequence accession values ( -hash_index or -parse_seqids option of makeblastdb) + self.add_composite_file('blastdb.nsi', is_binary=True, optional=True) # index of sequence accession values ( -hash_index or -parse_seqids option of makeblastdb) +# self.add_composite_file('blastdb.00.idx', is_binary=True, optional=True) # first volume of the MegaBLAST index generated by makembindex # The previous line should be repeated for each index volume, with filename extensions like '.01.idx', '.02.idx', etc. - self.add_composite_file('blastdb.shd', is_binary=True, optional=True) # MegaBLAST index superheader (-old_style_index false option of makembindex) -# self.add_composite_file('blastdb.naa', is_binary=True, optional=True) # index of a WriteDB column for e.g. mask data -# self.add_composite_file('blastdb.nab', is_binary=True, optional=True) # data of a WriteDB column -# self.add_composite_file('blastdb.nac', is_binary=True, optional=True) # multiple byte order for a WriteDB column + self.add_composite_file('blastdb.shd', is_binary=True, optional=True) # MegaBLAST index superheader (-old_style_index false option of makembindex) +# self.add_composite_file('blastdb.naa', is_binary=True, optional=True) # index of a WriteDB column for e.g. mask data +# self.add_composite_file('blastdb.nab', is_binary=True, optional=True) # data of a WriteDB column +# self.add_composite_file('blastdb.nac', is_binary=True, optional=True) # multiple byte order for a WriteDB column # The previous 3 lines should be repeated for each WriteDB column, with filename extensions like ('.nba', '.nbb', '.nbc'), ('.nca', '.ncb', '.ncc'), etc. -class BlastProtDb( _BlastDb, Data ): +class BlastProtDb(_BlastDb, Data): """Class for protein BLAST database files.""" file_ext = 'blastdbp' allow_datatype_change = False @@ -248,7 +248,7 @@ # Component file comments are as in BlastNucDb except where noted self.add_composite_file('blastdb.phr', is_binary=True) self.add_composite_file('blastdb.pin', is_binary=True) - self.add_composite_file('blastdb.psq', is_binary=True) # protein sequences + self.add_composite_file('blastdb.psq', is_binary=True) # protein sequences self.add_composite_file('blastdb.phd', is_binary=True, optional=True) self.add_composite_file('blastdb.phi', is_binary=True, optional=True) self.add_composite_file('blastdb.pnd', is_binary=True, optional=True) @@ -262,7 +262,7 @@ # The last 3 lines should be repeated for each WriteDB column, with filename extensions like ('.pba', '.pbb', '.pbc'), ('.pca', '.pcb', '.pcc'), etc. -class BlastDomainDb( _BlastDb, Data ): +class BlastDomainDb(_BlastDb, Data): """Class for domain BLAST database files.""" file_ext = 'blastdbd' allow_datatype_change = False