Mercurial > repos > bgruening > chemical_datatypes
view datatypes/molFiles.py @ 0:af7b6c6ee439 draft
initial commit
author | bgruening |
---|---|
date | Tue, 25 Dec 2012 05:16:25 -0500 |
parents | |
children |
line wrap: on
line source
# -*- coding: utf-8 -*- from galaxy.datatypes import data import logging from galaxy.datatypes.sniff import * import commands import pybel import openbabel openbabel.obErrorLog.StopLogging() from galaxy.datatypes.metadata import MetadataElement from galaxy.datatypes import metadata log = logging.getLogger(__name__) class GenericMolFile( data.Text ): MetadataElement( name="molecules", default=0, desc="Number of molecules", readonly=True, visible=False, optional=True, no_value=0 ) file_ext = "mol2/sdf/drf" def check_filetype( self,filename ): self.no_mols = commands.getstatusoutput("grep -c \\$\\$\\$\\$ "+filename) if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): self.file_ext="sdf" return True self.no_mols = commands.getstatusoutput("grep -c @\<TRIPOS\>MOLECULE "+filename) if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): self.file_ext="mol2" return True self.no_mols = commands.getstatusoutput("grep -c \"ligand id\" "+filename) if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): self.file_ext="drf" return True self.no_mols = commands.getstatusoutput("grep -c HEADER "+filename) if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): self.file_ext="pdb" return True return False def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: if(self.check_filetype(dataset.file_name)) : if (self.no_mols[1] == '1'): dataset.blurb = "1 molecule" else: dataset.blurb = "%s molecules" % self.no_mols[1] dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def get_mime(self): return 'text/plain' class GenericMultiMolFile( GenericMolFile ): def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: self.sniff(dataset.file_name) if (self.no_mols[1] == '1'): dataset.blurb = "1 molecule" else: dataset.blurb = "%s molecules" % self.no_mols[1] dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' class SDF( GenericMultiMolFile ): file_ext = "sdf" def sniff( self, filename ): self.no_mols = commands.getstatusoutput("grep -c \\$\\$\\$\\$ "+filename) if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): return True else: return False class MOL2( GenericMultiMolFile ): file_ext = "mol2" def sniff( self, filename ): self.no_mols = commands.getstatusoutput("grep -c @\<TRIPOS\>MOLECULE "+filename) if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): return True else: return False class FPS( GenericMultiMolFile ): file_ext = "fps" def sniff( self, filename ): self.no_mols = commands.getstatusoutput("grep -c -v '^#' "+filename) with open(filename) as in_handle: for line_counter, line in enumerate(in_handle): line = line.strip() if line.startswith('#FPS1'): return True if line_counter > 10: return False class DRF( GenericMultiMolFile ): file_ext = "drf" def sniff( self, filename ): self.no_mols = commands.getstatusoutput("grep -c \"ligand id\" "+filename) if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): return True else: return False class PHAR( GenericMultiMolFile ): MetadataElement( name="base_name", desc="base name", default='Phar', readonly=True, set_in_upload=True) file_ext = "phar" def sniff( self, filename ): self.no_mols = commands.getstatusoutput("grep -c -v '^#' "+filename) return False class PDB( GenericMolFile ): file_ext = "pdb" def sniff( self, filename ): self.no_mols = commands.getstatusoutput("grep -c HEADER "+filename) if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): return True else: return False def set_peek( self, dataset, is_multi_byte=False ): #def set_peek( self, dataset, line_count=None, is_multi_byte=False ): if not dataset.dataset.purged: res = commands.getstatusoutput("lib/galaxy/datatypes/countResidues.sh "+dataset.file_name) dataset.peek = res[1] self.sniff(dataset.file_name) if (self.no_mols[1] == '1'): dataset.blurb = "1 protein structure" else: dataset.blurb = "%s protein structures"%self.no_mols[1] else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' class grd ( data.Text ) : file_ext = "grd" def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: #dataset.peek = "" dataset.blurb = "score-grids for docking" else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' class grdtgz ( data.Text ) : file_ext = "grd.tgz" def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: #dataset.peek = "" dataset.blurb = "compressed score-grids for docking" else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' class InChI( GenericMultiMolFile ): file_ext = "inchi" def sniff( self, filename ): self.no_mols = commands.getstatusoutput("grep -c '^InChI=' "+filename) word_count = commands.getoutput("wc -w "+filename).split()[0] if self.no_mols[1] != word_count: return False if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): return True else: return False def set_meta( self, dataset, **kwd ): """ Set the number of sequences and the number of data lines in dataset. """ if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize: dataset.metadata.data_lines = None dataset.metadata.sequences = None return #word_count = commands.getoutput("wc -w "+filename).split()[0] # word_count are the lines of the file, if word_count and molecule count # are the same, that must hold to be an InChI File, then that should be # the same number as all non-empty lines #dataset.metadata.data_lines = word_count #int(commands.getoutput("grep -cve '^\s*$' "+filename)) #dataset.metadata.molecules = word_count class SMILES( GenericMultiMolFile ): file_ext = "smi" def sniff( self, filename ): """ Its hard or impossible to sniff a SMILES File. All what i know is the word_count must be the same as the non-empty line count. And that i can try to import the first SMILES and check if it is a molecule. """ # that corresponds to non-empty line count self.no_mols = commands.getstatusoutput("grep -cve '^\s*$' "+filename) word_count = int(commands.getoutput("wc -w "+filename).split()[0]) if int(self.no_mols[1]) != word_count: return False if (self.no_mols[0] == 0) & (int(self.no_mols[1]) > 0): for line in open(filename): line = line.strip() if line: # if we have atoms, we have a molecule try: if len(pybel.readstring('smi', line).atoms) > 0: return True else: return False except: # if convert fails its not a smiles string return False return True else: return False def set_meta( self, dataset, **kwd ): """ Set the number of sequences and the number of data lines in dataset. """ if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize: dataset.metadata.data_lines = None dataset.metadata.sequences = None return #word_count = int(commands.getoutput("wc -w "+filename).split()[0]) # word_count are the lines of the file, if word_count and molecule count # are the same, that must hold to be an InChI File, then that should be # the same number as all non-empty lines #dataset.metadata.data_lines = word_count #dataset.metadata.molecules = word_count