comparison molecules.py @ 0:7cb4c02f61e6

Uploaded
author bgruening
date Tue, 26 Mar 2013 13:14:13 -0400
parents
children e533de975501
comparison
equal deleted inserted replaced
-1:000000000000 0:7cb4c02f61e6
1 # -*- coding: utf-8 -*-
2
3 from galaxy.datatypes import data
4 import logging
5 from galaxy.datatypes.sniff import get_headers, get_test_fname
6 from galaxy.datatypes.data import get_file_peek
7 from galaxy.datatypes.tabular import Tabular
8 from galaxy.datatypes.binary import Binary
9 import subprocess
10 import os
11 #import pybel
12 #import openbabel
13 #openbabel.obErrorLog.StopLogging()
14
15 from galaxy.datatypes.metadata import MetadataElement
16 from galaxy.datatypes import metadata
17
18 log = logging.getLogger(__name__)
19
20 def count_special_lines( word, filename, invert = False ):
21 """
22 searching for special 'words' using the grep tool
23 grep is used to speed up the searching and counting
24 The number of hits is returned.
25 """
26 try:
27 cmd = ["grep", "-c"]
28 if invert:
29 cmd.append('-v')
30 cmd.extend([word, filename])
31 out = subprocess.Popen(cmd, stdout=subprocess.PIPE)
32 return int(out.communicate()[0].split()[0])
33 except:
34 pass
35 return 0
36
37 def count_lines( filename, non_empty = False):
38 """
39 counting the number of lines from the 'filename' file
40 """
41 try:
42 if non_empty:
43 out = subprocess.Popen(['grep', '-cve', '^\s*$', filename], stdout=subprocess.PIPE)
44 else:
45 out = subprocess.Popen(['wc', '-l', filename], stdout=subprocess.PIPE)
46 return int(out.communicate()[0].split()[0])
47 except:
48 pass
49 return 0
50
51
52 class GenericMolFile( data.Text ):
53 """
54 abstract class for most of the molecule files
55 """
56 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
57
58 def set_peek( self, dataset, is_multi_byte=False ):
59 if not dataset.dataset.purged:
60 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
61 if (dataset.metadata.number_of_molecules == 1):
62 dataset.blurb = "1 molecule"
63 else:
64 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
65 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
66 else:
67 dataset.peek = 'file does not exist'
68 dataset.blurb = 'file purged from disk'
69
70 def get_mime(self):
71 return 'text/plain'
72
73
74
75 class SDF( GenericMolFile ):
76 file_ext = "sdf"
77 def sniff( self, filename ):
78 if count_special_lines("^\$\$\$\$", filename) > 0:
79 return True
80 else:
81 return False
82
83 def set_meta( self, dataset, **kwd ):
84 """
85 Set the number of lines of data in dataset.
86 """
87 dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$", dataset.file_name)#self.count_data_lines(dataset.file_name)
88
89 def split( cls, input_datasets, subdir_generator_function, split_params):
90 """
91 Split the input files by molecule records.
92 """
93 if split_params is None:
94 return None
95
96 if len(input_datasets) > 1:
97 raise Exception("SD-file splitting does not support multiple files")
98 input_files = [ds.file_name for ds in input_datasets]
99
100 chunk_size = None
101 if split_params['split_mode'] == 'number_of_parts':
102 raise Exception('Split mode "%s" is currently not implemented for SD-files.' % split_params['split_mode'])
103 elif split_params['split_mode'] == 'to_size':
104 chunk_size = int(split_params['split_size'])
105 else:
106 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
107
108 def _read_sdf_records( filename ):
109 lines = []
110 with open(filename) as handle:
111 for line in handle:
112 lines.append( line )
113 if line.startswith("$$$$"):
114 yield lines
115 lines = []
116
117 def _write_part_sdf_file( accumulated_lines ):
118 part_dir = subdir_generator_function()
119 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
120 part_file = open(part_path, 'w')
121 part_file.writelines( accumulated_lines )
122 part_file.close()
123
124 try:
125 sdf_records = _read_sdf_records( input_files[0] )
126 sdf_lines_accumulated = []
127 for counter, sdf_record in enumerate( sdf_records, start = 1):
128 sdf_lines_accumulated.extend( sdf_record )
129 if counter % chunk_size == 0:
130 _write_part_sdf_file( sdf_lines_accumulated )
131 sdf_lines_accumulated = []
132 if sdf_lines_accumulated:
133 _write_part_sdf_file( sdf_lines_accumulated )
134 except Exception, e:
135 log.error('Unable to split files: %s' % str(e))
136 raise
137 split = classmethod(split)
138
139
140 class MOL2( GenericMolFile ):
141 file_ext = "mol2"
142 def sniff( self, filename ):
143 if count_special_lines("@\<TRIPOS\>MOLECULE", filename) > 0:
144 return True
145 else:
146 return False
147
148 def set_meta( self, dataset, **kwd ):
149 """
150 Set the number of lines of data in dataset.
151 """
152 dataset.metadata.number_of_molecules = count_special_lines("@<TRIPOS>MOLECULE", dataset.file_name)#self.count_data_lines(dataset)
153
154 def split( cls, input_datasets, subdir_generator_function, split_params):
155 """
156 Split the input files by molecule records.
157 """
158 if split_params is None:
159 return None
160
161 if len(input_datasets) > 1:
162 raise Exception("MOL2-file splitting does not support multiple files")
163 input_files = [ds.file_name for ds in input_datasets]
164
165 chunk_size = None
166 if split_params['split_mode'] == 'number_of_parts':
167 raise Exception('Split mode "%s" is currently not implemented for MOL2-files.' % split_params['split_mode'])
168 elif split_params['split_mode'] == 'to_size':
169 chunk_size = int(split_params['split_size'])
170 else:
171 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
172
173 def _read_sdf_records( filename ):
174 lines = []
175 start = True
176 with open(filename) as handle:
177 for line in handle:
178 if line.startswith("@<TRIPOS>MOLECULE"):
179 if start:
180 start = False
181 else:
182 yield lines
183 lines = []
184 lines.append( line )
185
186 def _write_part_mol2_file( accumulated_lines ):
187 part_dir = subdir_generator_function()
188 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
189 part_file = open(part_path, 'w')
190 part_file.writelines( accumulated_lines )
191 part_file.close()
192
193 try:
194 sdf_records = _read_sdf_records( input_files[0] )
195 sdf_lines_accumulated = []
196 for counter, sdf_record in enumerate( sdf_records, start = 1):
197 sdf_lines_accumulated.extend( sdf_record )
198 if counter % chunk_size == 0:
199 _write_part_mol2_file( sdf_lines_accumulated )
200 sdf_lines_accumulated = []
201 if sdf_lines_accumulated:
202 _write_part_mol2_file( sdf_lines_accumulated )
203 except Exception, e:
204 log.error('Unable to split files: %s' % str(e))
205 raise
206 split = classmethod(split)
207
208
209
210 class FPS( GenericMolFile ):
211 """
212 chemfp fingerprint file: http://code.google.com/p/chem-fingerprints/wiki/FPS
213 """
214 file_ext = "fps"
215 def sniff( self, filename ):
216 header = get_headers( filename, sep='\t', count=1 )
217 if header[0][0].strip() == '#FPS1':
218 return True
219 else:
220 return False
221
222 def set_meta( self, dataset, **kwd ):
223 """
224 Set the number of lines of data in dataset.
225 """
226 dataset.metadata.number_of_molecules = count_special_lines('^#', dataset.file_name, invert = True)#self.count_data_lines(dataset)
227
228
229 def split( cls, input_datasets, subdir_generator_function, split_params):
230 """
231 Split the input files by fingerprint records.
232 """
233 if split_params is None:
234 return None
235
236 if len(input_datasets) > 1:
237 raise Exception("FPS-file splitting does not support multiple files")
238 input_files = [ds.file_name for ds in input_datasets]
239
240 chunk_size = None
241 if split_params['split_mode'] == 'number_of_parts':
242 raise Exception('Split mode "%s" is currently not implemented for MOL2-files.' % split_params['split_mode'])
243 elif split_params['split_mode'] == 'to_size':
244 chunk_size = int(split_params['split_size'])
245 else:
246 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
247
248
249 def _write_part_fingerprint_file( accumulated_lines ):
250 part_dir = subdir_generator_function()
251 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
252 part_file = open(part_path, 'w')
253 part_file.writelines( accumulated_lines )
254 part_file.close()
255
256 try:
257 header_lines = []
258 lines_accumulated = []
259 fingerprint_counter = 0
260 for line in open( input_files[0] ):
261 if not line.strip():
262 continue
263 if line.startswith('#'):
264 header_lines.append( line )
265 else:
266 fingerprint_counter += 1
267 lines_accumulated.append( line )
268 if fingerprint_counter != 0 and fingerprint_counter % chunk_size == 0:
269 _write_part_fingerprint_file( header_lines + lines_accumulated )
270 lines_accumulated = []
271 if lines_accumulated:
272 _write_part_fingerprint_file( header_lines + lines_accumulated )
273 except Exception, e:
274 log.error('Unable to split files: %s' % str(e))
275 raise
276 split = classmethod(split)
277
278
279 def merge(split_files, output_file):
280 """
281 Merging fps files requires merging the header manually.
282 We take the header from the first file.
283 """
284 if len(split_files) == 1:
285 #For one file only, use base class method (move/copy)
286 return data.Text.merge(split_files, output_file)
287 if not split_files:
288 raise ValueError("No fps files given, %r, to merge into %s" \
289 % (split_files, output_file))
290 out = open(output_file, "w")
291 first = True
292 for filename in split_files:
293 with open(filename) as handle:
294 for line in handle:
295 if line.startswith('#'):
296 if first:
297 out.write(line)
298 else:
299 # line is no header and not a comment, we assume the first header is written to out and we set 'first' to False
300 first = False
301 out.write(line)
302 out.close()
303 merge = staticmethod(merge)
304
305
306
307 class OBFS( Binary ):
308 """OpenBabel Fastsearch format (fs)."""
309 file_ext = 'fs'
310 composite_type ='basic'
311 allow_datatype_change = False
312
313 MetadataElement( name="base_name", default='OpenBabel Fastsearch Index',
314 readonly=True, visible=True, optional=True,)
315
316 def __init__(self,**kwd):
317 """
318 A Fastsearch Index consists of a binary file with the fingerprints
319 and a pointer the actual molecule file.
320 """
321 Binary.__init__(self, **kwd)
322 self.add_composite_file('molecule.fs', is_binary = True,
323 description = 'OpenBabel Fastsearch Index' )
324 self.add_composite_file('molecule.sdf', optional=True,
325 is_binary = False, description = 'Molecule File' )
326 self.add_composite_file('molecule.smi', optional=True,
327 is_binary = False, description = 'Molecule File' )
328 self.add_composite_file('molecule.inchi', optional=True,
329 is_binary = False, description = 'Molecule File' )
330 self.add_composite_file('molecule.mol2', optional=True,
331 is_binary = False, description = 'Molecule File' )
332 self.add_composite_file('molecule.cml', optional=True,
333 is_binary = False, description = 'Molecule File' )
334
335 def set_peek( self, dataset, is_multi_byte=False ):
336 """Set the peek and blurb text."""
337 if not dataset.dataset.purged:
338 dataset.peek = "OpenBabel Fastsearch Index"
339 dataset.blurb = "OpenBabel Fastsearch Index"
340 else:
341 dataset.peek = "file does not exist"
342 dataset.blurb = "file purged from disk"
343
344 def display_peek( self, dataset ):
345 """Create HTML content, used for displaying peek."""
346 try:
347 return dataset.peek
348 except:
349 return "OpenBabel Fastsearch Index"
350
351 def display_data(self, trans, data, preview=False, filename=None,
352 to_ext=None, size=None, offset=None, **kwd):
353 """Apparently an old display method, but still gets called.
354
355 This allows us to format the data shown in the central pane via the "eye" icon.
356 """
357 return "This is a OpenBabel Fastsearch format. You can speed up your similarity and substructure search with it."
358
359 def get_mime(self):
360 """Returns the mime type of the datatype (pretend it is text for peek)"""
361 return 'text/plain'
362
363 def merge(split_files, output_file, extra_merge_args):
364 """Merging Fastsearch indices is not supported."""
365 raise NotImplementedError("Merging Fastsearch indices is not supported.")
366
367 def split( cls, input_datasets, subdir_generator_function, split_params):
368 """Splitting Fastsearch indices is not supported."""
369 if split_params is None:
370 return None
371 raise NotImplementedError("Splitting Fastsearch indices is not possible.")
372
373
374
375 class DRF( GenericMolFile ):
376 file_ext = "drf"
377
378 def set_meta( self, dataset, **kwd ):
379 """
380 Set the number of lines of data in dataset.
381 """
382 dataset.metadata.number_of_molecules = count_special_lines('\"ligand id\"', dataset.file_name, invert = True)#self.count_data_lines(dataset)
383
384
385 class PHAR( GenericMolFile ):
386 file_ext = "phar"
387 def set_peek( self, dataset, is_multi_byte=False ):
388 if not dataset.dataset.purged:
389 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
390 dataset.blurb = "pharmacophore"
391 else:
392 dataset.peek = 'file does not exist'
393 dataset.blurb = 'file purged from disk'
394
395
396 class PDB( GenericMolFile ):
397 file_ext = "pdb"
398 def sniff( self, filename ):
399 headers = get_headers( filename, sep=' ', count=300 )
400 h = t = c = s = k = e = False
401 for line in headers:
402 section_name = line[0].strip()
403 if section_name == 'HEADER':
404 h = True
405 elif section_name == 'TITLE':
406 t = True
407 elif section_name == 'COMPND':
408 c = True
409 elif section_name == 'SOURCE':
410 s = True
411 elif section_name == 'KEYWDS':
412 k = True
413 elif section_name == 'EXPDTA':
414 e = True
415
416 if h*t*c*s*k*e == True:
417 return True
418 else:
419 return False
420
421 def set_peek( self, dataset, is_multi_byte=False ):
422 if not dataset.dataset.purged:
423 atom_numbers = count_special_lines("^ATOM", dataset.file_name)
424 hetatm_numbers = count_special_lines("^HETATM", dataset.file_name)
425 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
426 dataset.blurb = "%s atoms and %s HET-atoms" % (atom_numbers, hetatm_numbers)
427 else:
428 dataset.peek = 'file does not exist'
429 dataset.blurb = 'file purged from disk'
430
431
432 class grd( data.Text ):
433 file_ext = "grd"
434 def set_peek( self, dataset, is_multi_byte=False ):
435 if not dataset.dataset.purged:
436 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
437 dataset.blurb = "grids for docking"
438 else:
439 dataset.peek = 'file does not exist'
440 dataset.blurb = 'file purged from disk'
441
442
443 class grdtgz( Binary ):
444 file_ext = "grd.tgz"
445 def set_peek( self, dataset, is_multi_byte=False ):
446 if not dataset.dataset.purged:
447 dataset.peek = 'binary data'
448 dataset.blurb = "compressed grids for docking"
449 else:
450 dataset.peek = 'file does not exist'
451 dataset.blurb = 'file purged from disk'
452
453
454 class InChI( Tabular ):
455 file_ext = "inchi"
456 column_names = [ 'InChI' ]
457 MetadataElement( name="columns", default=2, desc="Number of columns", readonly=True, visible=False )
458 MetadataElement( name="column_types", default=['str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
459 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
460
461 def set_meta( self, dataset, **kwd ):
462 """
463 Set the number of lines of data in dataset.
464 """
465 dataset.metadata.number_of_molecules = self.count_data_lines(dataset)
466
467 def set_peek( self, dataset, is_multi_byte=False ):
468 if not dataset.dataset.purged:
469 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
470 if (dataset.metadata.number_of_molecules == 1):
471 dataset.blurb = "1 molecule"
472 else:
473 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
474 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
475 else:
476 dataset.peek = 'file does not exist'
477 dataset.blurb = 'file purged from disk'
478
479 def sniff( self, filename ):
480 """
481 InChI files starts with 'InChI='
482 """
483 inchi_lines = get_headers( filename, sep=' ', count=10 )
484 for inchi in inchi_lines:
485 if not inchi[0].startswith('InChI='):
486 return False
487 return True
488
489
490 class SMILES( Tabular ):
491 file_ext = "smi"
492 column_names = [ 'SMILES', 'TITLE' ]
493 MetadataElement( name="columns", default=2, desc="Number of columns", readonly=True, visible=False )
494 MetadataElement( name="column_types", default=['str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
495 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
496
497 def set_meta( self, dataset, **kwd ):
498 """
499 Set the number of lines of data in dataset.
500 """
501 dataset.metadata.number_of_molecules = self.count_data_lines(dataset)
502
503 def set_peek( self, dataset, is_multi_byte=False ):
504 if not dataset.dataset.purged:
505 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
506 if (dataset.metadata.number_of_molecules == 1):
507 dataset.blurb = "1 molecule"
508 else:
509 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
510 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
511 else:
512 dataset.peek = 'file does not exist'
513 dataset.blurb = 'file purged from disk'
514
515
516 '''
517 def sniff( self, filename ):
518 """
519 Its hard or impossible to sniff a SMILES File. We can
520 try to import the first SMILES and check if it is a molecule, but
521 currently its not possible to use external libraries from the toolshed
522 in datatype definition files. TODO
523 """
524 self.molecule_number = count_lines( filename, non_empty = True )
525 word_count = count_lines( filename )
526
527 if self.molecule_number != word_count:
528 return False
529
530 if self.molecule_number > 0:
531 # test first 3 SMILES
532 smiles_lines = get_headers( filename, sep='\t', count=3 )
533 for smiles_line in smiles_lines:
534 if len(smiles_line) > 2:
535 return False
536 smiles = smiles_line[0]
537 try:
538 # if we have atoms, we have a molecule
539 if not len( pybel.readstring('smi', smiles).atoms ) > 0:
540 return False
541 except:
542 # if convert fails its not a smiles string
543 return False
544 return True
545 else:
546 return False
547 '''
548
549
550 if __name__ == '__main__':
551 """
552 TODO: We need to figure out, how to put example files under /lib/galaxy/datatypes/test/ from a toolshed, so that doctest can work properly.
553 """
554 inchi = get_test_fname('drugbank_drugs.inchi')
555 smiles = get_test_fname('drugbank_drugs.smi')
556 sdf = get_test_fname('drugbank_drugs.sdf')
557 fps = get_test_fname('50_chemfp_fingerprints_FPS1.fps')
558 pdb = get_test_fname('2zbz.pdb')
559
560 print 'SMILES test'
561 print SMILES().sniff(smiles), 'smi'
562 print SMILES().sniff(inchi)
563 print SMILES().sniff(pdb)
564
565 print 'InChI test'
566 print InChI().sniff(smiles)
567 print InChI().sniff(sdf)
568 print InChI().sniff(inchi), 'inchi'
569
570 print 'FPS test'
571 print FPS().sniff(smiles)
572 print FPS().sniff(sdf)
573 f = FPS()
574 print f.sniff(fps)
575
576 print 'SDF test'
577 print SDF().sniff(smiles)
578 print SDF().sniff(sdf), 'sdf'
579 print SDF().sniff(fps)
580
581 print 'PDB test'
582 print PDB().sniff(smiles)
583 print PDB().sniff(sdf)
584 print PDB().sniff(fps)
585 print PDB().sniff(pdb), 'pdb'