0
|
1 # -*- coding: utf-8 -*-
|
|
2
|
|
3 from galaxy.datatypes import data
|
|
4 import logging
|
|
5 from galaxy.datatypes.sniff import get_headers, get_test_fname
|
|
6 from galaxy.datatypes.data import get_file_peek
|
|
7 from galaxy.datatypes.tabular import Tabular
|
|
8 from galaxy.datatypes.binary import Binary
|
4
|
9 from galaxy.datatypes.xml import GenericXml
|
0
|
10 import subprocess
|
|
11 import os
|
|
12 #import pybel
|
|
13 #import openbabel
|
|
14 #openbabel.obErrorLog.StopLogging()
|
|
15
|
|
16 from galaxy.datatypes.metadata import MetadataElement
|
|
17 from galaxy.datatypes import metadata
|
|
18
|
|
19 log = logging.getLogger(__name__)
|
|
20
|
|
21 def count_special_lines( word, filename, invert = False ):
|
|
22 """
|
|
23 searching for special 'words' using the grep tool
|
|
24 grep is used to speed up the searching and counting
|
|
25 The number of hits is returned.
|
|
26 """
|
|
27 try:
|
|
28 cmd = ["grep", "-c"]
|
|
29 if invert:
|
|
30 cmd.append('-v')
|
|
31 cmd.extend([word, filename])
|
|
32 out = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
|
33 return int(out.communicate()[0].split()[0])
|
|
34 except:
|
|
35 pass
|
|
36 return 0
|
|
37
|
|
38 def count_lines( filename, non_empty = False):
|
|
39 """
|
|
40 counting the number of lines from the 'filename' file
|
|
41 """
|
|
42 try:
|
|
43 if non_empty:
|
|
44 out = subprocess.Popen(['grep', '-cve', '^\s*$', filename], stdout=subprocess.PIPE)
|
|
45 else:
|
|
46 out = subprocess.Popen(['wc', '-l', filename], stdout=subprocess.PIPE)
|
|
47 return int(out.communicate()[0].split()[0])
|
|
48 except:
|
|
49 pass
|
|
50 return 0
|
|
51
|
|
52
|
|
53 class GenericMolFile( data.Text ):
|
|
54 """
|
|
55 abstract class for most of the molecule files
|
|
56 """
|
|
57 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
|
|
58
|
|
59 def set_peek( self, dataset, is_multi_byte=False ):
|
|
60 if not dataset.dataset.purged:
|
|
61 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
62 if (dataset.metadata.number_of_molecules == 1):
|
|
63 dataset.blurb = "1 molecule"
|
|
64 else:
|
|
65 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
|
|
66 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
67 else:
|
|
68 dataset.peek = 'file does not exist'
|
|
69 dataset.blurb = 'file purged from disk'
|
|
70
|
|
71 def get_mime(self):
|
|
72 return 'text/plain'
|
|
73
|
2
|
74 class MOL( GenericMolFile ):
|
|
75 file_ext = "mol"
|
|
76 def sniff( self, filename ):
|
3
|
77 if count_special_lines("^M\s*END", filename) == 1:
|
2
|
78 return True
|
|
79 else:
|
|
80 return False
|
|
81
|
|
82 def set_meta( self, dataset, **kwd ):
|
|
83 """
|
|
84 Set the number molecules, in the case of MOL its always one.
|
|
85 """
|
|
86 dataset.metadata.number_of_molecules = 1
|
0
|
87
|
|
88
|
|
89 class SDF( GenericMolFile ):
|
|
90 file_ext = "sdf"
|
|
91 def sniff( self, filename ):
|
|
92 if count_special_lines("^\$\$\$\$", filename) > 0:
|
|
93 return True
|
|
94 else:
|
|
95 return False
|
|
96
|
|
97 def set_meta( self, dataset, **kwd ):
|
|
98 """
|
2
|
99 Set the number of molecules in dataset.
|
0
|
100 """
|
2
|
101 dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$", dataset.file_name)
|
0
|
102
|
|
103 def split( cls, input_datasets, subdir_generator_function, split_params):
|
|
104 """
|
|
105 Split the input files by molecule records.
|
|
106 """
|
|
107 if split_params is None:
|
|
108 return None
|
|
109
|
|
110 if len(input_datasets) > 1:
|
|
111 raise Exception("SD-file splitting does not support multiple files")
|
|
112 input_files = [ds.file_name for ds in input_datasets]
|
|
113
|
|
114 chunk_size = None
|
|
115 if split_params['split_mode'] == 'number_of_parts':
|
|
116 raise Exception('Split mode "%s" is currently not implemented for SD-files.' % split_params['split_mode'])
|
|
117 elif split_params['split_mode'] == 'to_size':
|
|
118 chunk_size = int(split_params['split_size'])
|
|
119 else:
|
|
120 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
|
|
121
|
|
122 def _read_sdf_records( filename ):
|
|
123 lines = []
|
|
124 with open(filename) as handle:
|
|
125 for line in handle:
|
|
126 lines.append( line )
|
|
127 if line.startswith("$$$$"):
|
|
128 yield lines
|
|
129 lines = []
|
|
130
|
|
131 def _write_part_sdf_file( accumulated_lines ):
|
|
132 part_dir = subdir_generator_function()
|
|
133 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
|
|
134 part_file = open(part_path, 'w')
|
|
135 part_file.writelines( accumulated_lines )
|
|
136 part_file.close()
|
|
137
|
|
138 try:
|
|
139 sdf_records = _read_sdf_records( input_files[0] )
|
|
140 sdf_lines_accumulated = []
|
|
141 for counter, sdf_record in enumerate( sdf_records, start = 1):
|
|
142 sdf_lines_accumulated.extend( sdf_record )
|
|
143 if counter % chunk_size == 0:
|
|
144 _write_part_sdf_file( sdf_lines_accumulated )
|
|
145 sdf_lines_accumulated = []
|
|
146 if sdf_lines_accumulated:
|
|
147 _write_part_sdf_file( sdf_lines_accumulated )
|
|
148 except Exception, e:
|
|
149 log.error('Unable to split files: %s' % str(e))
|
|
150 raise
|
|
151 split = classmethod(split)
|
|
152
|
|
153
|
|
154 class MOL2( GenericMolFile ):
|
|
155 file_ext = "mol2"
|
|
156 def sniff( self, filename ):
|
|
157 if count_special_lines("@\<TRIPOS\>MOLECULE", filename) > 0:
|
|
158 return True
|
|
159 else:
|
|
160 return False
|
|
161
|
|
162 def set_meta( self, dataset, **kwd ):
|
|
163 """
|
|
164 Set the number of lines of data in dataset.
|
|
165 """
|
|
166 dataset.metadata.number_of_molecules = count_special_lines("@<TRIPOS>MOLECULE", dataset.file_name)#self.count_data_lines(dataset)
|
|
167
|
|
168 def split( cls, input_datasets, subdir_generator_function, split_params):
|
|
169 """
|
|
170 Split the input files by molecule records.
|
|
171 """
|
|
172 if split_params is None:
|
|
173 return None
|
|
174
|
|
175 if len(input_datasets) > 1:
|
|
176 raise Exception("MOL2-file splitting does not support multiple files")
|
|
177 input_files = [ds.file_name for ds in input_datasets]
|
|
178
|
|
179 chunk_size = None
|
|
180 if split_params['split_mode'] == 'number_of_parts':
|
|
181 raise Exception('Split mode "%s" is currently not implemented for MOL2-files.' % split_params['split_mode'])
|
|
182 elif split_params['split_mode'] == 'to_size':
|
|
183 chunk_size = int(split_params['split_size'])
|
|
184 else:
|
|
185 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
|
|
186
|
4
|
187 def _read_mol2_records( filename ):
|
0
|
188 lines = []
|
|
189 start = True
|
|
190 with open(filename) as handle:
|
|
191 for line in handle:
|
|
192 if line.startswith("@<TRIPOS>MOLECULE"):
|
|
193 if start:
|
|
194 start = False
|
|
195 else:
|
|
196 yield lines
|
|
197 lines = []
|
|
198 lines.append( line )
|
|
199
|
|
200 def _write_part_mol2_file( accumulated_lines ):
|
|
201 part_dir = subdir_generator_function()
|
|
202 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
|
|
203 part_file = open(part_path, 'w')
|
|
204 part_file.writelines( accumulated_lines )
|
|
205 part_file.close()
|
|
206
|
|
207 try:
|
4
|
208 mol2_records = _read_mol2_records( input_files[0] )
|
|
209 mol2_lines_accumulated = []
|
|
210 for counter, mol2_record in enumerate( mol2_records, start = 1):
|
|
211 mol2_lines_accumulated.extend( mol2_record )
|
0
|
212 if counter % chunk_size == 0:
|
4
|
213 _write_part_mol2_file( mol2_lines_accumulated )
|
|
214 mol2_lines_accumulated = []
|
|
215 if mol2_lines_accumulated:
|
|
216 _write_part_mol2_file( mol2_lines_accumulated )
|
0
|
217 except Exception, e:
|
|
218 log.error('Unable to split files: %s' % str(e))
|
|
219 raise
|
|
220 split = classmethod(split)
|
|
221
|
|
222
|
|
223
|
|
224 class FPS( GenericMolFile ):
|
|
225 """
|
4
|
226 chemfp fingerprint file: http://code.google.com/p/chem-fingerprints/wiki/FPS
|
0
|
227 """
|
|
228 file_ext = "fps"
|
|
229 def sniff( self, filename ):
|
|
230 header = get_headers( filename, sep='\t', count=1 )
|
|
231 if header[0][0].strip() == '#FPS1':
|
|
232 return True
|
|
233 else:
|
|
234 return False
|
|
235
|
|
236 def set_meta( self, dataset, **kwd ):
|
|
237 """
|
|
238 Set the number of lines of data in dataset.
|
|
239 """
|
4
|
240 dataset.metadata.number_of_molecules = count_special_lines('^#', dataset.file_name, invert = True)
|
0
|
241
|
|
242
|
|
243 def split( cls, input_datasets, subdir_generator_function, split_params):
|
|
244 """
|
|
245 Split the input files by fingerprint records.
|
|
246 """
|
|
247 if split_params is None:
|
|
248 return None
|
|
249
|
|
250 if len(input_datasets) > 1:
|
|
251 raise Exception("FPS-file splitting does not support multiple files")
|
|
252 input_files = [ds.file_name for ds in input_datasets]
|
|
253
|
|
254 chunk_size = None
|
|
255 if split_params['split_mode'] == 'number_of_parts':
|
|
256 raise Exception('Split mode "%s" is currently not implemented for MOL2-files.' % split_params['split_mode'])
|
|
257 elif split_params['split_mode'] == 'to_size':
|
|
258 chunk_size = int(split_params['split_size'])
|
|
259 else:
|
|
260 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
|
|
261
|
|
262
|
|
263 def _write_part_fingerprint_file( accumulated_lines ):
|
|
264 part_dir = subdir_generator_function()
|
|
265 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
|
|
266 part_file = open(part_path, 'w')
|
|
267 part_file.writelines( accumulated_lines )
|
|
268 part_file.close()
|
|
269
|
|
270 try:
|
|
271 header_lines = []
|
|
272 lines_accumulated = []
|
|
273 fingerprint_counter = 0
|
|
274 for line in open( input_files[0] ):
|
|
275 if not line.strip():
|
|
276 continue
|
|
277 if line.startswith('#'):
|
|
278 header_lines.append( line )
|
|
279 else:
|
|
280 fingerprint_counter += 1
|
|
281 lines_accumulated.append( line )
|
|
282 if fingerprint_counter != 0 and fingerprint_counter % chunk_size == 0:
|
|
283 _write_part_fingerprint_file( header_lines + lines_accumulated )
|
|
284 lines_accumulated = []
|
|
285 if lines_accumulated:
|
|
286 _write_part_fingerprint_file( header_lines + lines_accumulated )
|
|
287 except Exception, e:
|
|
288 log.error('Unable to split files: %s' % str(e))
|
|
289 raise
|
|
290 split = classmethod(split)
|
|
291
|
|
292
|
|
293 def merge(split_files, output_file):
|
|
294 """
|
|
295 Merging fps files requires merging the header manually.
|
|
296 We take the header from the first file.
|
|
297 """
|
|
298 if len(split_files) == 1:
|
|
299 #For one file only, use base class method (move/copy)
|
|
300 return data.Text.merge(split_files, output_file)
|
|
301 if not split_files:
|
|
302 raise ValueError("No fps files given, %r, to merge into %s" \
|
|
303 % (split_files, output_file))
|
|
304 out = open(output_file, "w")
|
|
305 first = True
|
|
306 for filename in split_files:
|
|
307 with open(filename) as handle:
|
|
308 for line in handle:
|
|
309 if line.startswith('#'):
|
|
310 if first:
|
|
311 out.write(line)
|
|
312 else:
|
|
313 # line is no header and not a comment, we assume the first header is written to out and we set 'first' to False
|
|
314 first = False
|
|
315 out.write(line)
|
|
316 out.close()
|
|
317 merge = staticmethod(merge)
|
|
318
|
|
319
|
|
320
|
|
321 class OBFS( Binary ):
|
|
322 """OpenBabel Fastsearch format (fs)."""
|
|
323 file_ext = 'fs'
|
|
324 composite_type ='basic'
|
|
325 allow_datatype_change = False
|
|
326
|
|
327 MetadataElement( name="base_name", default='OpenBabel Fastsearch Index',
|
|
328 readonly=True, visible=True, optional=True,)
|
|
329
|
|
330 def __init__(self,**kwd):
|
|
331 """
|
|
332 A Fastsearch Index consists of a binary file with the fingerprints
|
|
333 and a pointer the actual molecule file.
|
|
334 """
|
|
335 Binary.__init__(self, **kwd)
|
|
336 self.add_composite_file('molecule.fs', is_binary = True,
|
|
337 description = 'OpenBabel Fastsearch Index' )
|
|
338 self.add_composite_file('molecule.sdf', optional=True,
|
|
339 is_binary = False, description = 'Molecule File' )
|
|
340 self.add_composite_file('molecule.smi', optional=True,
|
|
341 is_binary = False, description = 'Molecule File' )
|
|
342 self.add_composite_file('molecule.inchi', optional=True,
|
|
343 is_binary = False, description = 'Molecule File' )
|
|
344 self.add_composite_file('molecule.mol2', optional=True,
|
|
345 is_binary = False, description = 'Molecule File' )
|
|
346 self.add_composite_file('molecule.cml', optional=True,
|
|
347 is_binary = False, description = 'Molecule File' )
|
|
348
|
|
349 def set_peek( self, dataset, is_multi_byte=False ):
|
|
350 """Set the peek and blurb text."""
|
|
351 if not dataset.dataset.purged:
|
|
352 dataset.peek = "OpenBabel Fastsearch Index"
|
|
353 dataset.blurb = "OpenBabel Fastsearch Index"
|
|
354 else:
|
|
355 dataset.peek = "file does not exist"
|
|
356 dataset.blurb = "file purged from disk"
|
|
357
|
|
358 def display_peek( self, dataset ):
|
|
359 """Create HTML content, used for displaying peek."""
|
|
360 try:
|
|
361 return dataset.peek
|
|
362 except:
|
|
363 return "OpenBabel Fastsearch Index"
|
|
364
|
|
365 def display_data(self, trans, data, preview=False, filename=None,
|
|
366 to_ext=None, size=None, offset=None, **kwd):
|
|
367 """Apparently an old display method, but still gets called.
|
|
368
|
|
369 This allows us to format the data shown in the central pane via the "eye" icon.
|
|
370 """
|
|
371 return "This is a OpenBabel Fastsearch format. You can speed up your similarity and substructure search with it."
|
|
372
|
|
373 def get_mime(self):
|
|
374 """Returns the mime type of the datatype (pretend it is text for peek)"""
|
|
375 return 'text/plain'
|
|
376
|
|
377 def merge(split_files, output_file, extra_merge_args):
|
|
378 """Merging Fastsearch indices is not supported."""
|
|
379 raise NotImplementedError("Merging Fastsearch indices is not supported.")
|
|
380
|
|
381 def split( cls, input_datasets, subdir_generator_function, split_params):
|
|
382 """Splitting Fastsearch indices is not supported."""
|
|
383 if split_params is None:
|
|
384 return None
|
|
385 raise NotImplementedError("Splitting Fastsearch indices is not possible.")
|
|
386
|
|
387
|
|
388
|
|
389 class DRF( GenericMolFile ):
|
|
390 file_ext = "drf"
|
|
391
|
|
392 def set_meta( self, dataset, **kwd ):
|
|
393 """
|
|
394 Set the number of lines of data in dataset.
|
|
395 """
|
4
|
396 dataset.metadata.number_of_molecules = count_special_lines('\"ligand id\"', dataset.file_name, invert = True)
|
0
|
397
|
|
398
|
|
399 class PHAR( GenericMolFile ):
|
4
|
400 """
|
|
401 Pharmacophore database format from silicos-it.
|
|
402 """
|
0
|
403 file_ext = "phar"
|
|
404 def set_peek( self, dataset, is_multi_byte=False ):
|
|
405 if not dataset.dataset.purged:
|
|
406 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
407 dataset.blurb = "pharmacophore"
|
|
408 else:
|
|
409 dataset.peek = 'file does not exist'
|
|
410 dataset.blurb = 'file purged from disk'
|
|
411
|
|
412
|
|
413 class PDB( GenericMolFile ):
|
4
|
414 """
|
|
415 Protein Databank format.
|
|
416 http://www.wwpdb.org/documentation/format33/v3.3.html
|
|
417 """
|
0
|
418 file_ext = "pdb"
|
|
419 def sniff( self, filename ):
|
|
420 headers = get_headers( filename, sep=' ', count=300 )
|
|
421 h = t = c = s = k = e = False
|
|
422 for line in headers:
|
|
423 section_name = line[0].strip()
|
|
424 if section_name == 'HEADER':
|
|
425 h = True
|
|
426 elif section_name == 'TITLE':
|
|
427 t = True
|
|
428 elif section_name == 'COMPND':
|
|
429 c = True
|
|
430 elif section_name == 'SOURCE':
|
|
431 s = True
|
|
432 elif section_name == 'KEYWDS':
|
|
433 k = True
|
|
434 elif section_name == 'EXPDTA':
|
|
435 e = True
|
|
436
|
|
437 if h*t*c*s*k*e == True:
|
|
438 return True
|
|
439 else:
|
|
440 return False
|
|
441
|
|
442 def set_peek( self, dataset, is_multi_byte=False ):
|
|
443 if not dataset.dataset.purged:
|
|
444 atom_numbers = count_special_lines("^ATOM", dataset.file_name)
|
|
445 hetatm_numbers = count_special_lines("^HETATM", dataset.file_name)
|
|
446 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
447 dataset.blurb = "%s atoms and %s HET-atoms" % (atom_numbers, hetatm_numbers)
|
|
448 else:
|
|
449 dataset.peek = 'file does not exist'
|
|
450 dataset.blurb = 'file purged from disk'
|
|
451
|
|
452
|
|
453 class grd( data.Text ):
|
|
454 file_ext = "grd"
|
|
455 def set_peek( self, dataset, is_multi_byte=False ):
|
|
456 if not dataset.dataset.purged:
|
|
457 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
458 dataset.blurb = "grids for docking"
|
|
459 else:
|
|
460 dataset.peek = 'file does not exist'
|
|
461 dataset.blurb = 'file purged from disk'
|
|
462
|
|
463
|
|
464 class grdtgz( Binary ):
|
|
465 file_ext = "grd.tgz"
|
|
466 def set_peek( self, dataset, is_multi_byte=False ):
|
|
467 if not dataset.dataset.purged:
|
|
468 dataset.peek = 'binary data'
|
|
469 dataset.blurb = "compressed grids for docking"
|
|
470 else:
|
|
471 dataset.peek = 'file does not exist'
|
|
472 dataset.blurb = 'file purged from disk'
|
|
473
|
|
474
|
|
475 class InChI( Tabular ):
|
|
476 file_ext = "inchi"
|
|
477 column_names = [ 'InChI' ]
|
|
478 MetadataElement( name="columns", default=2, desc="Number of columns", readonly=True, visible=False )
|
|
479 MetadataElement( name="column_types", default=['str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
|
|
480 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
|
|
481
|
|
482 def set_meta( self, dataset, **kwd ):
|
|
483 """
|
|
484 Set the number of lines of data in dataset.
|
|
485 """
|
|
486 dataset.metadata.number_of_molecules = self.count_data_lines(dataset)
|
|
487
|
|
488 def set_peek( self, dataset, is_multi_byte=False ):
|
|
489 if not dataset.dataset.purged:
|
|
490 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
491 if (dataset.metadata.number_of_molecules == 1):
|
|
492 dataset.blurb = "1 molecule"
|
|
493 else:
|
|
494 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
|
|
495 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
496 else:
|
|
497 dataset.peek = 'file does not exist'
|
|
498 dataset.blurb = 'file purged from disk'
|
|
499
|
|
500 def sniff( self, filename ):
|
|
501 """
|
|
502 InChI files starts with 'InChI='
|
|
503 """
|
|
504 inchi_lines = get_headers( filename, sep=' ', count=10 )
|
|
505 for inchi in inchi_lines:
|
|
506 if not inchi[0].startswith('InChI='):
|
|
507 return False
|
|
508 return True
|
|
509
|
|
510
|
|
511 class SMILES( Tabular ):
|
|
512 file_ext = "smi"
|
|
513 column_names = [ 'SMILES', 'TITLE' ]
|
|
514 MetadataElement( name="columns", default=2, desc="Number of columns", readonly=True, visible=False )
|
|
515 MetadataElement( name="column_types", default=['str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
|
|
516 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
|
|
517
|
|
518 def set_meta( self, dataset, **kwd ):
|
|
519 """
|
|
520 Set the number of lines of data in dataset.
|
|
521 """
|
|
522 dataset.metadata.number_of_molecules = self.count_data_lines(dataset)
|
|
523
|
|
524 def set_peek( self, dataset, is_multi_byte=False ):
|
|
525 if not dataset.dataset.purged:
|
|
526 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
527 if (dataset.metadata.number_of_molecules == 1):
|
|
528 dataset.blurb = "1 molecule"
|
|
529 else:
|
|
530 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
|
|
531 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
532 else:
|
|
533 dataset.peek = 'file does not exist'
|
|
534 dataset.blurb = 'file purged from disk'
|
|
535
|
|
536
|
|
537 '''
|
|
538 def sniff( self, filename ):
|
|
539 """
|
|
540 Its hard or impossible to sniff a SMILES File. We can
|
|
541 try to import the first SMILES and check if it is a molecule, but
|
|
542 currently its not possible to use external libraries from the toolshed
|
|
543 in datatype definition files. TODO
|
|
544 """
|
|
545 self.molecule_number = count_lines( filename, non_empty = True )
|
|
546 word_count = count_lines( filename )
|
|
547
|
|
548 if self.molecule_number != word_count:
|
|
549 return False
|
|
550
|
|
551 if self.molecule_number > 0:
|
|
552 # test first 3 SMILES
|
|
553 smiles_lines = get_headers( filename, sep='\t', count=3 )
|
|
554 for smiles_line in smiles_lines:
|
|
555 if len(smiles_line) > 2:
|
|
556 return False
|
|
557 smiles = smiles_line[0]
|
|
558 try:
|
|
559 # if we have atoms, we have a molecule
|
|
560 if not len( pybel.readstring('smi', smiles).atoms ) > 0:
|
|
561 return False
|
|
562 except:
|
|
563 # if convert fails its not a smiles string
|
|
564 return False
|
|
565 return True
|
|
566 else:
|
|
567 return False
|
|
568 '''
|
|
569
|
|
570
|
4
|
571
|
|
572 class CML( GenericXml ):
|
|
573 """
|
|
574 Chemical Markup Language
|
|
575 http://cml.sourceforge.net/
|
|
576 """
|
|
577 file_ext = "cml"
|
|
578 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
|
|
579
|
|
580
|
|
581 def set_meta( self, dataset, **kwd ):
|
|
582 """
|
|
583 Set the number of lines of data in dataset.
|
|
584 """
|
|
585 dataset.metadata.number_of_molecules = count_special_lines( '^\s*<molecule', dataset.file_name )
|
|
586
|
|
587 def set_peek( self, dataset, is_multi_byte=False ):
|
|
588 if not dataset.dataset.purged:
|
|
589 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
590 if (dataset.metadata.number_of_molecules == 1):
|
|
591 dataset.blurb = "1 molecule"
|
|
592 else:
|
|
593 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
|
|
594 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
595 else:
|
|
596 dataset.peek = 'file does not exist'
|
|
597 dataset.blurb = 'file purged from disk'
|
|
598
|
|
599 def sniff( self, filename ):
|
|
600 """
|
|
601 Try to guess if the file is a CML file.
|
|
602 TODO: add true positive test, need to submit a CML example
|
|
603
|
|
604 >>> fname = get_test_fname( 'interval.interval' )
|
|
605 >>> CML().sniff( fname )
|
|
606 False
|
|
607 """
|
|
608 handle = open(filename)
|
|
609 line = handle.readline()
|
|
610 if line.strip() != '<?xml version="1.0"?>':
|
|
611 handle.close()
|
|
612 return False
|
|
613 line = handle.readline()
|
|
614 if line.strip().find('http://www.xml-cml.org/schema') == -1:
|
|
615 handle.close()
|
|
616 return False
|
|
617 handle.close()
|
|
618 return True
|
|
619
|
|
620
|
|
621 def split( cls, input_datasets, subdir_generator_function, split_params):
|
|
622 """
|
|
623 Split the input files by molecule records.
|
|
624 """
|
|
625 if split_params is None:
|
|
626 return None
|
|
627
|
|
628 if len(input_datasets) > 1:
|
|
629 raise Exception("CML-file splitting does not support multiple files")
|
|
630 input_files = [ds.file_name for ds in input_datasets]
|
|
631
|
|
632 chunk_size = None
|
|
633 if split_params['split_mode'] == 'number_of_parts':
|
|
634 raise Exception('Split mode "%s" is currently not implemented for CML-files.' % split_params['split_mode'])
|
|
635 elif split_params['split_mode'] == 'to_size':
|
|
636 chunk_size = int(split_params['split_size'])
|
|
637 else:
|
|
638 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
|
|
639
|
|
640 def _read_cml_records( filename ):
|
|
641 lines = []
|
|
642 with open(filename) as handle:
|
|
643 for line in handle:
|
|
644 if line.lstrip().startswith('<?xml version="1.0"?>') or \
|
|
645 line.lstrip().startswith('<cml xmlns="http://www.xml-cml.org/schema') or \
|
|
646 line.lstrip().startswith('</cml>'):
|
|
647 continue
|
|
648 lines.append( line )
|
|
649 if line.lstrip().startswith('</molecule>'):
|
|
650 yield lines
|
|
651 lines = []
|
|
652
|
|
653 header_lines = ['<?xml version="1.0"?>\n', '<cml xmlns="http://www.xml-cml.org/schema">\n']
|
|
654 footer_line = ['</cml>\n']
|
|
655 def _write_part_cml_file( accumulated_lines ):
|
|
656 part_dir = subdir_generator_function()
|
|
657 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
|
|
658 part_file = open(part_path, 'w')
|
|
659 part_file.writelines( header_lines )
|
|
660 part_file.writelines( accumulated_lines )
|
|
661 part_file.writelines( footer_line )
|
|
662 part_file.close()
|
|
663
|
|
664 try:
|
|
665 cml_records = _read_cml_records( input_files[0] )
|
|
666 cml_lines_accumulated = []
|
|
667 for counter, cml_record in enumerate( cml_records, start = 1):
|
|
668 cml_lines_accumulated.extend( cml_record )
|
|
669 if counter % chunk_size == 0:
|
|
670 _write_part_cml_file( cml_lines_accumulated )
|
|
671 cml_lines_accumulated = []
|
|
672 if cml_lines_accumulated:
|
|
673 _write_part_cml_file( cml_lines_accumulated )
|
|
674 except Exception, e:
|
|
675 log.error('Unable to split files: %s' % str(e))
|
|
676 raise
|
|
677 split = classmethod(split)
|
|
678
|
|
679
|
|
680 def merge(split_files, output_file):
|
|
681 """
|
|
682 Merging CML files.
|
|
683 """
|
|
684 if len(split_files) == 1:
|
|
685 #For one file only, use base class method (move/copy)
|
|
686 return Text.merge(split_files, output_file)
|
|
687 if not split_files:
|
|
688 raise ValueError("Given no CML files, %r, to merge into %s" \
|
|
689 % (split_files, output_file))
|
|
690 with open(output_file, "w") as out:
|
|
691 for filename in split_files:
|
|
692 with open( filename ) as handle:
|
|
693 header = handle.readline()
|
|
694 if not header:
|
|
695 raise ValueError("CML file %s was empty" % f)
|
|
696 if not header.lstrip().startswith('<?xml version="1.0"?>'):
|
|
697 out.write(header)
|
|
698 raise ValueError("%s is not a valid XML file!" % f)
|
|
699 line = handle.readline()
|
|
700 header += line
|
|
701 if not line.lstrip().startswith('<cml xmlns="http://www.xml-cml.org/schema'):
|
|
702 out.write(header)
|
|
703 raise ValueError("%s is not a CML file!" % f)
|
|
704 molecule_found = False
|
|
705 for line in handle.readlines():
|
|
706 # we found two required header lines, the next line should start with <molecule >
|
|
707 if line.lstrip().startswith('</cml>'):
|
|
708 continue
|
|
709 if line.lstrip().startswith('<molecule'):
|
|
710 molecule_found = True
|
|
711 if molecule_found:
|
|
712 out.write( line )
|
|
713 out.write("</cml>\n")
|
|
714 merge = staticmethod(merge)
|
|
715
|
|
716
|
0
|
717 if __name__ == '__main__':
|
|
718 """
|
4
|
719 TODO: We need to figure out, how to put example files under /lib/galaxy/datatypes/test/ from a toolshed, so that doctest can work properly.
|
0
|
720 """
|
|
721 inchi = get_test_fname('drugbank_drugs.inchi')
|
|
722 smiles = get_test_fname('drugbank_drugs.smi')
|
|
723 sdf = get_test_fname('drugbank_drugs.sdf')
|
|
724 fps = get_test_fname('50_chemfp_fingerprints_FPS1.fps')
|
|
725 pdb = get_test_fname('2zbz.pdb')
|
4
|
726 cml = get_test_fname('/home/bag/Downloads/approved.cml')
|
0
|
727
|
4
|
728 print 'CML test'
|
|
729 print CML().sniff(cml), 'cml'
|
|
730 print CML().sniff(inchi)
|
|
731 print CML().sniff(pdb)
|
|
732 CML().split()
|
|
733 """
|
0
|
734 print 'SMILES test'
|
|
735 print SMILES().sniff(smiles), 'smi'
|
|
736 print SMILES().sniff(inchi)
|
|
737 print SMILES().sniff(pdb)
|
4
|
738 """
|
0
|
739 print 'InChI test'
|
|
740 print InChI().sniff(smiles)
|
|
741 print InChI().sniff(sdf)
|
|
742 print InChI().sniff(inchi), 'inchi'
|
|
743
|
|
744 print 'FPS test'
|
|
745 print FPS().sniff(smiles)
|
|
746 print FPS().sniff(sdf)
|
|
747 f = FPS()
|
|
748 print f.sniff(fps)
|
|
749
|
|
750 print 'SDF test'
|
|
751 print SDF().sniff(smiles)
|
|
752 print SDF().sniff(sdf), 'sdf'
|
|
753 print SDF().sniff(fps)
|
|
754
|
|
755 print 'PDB test'
|
|
756 print PDB().sniff(smiles)
|
|
757 print PDB().sniff(sdf)
|
|
758 print PDB().sniff(fps)
|
|
759 print PDB().sniff(pdb), 'pdb'
|