annotate molecules.py @ 0:8714f927a6ee draft default tip

Uploaded
author iuc
date Tue, 29 Oct 2013 11:14:04 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8714f927a6ee Uploaded
iuc
parents:
diff changeset
1 # -*- coding: utf-8 -*-
8714f927a6ee Uploaded
iuc
parents:
diff changeset
2
8714f927a6ee Uploaded
iuc
parents:
diff changeset
3 from galaxy.datatypes import data
8714f927a6ee Uploaded
iuc
parents:
diff changeset
4 import logging
8714f927a6ee Uploaded
iuc
parents:
diff changeset
5 from galaxy.datatypes.sniff import get_headers, get_test_fname
8714f927a6ee Uploaded
iuc
parents:
diff changeset
6 from galaxy.datatypes.data import get_file_peek
8714f927a6ee Uploaded
iuc
parents:
diff changeset
7 from galaxy.datatypes.tabular import Tabular
8714f927a6ee Uploaded
iuc
parents:
diff changeset
8 from galaxy.datatypes.binary import Binary
8714f927a6ee Uploaded
iuc
parents:
diff changeset
9 from galaxy.datatypes.xml import GenericXml
8714f927a6ee Uploaded
iuc
parents:
diff changeset
10 import subprocess
8714f927a6ee Uploaded
iuc
parents:
diff changeset
11 import os
8714f927a6ee Uploaded
iuc
parents:
diff changeset
12 #import pybel
8714f927a6ee Uploaded
iuc
parents:
diff changeset
13 #import openbabel
8714f927a6ee Uploaded
iuc
parents:
diff changeset
14 #openbabel.obErrorLog.StopLogging()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
15
8714f927a6ee Uploaded
iuc
parents:
diff changeset
16 from galaxy.datatypes.metadata import MetadataElement
8714f927a6ee Uploaded
iuc
parents:
diff changeset
17 from galaxy.datatypes import metadata
8714f927a6ee Uploaded
iuc
parents:
diff changeset
18
8714f927a6ee Uploaded
iuc
parents:
diff changeset
19 log = logging.getLogger(__name__)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
20
8714f927a6ee Uploaded
iuc
parents:
diff changeset
21 def count_special_lines( word, filename, invert = False ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
22 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
23 searching for special 'words' using the grep tool
8714f927a6ee Uploaded
iuc
parents:
diff changeset
24 grep is used to speed up the searching and counting
8714f927a6ee Uploaded
iuc
parents:
diff changeset
25 The number of hits is returned.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
26 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
27 try:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
28 cmd = ["grep", "-c"]
8714f927a6ee Uploaded
iuc
parents:
diff changeset
29 if invert:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
30 cmd.append('-v')
8714f927a6ee Uploaded
iuc
parents:
diff changeset
31 cmd.extend([word, filename])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
32 out = subprocess.Popen(cmd, stdout=subprocess.PIPE)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
33 return int(out.communicate()[0].split()[0])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
34 except:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
35 pass
8714f927a6ee Uploaded
iuc
parents:
diff changeset
36 return 0
8714f927a6ee Uploaded
iuc
parents:
diff changeset
37
8714f927a6ee Uploaded
iuc
parents:
diff changeset
38 def count_lines( filename, non_empty = False):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
39 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
40 counting the number of lines from the 'filename' file
8714f927a6ee Uploaded
iuc
parents:
diff changeset
41 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
42 try:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
43 if non_empty:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
44 out = subprocess.Popen(['grep', '-cve', '^\s*$', filename], stdout=subprocess.PIPE)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
45 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
46 out = subprocess.Popen(['wc', '-l', filename], stdout=subprocess.PIPE)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
47 return int(out.communicate()[0].split()[0])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
48 except:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
49 pass
8714f927a6ee Uploaded
iuc
parents:
diff changeset
50 return 0
8714f927a6ee Uploaded
iuc
parents:
diff changeset
51
8714f927a6ee Uploaded
iuc
parents:
diff changeset
52
8714f927a6ee Uploaded
iuc
parents:
diff changeset
53 class GenericMolFile( data.Text ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
54 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
55 abstract class for most of the molecule files
8714f927a6ee Uploaded
iuc
parents:
diff changeset
56 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
57 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
58
8714f927a6ee Uploaded
iuc
parents:
diff changeset
59 def set_peek( self, dataset, is_multi_byte=False ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
60 if not dataset.dataset.purged:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
61 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
62 if (dataset.metadata.number_of_molecules == 1):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
63 dataset.blurb = "1 molecule"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
64 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
65 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
8714f927a6ee Uploaded
iuc
parents:
diff changeset
66 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
67 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
68 dataset.peek = 'file does not exist'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
69 dataset.blurb = 'file purged from disk'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
70
8714f927a6ee Uploaded
iuc
parents:
diff changeset
71 def get_mime(self):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
72 return 'text/plain'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
73
8714f927a6ee Uploaded
iuc
parents:
diff changeset
74 class MOL( GenericMolFile ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
75 file_ext = "mol"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
76 def sniff( self, filename ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
77 if count_special_lines("^M\s*END", filename) == 1:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
78 return True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
79 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
80 return False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
81
8714f927a6ee Uploaded
iuc
parents:
diff changeset
82 def set_meta( self, dataset, **kwd ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
83 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
84 Set the number molecules, in the case of MOL its always one.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
85 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
86 dataset.metadata.number_of_molecules = 1
8714f927a6ee Uploaded
iuc
parents:
diff changeset
87
8714f927a6ee Uploaded
iuc
parents:
diff changeset
88
8714f927a6ee Uploaded
iuc
parents:
diff changeset
89 class SDF( GenericMolFile ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
90 file_ext = "sdf"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
91 def sniff( self, filename ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
92 if count_special_lines("^\$\$\$\$", filename) > 0:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
93 return True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
94 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
95 return False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
96
8714f927a6ee Uploaded
iuc
parents:
diff changeset
97 def set_meta( self, dataset, **kwd ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
98 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
99 Set the number of molecules in dataset.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
100 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
101 dataset.metadata.number_of_molecules = count_special_lines("^\$\$\$\$", dataset.file_name)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
102
8714f927a6ee Uploaded
iuc
parents:
diff changeset
103 def split( cls, input_datasets, subdir_generator_function, split_params):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
104 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
105 Split the input files by molecule records.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
106 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
107 if split_params is None:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
108 return None
8714f927a6ee Uploaded
iuc
parents:
diff changeset
109
8714f927a6ee Uploaded
iuc
parents:
diff changeset
110 if len(input_datasets) > 1:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
111 raise Exception("SD-file splitting does not support multiple files")
8714f927a6ee Uploaded
iuc
parents:
diff changeset
112 input_files = [ds.file_name for ds in input_datasets]
8714f927a6ee Uploaded
iuc
parents:
diff changeset
113
8714f927a6ee Uploaded
iuc
parents:
diff changeset
114 chunk_size = None
8714f927a6ee Uploaded
iuc
parents:
diff changeset
115 if split_params['split_mode'] == 'number_of_parts':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
116 raise Exception('Split mode "%s" is currently not implemented for SD-files.' % split_params['split_mode'])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
117 elif split_params['split_mode'] == 'to_size':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
118 chunk_size = int(split_params['split_size'])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
119 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
120 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
121
8714f927a6ee Uploaded
iuc
parents:
diff changeset
122 def _read_sdf_records( filename ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
123 lines = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
124 with open(filename) as handle:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
125 for line in handle:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
126 lines.append( line )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
127 if line.startswith("$$$$"):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
128 yield lines
8714f927a6ee Uploaded
iuc
parents:
diff changeset
129 lines = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
130
8714f927a6ee Uploaded
iuc
parents:
diff changeset
131 def _write_part_sdf_file( accumulated_lines ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
132 part_dir = subdir_generator_function()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
133 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
8714f927a6ee Uploaded
iuc
parents:
diff changeset
134 part_file = open(part_path, 'w')
8714f927a6ee Uploaded
iuc
parents:
diff changeset
135 part_file.writelines( accumulated_lines )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
136 part_file.close()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
137
8714f927a6ee Uploaded
iuc
parents:
diff changeset
138 try:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
139 sdf_records = _read_sdf_records( input_files[0] )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
140 sdf_lines_accumulated = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
141 for counter, sdf_record in enumerate( sdf_records, start = 1):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
142 sdf_lines_accumulated.extend( sdf_record )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
143 if counter % chunk_size == 0:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
144 _write_part_sdf_file( sdf_lines_accumulated )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
145 sdf_lines_accumulated = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
146 if sdf_lines_accumulated:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
147 _write_part_sdf_file( sdf_lines_accumulated )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
148 except Exception, e:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
149 log.error('Unable to split files: %s' % str(e))
8714f927a6ee Uploaded
iuc
parents:
diff changeset
150 raise
8714f927a6ee Uploaded
iuc
parents:
diff changeset
151 split = classmethod(split)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
152
8714f927a6ee Uploaded
iuc
parents:
diff changeset
153
8714f927a6ee Uploaded
iuc
parents:
diff changeset
154 class MOL2( GenericMolFile ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
155 file_ext = "mol2"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
156 def sniff( self, filename ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
157 if count_special_lines("@\<TRIPOS\>MOLECULE", filename) > 0:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
158 return True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
159 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
160 return False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
161
8714f927a6ee Uploaded
iuc
parents:
diff changeset
162 def set_meta( self, dataset, **kwd ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
163 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
164 Set the number of lines of data in dataset.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
165 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
166 dataset.metadata.number_of_molecules = count_special_lines("@<TRIPOS>MOLECULE", dataset.file_name)#self.count_data_lines(dataset)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
167
8714f927a6ee Uploaded
iuc
parents:
diff changeset
168 def split( cls, input_datasets, subdir_generator_function, split_params):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
169 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
170 Split the input files by molecule records.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
171 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
172 if split_params is None:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
173 return None
8714f927a6ee Uploaded
iuc
parents:
diff changeset
174
8714f927a6ee Uploaded
iuc
parents:
diff changeset
175 if len(input_datasets) > 1:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
176 raise Exception("MOL2-file splitting does not support multiple files")
8714f927a6ee Uploaded
iuc
parents:
diff changeset
177 input_files = [ds.file_name for ds in input_datasets]
8714f927a6ee Uploaded
iuc
parents:
diff changeset
178
8714f927a6ee Uploaded
iuc
parents:
diff changeset
179 chunk_size = None
8714f927a6ee Uploaded
iuc
parents:
diff changeset
180 if split_params['split_mode'] == 'number_of_parts':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
181 raise Exception('Split mode "%s" is currently not implemented for MOL2-files.' % split_params['split_mode'])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
182 elif split_params['split_mode'] == 'to_size':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
183 chunk_size = int(split_params['split_size'])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
184 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
185 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
186
8714f927a6ee Uploaded
iuc
parents:
diff changeset
187 def _read_mol2_records( filename ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
188 lines = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
189 start = True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
190 with open(filename) as handle:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
191 for line in handle:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
192 if line.startswith("@<TRIPOS>MOLECULE"):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
193 if start:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
194 start = False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
195 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
196 yield lines
8714f927a6ee Uploaded
iuc
parents:
diff changeset
197 lines = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
198 lines.append( line )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
199
8714f927a6ee Uploaded
iuc
parents:
diff changeset
200 def _write_part_mol2_file( accumulated_lines ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
201 part_dir = subdir_generator_function()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
202 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
8714f927a6ee Uploaded
iuc
parents:
diff changeset
203 part_file = open(part_path, 'w')
8714f927a6ee Uploaded
iuc
parents:
diff changeset
204 part_file.writelines( accumulated_lines )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
205 part_file.close()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
206
8714f927a6ee Uploaded
iuc
parents:
diff changeset
207 try:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
208 mol2_records = _read_mol2_records( input_files[0] )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
209 mol2_lines_accumulated = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
210 for counter, mol2_record in enumerate( mol2_records, start = 1):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
211 mol2_lines_accumulated.extend( mol2_record )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
212 if counter % chunk_size == 0:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
213 _write_part_mol2_file( mol2_lines_accumulated )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
214 mol2_lines_accumulated = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
215 if mol2_lines_accumulated:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
216 _write_part_mol2_file( mol2_lines_accumulated )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
217 except Exception, e:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
218 log.error('Unable to split files: %s' % str(e))
8714f927a6ee Uploaded
iuc
parents:
diff changeset
219 raise
8714f927a6ee Uploaded
iuc
parents:
diff changeset
220 split = classmethod(split)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
221
8714f927a6ee Uploaded
iuc
parents:
diff changeset
222
8714f927a6ee Uploaded
iuc
parents:
diff changeset
223
8714f927a6ee Uploaded
iuc
parents:
diff changeset
224 class FPS( GenericMolFile ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
225 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
226 chemfp fingerprint file: http://code.google.com/p/chem-fingerprints/wiki/FPS
8714f927a6ee Uploaded
iuc
parents:
diff changeset
227 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
228 file_ext = "fps"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
229 def sniff( self, filename ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
230 header = get_headers( filename, sep='\t', count=1 )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
231 if header[0][0].strip() == '#FPS1':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
232 return True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
233 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
234 return False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
235
8714f927a6ee Uploaded
iuc
parents:
diff changeset
236 def set_meta( self, dataset, **kwd ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
237 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
238 Set the number of lines of data in dataset.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
239 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
240 dataset.metadata.number_of_molecules = count_special_lines('^#', dataset.file_name, invert = True)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
241
8714f927a6ee Uploaded
iuc
parents:
diff changeset
242
8714f927a6ee Uploaded
iuc
parents:
diff changeset
243 def split( cls, input_datasets, subdir_generator_function, split_params):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
244 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
245 Split the input files by fingerprint records.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
246 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
247 if split_params is None:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
248 return None
8714f927a6ee Uploaded
iuc
parents:
diff changeset
249
8714f927a6ee Uploaded
iuc
parents:
diff changeset
250 if len(input_datasets) > 1:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
251 raise Exception("FPS-file splitting does not support multiple files")
8714f927a6ee Uploaded
iuc
parents:
diff changeset
252 input_files = [ds.file_name for ds in input_datasets]
8714f927a6ee Uploaded
iuc
parents:
diff changeset
253
8714f927a6ee Uploaded
iuc
parents:
diff changeset
254 chunk_size = None
8714f927a6ee Uploaded
iuc
parents:
diff changeset
255 if split_params['split_mode'] == 'number_of_parts':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
256 raise Exception('Split mode "%s" is currently not implemented for MOL2-files.' % split_params['split_mode'])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
257 elif split_params['split_mode'] == 'to_size':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
258 chunk_size = int(split_params['split_size'])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
259 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
260 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
261
8714f927a6ee Uploaded
iuc
parents:
diff changeset
262
8714f927a6ee Uploaded
iuc
parents:
diff changeset
263 def _write_part_fingerprint_file( accumulated_lines ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
264 part_dir = subdir_generator_function()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
265 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
8714f927a6ee Uploaded
iuc
parents:
diff changeset
266 part_file = open(part_path, 'w')
8714f927a6ee Uploaded
iuc
parents:
diff changeset
267 part_file.writelines( accumulated_lines )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
268 part_file.close()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
269
8714f927a6ee Uploaded
iuc
parents:
diff changeset
270 try:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
271 header_lines = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
272 lines_accumulated = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
273 fingerprint_counter = 0
8714f927a6ee Uploaded
iuc
parents:
diff changeset
274 for line in open( input_files[0] ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
275 if not line.strip():
8714f927a6ee Uploaded
iuc
parents:
diff changeset
276 continue
8714f927a6ee Uploaded
iuc
parents:
diff changeset
277 if line.startswith('#'):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
278 header_lines.append( line )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
279 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
280 fingerprint_counter += 1
8714f927a6ee Uploaded
iuc
parents:
diff changeset
281 lines_accumulated.append( line )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
282 if fingerprint_counter != 0 and fingerprint_counter % chunk_size == 0:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
283 _write_part_fingerprint_file( header_lines + lines_accumulated )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
284 lines_accumulated = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
285 if lines_accumulated:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
286 _write_part_fingerprint_file( header_lines + lines_accumulated )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
287 except Exception, e:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
288 log.error('Unable to split files: %s' % str(e))
8714f927a6ee Uploaded
iuc
parents:
diff changeset
289 raise
8714f927a6ee Uploaded
iuc
parents:
diff changeset
290 split = classmethod(split)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
291
8714f927a6ee Uploaded
iuc
parents:
diff changeset
292
8714f927a6ee Uploaded
iuc
parents:
diff changeset
293 def merge(split_files, output_file):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
294 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
295 Merging fps files requires merging the header manually.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
296 We take the header from the first file.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
297 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
298 if len(split_files) == 1:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
299 #For one file only, use base class method (move/copy)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
300 return data.Text.merge(split_files, output_file)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
301 if not split_files:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
302 raise ValueError("No fps files given, %r, to merge into %s" \
8714f927a6ee Uploaded
iuc
parents:
diff changeset
303 % (split_files, output_file))
8714f927a6ee Uploaded
iuc
parents:
diff changeset
304 out = open(output_file, "w")
8714f927a6ee Uploaded
iuc
parents:
diff changeset
305 first = True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
306 for filename in split_files:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
307 with open(filename) as handle:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
308 for line in handle:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
309 if line.startswith('#'):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
310 if first:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
311 out.write(line)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
312 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
313 # line is no header and not a comment, we assume the first header is written to out and we set 'first' to False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
314 first = False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
315 out.write(line)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
316 out.close()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
317 merge = staticmethod(merge)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
318
8714f927a6ee Uploaded
iuc
parents:
diff changeset
319
8714f927a6ee Uploaded
iuc
parents:
diff changeset
320
8714f927a6ee Uploaded
iuc
parents:
diff changeset
321 class OBFS( Binary ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
322 """OpenBabel Fastsearch format (fs)."""
8714f927a6ee Uploaded
iuc
parents:
diff changeset
323 file_ext = 'fs'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
324 composite_type ='basic'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
325 allow_datatype_change = False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
326
8714f927a6ee Uploaded
iuc
parents:
diff changeset
327 MetadataElement( name="base_name", default='OpenBabel Fastsearch Index',
8714f927a6ee Uploaded
iuc
parents:
diff changeset
328 readonly=True, visible=True, optional=True,)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
329
8714f927a6ee Uploaded
iuc
parents:
diff changeset
330 def __init__(self,**kwd):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
331 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
332 A Fastsearch Index consists of a binary file with the fingerprints
8714f927a6ee Uploaded
iuc
parents:
diff changeset
333 and a pointer the actual molecule file.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
334 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
335 Binary.__init__(self, **kwd)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
336 self.add_composite_file('molecule.fs', is_binary = True,
8714f927a6ee Uploaded
iuc
parents:
diff changeset
337 description = 'OpenBabel Fastsearch Index' )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
338 self.add_composite_file('molecule.sdf', optional=True,
8714f927a6ee Uploaded
iuc
parents:
diff changeset
339 is_binary = False, description = 'Molecule File' )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
340 self.add_composite_file('molecule.smi', optional=True,
8714f927a6ee Uploaded
iuc
parents:
diff changeset
341 is_binary = False, description = 'Molecule File' )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
342 self.add_composite_file('molecule.inchi', optional=True,
8714f927a6ee Uploaded
iuc
parents:
diff changeset
343 is_binary = False, description = 'Molecule File' )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
344 self.add_composite_file('molecule.mol2', optional=True,
8714f927a6ee Uploaded
iuc
parents:
diff changeset
345 is_binary = False, description = 'Molecule File' )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
346 self.add_composite_file('molecule.cml', optional=True,
8714f927a6ee Uploaded
iuc
parents:
diff changeset
347 is_binary = False, description = 'Molecule File' )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
348
8714f927a6ee Uploaded
iuc
parents:
diff changeset
349 def set_peek( self, dataset, is_multi_byte=False ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
350 """Set the peek and blurb text."""
8714f927a6ee Uploaded
iuc
parents:
diff changeset
351 if not dataset.dataset.purged:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
352 dataset.peek = "OpenBabel Fastsearch Index"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
353 dataset.blurb = "OpenBabel Fastsearch Index"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
354 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
355 dataset.peek = "file does not exist"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
356 dataset.blurb = "file purged from disk"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
357
8714f927a6ee Uploaded
iuc
parents:
diff changeset
358 def display_peek( self, dataset ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
359 """Create HTML content, used for displaying peek."""
8714f927a6ee Uploaded
iuc
parents:
diff changeset
360 try:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
361 return dataset.peek
8714f927a6ee Uploaded
iuc
parents:
diff changeset
362 except:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
363 return "OpenBabel Fastsearch Index"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
364
8714f927a6ee Uploaded
iuc
parents:
diff changeset
365 def display_data(self, trans, data, preview=False, filename=None,
8714f927a6ee Uploaded
iuc
parents:
diff changeset
366 to_ext=None, size=None, offset=None, **kwd):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
367 """Apparently an old display method, but still gets called.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
368
8714f927a6ee Uploaded
iuc
parents:
diff changeset
369 This allows us to format the data shown in the central pane via the "eye" icon.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
370 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
371 return "This is a OpenBabel Fastsearch format. You can speed up your similarity and substructure search with it."
8714f927a6ee Uploaded
iuc
parents:
diff changeset
372
8714f927a6ee Uploaded
iuc
parents:
diff changeset
373 def get_mime(self):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
374 """Returns the mime type of the datatype (pretend it is text for peek)"""
8714f927a6ee Uploaded
iuc
parents:
diff changeset
375 return 'text/plain'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
376
8714f927a6ee Uploaded
iuc
parents:
diff changeset
377 def merge(split_files, output_file, extra_merge_args):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
378 """Merging Fastsearch indices is not supported."""
8714f927a6ee Uploaded
iuc
parents:
diff changeset
379 raise NotImplementedError("Merging Fastsearch indices is not supported.")
8714f927a6ee Uploaded
iuc
parents:
diff changeset
380
8714f927a6ee Uploaded
iuc
parents:
diff changeset
381 def split( cls, input_datasets, subdir_generator_function, split_params):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
382 """Splitting Fastsearch indices is not supported."""
8714f927a6ee Uploaded
iuc
parents:
diff changeset
383 if split_params is None:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
384 return None
8714f927a6ee Uploaded
iuc
parents:
diff changeset
385 raise NotImplementedError("Splitting Fastsearch indices is not possible.")
8714f927a6ee Uploaded
iuc
parents:
diff changeset
386
8714f927a6ee Uploaded
iuc
parents:
diff changeset
387
8714f927a6ee Uploaded
iuc
parents:
diff changeset
388
8714f927a6ee Uploaded
iuc
parents:
diff changeset
389 class DRF( GenericMolFile ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
390 file_ext = "drf"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
391
8714f927a6ee Uploaded
iuc
parents:
diff changeset
392 def set_meta( self, dataset, **kwd ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
393 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
394 Set the number of lines of data in dataset.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
395 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
396 dataset.metadata.number_of_molecules = count_special_lines('\"ligand id\"', dataset.file_name, invert = True)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
397
8714f927a6ee Uploaded
iuc
parents:
diff changeset
398
8714f927a6ee Uploaded
iuc
parents:
diff changeset
399 class PHAR( GenericMolFile ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
400 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
401 Pharmacophore database format from silicos-it.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
402 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
403 file_ext = "phar"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
404 def set_peek( self, dataset, is_multi_byte=False ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
405 if not dataset.dataset.purged:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
406 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
407 dataset.blurb = "pharmacophore"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
408 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
409 dataset.peek = 'file does not exist'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
410 dataset.blurb = 'file purged from disk'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
411
8714f927a6ee Uploaded
iuc
parents:
diff changeset
412
8714f927a6ee Uploaded
iuc
parents:
diff changeset
413 class PDB( GenericMolFile ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
414 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
415 Protein Databank format.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
416 http://www.wwpdb.org/documentation/format33/v3.3.html
8714f927a6ee Uploaded
iuc
parents:
diff changeset
417 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
418 file_ext = "pdb"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
419 def sniff( self, filename ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
420 headers = get_headers( filename, sep=' ', count=300 )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
421 h = t = c = s = k = e = False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
422 for line in headers:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
423 section_name = line[0].strip()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
424 if section_name == 'HEADER':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
425 h = True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
426 elif section_name == 'TITLE':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
427 t = True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
428 elif section_name == 'COMPND':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
429 c = True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
430 elif section_name == 'SOURCE':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
431 s = True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
432 elif section_name == 'KEYWDS':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
433 k = True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
434 elif section_name == 'EXPDTA':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
435 e = True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
436
8714f927a6ee Uploaded
iuc
parents:
diff changeset
437 if h*t*c*s*k*e == True:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
438 return True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
439 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
440 return False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
441
8714f927a6ee Uploaded
iuc
parents:
diff changeset
442 def set_peek( self, dataset, is_multi_byte=False ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
443 if not dataset.dataset.purged:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
444 atom_numbers = count_special_lines("^ATOM", dataset.file_name)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
445 hetatm_numbers = count_special_lines("^HETATM", dataset.file_name)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
446 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
447 dataset.blurb = "%s atoms and %s HET-atoms" % (atom_numbers, hetatm_numbers)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
448 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
449 dataset.peek = 'file does not exist'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
450 dataset.blurb = 'file purged from disk'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
451
8714f927a6ee Uploaded
iuc
parents:
diff changeset
452
8714f927a6ee Uploaded
iuc
parents:
diff changeset
453 class grd( data.Text ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
454 file_ext = "grd"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
455 def set_peek( self, dataset, is_multi_byte=False ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
456 if not dataset.dataset.purged:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
457 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
458 dataset.blurb = "grids for docking"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
459 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
460 dataset.peek = 'file does not exist'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
461 dataset.blurb = 'file purged from disk'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
462
8714f927a6ee Uploaded
iuc
parents:
diff changeset
463
8714f927a6ee Uploaded
iuc
parents:
diff changeset
464 class grdtgz( Binary ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
465 file_ext = "grd.tgz"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
466 def set_peek( self, dataset, is_multi_byte=False ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
467 if not dataset.dataset.purged:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
468 dataset.peek = 'binary data'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
469 dataset.blurb = "compressed grids for docking"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
470 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
471 dataset.peek = 'file does not exist'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
472 dataset.blurb = 'file purged from disk'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
473
8714f927a6ee Uploaded
iuc
parents:
diff changeset
474
8714f927a6ee Uploaded
iuc
parents:
diff changeset
475 class InChI( Tabular ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
476 file_ext = "inchi"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
477 column_names = [ 'InChI' ]
8714f927a6ee Uploaded
iuc
parents:
diff changeset
478 MetadataElement( name="columns", default=2, desc="Number of columns", readonly=True, visible=False )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
479 MetadataElement( name="column_types", default=['str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
480 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
481
8714f927a6ee Uploaded
iuc
parents:
diff changeset
482 def set_meta( self, dataset, **kwd ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
483 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
484 Set the number of lines of data in dataset.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
485 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
486 dataset.metadata.number_of_molecules = self.count_data_lines(dataset)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
487
8714f927a6ee Uploaded
iuc
parents:
diff changeset
488 def set_peek( self, dataset, is_multi_byte=False ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
489 if not dataset.dataset.purged:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
490 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
491 if (dataset.metadata.number_of_molecules == 1):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
492 dataset.blurb = "1 molecule"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
493 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
494 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
8714f927a6ee Uploaded
iuc
parents:
diff changeset
495 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
496 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
497 dataset.peek = 'file does not exist'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
498 dataset.blurb = 'file purged from disk'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
499
8714f927a6ee Uploaded
iuc
parents:
diff changeset
500 def sniff( self, filename ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
501 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
502 InChI files starts with 'InChI='
8714f927a6ee Uploaded
iuc
parents:
diff changeset
503 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
504 inchi_lines = get_headers( filename, sep=' ', count=10 )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
505 for inchi in inchi_lines:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
506 if not inchi[0].startswith('InChI='):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
507 return False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
508 return True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
509
8714f927a6ee Uploaded
iuc
parents:
diff changeset
510
8714f927a6ee Uploaded
iuc
parents:
diff changeset
511 class SMILES( Tabular ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
512 file_ext = "smi"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
513 column_names = [ 'SMILES', 'TITLE' ]
8714f927a6ee Uploaded
iuc
parents:
diff changeset
514 MetadataElement( name="columns", default=2, desc="Number of columns", readonly=True, visible=False )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
515 MetadataElement( name="column_types", default=['str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
516 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
517
8714f927a6ee Uploaded
iuc
parents:
diff changeset
518 def set_meta( self, dataset, **kwd ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
519 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
520 Set the number of lines of data in dataset.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
521 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
522 dataset.metadata.number_of_molecules = self.count_data_lines(dataset)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
523
8714f927a6ee Uploaded
iuc
parents:
diff changeset
524 def set_peek( self, dataset, is_multi_byte=False ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
525 if not dataset.dataset.purged:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
526 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
527 if (dataset.metadata.number_of_molecules == 1):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
528 dataset.blurb = "1 molecule"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
529 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
530 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
8714f927a6ee Uploaded
iuc
parents:
diff changeset
531 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
532 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
533 dataset.peek = 'file does not exist'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
534 dataset.blurb = 'file purged from disk'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
535
8714f927a6ee Uploaded
iuc
parents:
diff changeset
536
8714f927a6ee Uploaded
iuc
parents:
diff changeset
537 '''
8714f927a6ee Uploaded
iuc
parents:
diff changeset
538 def sniff( self, filename ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
539 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
540 Its hard or impossible to sniff a SMILES File. We can
8714f927a6ee Uploaded
iuc
parents:
diff changeset
541 try to import the first SMILES and check if it is a molecule, but
8714f927a6ee Uploaded
iuc
parents:
diff changeset
542 currently its not possible to use external libraries from the toolshed
8714f927a6ee Uploaded
iuc
parents:
diff changeset
543 in datatype definition files. TODO
8714f927a6ee Uploaded
iuc
parents:
diff changeset
544 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
545 self.molecule_number = count_lines( filename, non_empty = True )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
546 word_count = count_lines( filename )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
547
8714f927a6ee Uploaded
iuc
parents:
diff changeset
548 if self.molecule_number != word_count:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
549 return False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
550
8714f927a6ee Uploaded
iuc
parents:
diff changeset
551 if self.molecule_number > 0:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
552 # test first 3 SMILES
8714f927a6ee Uploaded
iuc
parents:
diff changeset
553 smiles_lines = get_headers( filename, sep='\t', count=3 )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
554 for smiles_line in smiles_lines:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
555 if len(smiles_line) > 2:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
556 return False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
557 smiles = smiles_line[0]
8714f927a6ee Uploaded
iuc
parents:
diff changeset
558 try:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
559 # if we have atoms, we have a molecule
8714f927a6ee Uploaded
iuc
parents:
diff changeset
560 if not len( pybel.readstring('smi', smiles).atoms ) > 0:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
561 return False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
562 except:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
563 # if convert fails its not a smiles string
8714f927a6ee Uploaded
iuc
parents:
diff changeset
564 return False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
565 return True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
566 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
567 return False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
568 '''
8714f927a6ee Uploaded
iuc
parents:
diff changeset
569
8714f927a6ee Uploaded
iuc
parents:
diff changeset
570
8714f927a6ee Uploaded
iuc
parents:
diff changeset
571
8714f927a6ee Uploaded
iuc
parents:
diff changeset
572 class CML( GenericXml ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
573 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
574 Chemical Markup Language
8714f927a6ee Uploaded
iuc
parents:
diff changeset
575 http://cml.sourceforge.net/
8714f927a6ee Uploaded
iuc
parents:
diff changeset
576 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
577 file_ext = "cml"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
578 MetadataElement( name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0 )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
579
8714f927a6ee Uploaded
iuc
parents:
diff changeset
580
8714f927a6ee Uploaded
iuc
parents:
diff changeset
581 def set_meta( self, dataset, **kwd ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
582 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
583 Set the number of lines of data in dataset.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
584 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
585 dataset.metadata.number_of_molecules = count_special_lines( '^\s*<molecule', dataset.file_name )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
586
8714f927a6ee Uploaded
iuc
parents:
diff changeset
587 def set_peek( self, dataset, is_multi_byte=False ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
588 if not dataset.dataset.purged:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
589 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
590 if (dataset.metadata.number_of_molecules == 1):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
591 dataset.blurb = "1 molecule"
8714f927a6ee Uploaded
iuc
parents:
diff changeset
592 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
593 dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
8714f927a6ee Uploaded
iuc
parents:
diff changeset
594 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
595 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
596 dataset.peek = 'file does not exist'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
597 dataset.blurb = 'file purged from disk'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
598
8714f927a6ee Uploaded
iuc
parents:
diff changeset
599 def sniff( self, filename ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
600 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
601 Try to guess if the file is a CML file.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
602 TODO: add true positive test, need to submit a CML example
8714f927a6ee Uploaded
iuc
parents:
diff changeset
603
8714f927a6ee Uploaded
iuc
parents:
diff changeset
604 >>> fname = get_test_fname( 'interval.interval' )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
605 >>> CML().sniff( fname )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
606 False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
607 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
608 handle = open(filename)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
609 line = handle.readline()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
610 if line.strip() != '<?xml version="1.0"?>':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
611 handle.close()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
612 return False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
613 line = handle.readline()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
614 if line.strip().find('http://www.xml-cml.org/schema') == -1:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
615 handle.close()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
616 return False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
617 handle.close()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
618 return True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
619
8714f927a6ee Uploaded
iuc
parents:
diff changeset
620
8714f927a6ee Uploaded
iuc
parents:
diff changeset
621 def split( cls, input_datasets, subdir_generator_function, split_params):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
622 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
623 Split the input files by molecule records.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
624 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
625 if split_params is None:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
626 return None
8714f927a6ee Uploaded
iuc
parents:
diff changeset
627
8714f927a6ee Uploaded
iuc
parents:
diff changeset
628 if len(input_datasets) > 1:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
629 raise Exception("CML-file splitting does not support multiple files")
8714f927a6ee Uploaded
iuc
parents:
diff changeset
630 input_files = [ds.file_name for ds in input_datasets]
8714f927a6ee Uploaded
iuc
parents:
diff changeset
631
8714f927a6ee Uploaded
iuc
parents:
diff changeset
632 chunk_size = None
8714f927a6ee Uploaded
iuc
parents:
diff changeset
633 if split_params['split_mode'] == 'number_of_parts':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
634 raise Exception('Split mode "%s" is currently not implemented for CML-files.' % split_params['split_mode'])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
635 elif split_params['split_mode'] == 'to_size':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
636 chunk_size = int(split_params['split_size'])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
637 else:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
638 raise Exception('Unsupported split mode %s' % split_params['split_mode'])
8714f927a6ee Uploaded
iuc
parents:
diff changeset
639
8714f927a6ee Uploaded
iuc
parents:
diff changeset
640 def _read_cml_records( filename ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
641 lines = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
642 with open(filename) as handle:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
643 for line in handle:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
644 if line.lstrip().startswith('<?xml version="1.0"?>') or \
8714f927a6ee Uploaded
iuc
parents:
diff changeset
645 line.lstrip().startswith('<cml xmlns="http://www.xml-cml.org/schema') or \
8714f927a6ee Uploaded
iuc
parents:
diff changeset
646 line.lstrip().startswith('</cml>'):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
647 continue
8714f927a6ee Uploaded
iuc
parents:
diff changeset
648 lines.append( line )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
649 if line.lstrip().startswith('</molecule>'):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
650 yield lines
8714f927a6ee Uploaded
iuc
parents:
diff changeset
651 lines = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
652
8714f927a6ee Uploaded
iuc
parents:
diff changeset
653 header_lines = ['<?xml version="1.0"?>\n', '<cml xmlns="http://www.xml-cml.org/schema">\n']
8714f927a6ee Uploaded
iuc
parents:
diff changeset
654 footer_line = ['</cml>\n']
8714f927a6ee Uploaded
iuc
parents:
diff changeset
655 def _write_part_cml_file( accumulated_lines ):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
656 part_dir = subdir_generator_function()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
657 part_path = os.path.join(part_dir, os.path.basename(input_files[0]))
8714f927a6ee Uploaded
iuc
parents:
diff changeset
658 part_file = open(part_path, 'w')
8714f927a6ee Uploaded
iuc
parents:
diff changeset
659 part_file.writelines( header_lines )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
660 part_file.writelines( accumulated_lines )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
661 part_file.writelines( footer_line )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
662 part_file.close()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
663
8714f927a6ee Uploaded
iuc
parents:
diff changeset
664 try:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
665 cml_records = _read_cml_records( input_files[0] )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
666 cml_lines_accumulated = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
667 for counter, cml_record in enumerate( cml_records, start = 1):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
668 cml_lines_accumulated.extend( cml_record )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
669 if counter % chunk_size == 0:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
670 _write_part_cml_file( cml_lines_accumulated )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
671 cml_lines_accumulated = []
8714f927a6ee Uploaded
iuc
parents:
diff changeset
672 if cml_lines_accumulated:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
673 _write_part_cml_file( cml_lines_accumulated )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
674 except Exception, e:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
675 log.error('Unable to split files: %s' % str(e))
8714f927a6ee Uploaded
iuc
parents:
diff changeset
676 raise
8714f927a6ee Uploaded
iuc
parents:
diff changeset
677 split = classmethod(split)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
678
8714f927a6ee Uploaded
iuc
parents:
diff changeset
679
8714f927a6ee Uploaded
iuc
parents:
diff changeset
680 def merge(split_files, output_file):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
681 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
682 Merging CML files.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
683 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
684 if len(split_files) == 1:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
685 #For one file only, use base class method (move/copy)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
686 return Text.merge(split_files, output_file)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
687 if not split_files:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
688 raise ValueError("Given no CML files, %r, to merge into %s" \
8714f927a6ee Uploaded
iuc
parents:
diff changeset
689 % (split_files, output_file))
8714f927a6ee Uploaded
iuc
parents:
diff changeset
690 with open(output_file, "w") as out:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
691 for filename in split_files:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
692 with open( filename ) as handle:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
693 header = handle.readline()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
694 if not header:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
695 raise ValueError("CML file %s was empty" % f)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
696 if not header.lstrip().startswith('<?xml version="1.0"?>'):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
697 out.write(header)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
698 raise ValueError("%s is not a valid XML file!" % f)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
699 line = handle.readline()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
700 header += line
8714f927a6ee Uploaded
iuc
parents:
diff changeset
701 if not line.lstrip().startswith('<cml xmlns="http://www.xml-cml.org/schema'):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
702 out.write(header)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
703 raise ValueError("%s is not a CML file!" % f)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
704 molecule_found = False
8714f927a6ee Uploaded
iuc
parents:
diff changeset
705 for line in handle.readlines():
8714f927a6ee Uploaded
iuc
parents:
diff changeset
706 # we found two required header lines, the next line should start with <molecule >
8714f927a6ee Uploaded
iuc
parents:
diff changeset
707 if line.lstrip().startswith('</cml>'):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
708 continue
8714f927a6ee Uploaded
iuc
parents:
diff changeset
709 if line.lstrip().startswith('<molecule'):
8714f927a6ee Uploaded
iuc
parents:
diff changeset
710 molecule_found = True
8714f927a6ee Uploaded
iuc
parents:
diff changeset
711 if molecule_found:
8714f927a6ee Uploaded
iuc
parents:
diff changeset
712 out.write( line )
8714f927a6ee Uploaded
iuc
parents:
diff changeset
713 out.write("</cml>\n")
8714f927a6ee Uploaded
iuc
parents:
diff changeset
714 merge = staticmethod(merge)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
715
8714f927a6ee Uploaded
iuc
parents:
diff changeset
716
8714f927a6ee Uploaded
iuc
parents:
diff changeset
717 if __name__ == '__main__':
8714f927a6ee Uploaded
iuc
parents:
diff changeset
718 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
719 TODO: We need to figure out, how to put example files under /lib/galaxy/datatypes/test/ from a toolshed, so that doctest can work properly.
8714f927a6ee Uploaded
iuc
parents:
diff changeset
720 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
721 inchi = get_test_fname('drugbank_drugs.inchi')
8714f927a6ee Uploaded
iuc
parents:
diff changeset
722 smiles = get_test_fname('drugbank_drugs.smi')
8714f927a6ee Uploaded
iuc
parents:
diff changeset
723 sdf = get_test_fname('drugbank_drugs.sdf')
8714f927a6ee Uploaded
iuc
parents:
diff changeset
724 fps = get_test_fname('50_chemfp_fingerprints_FPS1.fps')
8714f927a6ee Uploaded
iuc
parents:
diff changeset
725 pdb = get_test_fname('2zbz.pdb')
8714f927a6ee Uploaded
iuc
parents:
diff changeset
726 cml = get_test_fname('/home/bag/Downloads/approved.cml')
8714f927a6ee Uploaded
iuc
parents:
diff changeset
727
8714f927a6ee Uploaded
iuc
parents:
diff changeset
728 print 'CML test'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
729 print CML().sniff(cml), 'cml'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
730 print CML().sniff(inchi)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
731 print CML().sniff(pdb)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
732 CML().split()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
733 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
734 print 'SMILES test'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
735 print SMILES().sniff(smiles), 'smi'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
736 print SMILES().sniff(inchi)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
737 print SMILES().sniff(pdb)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
738 """
8714f927a6ee Uploaded
iuc
parents:
diff changeset
739 print 'InChI test'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
740 print InChI().sniff(smiles)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
741 print InChI().sniff(sdf)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
742 print InChI().sniff(inchi), 'inchi'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
743
8714f927a6ee Uploaded
iuc
parents:
diff changeset
744 print 'FPS test'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
745 print FPS().sniff(smiles)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
746 print FPS().sniff(sdf)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
747 f = FPS()
8714f927a6ee Uploaded
iuc
parents:
diff changeset
748 print f.sniff(fps)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
749
8714f927a6ee Uploaded
iuc
parents:
diff changeset
750 print 'SDF test'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
751 print SDF().sniff(smiles)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
752 print SDF().sniff(sdf), 'sdf'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
753 print SDF().sniff(fps)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
754
8714f927a6ee Uploaded
iuc
parents:
diff changeset
755 print 'PDB test'
8714f927a6ee Uploaded
iuc
parents:
diff changeset
756 print PDB().sniff(smiles)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
757 print PDB().sniff(sdf)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
758 print PDB().sniff(fps)
8714f927a6ee Uploaded
iuc
parents:
diff changeset
759 print PDB().sniff(pdb), 'pdb'