Mercurial > repos > bgruening > rdconf
comparison dimorphite_dl.py @ 0:55a2082540a9 draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit c1d813d3f0fec60ea6efe8a11e59d98bfdc1636f"
author | bgruening |
---|---|
date | Sat, 04 Dec 2021 16:36:56 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:55a2082540a9 |
---|---|
1 # flake8: noqa | |
2 # Copyright 2018 Jacob D. Durrant | |
3 # | |
4 # Licensed under the Apache License, Version 2.0 (the "License"); | |
5 # you may not use this file except in compliance with the License. | |
6 # You may obtain a copy of the License at | |
7 # | |
8 # http://www.apache.org/licenses/LICENSE-2.0 | |
9 # | |
10 # Unless required by applicable law or agreed to in writing, software | |
11 # distributed under the License is distributed on an "AS IS" BASIS, | |
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 # See the License for the specific language governing permissions and | |
14 # limitations under the License. | |
15 | |
16 """ | |
17 This script identifies and enumerates the possible protonation sites of SMILES | |
18 strings. | |
19 """ | |
20 | |
21 from __future__ import print_function | |
22 | |
23 import argparse | |
24 import os | |
25 import sys | |
26 | |
27 try: | |
28 # Python2 | |
29 from StringIO import StringIO | |
30 except ImportError: | |
31 # Python3 | |
32 from io import StringIO | |
33 | |
34 # Always let the user know a help file is available. | |
35 print("\nFor help, use: python dimorphite_dl.py --help") | |
36 | |
37 # And always report citation information. | |
38 print("\nIf you use Dimorphite-DL in your research, please cite:") | |
39 print("Ropp PJ, Kaminsky JC, Yablonski S, Durrant JD (2019) Dimorphite-DL: An") | |
40 print("open-source program for enumerating the ionization states of drug-like small") | |
41 print("molecules. J Cheminform 11:14. doi:10.1186/s13321-019-0336-9.\n") | |
42 | |
43 try: | |
44 import rdkit | |
45 from rdkit import Chem | |
46 from rdkit.Chem import AllChem | |
47 except Exception: | |
48 msg = "Dimorphite-DL requires RDKit. See https://www.rdkit.org/" | |
49 print(msg) | |
50 raise Exception(msg) | |
51 | |
52 | |
53 def main(params=None): | |
54 """The main definition run when you call the script from the commandline. | |
55 | |
56 :param params: The parameters to use. Entirely optional. If absent, | |
57 defaults to None, in which case argments will be taken from | |
58 those given at the command line. | |
59 :param params: dict, optional | |
60 :return: Returns a list of the SMILES strings return_as_list parameter is | |
61 True. Otherwise, returns None. | |
62 """ | |
63 | |
64 parser = ArgParseFuncs.get_args() | |
65 args = vars(parser.parse_args()) | |
66 | |
67 # Add in any parameters in params. | |
68 if params is not None: | |
69 for k, v in params.items(): | |
70 args[k] = v | |
71 | |
72 # If being run from the command line, print out all parameters. | |
73 if __name__ == "__main__": | |
74 print("\nPARAMETERS:\n") | |
75 for k in sorted(args.keys()): | |
76 print(k.rjust(13) + ": " + str(args[k])) | |
77 print("") | |
78 | |
79 if args["test"]: | |
80 # Run tests. | |
81 TestFuncs.test() | |
82 else: | |
83 # Run protonation | |
84 if "output_file" in args and args["output_file"] is not None: | |
85 # An output file was specified, so write to that. | |
86 with open(args["output_file"], "w") as file: | |
87 for protonated_smi in Protonate(args): | |
88 file.write(protonated_smi + "\n") | |
89 elif "return_as_list" in args and args["return_as_list"]: | |
90 return list(Protonate(args)) | |
91 else: | |
92 # No output file specified. Just print it to the screen. | |
93 for protonated_smi in Protonate(args): | |
94 print(protonated_smi) | |
95 | |
96 | |
97 class MyParser(argparse.ArgumentParser): | |
98 """Overwrite default parse so it displays help file on error. See | |
99 https://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu""" | |
100 | |
101 def error(self, message): | |
102 """Overwrites the default error message. | |
103 | |
104 :param message: The default error message. | |
105 """ | |
106 | |
107 self.print_help() | |
108 msg = "ERROR: %s\n\n" % message | |
109 print(msg) | |
110 raise Exception(msg) | |
111 | |
112 def print_help(self, file=None): | |
113 """Overwrite the default print_help function | |
114 | |
115 :param file: Output file, defaults to None | |
116 """ | |
117 | |
118 print("") | |
119 | |
120 if file is None: | |
121 file = sys.stdout | |
122 self._print_message(self.format_help(), file) | |
123 print( | |
124 """ | |
125 examples: | |
126 python dimorphite_dl.py --smiles_file sample_molecules.smi | |
127 python dimorphite_dl.py --smiles "CCC(=O)O" --min_ph -3.0 --max_ph -2.0 | |
128 python dimorphite_dl.py --smiles "CCCN" --min_ph -3.0 --max_ph -2.0 --output_file output.smi | |
129 python dimorphite_dl.py --smiles_file sample_molecules.smi --pka_precision 2.0 --label_states | |
130 python dimorphite_dl.py --test""" | |
131 ) | |
132 print("") | |
133 | |
134 | |
135 class ArgParseFuncs: | |
136 """A namespace for storing functions that are useful for processing | |
137 command-line arguments. To keep things organized.""" | |
138 | |
139 @staticmethod | |
140 def get_args(): | |
141 """Gets the arguments from the command line. | |
142 | |
143 :return: A parser object. | |
144 """ | |
145 | |
146 parser = MyParser( | |
147 description="Dimorphite 1.2: Creates models of " | |
148 + "appropriately protonated small moleucles. " | |
149 + "Apache 2.0 License. Copyright 2018 Jacob D. " | |
150 + "Durrant." | |
151 ) | |
152 parser.add_argument( | |
153 "--min_ph", | |
154 metavar="MIN", | |
155 type=float, | |
156 default=6.4, | |
157 help="minimum pH to consider (default: 6.4)", | |
158 ) | |
159 parser.add_argument( | |
160 "--max_ph", | |
161 metavar="MAX", | |
162 type=float, | |
163 default=8.4, | |
164 help="maximum pH to consider (default: 8.4)", | |
165 ) | |
166 parser.add_argument( | |
167 "--pka_precision", | |
168 metavar="PRE", | |
169 type=float, | |
170 default=1.0, | |
171 help="pKa precision factor (number of standard devations, default: 1.0)", | |
172 ) | |
173 parser.add_argument( | |
174 "--smiles", metavar="SMI", type=str, help="SMILES string to protonate" | |
175 ) | |
176 parser.add_argument( | |
177 "--smiles_file", | |
178 metavar="FILE", | |
179 type=str, | |
180 help="file that contains SMILES strings to protonate", | |
181 ) | |
182 parser.add_argument( | |
183 "--output_file", | |
184 metavar="FILE", | |
185 type=str, | |
186 help="output file to write protonated SMILES (optional)", | |
187 ) | |
188 parser.add_argument( | |
189 "--label_states", | |
190 action="store_true", | |
191 help="label protonated SMILES with target state " | |
192 + '(i.e., "DEPROTONATED", "PROTONATED", or "BOTH").', | |
193 ) | |
194 parser.add_argument( | |
195 "--test", action="store_true", help="run unit tests (for debugging)" | |
196 ) | |
197 | |
198 return parser | |
199 | |
200 @staticmethod | |
201 def clean_args(args): | |
202 """Cleans and normalizes input parameters | |
203 | |
204 :param args: A dictionary containing the arguments. | |
205 :type args: dict | |
206 :raises Exception: No SMILES in params. | |
207 """ | |
208 | |
209 defaults = { | |
210 "min_ph": 6.4, | |
211 "max_ph": 8.4, | |
212 "pka_precision": 1.0, | |
213 "label_states": False, | |
214 "test": False, | |
215 } | |
216 | |
217 for key in defaults: | |
218 if key not in args: | |
219 args[key] = defaults[key] | |
220 | |
221 keys = list(args.keys()) | |
222 for key in keys: | |
223 if args[key] is None: | |
224 del args[key] | |
225 | |
226 if not "smiles" in args and not "smiles_file" in args: | |
227 msg = "Error: No SMILES in params. Use the -h parameter for help." | |
228 print(msg) | |
229 raise Exception(msg) | |
230 | |
231 # If the user provides a smiles string, turn it into a file-like StringIO | |
232 # object. | |
233 if "smiles" in args: | |
234 if isinstance(args["smiles"], str): | |
235 args["smiles_file"] = StringIO(args["smiles"]) | |
236 | |
237 args["smiles_and_data"] = LoadSMIFile(args["smiles_file"]) | |
238 | |
239 return args | |
240 | |
241 | |
242 class UtilFuncs: | |
243 """A namespace to store functions for manipulating mol objects. To keep | |
244 things organized.""" | |
245 | |
246 @staticmethod | |
247 def neutralize_mol(mol): | |
248 """All molecules should be neuralized to the extent possible. The user | |
249 should not be allowed to specify the valence of the atoms in most cases. | |
250 | |
251 :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol objet to be neutralized. | |
252 :return: The neutralized Mol object. | |
253 """ | |
254 | |
255 # Get the reaction data | |
256 rxn_data = [ | |
257 [ | |
258 "[Ov1-1:1]", | |
259 "[Ov2+0:1]-[H]", | |
260 ], # To handle O- bonded to only one atom (add hydrogen). | |
261 [ | |
262 "[#7v4+1:1]-[H]", | |
263 "[#7v3+0:1]", | |
264 ], # To handle N+ bonded to a hydrogen (remove hydrogen). | |
265 [ | |
266 "[Ov2-:1]", | |
267 "[Ov2+0:1]", | |
268 ], # To handle O- bonded to two atoms. Should not be Negative. | |
269 [ | |
270 "[#7v3+1:1]", | |
271 "[#7v3+0:1]", | |
272 ], # To handle N+ bonded to three atoms. Should not be positive. | |
273 [ | |
274 "[#7v2-1:1]", | |
275 "[#7+0:1]-[H]", | |
276 ], # To handle N- Bonded to two atoms. Add hydrogen. | |
277 # ['[N:1]=[N+0:2]=[N:3]-[H]', '[N:1]=[N+1:2]=[N+0:3]-[H]'], # To | |
278 # handle bad azide. Must be protonated. (Now handled elsewhere, before | |
279 # SMILES converted to Mol object.) | |
280 [ | |
281 "[H]-[N:1]-[N:2]#[N:3]", | |
282 "[N:1]=[N+1:2]=[N:3]-[H]", | |
283 ], # To handle bad azide. R-N-N#N should be R-N=[N+]=N | |
284 ] | |
285 | |
286 # Add substructures and reactions (initially none) | |
287 for i, rxn_datum in enumerate(rxn_data): | |
288 rxn_data[i].append(Chem.MolFromSmarts(rxn_datum[0])) | |
289 rxn_data[i].append(None) | |
290 | |
291 # Add hydrogens (respects valence, so incomplete). | |
292 # Chem.calcImplicitValence(mol) | |
293 mol.UpdatePropertyCache(strict=False) | |
294 mol = Chem.AddHs(mol) | |
295 | |
296 while True: # Keep going until all these issues have been resolved. | |
297 current_rxn = None # The reaction to perform. | |
298 current_rxn_str = None | |
299 | |
300 for i, rxn_datum in enumerate(rxn_data): | |
301 ( | |
302 reactant_smarts, | |
303 product_smarts, | |
304 substruct_match_mol, | |
305 rxn_placeholder, | |
306 ) = rxn_datum | |
307 if mol.HasSubstructMatch(substruct_match_mol): | |
308 if rxn_placeholder is None: | |
309 current_rxn_str = reactant_smarts + ">>" + product_smarts | |
310 current_rxn = AllChem.ReactionFromSmarts(current_rxn_str) | |
311 rxn_data[i][3] = current_rxn # Update the placeholder. | |
312 else: | |
313 current_rxn = rxn_data[i][3] | |
314 break | |
315 | |
316 # Perform the reaction if necessary | |
317 if current_rxn is None: # No reaction left, so break out of while loop. | |
318 break | |
319 else: | |
320 mol = current_rxn.RunReactants((mol,))[0][0] | |
321 mol.UpdatePropertyCache(strict=False) # Update valences | |
322 | |
323 # The mols have been altered from the reactions described above, we need | |
324 # to resanitize them. Make sure aromatic rings are shown as such This | |
325 # catches all RDKit Errors. without the catchError and sanitizeOps the | |
326 # Chem.SanitizeMol can crash the program. | |
327 sanitize_string = Chem.SanitizeMol( | |
328 mol, | |
329 sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, | |
330 catchErrors=True, | |
331 ) | |
332 | |
333 return mol if sanitize_string.name == "SANITIZE_NONE" else None | |
334 | |
335 @staticmethod | |
336 def convert_smiles_str_to_mol(smiles_str): | |
337 """Given a SMILES string, check that it is actually a string and not a | |
338 None. Then try to convert it to an RDKit Mol Object. | |
339 | |
340 :param string smiles_str: The SMILES string. | |
341 :return: A rdkit.Chem.rdchem.Mol object, or None if it is the wrong type or | |
342 if it fails to convert to a Mol Obj | |
343 """ | |
344 | |
345 # Check that there are no type errors, ie Nones or non-string | |
346 # A non-string type will cause RDKit to hard crash | |
347 if smiles_str is None or type(smiles_str) is not str: | |
348 return None | |
349 | |
350 # Try to fix azides here. They are just tricky to deal with. | |
351 smiles_str = smiles_str.replace("N=N=N", "N=[N+]=N") | |
352 smiles_str = smiles_str.replace("NN#N", "N=[N+]=N") | |
353 | |
354 # Now convert to a mol object. Note the trick that is necessary to | |
355 # capture RDKit error/warning messages. See | |
356 # https://stackoverflow.com/questions/24277488/in-python-how-to-capture-the-stdout-from-a-c-shared-library-to-a-variable | |
357 stderr_fileno = sys.stderr.fileno() | |
358 stderr_save = os.dup(stderr_fileno) | |
359 stderr_pipe = os.pipe() | |
360 os.dup2(stderr_pipe[1], stderr_fileno) | |
361 os.close(stderr_pipe[1]) | |
362 | |
363 mol = Chem.MolFromSmiles(smiles_str) | |
364 | |
365 os.close(stderr_fileno) | |
366 os.close(stderr_pipe[0]) | |
367 os.dup2(stderr_save, stderr_fileno) | |
368 os.close(stderr_save) | |
369 | |
370 # Check that there are None type errors Chem.MolFromSmiles has sanitize on | |
371 # which means if there is even a small error in the SMILES (kekulize, | |
372 # nitrogen charge...) then mol=None. ie. | |
373 # Chem.MolFromSmiles("C[N]=[N]=[N]") = None this is an example of an | |
374 # nitrogen charge error. It is cased in a try statement to be overly | |
375 # cautious. | |
376 | |
377 return None if mol is None else mol | |
378 | |
379 @staticmethod | |
380 def eprint(*args, **kwargs): | |
381 """Error messages should be printed to STDERR. See | |
382 https://stackoverflow.com/questions/5574702/how-to-print-to-stderr-in-python""" | |
383 | |
384 print(*args, file=sys.stderr, **kwargs) | |
385 | |
386 | |
387 class LoadSMIFile(object): | |
388 """A generator class for loading in the SMILES strings from a file, one at | |
389 a time.""" | |
390 | |
391 def __init__(self, filename): | |
392 """Initializes this class. | |
393 | |
394 :param filename: The filename or file object (i.e., StringIO). | |
395 :type filename: str or StringIO | |
396 """ | |
397 | |
398 if type(filename) is str: | |
399 # It's a filename | |
400 self.f = open(filename, "r") | |
401 else: | |
402 # It's a file object (i.e., StringIO) | |
403 self.f = filename | |
404 | |
405 def __iter__(self): | |
406 """Returns this generator object. | |
407 | |
408 :return: This generator object. | |
409 :rtype: LoadSMIFile | |
410 """ | |
411 | |
412 return self | |
413 | |
414 def __next__(self): | |
415 """Ensure Python3 compatibility. | |
416 | |
417 :return: A dict, where the "smiles" key contains the canonical SMILES | |
418 string and the "data" key contains the remaining information | |
419 (e.g., the molecule name). | |
420 :rtype: dict | |
421 """ | |
422 | |
423 return self.next() | |
424 | |
425 def next(self): | |
426 """Get the data associated with the next line. | |
427 | |
428 :raises StopIteration: If there are no more lines left iin the file. | |
429 :return: A dict, where the "smiles" key contains the canonical SMILES | |
430 string and the "data" key contains the remaining information | |
431 (e.g., the molecule name). | |
432 :rtype: dict | |
433 """ | |
434 | |
435 line = self.f.readline() | |
436 | |
437 if line == "": | |
438 # EOF | |
439 self.f.close() | |
440 raise StopIteration() | |
441 return | |
442 | |
443 # Divide line into smi and data | |
444 splits = line.split() | |
445 if len(splits) != 0: | |
446 # Generate mol object | |
447 smiles_str = splits[0] | |
448 | |
449 # Convert from SMILES string to RDKIT Mol. This series of tests is | |
450 # to make sure the SMILES string is properly formed and to get it | |
451 # into a canonical form. Filter if failed. | |
452 mol = UtilFuncs.convert_smiles_str_to_mol(smiles_str) | |
453 if mol is None: | |
454 UtilFuncs.eprint( | |
455 "WARNING: Skipping poorly formed SMILES string: " + line | |
456 ) | |
457 return self.next() | |
458 | |
459 # Handle nuetralizing the molecules. Filter if failed. | |
460 mol = UtilFuncs.neutralize_mol(mol) | |
461 if mol is None: | |
462 UtilFuncs.eprint( | |
463 "WARNING: Skipping poorly formed SMILES string: " + line | |
464 ) | |
465 return self.next() | |
466 | |
467 # Remove the hydrogens. | |
468 try: | |
469 mol = Chem.RemoveHs(mol) | |
470 except Exception: | |
471 UtilFuncs.eprint( | |
472 "WARNING: Skipping poorly formed SMILES string: " + line | |
473 ) | |
474 return self.next() | |
475 | |
476 if mol is None: | |
477 UtilFuncs.eprint( | |
478 "WARNING: Skipping poorly formed SMILES string: " + line | |
479 ) | |
480 return self.next() | |
481 | |
482 # Regenerate the smiles string (to standardize). | |
483 new_mol_string = Chem.MolToSmiles(mol, isomericSmiles=True) | |
484 | |
485 return {"smiles": new_mol_string, "data": splits[1:]} | |
486 else: | |
487 # Blank line? Go to next one. | |
488 return self.next() | |
489 | |
490 | |
491 class Protonate(object): | |
492 """A generator class for protonating SMILES strings, one at a time.""" | |
493 | |
494 def __init__(self, args): | |
495 """Initialize the generator. | |
496 | |
497 :param args: A dictionary containing the arguments. | |
498 :type args: dict | |
499 """ | |
500 | |
501 # Make the args an object variable variable. | |
502 self.args = args | |
503 | |
504 # A list to store the protonated SMILES strings associated with a | |
505 # single input model. | |
506 self.cur_prot_SMI = [] | |
507 | |
508 # Clean and normalize the args | |
509 self.args = ArgParseFuncs.clean_args(args) | |
510 | |
511 # Load the substructures that can be protonated. | |
512 self.subs = ProtSubstructFuncs.load_protonation_substructs_calc_state_for_ph( | |
513 self.args["min_ph"], self.args["max_ph"], self.args["pka_precision"] | |
514 ) | |
515 | |
516 def __iter__(self): | |
517 """Returns this generator object. | |
518 | |
519 :return: This generator object. | |
520 :rtype: Protonate | |
521 """ | |
522 | |
523 return self | |
524 | |
525 def __next__(self): | |
526 """Ensure Python3 compatibility. | |
527 | |
528 :return: A dict, where the "smiles" key contains the canonical SMILES | |
529 string and the "data" key contains the remaining information | |
530 (e.g., the molecule name). | |
531 :rtype: dict | |
532 """ | |
533 | |
534 return self.next() | |
535 | |
536 def next(self): | |
537 """Get the next protonated SMILES string. | |
538 | |
539 :raises StopIteration: If there are no more lines left iin the file. | |
540 :return: TODO A dict, where the "smiles" key contains the canonical SMILES | |
541 string and the "data" key contains the remaining information | |
542 (e.g., the molecule name). | |
543 :rtype: dict | |
544 """ | |
545 | |
546 # If there are any SMILES strings in self.cur_prot_SMI, just return | |
547 # the first one and update the list to include only the remaining. | |
548 if len(self.cur_prot_SMI) > 0: | |
549 first, self.cur_prot_SMI = self.cur_prot_SMI[0], self.cur_prot_SMI[1:] | |
550 return first | |
551 | |
552 # self.cur_prot_SMI is empty, so try to add more to it. | |
553 | |
554 # Get the next SMILES string from the input file. | |
555 try: | |
556 smile_and_datum = self.args["smiles_and_data"].next() | |
557 except StopIteration: | |
558 # There are no more input smiles strings... | |
559 raise StopIteration() | |
560 | |
561 smi = smile_and_datum["smiles"] | |
562 data = smile_and_datum["data"] # Everything on SMILES line but the | |
563 # SMILES string itself (e.g., the | |
564 # molecule name). | |
565 | |
566 # Collect the data associated with this smiles (e.g., the molecule | |
567 # name). | |
568 tag = " ".join(data) | |
569 | |
570 # sites is a list of (atom index, "PROTONATED|DEPROTONATED|BOTH"). | |
571 # Note that the second entry indicates what state the site SHOULD be | |
572 # in (not the one it IS in per the SMILES string). It's calculated | |
573 # based on the probablistic distributions obtained during training. | |
574 sites = ProtSubstructFuncs.get_prot_sites_and_target_states(smi, self.subs) | |
575 | |
576 new_smis = [smi] | |
577 for site in sites: | |
578 # Make a new smiles with the correct protonation state. Note that | |
579 # new_smis is a growing list. This is how multiple protonation | |
580 # sites are handled. | |
581 | |
582 # new_smis_to_perhaps_add = ProtSubstructFuncs.protonate_site(new_smis, site) | |
583 new_smis = ProtSubstructFuncs.protonate_site(new_smis, site) | |
584 # print(site, new_smis) # Good for debugging. | |
585 | |
586 # Only add new smiles if not already in the list. | |
587 # for s in new_smis_to_perhaps_add: | |
588 # if not s in new_smis: | |
589 # new_smis.append(s) | |
590 | |
591 # In some cases, the script might generate redundant molecules. | |
592 # Phosphonates, when the pH is between the two pKa values and the | |
593 # stdev value is big enough, for example, will generate two identical | |
594 # BOTH states. Let's remove this redundancy. | |
595 new_smis = list(set(new_smis)) | |
596 | |
597 # Deprotonating protonated aromatic nitrogen gives [nH-]. Change this | |
598 # to [n-]. This is a hack. | |
599 new_smis = [s.replace("[nH-]", "[n-]") for s in new_smis] | |
600 | |
601 # Sometimes Dimorphite-DL generates molecules that aren't actually | |
602 # possible. Simply convert these to mol objects to eliminate the bad | |
603 # ones (that are None). | |
604 new_smis = [ | |
605 s for s in new_smis if UtilFuncs.convert_smiles_str_to_mol(s) is not None | |
606 ] | |
607 | |
608 # If there are no smi left, return the input one at the very least. | |
609 # All generated forms have apparently been judged | |
610 # inappropriate/mal-formed. | |
611 if len(new_smis) == 0: | |
612 new_smis = [smi] | |
613 | |
614 # If the user wants to see the target states, add those | |
615 # to the ends of each line. | |
616 if self.args["label_states"]: | |
617 states = "\t".join([x[1] for x in sites]) | |
618 new_lines = [x + "\t" + tag + "\t" + states for x in new_smis] | |
619 else: | |
620 new_lines = [x + "\t" + tag for x in new_smis] | |
621 | |
622 self.cur_prot_SMI = new_lines | |
623 | |
624 return self.next() | |
625 | |
626 | |
627 class ProtSubstructFuncs: | |
628 """A namespace to store functions for loading the substructures that can | |
629 be protonated. To keep things organized.""" | |
630 | |
631 @staticmethod | |
632 def load_protonation_substructs_calc_state_for_ph( | |
633 min_ph=6.4, max_ph=8.4, pka_std_range=1 | |
634 ): | |
635 """A pre-calculated list of R-groups with protonation sites, with their | |
636 likely pKa bins. | |
637 | |
638 :param float min_ph: The lower bound on the pH range, defaults to 6.4. | |
639 :param float max_ph: The upper bound on the pH range, defaults to 8.4. | |
640 :param pka_std_range: Basically the precision (stdev from predicted pKa to | |
641 consider), defaults to 1. | |
642 :return: A dict of the protonation substructions for the specified pH | |
643 range. | |
644 """ | |
645 | |
646 subs = [] | |
647 pwd = os.path.dirname(os.path.realpath(__file__)) | |
648 | |
649 site_structures_file = "{}/{}".format(pwd, "site_substructures.smarts") | |
650 with open(site_structures_file, "r") as substruct: | |
651 for line in substruct: | |
652 line = line.strip() | |
653 sub = {} | |
654 if line is not "": | |
655 splits = line.split() | |
656 sub["name"] = splits[0] | |
657 sub["smart"] = splits[1] | |
658 sub["mol"] = Chem.MolFromSmarts(sub["smart"]) | |
659 | |
660 # NEED TO DIVIDE THIS BY 3s | |
661 pka_ranges = [ | |
662 splits[i : i + 3] for i in range(2, len(splits) - 1, 3) | |
663 ] | |
664 | |
665 prot = [] | |
666 for pka_range in pka_ranges: | |
667 site = pka_range[0] | |
668 std = float(pka_range[2]) * pka_std_range | |
669 mean = float(pka_range[1]) | |
670 protonation_state = ProtSubstructFuncs.define_protonation_state( | |
671 mean, std, min_ph, max_ph | |
672 ) | |
673 | |
674 prot.append([site, protonation_state]) | |
675 | |
676 sub["prot_states_for_pH"] = prot | |
677 subs.append(sub) | |
678 return subs | |
679 | |
680 @staticmethod | |
681 def define_protonation_state(mean, std, min_ph, max_ph): | |
682 """Updates the substructure definitions to include the protonation state | |
683 based on the user-given pH range. The size of the pKa range is also based | |
684 on the number of standard deviations to be considered by the user param. | |
685 | |
686 :param float mean: The mean pKa. | |
687 :param float std: The precision (stdev). | |
688 :param float min_ph: The min pH of the range. | |
689 :param float max_ph: The max pH of the range. | |
690 :return: A string describing the protonation state. | |
691 """ | |
692 | |
693 min_pka = mean - std | |
694 max_pka = mean + std | |
695 | |
696 # This needs to be reassigned, and 'ERROR' should never make it past the | |
697 # next set of checks. | |
698 if min_pka <= max_ph and min_ph <= max_pka: | |
699 protonation_state = "BOTH" | |
700 elif mean > max_ph: | |
701 protonation_state = "PROTONATED" | |
702 else: | |
703 protonation_state = "DEPROTONATED" | |
704 | |
705 return protonation_state | |
706 | |
707 @staticmethod | |
708 def get_prot_sites_and_target_states(smi, subs): | |
709 """For a single molecule, find all possible matches in the protonation | |
710 R-group list, subs. Items that are higher on the list will be matched | |
711 first, to the exclusion of later items. | |
712 | |
713 :param string smi: A SMILES string. | |
714 :param list subs: Substructure information. | |
715 :return: A list of protonation sites and their pKa bin. ('PROTONATED', | |
716 'BOTH', or 'DEPROTONATED') | |
717 """ | |
718 | |
719 # Convert the Smiles string (smi) to an RDKit Mol Obj | |
720 mol = UtilFuncs.convert_smiles_str_to_mol(smi) | |
721 | |
722 # Check Conversion worked | |
723 if mol is None: | |
724 UtilFuncs.eprint("ERROR: ", smi) | |
725 return [] | |
726 | |
727 # Try to Add hydrogens. if failed return [] | |
728 try: | |
729 mol = Chem.AddHs(mol) | |
730 except Exception: | |
731 UtilFuncs.eprint("ERROR: ", smi) | |
732 return [] | |
733 | |
734 # Check adding Hs worked | |
735 if mol is None: | |
736 UtilFuncs.eprint("ERROR: ", smi) | |
737 return [] | |
738 | |
739 ProtectUnprotectFuncs.unprotect_molecule(mol) | |
740 protonation_sites = [] | |
741 | |
742 for item in subs: | |
743 smart = item["mol"] | |
744 if mol.HasSubstructMatch(smart): | |
745 matches = ProtectUnprotectFuncs.get_unprotected_matches(mol, smart) | |
746 prot = item["prot_states_for_pH"] | |
747 for match in matches: | |
748 # We want to move the site from being relative to the | |
749 # substructure, to the index on the main molecule. | |
750 for site in prot: | |
751 proton = int(site[0]) | |
752 category = site[1] | |
753 new_site = (match[proton], category, item["name"]) | |
754 | |
755 if not new_site in protonation_sites: | |
756 # Because sites must be unique. | |
757 protonation_sites.append(new_site) | |
758 | |
759 ProtectUnprotectFuncs.protect_molecule(mol, match) | |
760 | |
761 return protonation_sites | |
762 | |
763 @staticmethod | |
764 def protonate_site(smis, site): | |
765 """Given a list of SMILES strings, we protonate the site. | |
766 | |
767 :param list smis: The list of SMILES strings. | |
768 :param tuple site: Information about the protonation site. | |
769 (idx, target_prot_state, prot_site_name) | |
770 :return: A list of the appropriately protonated SMILES. | |
771 """ | |
772 | |
773 # Decouple the atom index and its target protonation state from the site | |
774 # tuple | |
775 idx, target_prot_state, prot_site_name = site | |
776 | |
777 # Initialize the output list | |
778 output_smis = [] | |
779 | |
780 state_to_charge = {"DEPROTONATED": [-1], "PROTONATED": [0], "BOTH": [-1, 0]} | |
781 | |
782 charges = state_to_charge[target_prot_state] | |
783 | |
784 # Now make the actual smiles match the target protonation state. | |
785 output_smis = ProtSubstructFuncs.set_protonation_charge( | |
786 smis, idx, charges, prot_site_name | |
787 ) | |
788 | |
789 return output_smis | |
790 | |
791 @staticmethod | |
792 def set_protonation_charge(smis, idx, charges, prot_site_name): | |
793 """Sets the atomic charge on a particular site for a set of SMILES. | |
794 | |
795 :param list smis: A list of the SMILES strings. | |
796 :param int idx: The index of the atom to consider. | |
797 :param list charges: A list of the charges (ints) to assign at | |
798 this site. | |
799 :param string prot_site_name: The name of the protonation site. | |
800 :return: A list of the processed SMILES strings. | |
801 """ | |
802 | |
803 # Sets up the output list and the Nitrogen charge | |
804 output = [] | |
805 | |
806 for charge in charges: | |
807 # The charge for Nitrogens is 1 higher than others (i.e., protonated | |
808 # state is positively charged). | |
809 nitro_charge = charge + 1 | |
810 | |
811 # But there are a few nitrogen moieties where the acidic group is the | |
812 # neutral one. Amides are a good example. I gave some thought re. how | |
813 # to best flag these. I decided that those nitrogen-containing | |
814 # moieties where the acidic group is neutral (rather than positively | |
815 # charged) will have "*" in the name. | |
816 if "*" in prot_site_name: | |
817 nitro_charge = nitro_charge - 1 # Undo what was done previously. | |
818 | |
819 for smi in smis: | |
820 | |
821 # Convert smilesstring (smi) into a RDKit Mol | |
822 mol = UtilFuncs.convert_smiles_str_to_mol(smi) | |
823 | |
824 # Check that the conversion worked, skip if it fails | |
825 if mol is None: | |
826 continue | |
827 | |
828 atom = mol.GetAtomWithIdx(idx) | |
829 | |
830 # Assign the protonation charge, with special care for Nitrogens | |
831 element = atom.GetAtomicNum() | |
832 if element == 7: | |
833 atom.SetFormalCharge(nitro_charge) | |
834 else: | |
835 atom.SetFormalCharge(charge) | |
836 | |
837 # Convert back to SMILE and add to output | |
838 out_smile = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True) | |
839 output.append(out_smile) | |
840 | |
841 return output | |
842 | |
843 | |
844 class ProtectUnprotectFuncs: | |
845 """A namespace for storing functions that are useful for protecting and | |
846 unprotecting molecules. To keep things organized. We need to identify and | |
847 mark groups that have been matched with a substructure.""" | |
848 | |
849 @staticmethod | |
850 def unprotect_molecule(mol): | |
851 """Sets the protected property on all atoms to 0. This also creates the | |
852 property for new molecules. | |
853 | |
854 :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol object. | |
855 :type mol: The rdkit Mol object with atoms unprotected. | |
856 """ | |
857 | |
858 for atom in mol.GetAtoms(): | |
859 atom.SetProp("_protected", "0") | |
860 | |
861 @staticmethod | |
862 def protect_molecule(mol, match): | |
863 """Given a 'match', a list of molecules idx's, we set the protected status | |
864 of each atom to 1. This will prevent any matches using that atom in the | |
865 future. | |
866 | |
867 :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol object to protect. | |
868 :param list match: A list of molecule idx's. | |
869 """ | |
870 | |
871 for idx in match: | |
872 atom = mol.GetAtomWithIdx(idx) | |
873 atom.SetProp("_protected", "1") | |
874 | |
875 @staticmethod | |
876 def get_unprotected_matches(mol, substruct): | |
877 """Finds substructure matches with atoms that have not been protected. | |
878 Returns list of matches, each match a list of atom idxs. | |
879 | |
880 :param rdkit.Chem.rdchem.Mol mol: The Mol object to consider. | |
881 :param string substruct: The SMARTS string of the substructure ot match. | |
882 :return: A list of the matches. Each match is itself a list of atom idxs. | |
883 """ | |
884 | |
885 matches = mol.GetSubstructMatches(substruct) | |
886 unprotected_matches = [] | |
887 for match in matches: | |
888 if ProtectUnprotectFuncs.is_match_unprotected(mol, match): | |
889 unprotected_matches.append(match) | |
890 return unprotected_matches | |
891 | |
892 @staticmethod | |
893 def is_match_unprotected(mol, match): | |
894 """Checks a molecule to see if the substructure match contains any | |
895 protected atoms. | |
896 | |
897 :param rdkit.Chem.rdchem.Mol mol: The Mol object to check. | |
898 :param list match: The match to check. | |
899 :return: A boolean, whether the match is present or not. | |
900 """ | |
901 | |
902 for idx in match: | |
903 atom = mol.GetAtomWithIdx(idx) | |
904 protected = atom.GetProp("_protected") | |
905 if protected == "1": | |
906 return False | |
907 return True | |
908 | |
909 | |
910 class TestFuncs: | |
911 """A namespace for storing functions that perform tests on the code. To | |
912 keep things organized.""" | |
913 | |
914 @staticmethod | |
915 def test(): | |
916 """Tests all the 38 groups.""" | |
917 | |
918 smis = [ | |
919 # [input smiles, pka, protonated, deprotonated, category] | |
920 ["C#CCO", "C#CCO", "C#CC[O-]", "Alcohol"], | |
921 ["C(=O)N", "NC=O", "[NH-]C=O", "Amide"], | |
922 [ | |
923 "CC(=O)NOC(C)=O", | |
924 "CC(=O)NOC(C)=O", | |
925 "CC(=O)[N-]OC(C)=O", | |
926 "Amide_electronegative", | |
927 ], | |
928 ["COC(=N)N", "COC(N)=[NH2+]", "COC(=N)N", "AmidineGuanidine2"], | |
929 [ | |
930 "Brc1ccc(C2NCCS2)cc1", | |
931 "Brc1ccc(C2[NH2+]CCS2)cc1", | |
932 "Brc1ccc(C2NCCS2)cc1", | |
933 "Amines_primary_secondary_tertiary", | |
934 ], | |
935 [ | |
936 "CC(=O)[n+]1ccc(N)cc1", | |
937 "CC(=O)[n+]1ccc([NH3+])cc1", | |
938 "CC(=O)[n+]1ccc(N)cc1", | |
939 "Anilines_primary", | |
940 ], | |
941 ["CCNc1ccccc1", "CC[NH2+]c1ccccc1", "CCNc1ccccc1", "Anilines_secondary"], | |
942 [ | |
943 "Cc1ccccc1N(C)C", | |
944 "Cc1ccccc1[NH+](C)C", | |
945 "Cc1ccccc1N(C)C", | |
946 "Anilines_tertiary", | |
947 ], | |
948 [ | |
949 "BrC1=CC2=C(C=C1)NC=C2", | |
950 "Brc1ccc2[nH]ccc2c1", | |
951 "Brc1ccc2[n-]ccc2c1", | |
952 "Indole_pyrrole", | |
953 ], | |
954 [ | |
955 "O=c1cc[nH]cc1", | |
956 "O=c1cc[nH]cc1", | |
957 "O=c1cc[n-]cc1", | |
958 "Aromatic_nitrogen_protonated", | |
959 ], | |
960 ["C-N=[N+]=[N@H]", "CN=[N+]=N", "CN=[N+]=[N-]", "Azide"], | |
961 ["BrC(C(O)=O)CBr", "O=C(O)C(Br)CBr", "O=C([O-])C(Br)CBr", "Carboxyl"], | |
962 ["NC(NN=O)=N", "NC(=[NH2+])NN=O", "N=C(N)NN=O", "AmidineGuanidine1"], | |
963 [ | |
964 "C(F)(F)(F)C(=O)NC(=O)C", | |
965 "CC(=O)NC(=O)C(F)(F)F", | |
966 "CC(=O)[N-]C(=O)C(F)(F)F", | |
967 "Imide", | |
968 ], | |
969 ["O=C(C)NC(C)=O", "CC(=O)NC(C)=O", "CC(=O)[N-]C(C)=O", "Imide2"], | |
970 [ | |
971 "CC(C)(C)C(N(C)O)=O", | |
972 "CN(O)C(=O)C(C)(C)C", | |
973 "CN([O-])C(=O)C(C)(C)C", | |
974 "N-hydroxyamide", | |
975 ], | |
976 ["C[N+](O)=O", "C[N+](=O)O", "C[N+](=O)[O-]", "Nitro"], | |
977 ["O=C1C=C(O)CC1", "O=C1C=C(O)CC1", "O=C1C=C([O-])CC1", "O=C-C=C-OH"], | |
978 ["C1CC1OO", "OOC1CC1", "[O-]OC1CC1", "Peroxide2"], | |
979 ["C(=O)OO", "O=COO", "O=CO[O-]", "Peroxide1"], | |
980 [ | |
981 "Brc1cc(O)cc(Br)c1", | |
982 "Oc1cc(Br)cc(Br)c1", | |
983 "[O-]c1cc(Br)cc(Br)c1", | |
984 "Phenol", | |
985 ], | |
986 [ | |
987 "CC(=O)c1ccc(S)cc1", | |
988 "CC(=O)c1ccc(S)cc1", | |
989 "CC(=O)c1ccc([S-])cc1", | |
990 "Phenyl_Thiol", | |
991 ], | |
992 [ | |
993 "C=CCOc1ccc(C(=O)O)cc1", | |
994 "C=CCOc1ccc(C(=O)O)cc1", | |
995 "C=CCOc1ccc(C(=O)[O-])cc1", | |
996 "Phenyl_carboxyl", | |
997 ], | |
998 ["COP(=O)(O)OC", "COP(=O)(O)OC", "COP(=O)([O-])OC", "Phosphate_diester"], | |
999 ["CP(C)(=O)O", "CP(C)(=O)O", "CP(C)(=O)[O-]", "Phosphinic_acid"], | |
1000 [ | |
1001 "CC(C)OP(C)(=O)O", | |
1002 "CC(C)OP(C)(=O)O", | |
1003 "CC(C)OP(C)(=O)[O-]", | |
1004 "Phosphonate_ester", | |
1005 ], | |
1006 [ | |
1007 "CC1(C)OC(=O)NC1=O", | |
1008 "CC1(C)OC(=O)NC1=O", | |
1009 "CC1(C)OC(=O)[N-]C1=O", | |
1010 "Ringed_imide1", | |
1011 ], | |
1012 ["O=C(N1)C=CC1=O", "O=C1C=CC(=O)N1", "O=C1C=CC(=O)[N-]1", "Ringed_imide2"], | |
1013 ["O=S(OC)(O)=O", "COS(=O)(=O)O", "COS(=O)(=O)[O-]", "Sulfate"], | |
1014 [ | |
1015 "COc1ccc(S(=O)O)cc1", | |
1016 "COc1ccc(S(=O)O)cc1", | |
1017 "COc1ccc(S(=O)[O-])cc1", | |
1018 "Sulfinic_acid", | |
1019 ], | |
1020 ["CS(N)(=O)=O", "CS(N)(=O)=O", "CS([NH-])(=O)=O", "Sulfonamide"], | |
1021 [ | |
1022 "CC(=O)CSCCS(O)(=O)=O", | |
1023 "CC(=O)CSCCS(=O)(=O)O", | |
1024 "CC(=O)CSCCS(=O)(=O)[O-]", | |
1025 "Sulfonate", | |
1026 ], | |
1027 ["CC(=O)S", "CC(=O)S", "CC(=O)[S-]", "Thioic_acid"], | |
1028 ["C(C)(C)(C)(S)", "CC(C)(C)S", "CC(C)(C)[S-]", "Thiol"], | |
1029 [ | |
1030 "Brc1cc[nH+]cc1", | |
1031 "Brc1cc[nH+]cc1", | |
1032 "Brc1ccncc1", | |
1033 "Aromatic_nitrogen_unprotonated", | |
1034 ], | |
1035 [ | |
1036 "C=C(O)c1c(C)cc(C)cc1C", | |
1037 "C=C(O)c1c(C)cc(C)cc1C", | |
1038 "C=C([O-])c1c(C)cc(C)cc1C", | |
1039 "Vinyl_alcohol", | |
1040 ], | |
1041 ["CC(=O)ON", "CC(=O)O[NH3+]", "CC(=O)ON", "Primary_hydroxyl_amine"], | |
1042 ] | |
1043 | |
1044 smis_phos = [ | |
1045 [ | |
1046 "O=P(O)(O)OCCCC", | |
1047 "CCCCOP(=O)(O)O", | |
1048 "CCCCOP(=O)([O-])O", | |
1049 "CCCCOP(=O)([O-])[O-]", | |
1050 "Phosphate", | |
1051 ], | |
1052 [ | |
1053 "CC(P(O)(O)=O)C", | |
1054 "CC(C)P(=O)(O)O", | |
1055 "CC(C)P(=O)([O-])O", | |
1056 "CC(C)P(=O)([O-])[O-]", | |
1057 "Phosphonate", | |
1058 ], | |
1059 ] | |
1060 | |
1061 # Load the average pKa values. | |
1062 average_pkas = { | |
1063 l.split()[0].replace("*", ""): float(l.split()[3]) | |
1064 for l in open("site_substructures.smarts") | |
1065 if l.split()[0] not in ["Phosphate", "Phosphonate"] | |
1066 } | |
1067 average_pkas_phos = { | |
1068 l.split()[0].replace("*", ""): [float(l.split()[3]), float(l.split()[6])] | |
1069 for l in open("site_substructures.smarts") | |
1070 if l.split()[0] in ["Phosphate", "Phosphonate"] | |
1071 } | |
1072 | |
1073 print("Running Tests") | |
1074 print("=============") | |
1075 print("") | |
1076 | |
1077 print("Very Acidic (pH -10000000)") | |
1078 print("--------------------------") | |
1079 print("") | |
1080 | |
1081 args = { | |
1082 "min_ph": -10000000, | |
1083 "max_ph": -10000000, | |
1084 "pka_precision": 0.5, | |
1085 "smiles": "", | |
1086 "label_states": True, | |
1087 } | |
1088 | |
1089 for smi, protonated, deprotonated, category in smis: | |
1090 args["smiles"] = smi | |
1091 TestFuncs.test_check(args, [protonated], ["PROTONATED"]) | |
1092 | |
1093 for smi, protonated, mix, deprotonated, category in smis_phos: | |
1094 args["smiles"] = smi | |
1095 TestFuncs.test_check(args, [protonated], ["PROTONATED"]) | |
1096 | |
1097 args["min_ph"] = 10000000 | |
1098 args["max_ph"] = 10000000 | |
1099 | |
1100 print("") | |
1101 print("Very Basic (pH 10000000)") | |
1102 print("------------------------") | |
1103 print("") | |
1104 | |
1105 for smi, protonated, deprotonated, category in smis: | |
1106 args["smiles"] = smi | |
1107 TestFuncs.test_check(args, [deprotonated], ["DEPROTONATED"]) | |
1108 | |
1109 for smi, protonated, mix, deprotonated, category in smis_phos: | |
1110 args["smiles"] = smi | |
1111 TestFuncs.test_check(args, [deprotonated], ["DEPROTONATED"]) | |
1112 | |
1113 print("") | |
1114 print("pH is Category pKa") | |
1115 print("------------------") | |
1116 print("") | |
1117 | |
1118 for smi, protonated, deprotonated, category in smis: | |
1119 avg_pka = average_pkas[category] | |
1120 | |
1121 args["smiles"] = smi | |
1122 args["min_ph"] = avg_pka | |
1123 args["max_ph"] = avg_pka | |
1124 | |
1125 TestFuncs.test_check(args, [protonated, deprotonated], ["BOTH"]) | |
1126 | |
1127 for smi, protonated, mix, deprotonated, category in smis_phos: | |
1128 args["smiles"] = smi | |
1129 | |
1130 avg_pka = average_pkas_phos[category][0] | |
1131 args["min_ph"] = avg_pka | |
1132 args["max_ph"] = avg_pka | |
1133 | |
1134 TestFuncs.test_check(args, [mix, protonated], ["BOTH"]) | |
1135 | |
1136 avg_pka = average_pkas_phos[category][1] | |
1137 args["min_ph"] = avg_pka | |
1138 args["max_ph"] = avg_pka | |
1139 | |
1140 TestFuncs.test_check( | |
1141 args, [mix, deprotonated], ["DEPROTONATED", "DEPROTONATED"] | |
1142 ) | |
1143 | |
1144 avg_pka = 0.5 * ( | |
1145 average_pkas_phos[category][0] + average_pkas_phos[category][1] | |
1146 ) | |
1147 args["min_ph"] = avg_pka | |
1148 args["max_ph"] = avg_pka | |
1149 args["pka_precision"] = 5 # Should give all three | |
1150 | |
1151 TestFuncs.test_check( | |
1152 args, [mix, deprotonated, protonated], ["BOTH", "BOTH"] | |
1153 ) | |
1154 | |
1155 @staticmethod | |
1156 def test_check(args, expected_output, labels): | |
1157 """Tests most ionizable groups. The ones that can only loose or gain a single proton. | |
1158 | |
1159 :param args: The arguments to pass to protonate() | |
1160 :param expected_output: A list of the expected SMILES-strings output. | |
1161 :param labels: The labels. A list containing combo of BOTH, PROTONATED, | |
1162 DEPROTONATED. | |
1163 :raises Exception: Wrong number of states produced. | |
1164 :raises Exception: Unexpected output SMILES. | |
1165 :raises Exception: Wrong labels. | |
1166 """ | |
1167 | |
1168 output = list(Protonate(args)) | |
1169 output = [o.split() for o in output] | |
1170 | |
1171 num_states = len(expected_output) | |
1172 | |
1173 if len(output) != num_states: | |
1174 msg = ( | |
1175 args["smiles"] | |
1176 + " should have " | |
1177 + str(num_states) | |
1178 + " states at at pH " | |
1179 + str(args["min_ph"]) | |
1180 + ": " | |
1181 + str(output) | |
1182 ) | |
1183 print(msg) | |
1184 raise Exception(msg) | |
1185 | |
1186 if len(set([l[0] for l in output]) - set(expected_output)) != 0: | |
1187 msg = ( | |
1188 args["smiles"] | |
1189 + " is not " | |
1190 + " AND ".join(expected_output) | |
1191 + " at pH " | |
1192 + str(args["min_ph"]) | |
1193 + " - " | |
1194 + str(args["max_ph"]) | |
1195 + "; it is " | |
1196 + " AND ".join([l[0] for l in output]) | |
1197 ) | |
1198 print(msg) | |
1199 raise Exception(msg) | |
1200 | |
1201 if len(set([l[1] for l in output]) - set(labels)) != 0: | |
1202 msg = ( | |
1203 args["smiles"] | |
1204 + " not labeled as " | |
1205 + " AND ".join(labels) | |
1206 + "; it is " | |
1207 + " AND ".join([l[1] for l in output]) | |
1208 ) | |
1209 print(msg) | |
1210 raise Exception(msg) | |
1211 | |
1212 ph_range = sorted(list(set([args["min_ph"], args["max_ph"]]))) | |
1213 ph_range_str = "(" + " - ".join("{0:.2f}".format(n) for n in ph_range) + ")" | |
1214 print( | |
1215 "(CORRECT) " | |
1216 + ph_range_str.ljust(10) | |
1217 + " " | |
1218 + args["smiles"] | |
1219 + " => " | |
1220 + " AND ".join([l[0] for l in output]) | |
1221 ) | |
1222 | |
1223 | |
1224 def run(**kwargs): | |
1225 """A helpful, importable function for those who want to call Dimorphite-DL | |
1226 from another Python script rather than the command line. Note that this | |
1227 function accepts keyword arguments that match the command-line parameters | |
1228 exactly. If you want to pass and return a list of RDKit Mol objects, import | |
1229 run_with_mol_list() instead. | |
1230 | |
1231 :param **kwargs: For a complete description, run dimorphite_dl.py from the | |
1232 command line with the -h option. | |
1233 :type kwargs: dict | |
1234 """ | |
1235 | |
1236 # Run the main function with the specified arguments. | |
1237 main(kwargs) | |
1238 | |
1239 | |
1240 def run_with_mol_list(mol_lst, **kwargs): | |
1241 """A helpful, importable function for those who want to call Dimorphite-DL | |
1242 from another Python script rather than the command line. Note that this | |
1243 function is for passing Dimorphite-DL a list of RDKit Mol objects, together | |
1244 with command-line parameters. If you want to use only the same parameters | |
1245 that you would use from the command line, import run() instead. | |
1246 | |
1247 :param mol_lst: A list of rdkit.Chem.rdchem.Mol objects. | |
1248 :type mol_lst: list | |
1249 :raises Exception: If the **kwargs includes "smiles", "smiles_file", | |
1250 "output_file", or "test" parameters. | |
1251 :return: A list of properly protonated rdkit.Chem.rdchem.Mol objects. | |
1252 :rtype: list | |
1253 """ | |
1254 | |
1255 # Do a quick check to make sure the user input makes sense. | |
1256 for bad_arg in ["smiles", "smiles_file", "output_file", "test"]: | |
1257 if bad_arg in kwargs: | |
1258 msg = ( | |
1259 "You're using Dimorphite-DL's run_with_mol_list(mol_lst, " | |
1260 + '**kwargs) function, but you also passed the "' | |
1261 + bad_arg | |
1262 + '" argument. Did you mean to use the ' | |
1263 + "run(**kwargs) function instead?" | |
1264 ) | |
1265 print(msg) | |
1266 raise Exception(msg) | |
1267 | |
1268 # Set the return_as_list flag so main() will return the protonated smiles | |
1269 # as a list. | |
1270 kwargs["return_as_list"] = True | |
1271 | |
1272 # Having reviewed the code, it will be very difficult to rewrite it so | |
1273 # that a list of Mol objects can be used directly. Intead, convert this | |
1274 # list of mols to smiles and pass that. Not efficient, but it will work. | |
1275 protonated_smiles_and_props = [] | |
1276 for m in mol_lst: | |
1277 props = m.GetPropsAsDict() | |
1278 kwargs["smiles"] = Chem.MolToSmiles(m, isomericSmiles=True) | |
1279 protonated_smiles_and_props.extend( | |
1280 [(s.split("\t")[0], props) for s in main(kwargs)] | |
1281 ) | |
1282 | |
1283 # Now convert the list of protonated smiles strings back to RDKit Mol | |
1284 # objects. Also, add back in the properties from the original mol objects. | |
1285 mols = [] | |
1286 for s, props in protonated_smiles_and_props: | |
1287 m = Chem.MolFromSmiles(s) | |
1288 if m: | |
1289 for prop, val in props.items(): | |
1290 if type(val) is int: | |
1291 m.SetIntProp(prop, val) | |
1292 elif type(val) is float: | |
1293 m.SetDoubleProp(prop, val) | |
1294 elif type(val) is bool: | |
1295 m.SetBoolProp(prop, val) | |
1296 else: | |
1297 m.SetProp(prop, str(val)) | |
1298 mols.append(m) | |
1299 else: | |
1300 UtilFuncs.eprint( | |
1301 "WARNING: Could not process molecule with SMILES string " | |
1302 + s | |
1303 + " and properties " | |
1304 + str(props) | |
1305 ) | |
1306 | |
1307 return mols | |
1308 | |
1309 | |
1310 if __name__ == "__main__": | |
1311 main() |