comparison dimorphite_dl.py @ 0:55a2082540a9 draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit c1d813d3f0fec60ea6efe8a11e59d98bfdc1636f"
author bgruening
date Sat, 04 Dec 2021 16:36:56 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:55a2082540a9
1 # flake8: noqa
2 # Copyright 2018 Jacob D. Durrant
3 #
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7 #
8 # http://www.apache.org/licenses/LICENSE-2.0
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16 """
17 This script identifies and enumerates the possible protonation sites of SMILES
18 strings.
19 """
20
21 from __future__ import print_function
22
23 import argparse
24 import os
25 import sys
26
27 try:
28 # Python2
29 from StringIO import StringIO
30 except ImportError:
31 # Python3
32 from io import StringIO
33
34 # Always let the user know a help file is available.
35 print("\nFor help, use: python dimorphite_dl.py --help")
36
37 # And always report citation information.
38 print("\nIf you use Dimorphite-DL in your research, please cite:")
39 print("Ropp PJ, Kaminsky JC, Yablonski S, Durrant JD (2019) Dimorphite-DL: An")
40 print("open-source program for enumerating the ionization states of drug-like small")
41 print("molecules. J Cheminform 11:14. doi:10.1186/s13321-019-0336-9.\n")
42
43 try:
44 import rdkit
45 from rdkit import Chem
46 from rdkit.Chem import AllChem
47 except Exception:
48 msg = "Dimorphite-DL requires RDKit. See https://www.rdkit.org/"
49 print(msg)
50 raise Exception(msg)
51
52
53 def main(params=None):
54 """The main definition run when you call the script from the commandline.
55
56 :param params: The parameters to use. Entirely optional. If absent,
57 defaults to None, in which case argments will be taken from
58 those given at the command line.
59 :param params: dict, optional
60 :return: Returns a list of the SMILES strings return_as_list parameter is
61 True. Otherwise, returns None.
62 """
63
64 parser = ArgParseFuncs.get_args()
65 args = vars(parser.parse_args())
66
67 # Add in any parameters in params.
68 if params is not None:
69 for k, v in params.items():
70 args[k] = v
71
72 # If being run from the command line, print out all parameters.
73 if __name__ == "__main__":
74 print("\nPARAMETERS:\n")
75 for k in sorted(args.keys()):
76 print(k.rjust(13) + ": " + str(args[k]))
77 print("")
78
79 if args["test"]:
80 # Run tests.
81 TestFuncs.test()
82 else:
83 # Run protonation
84 if "output_file" in args and args["output_file"] is not None:
85 # An output file was specified, so write to that.
86 with open(args["output_file"], "w") as file:
87 for protonated_smi in Protonate(args):
88 file.write(protonated_smi + "\n")
89 elif "return_as_list" in args and args["return_as_list"]:
90 return list(Protonate(args))
91 else:
92 # No output file specified. Just print it to the screen.
93 for protonated_smi in Protonate(args):
94 print(protonated_smi)
95
96
97 class MyParser(argparse.ArgumentParser):
98 """Overwrite default parse so it displays help file on error. See
99 https://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu"""
100
101 def error(self, message):
102 """Overwrites the default error message.
103
104 :param message: The default error message.
105 """
106
107 self.print_help()
108 msg = "ERROR: %s\n\n" % message
109 print(msg)
110 raise Exception(msg)
111
112 def print_help(self, file=None):
113 """Overwrite the default print_help function
114
115 :param file: Output file, defaults to None
116 """
117
118 print("")
119
120 if file is None:
121 file = sys.stdout
122 self._print_message(self.format_help(), file)
123 print(
124 """
125 examples:
126 python dimorphite_dl.py --smiles_file sample_molecules.smi
127 python dimorphite_dl.py --smiles "CCC(=O)O" --min_ph -3.0 --max_ph -2.0
128 python dimorphite_dl.py --smiles "CCCN" --min_ph -3.0 --max_ph -2.0 --output_file output.smi
129 python dimorphite_dl.py --smiles_file sample_molecules.smi --pka_precision 2.0 --label_states
130 python dimorphite_dl.py --test"""
131 )
132 print("")
133
134
135 class ArgParseFuncs:
136 """A namespace for storing functions that are useful for processing
137 command-line arguments. To keep things organized."""
138
139 @staticmethod
140 def get_args():
141 """Gets the arguments from the command line.
142
143 :return: A parser object.
144 """
145
146 parser = MyParser(
147 description="Dimorphite 1.2: Creates models of "
148 + "appropriately protonated small moleucles. "
149 + "Apache 2.0 License. Copyright 2018 Jacob D. "
150 + "Durrant."
151 )
152 parser.add_argument(
153 "--min_ph",
154 metavar="MIN",
155 type=float,
156 default=6.4,
157 help="minimum pH to consider (default: 6.4)",
158 )
159 parser.add_argument(
160 "--max_ph",
161 metavar="MAX",
162 type=float,
163 default=8.4,
164 help="maximum pH to consider (default: 8.4)",
165 )
166 parser.add_argument(
167 "--pka_precision",
168 metavar="PRE",
169 type=float,
170 default=1.0,
171 help="pKa precision factor (number of standard devations, default: 1.0)",
172 )
173 parser.add_argument(
174 "--smiles", metavar="SMI", type=str, help="SMILES string to protonate"
175 )
176 parser.add_argument(
177 "--smiles_file",
178 metavar="FILE",
179 type=str,
180 help="file that contains SMILES strings to protonate",
181 )
182 parser.add_argument(
183 "--output_file",
184 metavar="FILE",
185 type=str,
186 help="output file to write protonated SMILES (optional)",
187 )
188 parser.add_argument(
189 "--label_states",
190 action="store_true",
191 help="label protonated SMILES with target state "
192 + '(i.e., "DEPROTONATED", "PROTONATED", or "BOTH").',
193 )
194 parser.add_argument(
195 "--test", action="store_true", help="run unit tests (for debugging)"
196 )
197
198 return parser
199
200 @staticmethod
201 def clean_args(args):
202 """Cleans and normalizes input parameters
203
204 :param args: A dictionary containing the arguments.
205 :type args: dict
206 :raises Exception: No SMILES in params.
207 """
208
209 defaults = {
210 "min_ph": 6.4,
211 "max_ph": 8.4,
212 "pka_precision": 1.0,
213 "label_states": False,
214 "test": False,
215 }
216
217 for key in defaults:
218 if key not in args:
219 args[key] = defaults[key]
220
221 keys = list(args.keys())
222 for key in keys:
223 if args[key] is None:
224 del args[key]
225
226 if not "smiles" in args and not "smiles_file" in args:
227 msg = "Error: No SMILES in params. Use the -h parameter for help."
228 print(msg)
229 raise Exception(msg)
230
231 # If the user provides a smiles string, turn it into a file-like StringIO
232 # object.
233 if "smiles" in args:
234 if isinstance(args["smiles"], str):
235 args["smiles_file"] = StringIO(args["smiles"])
236
237 args["smiles_and_data"] = LoadSMIFile(args["smiles_file"])
238
239 return args
240
241
242 class UtilFuncs:
243 """A namespace to store functions for manipulating mol objects. To keep
244 things organized."""
245
246 @staticmethod
247 def neutralize_mol(mol):
248 """All molecules should be neuralized to the extent possible. The user
249 should not be allowed to specify the valence of the atoms in most cases.
250
251 :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol objet to be neutralized.
252 :return: The neutralized Mol object.
253 """
254
255 # Get the reaction data
256 rxn_data = [
257 [
258 "[Ov1-1:1]",
259 "[Ov2+0:1]-[H]",
260 ], # To handle O- bonded to only one atom (add hydrogen).
261 [
262 "[#7v4+1:1]-[H]",
263 "[#7v3+0:1]",
264 ], # To handle N+ bonded to a hydrogen (remove hydrogen).
265 [
266 "[Ov2-:1]",
267 "[Ov2+0:1]",
268 ], # To handle O- bonded to two atoms. Should not be Negative.
269 [
270 "[#7v3+1:1]",
271 "[#7v3+0:1]",
272 ], # To handle N+ bonded to three atoms. Should not be positive.
273 [
274 "[#7v2-1:1]",
275 "[#7+0:1]-[H]",
276 ], # To handle N- Bonded to two atoms. Add hydrogen.
277 # ['[N:1]=[N+0:2]=[N:3]-[H]', '[N:1]=[N+1:2]=[N+0:3]-[H]'], # To
278 # handle bad azide. Must be protonated. (Now handled elsewhere, before
279 # SMILES converted to Mol object.)
280 [
281 "[H]-[N:1]-[N:2]#[N:3]",
282 "[N:1]=[N+1:2]=[N:3]-[H]",
283 ], # To handle bad azide. R-N-N#N should be R-N=[N+]=N
284 ]
285
286 # Add substructures and reactions (initially none)
287 for i, rxn_datum in enumerate(rxn_data):
288 rxn_data[i].append(Chem.MolFromSmarts(rxn_datum[0]))
289 rxn_data[i].append(None)
290
291 # Add hydrogens (respects valence, so incomplete).
292 # Chem.calcImplicitValence(mol)
293 mol.UpdatePropertyCache(strict=False)
294 mol = Chem.AddHs(mol)
295
296 while True: # Keep going until all these issues have been resolved.
297 current_rxn = None # The reaction to perform.
298 current_rxn_str = None
299
300 for i, rxn_datum in enumerate(rxn_data):
301 (
302 reactant_smarts,
303 product_smarts,
304 substruct_match_mol,
305 rxn_placeholder,
306 ) = rxn_datum
307 if mol.HasSubstructMatch(substruct_match_mol):
308 if rxn_placeholder is None:
309 current_rxn_str = reactant_smarts + ">>" + product_smarts
310 current_rxn = AllChem.ReactionFromSmarts(current_rxn_str)
311 rxn_data[i][3] = current_rxn # Update the placeholder.
312 else:
313 current_rxn = rxn_data[i][3]
314 break
315
316 # Perform the reaction if necessary
317 if current_rxn is None: # No reaction left, so break out of while loop.
318 break
319 else:
320 mol = current_rxn.RunReactants((mol,))[0][0]
321 mol.UpdatePropertyCache(strict=False) # Update valences
322
323 # The mols have been altered from the reactions described above, we need
324 # to resanitize them. Make sure aromatic rings are shown as such This
325 # catches all RDKit Errors. without the catchError and sanitizeOps the
326 # Chem.SanitizeMol can crash the program.
327 sanitize_string = Chem.SanitizeMol(
328 mol,
329 sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL,
330 catchErrors=True,
331 )
332
333 return mol if sanitize_string.name == "SANITIZE_NONE" else None
334
335 @staticmethod
336 def convert_smiles_str_to_mol(smiles_str):
337 """Given a SMILES string, check that it is actually a string and not a
338 None. Then try to convert it to an RDKit Mol Object.
339
340 :param string smiles_str: The SMILES string.
341 :return: A rdkit.Chem.rdchem.Mol object, or None if it is the wrong type or
342 if it fails to convert to a Mol Obj
343 """
344
345 # Check that there are no type errors, ie Nones or non-string
346 # A non-string type will cause RDKit to hard crash
347 if smiles_str is None or type(smiles_str) is not str:
348 return None
349
350 # Try to fix azides here. They are just tricky to deal with.
351 smiles_str = smiles_str.replace("N=N=N", "N=[N+]=N")
352 smiles_str = smiles_str.replace("NN#N", "N=[N+]=N")
353
354 # Now convert to a mol object. Note the trick that is necessary to
355 # capture RDKit error/warning messages. See
356 # https://stackoverflow.com/questions/24277488/in-python-how-to-capture-the-stdout-from-a-c-shared-library-to-a-variable
357 stderr_fileno = sys.stderr.fileno()
358 stderr_save = os.dup(stderr_fileno)
359 stderr_pipe = os.pipe()
360 os.dup2(stderr_pipe[1], stderr_fileno)
361 os.close(stderr_pipe[1])
362
363 mol = Chem.MolFromSmiles(smiles_str)
364
365 os.close(stderr_fileno)
366 os.close(stderr_pipe[0])
367 os.dup2(stderr_save, stderr_fileno)
368 os.close(stderr_save)
369
370 # Check that there are None type errors Chem.MolFromSmiles has sanitize on
371 # which means if there is even a small error in the SMILES (kekulize,
372 # nitrogen charge...) then mol=None. ie.
373 # Chem.MolFromSmiles("C[N]=[N]=[N]") = None this is an example of an
374 # nitrogen charge error. It is cased in a try statement to be overly
375 # cautious.
376
377 return None if mol is None else mol
378
379 @staticmethod
380 def eprint(*args, **kwargs):
381 """Error messages should be printed to STDERR. See
382 https://stackoverflow.com/questions/5574702/how-to-print-to-stderr-in-python"""
383
384 print(*args, file=sys.stderr, **kwargs)
385
386
387 class LoadSMIFile(object):
388 """A generator class for loading in the SMILES strings from a file, one at
389 a time."""
390
391 def __init__(self, filename):
392 """Initializes this class.
393
394 :param filename: The filename or file object (i.e., StringIO).
395 :type filename: str or StringIO
396 """
397
398 if type(filename) is str:
399 # It's a filename
400 self.f = open(filename, "r")
401 else:
402 # It's a file object (i.e., StringIO)
403 self.f = filename
404
405 def __iter__(self):
406 """Returns this generator object.
407
408 :return: This generator object.
409 :rtype: LoadSMIFile
410 """
411
412 return self
413
414 def __next__(self):
415 """Ensure Python3 compatibility.
416
417 :return: A dict, where the "smiles" key contains the canonical SMILES
418 string and the "data" key contains the remaining information
419 (e.g., the molecule name).
420 :rtype: dict
421 """
422
423 return self.next()
424
425 def next(self):
426 """Get the data associated with the next line.
427
428 :raises StopIteration: If there are no more lines left iin the file.
429 :return: A dict, where the "smiles" key contains the canonical SMILES
430 string and the "data" key contains the remaining information
431 (e.g., the molecule name).
432 :rtype: dict
433 """
434
435 line = self.f.readline()
436
437 if line == "":
438 # EOF
439 self.f.close()
440 raise StopIteration()
441 return
442
443 # Divide line into smi and data
444 splits = line.split()
445 if len(splits) != 0:
446 # Generate mol object
447 smiles_str = splits[0]
448
449 # Convert from SMILES string to RDKIT Mol. This series of tests is
450 # to make sure the SMILES string is properly formed and to get it
451 # into a canonical form. Filter if failed.
452 mol = UtilFuncs.convert_smiles_str_to_mol(smiles_str)
453 if mol is None:
454 UtilFuncs.eprint(
455 "WARNING: Skipping poorly formed SMILES string: " + line
456 )
457 return self.next()
458
459 # Handle nuetralizing the molecules. Filter if failed.
460 mol = UtilFuncs.neutralize_mol(mol)
461 if mol is None:
462 UtilFuncs.eprint(
463 "WARNING: Skipping poorly formed SMILES string: " + line
464 )
465 return self.next()
466
467 # Remove the hydrogens.
468 try:
469 mol = Chem.RemoveHs(mol)
470 except Exception:
471 UtilFuncs.eprint(
472 "WARNING: Skipping poorly formed SMILES string: " + line
473 )
474 return self.next()
475
476 if mol is None:
477 UtilFuncs.eprint(
478 "WARNING: Skipping poorly formed SMILES string: " + line
479 )
480 return self.next()
481
482 # Regenerate the smiles string (to standardize).
483 new_mol_string = Chem.MolToSmiles(mol, isomericSmiles=True)
484
485 return {"smiles": new_mol_string, "data": splits[1:]}
486 else:
487 # Blank line? Go to next one.
488 return self.next()
489
490
491 class Protonate(object):
492 """A generator class for protonating SMILES strings, one at a time."""
493
494 def __init__(self, args):
495 """Initialize the generator.
496
497 :param args: A dictionary containing the arguments.
498 :type args: dict
499 """
500
501 # Make the args an object variable variable.
502 self.args = args
503
504 # A list to store the protonated SMILES strings associated with a
505 # single input model.
506 self.cur_prot_SMI = []
507
508 # Clean and normalize the args
509 self.args = ArgParseFuncs.clean_args(args)
510
511 # Load the substructures that can be protonated.
512 self.subs = ProtSubstructFuncs.load_protonation_substructs_calc_state_for_ph(
513 self.args["min_ph"], self.args["max_ph"], self.args["pka_precision"]
514 )
515
516 def __iter__(self):
517 """Returns this generator object.
518
519 :return: This generator object.
520 :rtype: Protonate
521 """
522
523 return self
524
525 def __next__(self):
526 """Ensure Python3 compatibility.
527
528 :return: A dict, where the "smiles" key contains the canonical SMILES
529 string and the "data" key contains the remaining information
530 (e.g., the molecule name).
531 :rtype: dict
532 """
533
534 return self.next()
535
536 def next(self):
537 """Get the next protonated SMILES string.
538
539 :raises StopIteration: If there are no more lines left iin the file.
540 :return: TODO A dict, where the "smiles" key contains the canonical SMILES
541 string and the "data" key contains the remaining information
542 (e.g., the molecule name).
543 :rtype: dict
544 """
545
546 # If there are any SMILES strings in self.cur_prot_SMI, just return
547 # the first one and update the list to include only the remaining.
548 if len(self.cur_prot_SMI) > 0:
549 first, self.cur_prot_SMI = self.cur_prot_SMI[0], self.cur_prot_SMI[1:]
550 return first
551
552 # self.cur_prot_SMI is empty, so try to add more to it.
553
554 # Get the next SMILES string from the input file.
555 try:
556 smile_and_datum = self.args["smiles_and_data"].next()
557 except StopIteration:
558 # There are no more input smiles strings...
559 raise StopIteration()
560
561 smi = smile_and_datum["smiles"]
562 data = smile_and_datum["data"] # Everything on SMILES line but the
563 # SMILES string itself (e.g., the
564 # molecule name).
565
566 # Collect the data associated with this smiles (e.g., the molecule
567 # name).
568 tag = " ".join(data)
569
570 # sites is a list of (atom index, "PROTONATED|DEPROTONATED|BOTH").
571 # Note that the second entry indicates what state the site SHOULD be
572 # in (not the one it IS in per the SMILES string). It's calculated
573 # based on the probablistic distributions obtained during training.
574 sites = ProtSubstructFuncs.get_prot_sites_and_target_states(smi, self.subs)
575
576 new_smis = [smi]
577 for site in sites:
578 # Make a new smiles with the correct protonation state. Note that
579 # new_smis is a growing list. This is how multiple protonation
580 # sites are handled.
581
582 # new_smis_to_perhaps_add = ProtSubstructFuncs.protonate_site(new_smis, site)
583 new_smis = ProtSubstructFuncs.protonate_site(new_smis, site)
584 # print(site, new_smis) # Good for debugging.
585
586 # Only add new smiles if not already in the list.
587 # for s in new_smis_to_perhaps_add:
588 # if not s in new_smis:
589 # new_smis.append(s)
590
591 # In some cases, the script might generate redundant molecules.
592 # Phosphonates, when the pH is between the two pKa values and the
593 # stdev value is big enough, for example, will generate two identical
594 # BOTH states. Let's remove this redundancy.
595 new_smis = list(set(new_smis))
596
597 # Deprotonating protonated aromatic nitrogen gives [nH-]. Change this
598 # to [n-]. This is a hack.
599 new_smis = [s.replace("[nH-]", "[n-]") for s in new_smis]
600
601 # Sometimes Dimorphite-DL generates molecules that aren't actually
602 # possible. Simply convert these to mol objects to eliminate the bad
603 # ones (that are None).
604 new_smis = [
605 s for s in new_smis if UtilFuncs.convert_smiles_str_to_mol(s) is not None
606 ]
607
608 # If there are no smi left, return the input one at the very least.
609 # All generated forms have apparently been judged
610 # inappropriate/mal-formed.
611 if len(new_smis) == 0:
612 new_smis = [smi]
613
614 # If the user wants to see the target states, add those
615 # to the ends of each line.
616 if self.args["label_states"]:
617 states = "\t".join([x[1] for x in sites])
618 new_lines = [x + "\t" + tag + "\t" + states for x in new_smis]
619 else:
620 new_lines = [x + "\t" + tag for x in new_smis]
621
622 self.cur_prot_SMI = new_lines
623
624 return self.next()
625
626
627 class ProtSubstructFuncs:
628 """A namespace to store functions for loading the substructures that can
629 be protonated. To keep things organized."""
630
631 @staticmethod
632 def load_protonation_substructs_calc_state_for_ph(
633 min_ph=6.4, max_ph=8.4, pka_std_range=1
634 ):
635 """A pre-calculated list of R-groups with protonation sites, with their
636 likely pKa bins.
637
638 :param float min_ph: The lower bound on the pH range, defaults to 6.4.
639 :param float max_ph: The upper bound on the pH range, defaults to 8.4.
640 :param pka_std_range: Basically the precision (stdev from predicted pKa to
641 consider), defaults to 1.
642 :return: A dict of the protonation substructions for the specified pH
643 range.
644 """
645
646 subs = []
647 pwd = os.path.dirname(os.path.realpath(__file__))
648
649 site_structures_file = "{}/{}".format(pwd, "site_substructures.smarts")
650 with open(site_structures_file, "r") as substruct:
651 for line in substruct:
652 line = line.strip()
653 sub = {}
654 if line is not "":
655 splits = line.split()
656 sub["name"] = splits[0]
657 sub["smart"] = splits[1]
658 sub["mol"] = Chem.MolFromSmarts(sub["smart"])
659
660 # NEED TO DIVIDE THIS BY 3s
661 pka_ranges = [
662 splits[i : i + 3] for i in range(2, len(splits) - 1, 3)
663 ]
664
665 prot = []
666 for pka_range in pka_ranges:
667 site = pka_range[0]
668 std = float(pka_range[2]) * pka_std_range
669 mean = float(pka_range[1])
670 protonation_state = ProtSubstructFuncs.define_protonation_state(
671 mean, std, min_ph, max_ph
672 )
673
674 prot.append([site, protonation_state])
675
676 sub["prot_states_for_pH"] = prot
677 subs.append(sub)
678 return subs
679
680 @staticmethod
681 def define_protonation_state(mean, std, min_ph, max_ph):
682 """Updates the substructure definitions to include the protonation state
683 based on the user-given pH range. The size of the pKa range is also based
684 on the number of standard deviations to be considered by the user param.
685
686 :param float mean: The mean pKa.
687 :param float std: The precision (stdev).
688 :param float min_ph: The min pH of the range.
689 :param float max_ph: The max pH of the range.
690 :return: A string describing the protonation state.
691 """
692
693 min_pka = mean - std
694 max_pka = mean + std
695
696 # This needs to be reassigned, and 'ERROR' should never make it past the
697 # next set of checks.
698 if min_pka <= max_ph and min_ph <= max_pka:
699 protonation_state = "BOTH"
700 elif mean > max_ph:
701 protonation_state = "PROTONATED"
702 else:
703 protonation_state = "DEPROTONATED"
704
705 return protonation_state
706
707 @staticmethod
708 def get_prot_sites_and_target_states(smi, subs):
709 """For a single molecule, find all possible matches in the protonation
710 R-group list, subs. Items that are higher on the list will be matched
711 first, to the exclusion of later items.
712
713 :param string smi: A SMILES string.
714 :param list subs: Substructure information.
715 :return: A list of protonation sites and their pKa bin. ('PROTONATED',
716 'BOTH', or 'DEPROTONATED')
717 """
718
719 # Convert the Smiles string (smi) to an RDKit Mol Obj
720 mol = UtilFuncs.convert_smiles_str_to_mol(smi)
721
722 # Check Conversion worked
723 if mol is None:
724 UtilFuncs.eprint("ERROR: ", smi)
725 return []
726
727 # Try to Add hydrogens. if failed return []
728 try:
729 mol = Chem.AddHs(mol)
730 except Exception:
731 UtilFuncs.eprint("ERROR: ", smi)
732 return []
733
734 # Check adding Hs worked
735 if mol is None:
736 UtilFuncs.eprint("ERROR: ", smi)
737 return []
738
739 ProtectUnprotectFuncs.unprotect_molecule(mol)
740 protonation_sites = []
741
742 for item in subs:
743 smart = item["mol"]
744 if mol.HasSubstructMatch(smart):
745 matches = ProtectUnprotectFuncs.get_unprotected_matches(mol, smart)
746 prot = item["prot_states_for_pH"]
747 for match in matches:
748 # We want to move the site from being relative to the
749 # substructure, to the index on the main molecule.
750 for site in prot:
751 proton = int(site[0])
752 category = site[1]
753 new_site = (match[proton], category, item["name"])
754
755 if not new_site in protonation_sites:
756 # Because sites must be unique.
757 protonation_sites.append(new_site)
758
759 ProtectUnprotectFuncs.protect_molecule(mol, match)
760
761 return protonation_sites
762
763 @staticmethod
764 def protonate_site(smis, site):
765 """Given a list of SMILES strings, we protonate the site.
766
767 :param list smis: The list of SMILES strings.
768 :param tuple site: Information about the protonation site.
769 (idx, target_prot_state, prot_site_name)
770 :return: A list of the appropriately protonated SMILES.
771 """
772
773 # Decouple the atom index and its target protonation state from the site
774 # tuple
775 idx, target_prot_state, prot_site_name = site
776
777 # Initialize the output list
778 output_smis = []
779
780 state_to_charge = {"DEPROTONATED": [-1], "PROTONATED": [0], "BOTH": [-1, 0]}
781
782 charges = state_to_charge[target_prot_state]
783
784 # Now make the actual smiles match the target protonation state.
785 output_smis = ProtSubstructFuncs.set_protonation_charge(
786 smis, idx, charges, prot_site_name
787 )
788
789 return output_smis
790
791 @staticmethod
792 def set_protonation_charge(smis, idx, charges, prot_site_name):
793 """Sets the atomic charge on a particular site for a set of SMILES.
794
795 :param list smis: A list of the SMILES strings.
796 :param int idx: The index of the atom to consider.
797 :param list charges: A list of the charges (ints) to assign at
798 this site.
799 :param string prot_site_name: The name of the protonation site.
800 :return: A list of the processed SMILES strings.
801 """
802
803 # Sets up the output list and the Nitrogen charge
804 output = []
805
806 for charge in charges:
807 # The charge for Nitrogens is 1 higher than others (i.e., protonated
808 # state is positively charged).
809 nitro_charge = charge + 1
810
811 # But there are a few nitrogen moieties where the acidic group is the
812 # neutral one. Amides are a good example. I gave some thought re. how
813 # to best flag these. I decided that those nitrogen-containing
814 # moieties where the acidic group is neutral (rather than positively
815 # charged) will have "*" in the name.
816 if "*" in prot_site_name:
817 nitro_charge = nitro_charge - 1 # Undo what was done previously.
818
819 for smi in smis:
820
821 # Convert smilesstring (smi) into a RDKit Mol
822 mol = UtilFuncs.convert_smiles_str_to_mol(smi)
823
824 # Check that the conversion worked, skip if it fails
825 if mol is None:
826 continue
827
828 atom = mol.GetAtomWithIdx(idx)
829
830 # Assign the protonation charge, with special care for Nitrogens
831 element = atom.GetAtomicNum()
832 if element == 7:
833 atom.SetFormalCharge(nitro_charge)
834 else:
835 atom.SetFormalCharge(charge)
836
837 # Convert back to SMILE and add to output
838 out_smile = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
839 output.append(out_smile)
840
841 return output
842
843
844 class ProtectUnprotectFuncs:
845 """A namespace for storing functions that are useful for protecting and
846 unprotecting molecules. To keep things organized. We need to identify and
847 mark groups that have been matched with a substructure."""
848
849 @staticmethod
850 def unprotect_molecule(mol):
851 """Sets the protected property on all atoms to 0. This also creates the
852 property for new molecules.
853
854 :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol object.
855 :type mol: The rdkit Mol object with atoms unprotected.
856 """
857
858 for atom in mol.GetAtoms():
859 atom.SetProp("_protected", "0")
860
861 @staticmethod
862 def protect_molecule(mol, match):
863 """Given a 'match', a list of molecules idx's, we set the protected status
864 of each atom to 1. This will prevent any matches using that atom in the
865 future.
866
867 :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol object to protect.
868 :param list match: A list of molecule idx's.
869 """
870
871 for idx in match:
872 atom = mol.GetAtomWithIdx(idx)
873 atom.SetProp("_protected", "1")
874
875 @staticmethod
876 def get_unprotected_matches(mol, substruct):
877 """Finds substructure matches with atoms that have not been protected.
878 Returns list of matches, each match a list of atom idxs.
879
880 :param rdkit.Chem.rdchem.Mol mol: The Mol object to consider.
881 :param string substruct: The SMARTS string of the substructure ot match.
882 :return: A list of the matches. Each match is itself a list of atom idxs.
883 """
884
885 matches = mol.GetSubstructMatches(substruct)
886 unprotected_matches = []
887 for match in matches:
888 if ProtectUnprotectFuncs.is_match_unprotected(mol, match):
889 unprotected_matches.append(match)
890 return unprotected_matches
891
892 @staticmethod
893 def is_match_unprotected(mol, match):
894 """Checks a molecule to see if the substructure match contains any
895 protected atoms.
896
897 :param rdkit.Chem.rdchem.Mol mol: The Mol object to check.
898 :param list match: The match to check.
899 :return: A boolean, whether the match is present or not.
900 """
901
902 for idx in match:
903 atom = mol.GetAtomWithIdx(idx)
904 protected = atom.GetProp("_protected")
905 if protected == "1":
906 return False
907 return True
908
909
910 class TestFuncs:
911 """A namespace for storing functions that perform tests on the code. To
912 keep things organized."""
913
914 @staticmethod
915 def test():
916 """Tests all the 38 groups."""
917
918 smis = [
919 # [input smiles, pka, protonated, deprotonated, category]
920 ["C#CCO", "C#CCO", "C#CC[O-]", "Alcohol"],
921 ["C(=O)N", "NC=O", "[NH-]C=O", "Amide"],
922 [
923 "CC(=O)NOC(C)=O",
924 "CC(=O)NOC(C)=O",
925 "CC(=O)[N-]OC(C)=O",
926 "Amide_electronegative",
927 ],
928 ["COC(=N)N", "COC(N)=[NH2+]", "COC(=N)N", "AmidineGuanidine2"],
929 [
930 "Brc1ccc(C2NCCS2)cc1",
931 "Brc1ccc(C2[NH2+]CCS2)cc1",
932 "Brc1ccc(C2NCCS2)cc1",
933 "Amines_primary_secondary_tertiary",
934 ],
935 [
936 "CC(=O)[n+]1ccc(N)cc1",
937 "CC(=O)[n+]1ccc([NH3+])cc1",
938 "CC(=O)[n+]1ccc(N)cc1",
939 "Anilines_primary",
940 ],
941 ["CCNc1ccccc1", "CC[NH2+]c1ccccc1", "CCNc1ccccc1", "Anilines_secondary"],
942 [
943 "Cc1ccccc1N(C)C",
944 "Cc1ccccc1[NH+](C)C",
945 "Cc1ccccc1N(C)C",
946 "Anilines_tertiary",
947 ],
948 [
949 "BrC1=CC2=C(C=C1)NC=C2",
950 "Brc1ccc2[nH]ccc2c1",
951 "Brc1ccc2[n-]ccc2c1",
952 "Indole_pyrrole",
953 ],
954 [
955 "O=c1cc[nH]cc1",
956 "O=c1cc[nH]cc1",
957 "O=c1cc[n-]cc1",
958 "Aromatic_nitrogen_protonated",
959 ],
960 ["C-N=[N+]=[N@H]", "CN=[N+]=N", "CN=[N+]=[N-]", "Azide"],
961 ["BrC(C(O)=O)CBr", "O=C(O)C(Br)CBr", "O=C([O-])C(Br)CBr", "Carboxyl"],
962 ["NC(NN=O)=N", "NC(=[NH2+])NN=O", "N=C(N)NN=O", "AmidineGuanidine1"],
963 [
964 "C(F)(F)(F)C(=O)NC(=O)C",
965 "CC(=O)NC(=O)C(F)(F)F",
966 "CC(=O)[N-]C(=O)C(F)(F)F",
967 "Imide",
968 ],
969 ["O=C(C)NC(C)=O", "CC(=O)NC(C)=O", "CC(=O)[N-]C(C)=O", "Imide2"],
970 [
971 "CC(C)(C)C(N(C)O)=O",
972 "CN(O)C(=O)C(C)(C)C",
973 "CN([O-])C(=O)C(C)(C)C",
974 "N-hydroxyamide",
975 ],
976 ["C[N+](O)=O", "C[N+](=O)O", "C[N+](=O)[O-]", "Nitro"],
977 ["O=C1C=C(O)CC1", "O=C1C=C(O)CC1", "O=C1C=C([O-])CC1", "O=C-C=C-OH"],
978 ["C1CC1OO", "OOC1CC1", "[O-]OC1CC1", "Peroxide2"],
979 ["C(=O)OO", "O=COO", "O=CO[O-]", "Peroxide1"],
980 [
981 "Brc1cc(O)cc(Br)c1",
982 "Oc1cc(Br)cc(Br)c1",
983 "[O-]c1cc(Br)cc(Br)c1",
984 "Phenol",
985 ],
986 [
987 "CC(=O)c1ccc(S)cc1",
988 "CC(=O)c1ccc(S)cc1",
989 "CC(=O)c1ccc([S-])cc1",
990 "Phenyl_Thiol",
991 ],
992 [
993 "C=CCOc1ccc(C(=O)O)cc1",
994 "C=CCOc1ccc(C(=O)O)cc1",
995 "C=CCOc1ccc(C(=O)[O-])cc1",
996 "Phenyl_carboxyl",
997 ],
998 ["COP(=O)(O)OC", "COP(=O)(O)OC", "COP(=O)([O-])OC", "Phosphate_diester"],
999 ["CP(C)(=O)O", "CP(C)(=O)O", "CP(C)(=O)[O-]", "Phosphinic_acid"],
1000 [
1001 "CC(C)OP(C)(=O)O",
1002 "CC(C)OP(C)(=O)O",
1003 "CC(C)OP(C)(=O)[O-]",
1004 "Phosphonate_ester",
1005 ],
1006 [
1007 "CC1(C)OC(=O)NC1=O",
1008 "CC1(C)OC(=O)NC1=O",
1009 "CC1(C)OC(=O)[N-]C1=O",
1010 "Ringed_imide1",
1011 ],
1012 ["O=C(N1)C=CC1=O", "O=C1C=CC(=O)N1", "O=C1C=CC(=O)[N-]1", "Ringed_imide2"],
1013 ["O=S(OC)(O)=O", "COS(=O)(=O)O", "COS(=O)(=O)[O-]", "Sulfate"],
1014 [
1015 "COc1ccc(S(=O)O)cc1",
1016 "COc1ccc(S(=O)O)cc1",
1017 "COc1ccc(S(=O)[O-])cc1",
1018 "Sulfinic_acid",
1019 ],
1020 ["CS(N)(=O)=O", "CS(N)(=O)=O", "CS([NH-])(=O)=O", "Sulfonamide"],
1021 [
1022 "CC(=O)CSCCS(O)(=O)=O",
1023 "CC(=O)CSCCS(=O)(=O)O",
1024 "CC(=O)CSCCS(=O)(=O)[O-]",
1025 "Sulfonate",
1026 ],
1027 ["CC(=O)S", "CC(=O)S", "CC(=O)[S-]", "Thioic_acid"],
1028 ["C(C)(C)(C)(S)", "CC(C)(C)S", "CC(C)(C)[S-]", "Thiol"],
1029 [
1030 "Brc1cc[nH+]cc1",
1031 "Brc1cc[nH+]cc1",
1032 "Brc1ccncc1",
1033 "Aromatic_nitrogen_unprotonated",
1034 ],
1035 [
1036 "C=C(O)c1c(C)cc(C)cc1C",
1037 "C=C(O)c1c(C)cc(C)cc1C",
1038 "C=C([O-])c1c(C)cc(C)cc1C",
1039 "Vinyl_alcohol",
1040 ],
1041 ["CC(=O)ON", "CC(=O)O[NH3+]", "CC(=O)ON", "Primary_hydroxyl_amine"],
1042 ]
1043
1044 smis_phos = [
1045 [
1046 "O=P(O)(O)OCCCC",
1047 "CCCCOP(=O)(O)O",
1048 "CCCCOP(=O)([O-])O",
1049 "CCCCOP(=O)([O-])[O-]",
1050 "Phosphate",
1051 ],
1052 [
1053 "CC(P(O)(O)=O)C",
1054 "CC(C)P(=O)(O)O",
1055 "CC(C)P(=O)([O-])O",
1056 "CC(C)P(=O)([O-])[O-]",
1057 "Phosphonate",
1058 ],
1059 ]
1060
1061 # Load the average pKa values.
1062 average_pkas = {
1063 l.split()[0].replace("*", ""): float(l.split()[3])
1064 for l in open("site_substructures.smarts")
1065 if l.split()[0] not in ["Phosphate", "Phosphonate"]
1066 }
1067 average_pkas_phos = {
1068 l.split()[0].replace("*", ""): [float(l.split()[3]), float(l.split()[6])]
1069 for l in open("site_substructures.smarts")
1070 if l.split()[0] in ["Phosphate", "Phosphonate"]
1071 }
1072
1073 print("Running Tests")
1074 print("=============")
1075 print("")
1076
1077 print("Very Acidic (pH -10000000)")
1078 print("--------------------------")
1079 print("")
1080
1081 args = {
1082 "min_ph": -10000000,
1083 "max_ph": -10000000,
1084 "pka_precision": 0.5,
1085 "smiles": "",
1086 "label_states": True,
1087 }
1088
1089 for smi, protonated, deprotonated, category in smis:
1090 args["smiles"] = smi
1091 TestFuncs.test_check(args, [protonated], ["PROTONATED"])
1092
1093 for smi, protonated, mix, deprotonated, category in smis_phos:
1094 args["smiles"] = smi
1095 TestFuncs.test_check(args, [protonated], ["PROTONATED"])
1096
1097 args["min_ph"] = 10000000
1098 args["max_ph"] = 10000000
1099
1100 print("")
1101 print("Very Basic (pH 10000000)")
1102 print("------------------------")
1103 print("")
1104
1105 for smi, protonated, deprotonated, category in smis:
1106 args["smiles"] = smi
1107 TestFuncs.test_check(args, [deprotonated], ["DEPROTONATED"])
1108
1109 for smi, protonated, mix, deprotonated, category in smis_phos:
1110 args["smiles"] = smi
1111 TestFuncs.test_check(args, [deprotonated], ["DEPROTONATED"])
1112
1113 print("")
1114 print("pH is Category pKa")
1115 print("------------------")
1116 print("")
1117
1118 for smi, protonated, deprotonated, category in smis:
1119 avg_pka = average_pkas[category]
1120
1121 args["smiles"] = smi
1122 args["min_ph"] = avg_pka
1123 args["max_ph"] = avg_pka
1124
1125 TestFuncs.test_check(args, [protonated, deprotonated], ["BOTH"])
1126
1127 for smi, protonated, mix, deprotonated, category in smis_phos:
1128 args["smiles"] = smi
1129
1130 avg_pka = average_pkas_phos[category][0]
1131 args["min_ph"] = avg_pka
1132 args["max_ph"] = avg_pka
1133
1134 TestFuncs.test_check(args, [mix, protonated], ["BOTH"])
1135
1136 avg_pka = average_pkas_phos[category][1]
1137 args["min_ph"] = avg_pka
1138 args["max_ph"] = avg_pka
1139
1140 TestFuncs.test_check(
1141 args, [mix, deprotonated], ["DEPROTONATED", "DEPROTONATED"]
1142 )
1143
1144 avg_pka = 0.5 * (
1145 average_pkas_phos[category][0] + average_pkas_phos[category][1]
1146 )
1147 args["min_ph"] = avg_pka
1148 args["max_ph"] = avg_pka
1149 args["pka_precision"] = 5 # Should give all three
1150
1151 TestFuncs.test_check(
1152 args, [mix, deprotonated, protonated], ["BOTH", "BOTH"]
1153 )
1154
1155 @staticmethod
1156 def test_check(args, expected_output, labels):
1157 """Tests most ionizable groups. The ones that can only loose or gain a single proton.
1158
1159 :param args: The arguments to pass to protonate()
1160 :param expected_output: A list of the expected SMILES-strings output.
1161 :param labels: The labels. A list containing combo of BOTH, PROTONATED,
1162 DEPROTONATED.
1163 :raises Exception: Wrong number of states produced.
1164 :raises Exception: Unexpected output SMILES.
1165 :raises Exception: Wrong labels.
1166 """
1167
1168 output = list(Protonate(args))
1169 output = [o.split() for o in output]
1170
1171 num_states = len(expected_output)
1172
1173 if len(output) != num_states:
1174 msg = (
1175 args["smiles"]
1176 + " should have "
1177 + str(num_states)
1178 + " states at at pH "
1179 + str(args["min_ph"])
1180 + ": "
1181 + str(output)
1182 )
1183 print(msg)
1184 raise Exception(msg)
1185
1186 if len(set([l[0] for l in output]) - set(expected_output)) != 0:
1187 msg = (
1188 args["smiles"]
1189 + " is not "
1190 + " AND ".join(expected_output)
1191 + " at pH "
1192 + str(args["min_ph"])
1193 + " - "
1194 + str(args["max_ph"])
1195 + "; it is "
1196 + " AND ".join([l[0] for l in output])
1197 )
1198 print(msg)
1199 raise Exception(msg)
1200
1201 if len(set([l[1] for l in output]) - set(labels)) != 0:
1202 msg = (
1203 args["smiles"]
1204 + " not labeled as "
1205 + " AND ".join(labels)
1206 + "; it is "
1207 + " AND ".join([l[1] for l in output])
1208 )
1209 print(msg)
1210 raise Exception(msg)
1211
1212 ph_range = sorted(list(set([args["min_ph"], args["max_ph"]])))
1213 ph_range_str = "(" + " - ".join("{0:.2f}".format(n) for n in ph_range) + ")"
1214 print(
1215 "(CORRECT) "
1216 + ph_range_str.ljust(10)
1217 + " "
1218 + args["smiles"]
1219 + " => "
1220 + " AND ".join([l[0] for l in output])
1221 )
1222
1223
1224 def run(**kwargs):
1225 """A helpful, importable function for those who want to call Dimorphite-DL
1226 from another Python script rather than the command line. Note that this
1227 function accepts keyword arguments that match the command-line parameters
1228 exactly. If you want to pass and return a list of RDKit Mol objects, import
1229 run_with_mol_list() instead.
1230
1231 :param **kwargs: For a complete description, run dimorphite_dl.py from the
1232 command line with the -h option.
1233 :type kwargs: dict
1234 """
1235
1236 # Run the main function with the specified arguments.
1237 main(kwargs)
1238
1239
1240 def run_with_mol_list(mol_lst, **kwargs):
1241 """A helpful, importable function for those who want to call Dimorphite-DL
1242 from another Python script rather than the command line. Note that this
1243 function is for passing Dimorphite-DL a list of RDKit Mol objects, together
1244 with command-line parameters. If you want to use only the same parameters
1245 that you would use from the command line, import run() instead.
1246
1247 :param mol_lst: A list of rdkit.Chem.rdchem.Mol objects.
1248 :type mol_lst: list
1249 :raises Exception: If the **kwargs includes "smiles", "smiles_file",
1250 "output_file", or "test" parameters.
1251 :return: A list of properly protonated rdkit.Chem.rdchem.Mol objects.
1252 :rtype: list
1253 """
1254
1255 # Do a quick check to make sure the user input makes sense.
1256 for bad_arg in ["smiles", "smiles_file", "output_file", "test"]:
1257 if bad_arg in kwargs:
1258 msg = (
1259 "You're using Dimorphite-DL's run_with_mol_list(mol_lst, "
1260 + '**kwargs) function, but you also passed the "'
1261 + bad_arg
1262 + '" argument. Did you mean to use the '
1263 + "run(**kwargs) function instead?"
1264 )
1265 print(msg)
1266 raise Exception(msg)
1267
1268 # Set the return_as_list flag so main() will return the protonated smiles
1269 # as a list.
1270 kwargs["return_as_list"] = True
1271
1272 # Having reviewed the code, it will be very difficult to rewrite it so
1273 # that a list of Mol objects can be used directly. Intead, convert this
1274 # list of mols to smiles and pass that. Not efficient, but it will work.
1275 protonated_smiles_and_props = []
1276 for m in mol_lst:
1277 props = m.GetPropsAsDict()
1278 kwargs["smiles"] = Chem.MolToSmiles(m, isomericSmiles=True)
1279 protonated_smiles_and_props.extend(
1280 [(s.split("\t")[0], props) for s in main(kwargs)]
1281 )
1282
1283 # Now convert the list of protonated smiles strings back to RDKit Mol
1284 # objects. Also, add back in the properties from the original mol objects.
1285 mols = []
1286 for s, props in protonated_smiles_and_props:
1287 m = Chem.MolFromSmiles(s)
1288 if m:
1289 for prop, val in props.items():
1290 if type(val) is int:
1291 m.SetIntProp(prop, val)
1292 elif type(val) is float:
1293 m.SetDoubleProp(prop, val)
1294 elif type(val) is bool:
1295 m.SetBoolProp(prop, val)
1296 else:
1297 m.SetProp(prop, str(val))
1298 mols.append(m)
1299 else:
1300 UtilFuncs.eprint(
1301 "WARNING: Could not process molecule with SMILES string "
1302 + s
1303 + " and properties "
1304 + str(props)
1305 )
1306
1307 return mols
1308
1309
1310 if __name__ == "__main__":
1311 main()