comparison lib/Fingerprints/PathLengthFingerprints.pm @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 package Fingerprints::PathLengthFingerprints;
2 #
3 # $RCSfile: PathLengthFingerprints.pm,v $
4 # $Date: 2015/02/28 20:48:54 $
5 # $Revision: 1.39 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use Carp;
31 use Exporter;
32 use TextUtil ();
33 use MathUtil ();
34 use Fingerprints::Fingerprints;
35 use Molecule;
36 use AtomTypes::AtomicInvariantsAtomTypes;
37 use AtomTypes::DREIDINGAtomTypes;
38 use AtomTypes::EStateAtomTypes;
39 use AtomTypes::FunctionalClassAtomTypes;
40 use AtomTypes::MMFF94AtomTypes;
41 use AtomTypes::SLogPAtomTypes;
42 use AtomTypes::SYBYLAtomTypes;
43 use AtomTypes::TPSAAtomTypes;
44 use AtomTypes::UFFAtomTypes;
45
46 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
47
48 @ISA = qw(Fingerprints::Fingerprints Exporter);
49 @EXPORT = qw();
50 @EXPORT_OK = qw();
51
52 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
53
54 # Setup class variables...
55 my($ClassName);
56 _InitializeClass();
57
58 # Overload Perl functions...
59 use overload '""' => 'StringifyPathLengthFingerprints';
60
61 # Class constructor...
62 sub new {
63 my($Class, %NamesAndValues) = @_;
64
65 # Initialize object...
66 my $This = $Class->SUPER::new();
67 bless $This, ref($Class) || $Class;
68 $This->_InitializePathLengthFingerprints();
69
70 $This->_InitializePathLengthFingerprintsProperties(%NamesAndValues);
71
72 return $This;
73 }
74
75 # Initialize object data...
76 #
77 sub _InitializePathLengthFingerprints {
78 my($This) = @_;
79
80 # Type of fingerprint to generate...
81 #
82 # PathLengthBits - A bit vector indicating presence/absence of atom paths
83 # PathLengthCount - A vector containing count of atom paths
84 #
85 $This->{Type} = '';
86
87 # Type of vector: FingerprintsBitVector or FingerprintsVector
88 $This->{VectorType} = '';
89
90 # Set default mininum, maximum, and default size. Although any arbitrary size can
91 # be specified, bit vector used to store bits work on a vector size which is
92 # power of 2 and additonal bits are automatically added and cleared.
93 #
94 $This->{Size} = 1024;
95
96 $This->{MinSize} = 32;
97 $This->{MaxSize} = 2**32;
98
99 # Minimum and maximum path lengths to use for fingerprints generation...
100 $This->{MinLength} = 1;
101 $This->{MaxLength} = 8;
102
103 # Numner of bits to set for each atom path for FingerprintsBitVector...
104 $This->{NumOfBitsToSetPerPath} = 1;
105
106 # Atom identifier type to use for path atoms during fingerprints generation...
107 #
108 # Currently supported values are: AtomicInvariantsAtomTypes, DREIDINGAtomTypes,
109 # EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes,
110 # SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes
111 #
112 $This->{AtomIdentifierType} = '';
113
114 # Atom types assigned to atoms...
115 %{$This->{AssignedAtomTypes}} = ();
116
117 # For molecules containing rings, atom paths starting from each atom can be traversed in four
118 # different ways:
119 #
120 # . Atom paths without any rings and sharing of bonds in traversed paths.
121 # . Atom paths containing rings and without any sharing of bonds in traversed paths
122 # . All possible atom paths without any rings and sharing of bonds in traversed paths
123 # . All possible atom paths containing rings and with sharing of bonds in traversed paths.
124 #
125 # Atom path traversal is terminated at the last ring atom. For molecules containing no rings,
126 # first two and last two types described above are equivalent.
127 #
128 # AllowSharedBonds and AllowRings variables allow generation of differen types of paths
129 # to be used for fingerprints generation.
130 #
131 # In addition to atom symbols, bond symbols are also used to generate a string
132 # for atom paths. These atom paths strings are hased to a 32 bit integer key which
133 # in turn is used as a seed for a random number generation in range of 1 to fingerprint
134 # size for setting corresponding bit in bit vector.
135 #
136 # UseBondSymbols variable allow generation of atom path strings and consequently fingerprints.
137 #
138 # Combination of AllowSharedBonds, AllowRings, and UseBondSymbols allow generation of
139 # 8 different types of path length fingerprints:
140 #
141 # AllowSharedBonds AllowRings UseBondSymbols PathLengthFingerprintsType
142 #
143 # No No Yes AtomPathsNoCyclesWithBondSymbols
144 # No Yes Yes AtomPathsWithCyclesWithBondSymbols
145 #
146 # Yes No Yes AllAtomPathsNoCyclesWithBondSymbols
147 # Yes Yes Yes AllAtomPathsWithCyclesWithBondSymbols [ DEFAULT ]
148 #
149 # No No No AtomPathsNoCyclesNoBondSymbols
150 # No Yes No AtomPathsWithCyclesNoBondSymbols
151 #
152 # Yes No No AllAtomPathsNoCyclesNoBondSymbols
153 # Yes Yes No AllAtomPathsWithCyclesNoWithBondSymbols
154 #
155 #
156
157 # By default, atom paths starting from atoms are allowed to share bonds already traversed...
158 $This->{AllowSharedBonds} = 1;
159
160 # By default rings are included in paths...
161 $This->{AllowRings} = 1;
162
163 # By default bond symbols are included in atom path strings...
164 $This->{UseBondSymbols} = 1;
165
166 # By default only structurally unique atom paths are used for generation
167 # atom path strings...
168 $This->{UseUniquePaths} = 1;
169
170 # Random number generator to use during generation of fingerprints bit-vector
171 # string: Perl CORE::rand or MayaChemTools MathUtil::random function.
172 #
173 # The random number generator implemented in MayaChemTools is a variant of
174 # linear congruential generator (LCG) as described by Miller et al. [ Ref 120 ].
175 # It is also referred to as Lehmer random number generator or Park-Miller
176 # random number generator.
177 #
178 # Unlike Perl's core random number generator function rand, the random number
179 # generator implemented in MayaChemTools, MathUtil::random, generates consistent
180 # random values across different platformsfor a specific random seed and leads
181 # to generation of portable fingerprints bit-vector strings.
182 #
183 $This->{UsePerlCoreRandom} = 1;
184
185 # Bond symbols to use during generation of atom path strings...
186 %{$This->{BondOrderToSymbol}} = ();
187 %{$This->{BondOrderToSymbol}} = ('1' => '', '1.5' => ':', '2' => '=', '3' => '#');
188
189 # BondSymbols map to use for bonded atom IDs to use during atom path strings...
190 %{$This->{BondSymbols}} = ();
191
192 # Path atom IDs to remove duplicate paths...
193 %{$This->{UniqueLinearAtomPathsIDs}} = ();
194 %{$This->{UniqueCyclicAtomPathsIDs}} = ();
195
196 # Reference to all the atom paths upto specified path length...
197 $This->{AtomPathsRef} = '';
198
199 # Atom paths strings created using specified atom types and bond symbols...
200 %{$This->{AtomPathsStrings}} = ();
201 }
202
203 # Initialize class ...
204 sub _InitializeClass {
205 #Class name...
206 $ClassName = __PACKAGE__;
207 }
208
209 # Initialize object properties....
210 sub _InitializePathLengthFingerprintsProperties {
211 my($This, %NamesAndValues) = @_;
212
213 my($Name, $Value, $MethodName);
214 while (($Name, $Value) = each %NamesAndValues) {
215 $MethodName = "Set${Name}";
216 $This->$MethodName($Value);
217 }
218
219 # Make sure molecule object was specified...
220 if (!exists $NamesAndValues{Molecule}) {
221 croak "Error: ${ClassName}->New: Object can't be instantiated without specifying molecule...";
222 }
223
224 if (!exists $NamesAndValues{Type}) {
225 croak "Error: ${ClassName}->New: Object can't be instantiated without specifying Type...";
226 }
227
228 if (!exists $NamesAndValues{AtomIdentifierType}) {
229 croak "Error: ${ClassName}->New: Object can't be instantiated without specifying AtomIdentifierType...";
230 }
231
232 # Make sure it's power of 2...
233 if (exists $NamesAndValues{Size}) {
234 if (!TextUtil::IsNumberPowerOfNumber($NamesAndValues{Size}, 2)) {
235 croak "Error: ${ClassName}->New: Specified size value, $NamesAndValues{Size}, must be power of 2...";
236 }
237 }
238
239 if ($This->{Type} =~ /^PathLengthBits$/i) {
240 $This->_InitializePathLengthBits();
241 }
242 elsif ($This->{Type} =~ /^PathLengthCount$/i) {
243 $This->_InitializePathLengthCount();
244 }
245 else {
246 croak "Error: ${ClassName}->_InitializePathLengthFingerprintsProperties: Unknown PathLength type: $This->{Type}; Supported PathLength type : PathLengthBits or PathLengthCount......";
247 }
248
249 return $This;
250 }
251
252 # Initialize PathLength bits...
253 #
254 sub _InitializePathLengthBits {
255 my($This) = @_;
256
257 # Vector type...
258 $This->{VectorType} = 'FingerprintsBitVector';
259
260 $This->_InitializeFingerprintsBitVector();
261
262 return $This;
263 }
264
265 # Initialize PathLength key count...
266 #
267 sub _InitializePathLengthCount {
268 my($This) = @_;
269
270 # Vector type and type of values...
271 $This->{VectorType} = 'FingerprintsVector';
272 $This->{FingerprintsVectorType} = 'NumericalValues';
273
274 $This->_InitializeFingerprintsVector();
275
276 return $This;
277 }
278
279 # Set type...
280 #
281 sub SetType {
282 my($This, $Type) = @_;
283
284 if ($This->{Type}) {
285 croak "Error: ${ClassName}->SetType: Can't change type: It's already set...";
286 }
287
288 if ($Type =~ /^PathLengthBits$/i) {
289 $This->{Type} = 'PathLengthBits';;
290 }
291 elsif ($Type =~ /^PathLengthCount$/i) {
292 $This->{Type} = 'PathLengthCount';;
293 }
294 else {
295 croak "Error: ${ClassName}->SetType: Unknown PathLength keys: $Type; Supported PathLength types: PathLengthBits or PathLengthCount...";
296 }
297 return $This;
298 }
299
300 # Disable vector type change...
301 #
302 sub SetVectorType {
303 my($This, $Type) = @_;
304
305 croak "Error: ${ClassName}->SetVectorType: Can't change vector type...";
306
307 return $This;
308 }
309
310 # Disable vector type change...
311 #
312 sub SetFingerprintsVectorType {
313 my($This, $Type) = @_;
314
315 croak "Error: ${ClassName}->SetFingerprintsVectorType: Can't change fingerprints vector type...";
316
317 return $This;
318 }
319
320 # Set atom identifier type to use for path length atom identifiers...
321 #
322 sub SetAtomIdentifierType {
323 my($This, $IdentifierType) = @_;
324
325 if ($IdentifierType !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
326 croak "Error: ${ClassName}->SetAtomIdentifierType: Specified value, $IdentifierType, for AtomIdentifierType is not vaild. Supported types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, and UFFAtomTypes.";
327 }
328
329 if ($This->{AtomIdentifierType}) {
330 croak "Error: ${ClassName}->SetAtomIdentifierType: Can't change atom identifier type: It's already set...";
331 }
332
333 $This->{AtomIdentifierType} = $IdentifierType;
334
335 # Initialize atom identifier type information...
336 $This->_InitializeAtomIdentifierTypeInformation();
337
338 return $This;
339 }
340
341 # Set minimum path length...
342 #
343 sub SetMinLength {
344 my($This, $Value) = @_;
345
346 if (!TextUtil::IsPositiveInteger($Value)) {
347 croak "Error: ${ClassName}->SetMinLength: MinLength value, $Value, is not valid: It must be a positive integer...";
348 }
349 $This->{MinLength} = $Value;
350
351 return $This;
352 }
353
354 # Set maximum path length...
355 #
356 sub SetMaxLength {
357 my($This, $Value) = @_;
358
359 if (!TextUtil::IsPositiveInteger($Value)) {
360 croak "Error: ${ClassName}->SetMaxLength: MaxLength value, $Value, is not valid: It must be a positive integer...";
361 }
362 $This->{MaxLength} = $Value;
363
364 return $This;
365 }
366
367 # Set number of bits to set for each path...
368 #
369 sub SetNumOfBitsToSetPerPath {
370 my($This, $Value) = @_;
371
372 if (!TextUtil::IsPositiveInteger($Value)) {
373 croak "Error: ${ClassName}->SetNumOfBitsToSetPerPath: NumOfBitsToSetPerPath value, $Value, is not valid: It must be a positive integer...";
374 }
375 $This->{NumOfBitsToSetPerPath} = $Value;
376
377 return $This;
378 }
379
380 # Generate fingerprints description...
381 #
382 sub GetDescription {
383 my($This) = @_;
384
385 # Is description explicity set?
386 if (exists $This->{Description}) {
387 return $This->{Description};
388 }
389
390 # Generate fingerprints description...
391
392 return "$This->{Type}:$This->{AtomIdentifierType}:MinLength$This->{MinLength}:MaxLength$This->{MaxLength}";
393 }
394
395 # Generate path length fingerprints...
396 #
397 sub GenerateFingerprints {
398 my($This) = @_;
399
400 if ($This->{MinLength} > $This->{MaxLength}) {
401 croak "Error: ${ClassName}->GenerateFingerprints: No fingerpritns generated: MinLength, $This->{MinLength}, must be <= MaxLength, $This->{MaxLength}...";
402 }
403
404 # Cache appropriate molecule data...
405 $This->_SetupMoleculeDataCache();
406
407 # Assign atom types to all atoms...
408 if (!$This->_AssignAtomTypes()) {
409 carp "Warning: ${ClassName}->GenerateFingerprints: $This->{AtomIdentifierType} fingerprints generation didn't succeed: Couldn't assign valid $This->{AtomIdentifierType} to all atoms...";
410 return $This;
411 }
412
413 # Setup bond symbol map...
414 if ($This->{UseBondSymbols}) {
415 $This->_InitializeBondSymbols();
416 }
417
418 # Generate appropriate atom paths...
419 $This->_GenerateAtomPathsUpToMaxLength();
420
421 # Initialize atom path strings...
422 $This->_InitializeAtomPathsStrings();
423
424 # Generate appropriate atom path strings for unique atom paths...
425 $This->_GenerateAtomPathsStrings();
426
427 # Set final fingerprints...
428 $This->_SetFinalFingerprints();
429
430 # Clear cached molecule data...
431 $This->_ClearMoleculeDataCache();
432
433 return $This;
434 }
435
436 # Assign appropriate atom types to all atoms...
437 #
438 sub _AssignAtomTypes {
439 my($This) = @_;
440 my($SpecifiedAtomTypes, $Atom, $AtomID, $IgnoreHydrogens);
441
442 %{$This->{AssignedAtomTypes}} = ();
443 $IgnoreHydrogens = 0;
444
445 $SpecifiedAtomTypes = undef;
446
447 IDENTIFIERTYPE: {
448 if ($This->{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
449 $SpecifiedAtomTypes = new AtomTypes::AtomicInvariantsAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens, 'AtomicInvariantsToUse' => $This->{AtomicInvariantsToUse});
450 last IDENTIFIERTYPE;
451 }
452
453 if ($This->{AtomIdentifierType} =~ /^DREIDINGAtomTypes$/i) {
454 $SpecifiedAtomTypes = new AtomTypes::DREIDINGAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens);
455 last IDENTIFIERTYPE;
456 }
457
458 if ($This->{AtomIdentifierType} =~ /^EStateAtomTypes$/i) {
459 $SpecifiedAtomTypes = new AtomTypes::EStateAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens);
460 last IDENTIFIERTYPE;
461 }
462
463 if ($This->{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
464 $SpecifiedAtomTypes = new AtomTypes::FunctionalClassAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens, 'FunctionalClassesToUse' => $This->{FunctionalClassesToUse});
465 last IDENTIFIERTYPE;
466 }
467
468 if ($This->{AtomIdentifierType} =~ /^MMFF94AtomTypes$/i) {
469 $SpecifiedAtomTypes = new AtomTypes::MMFF94AtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens);
470 last IDENTIFIERTYPE;
471 }
472
473 if ($This->{AtomIdentifierType} =~ /^SLogPAtomTypes$/i) {
474 $SpecifiedAtomTypes = new AtomTypes::SLogPAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens);
475 last IDENTIFIERTYPE;
476 }
477 if ($This->{AtomIdentifierType} =~ /^SYBYLAtomTypes$/i) {
478 $SpecifiedAtomTypes = new AtomTypes::SYBYLAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens);
479 last IDENTIFIERTYPE;
480 }
481
482 if ($This->{AtomIdentifierType} =~ /^TPSAAtomTypes$/i) {
483 $SpecifiedAtomTypes = new AtomTypes::TPSAAtomTypes('Molecule' => $This->{Molecule}, 'IgnorePhosphorus' => 0, 'IgnoreSulfur' => 0);
484 last IDENTIFIERTYPE;
485 }
486
487 if ($This->{AtomIdentifierType} =~ /^UFFAtomTypes$/i) {
488 $SpecifiedAtomTypes = new AtomTypes::UFFAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens);
489 last IDENTIFIERTYPE;
490 }
491
492 croak "Error: ${ClassName}->_AssignAtomTypes: Unknown atom indentifier type $This->{AtomIdentifierType}...";
493 }
494
495 # Assign atom types...
496 $SpecifiedAtomTypes->AssignAtomTypes();
497
498 # Make sure atom types assignment is successful...
499 if (!$SpecifiedAtomTypes->IsAtomTypesAssignmentSuccessful()) {
500 return undef;
501 }
502
503 # Collect assigned atom types...
504 ATOM: for $Atom (@{$This->{Atoms}}) {
505 $AtomID = $Atom->GetID();
506 $This->{AssignedAtomTypes}{$AtomID} = $SpecifiedAtomTypes->GetAtomType($Atom);
507 }
508
509 return $This;
510 }
511
512 # Setup bond symbol map for atoms to speed up generation of path length identifiers
513 # during fingerprints generation...
514 #
515 sub _InitializeBondSymbols {
516 my($This) = @_;
517 my($Atom1, $Atom2, $AtomID1, $AtomID2, $Bond, $BondSymbol, $BondOrder);
518
519 %{$This->{BondSymbols}} = ();
520
521 if (!$This->{UseBondSymbols}) {
522 return $This;
523 }
524
525 for $Bond ($This->{Molecule}->GetBonds()) {
526 $BondOrder = $Bond->GetBondOrder();
527 $BondSymbol = $Bond->IsAromatic() ? ':' : (exists($This->{BondOrderToSymbol}{$BondOrder}) ? $This->{BondOrderToSymbol}{$BondOrder} : $BondOrder);
528 ($Atom1, $Atom2) = $Bond->GetAtoms();
529 $AtomID1 = $Atom1->GetID(); $AtomID2 = $Atom2->GetID();
530 if ($AtomID1 > $AtomID2) {
531 ($AtomID1, $AtomID2) = ($AtomID2, $AtomID1);
532 }
533
534 if (!exists $This->{BondSymbols}{$AtomID1}) {
535 %{$This->{BondSymbols}{$AtomID1}} = ();
536 }
537 $This->{BondSymbols}{$AtomID1}{$AtomID2} = $BondSymbol;
538 }
539 return $This;
540 }
541
542 # Get appropriate atom paths with length up to MaxLength...
543 #
544 sub _GenerateAtomPathsUpToMaxLength {
545 my($This) = @_;
546 my($PathLength, $AllowRings, $Molecule, $AtomPathsRef);
547
548 $PathLength = $This->{MaxLength};
549 $AllowRings = $This->{AllowRings};
550 $Molecule = $This->{Molecule};
551
552 if ($This->{AllowSharedBonds}) {
553 $AtomPathsRef = $Molecule->GetAllAtomPathsWithLengthUpto($PathLength, $AllowRings);
554 }
555 else {
556 $AtomPathsRef = $Molecule->GetAtomPathsWithLengthUpto($PathLength, $AllowRings);
557 }
558 $This->{AtomPathsRef} = $AtomPathsRef;
559
560 return $This;
561 }
562
563 # Initialize atom paths strings at various pathlength levels...
564 #
565 sub _InitializeAtomPathsStrings {
566 my($This) = @_;
567 my($PathLength);
568
569 %{$This->{AtomPathsStrings}} = ();
570
571 for $PathLength ($This->{MinLength} .. $This->{MaxLength}) {
572 %{$This->{AtomPathsStrings}{$PathLength}} = ();
573 }
574
575 return $This;
576 }
577
578 # Generate appropriate atom path strings for unique atom paths...
579 #
580 sub _GenerateAtomPathsStrings {
581 my($This, $PathAtomsRef) = @_;
582 my($PathLength, $MinPathLength, $UseUniquePaths);
583
584 $MinPathLength = $This->{MinLength};
585 $UseUniquePaths = $This->{UseUniquePaths};
586
587 PATHATOMS: for $PathAtomsRef (@{$This->{AtomPathsRef}}) {
588 $PathLength = scalar @{$PathAtomsRef};
589 if ($PathLength < $MinPathLength) {
590 next PATHATOMS;
591 }
592 if ($UseUniquePaths) {
593 $This->_GenerateAtomPathStringUsingUniquePath($PathAtomsRef);
594 }
595 else {
596 $This->_GenerateAtomPathString($PathAtomsRef);
597 }
598 }
599 return $This;
600 }
601
602 # Generate atom path string using unique path...
603 #
604 sub _GenerateAtomPathStringUsingUniquePath {
605 my($This, $PathAtomsRef) = @_;
606
607 if ($This->{AllowRings} && $This->_DoesAtomPathContainsCycle($PathAtomsRef)) {
608 $This->_GenerateAtomPathStringUsingUniquePathContainingCycle($PathAtomsRef);
609 }
610 else {
611 $This->_GenerateAtomPathStringUsingUniqueLinearPath($PathAtomsRef);
612 }
613 return $This;
614 }
615
616 # Generate atom path string for specified path containing no cycle...
617 #
618 sub _GenerateAtomPathStringUsingUniqueLinearPath {
619 my($This, $PathAtomsRef) = @_;
620
621 # Is it a unique linear atom path?
622 #
623 if (!$This->_IsUniqueLinearAtomPath($PathAtomsRef)) {
624 return $This;
625 }
626 $This->_GenerateAtomPathString($PathAtomsRef);
627
628 return $This;
629 }
630
631 # Is it a structurally unique linear path?
632 #
633 # For a path to be structurally unique, all of its atom IDs must be diffferent from any
634 # earlier path atom IDs. In order to generate atom path atom ID invariant of the atom
635 # order in the molecule, atom IDs are sorted numerically before generating the path ID.
636 #
637 # Notes:
638 # . Atom path ID doesn't reflect the order of atoms in the atom path.
639 #
640 sub _IsUniqueLinearAtomPath {
641 my($This, $PathAtomsRef) = @_;
642 my($AtomPathID, $PathLength, @PathAtomIDs);
643
644 @PathAtomIDs = ();
645 @PathAtomIDs = map { $_->GetID(); } @{$PathAtomsRef};
646
647 $AtomPathID = join '-', sort { $a <=> $b } @PathAtomIDs;
648 if (exists $This->{UniqueLinearAtomPathsIDs}{$AtomPathID}) {
649 return 0;
650 }
651
652 # It's a unique atom path...
653 $This->{UniqueLinearAtomPathsIDs}{$AtomPathID} = 1;
654
655 return 1;
656 }
657
658 # Generate atom path string for specified path containing a cycle...
659 #
660 sub _GenerateAtomPathStringUsingUniquePathContainingCycle {
661 my($This, $PathAtomsRef) = @_;
662
663 # Is it a unique atom path containing a cycle?
664 #
665 if (!$This->_IsUniqueAtomPathContainingCycle($PathAtomsRef)) {
666 return $This;
667 }
668
669 my($CycleClosingPathAtomIndex);
670 ($CycleClosingPathAtomIndex) = $This->_GetAtomPathCycleClosingAtomIndex($PathAtomsRef);
671
672 if ($CycleClosingPathAtomIndex == 0) {
673 $This->_GenerateUniqueAtomPathStringForPathCycle($PathAtomsRef);
674 }
675 else {
676 $This->_GenerateUniqueAtomPathStringForPathContainingCycle($PathAtomsRef, $CycleClosingPathAtomIndex);
677 }
678 return $This;
679 }
680
681 # Generate a unique atom path string for a cyclic path by generating atom path
682 # strings for all possible paths in the cycle and keeping the lexicographically smallest
683 # one.
684 #
685 # Although all the paths enumerated during atom path string generation are also
686 # present in the intial paths list, but structural uniqueness check would detect
687 # 'em earlier and this method ends being invoked only once for the first cyclic path.
688 #
689 # For atom paths containg same atom types and bond symbols, atom path strings
690 # would be same for the paths.
691 #
692 sub _GenerateUniqueAtomPathStringForPathCycle {
693 my($This, $PathAtomsRef) = @_;
694
695 if ($This->_AreAllPathAtomsSymbolsSame($PathAtomsRef) && $This->_AreAllPathBondSymbolsSame($PathAtomsRef)) {
696 return $This->_GenerateAtomPathString($PathAtomsRef);
697 }
698
699 # Generate all possible atom path strings and select the lexicographically smallest one...
700 my($Index, $PathLength, $FinalAtomPathString, $FirstAtomPathString, $LastIndex, $FirstPartIndex, $FirstPartStartIndex, $FirstPartEndIndex, $SecondPartIndex, $SecondPartStartIndex, $SecondPartEndIndex, $AtomPathSymbolsRef, $AtomPathString, $ReverseAtomPathString, @FirstPartPathAtoms, @SecondPartPathAtoms, @PathAtoms);
701
702 $PathLength = scalar @{$PathAtomsRef};
703 $LastIndex = $PathLength - 1;
704
705 $FinalAtomPathString = '';
706 $FirstAtomPathString = 1;
707
708 @FirstPartPathAtoms = (); @SecondPartPathAtoms = (); @PathAtoms = ();
709
710 for $Index (0 .. ($LastIndex - 1)) {
711 @FirstPartPathAtoms = (); @SecondPartPathAtoms = (); @PathAtoms = ();
712
713 $FirstPartStartIndex = 0; $FirstPartEndIndex = $Index - 1;
714 $SecondPartStartIndex = $Index; $SecondPartEndIndex = $LastIndex - 1;
715
716 # Get first part atoms...
717 for $FirstPartIndex ($FirstPartStartIndex .. $FirstPartEndIndex) {
718 push @FirstPartPathAtoms, $PathAtomsRef->[$FirstPartIndex];
719 }
720
721 # Get second part atoms...
722 for $SecondPartIndex ($SecondPartStartIndex .. $SecondPartEndIndex) {
723 push @SecondPartPathAtoms, $PathAtomsRef->[$SecondPartIndex];
724 }
725
726 # Get final list of path atoms...
727 if (@SecondPartPathAtoms) {
728 push @PathAtoms, @SecondPartPathAtoms;
729 }
730 if (@FirstPartPathAtoms) {
731 push @PathAtoms, @FirstPartPathAtoms;
732 }
733
734 # Complete the cycle by adding first atom as the last atom...
735 push @PathAtoms, $PathAtomsRef->[$SecondPartStartIndex];
736
737 # Generate atom path string...
738 $AtomPathSymbolsRef = $This->_GenerateAtomPathSymbols(\@PathAtoms);
739
740 $AtomPathString = join '', @{$AtomPathSymbolsRef};
741 $ReverseAtomPathString = join '', reverse @{$AtomPathSymbolsRef};
742
743 if ($ReverseAtomPathString le $AtomPathString) {
744 $AtomPathString = $ReverseAtomPathString;
745 }
746
747 # Update final atom path string...
748
749 if ($FirstAtomPathString) {
750 $FirstAtomPathString = 0;
751 $FinalAtomPathString = $AtomPathString;
752 }
753 else {
754 if ($AtomPathString le $FinalAtomPathString) {
755 $FinalAtomPathString = $AtomPathString;
756 }
757 }
758 }
759
760 # Set final atom path string...
761 #
762 if (exists $This->{AtomPathsStrings}{$PathLength}{$FinalAtomPathString}) {
763 $This->{AtomPathsStrings}{$PathLength}{$FinalAtomPathString} += 1;
764 }
765 else {
766 $This->{AtomPathsStrings}{$PathLength}{$FinalAtomPathString} = 1;
767 }
768
769 return $This;
770 }
771
772 #
773 # Generate a unique atom path string for paths containing a cycle closed by
774 # the specified atom index and the last atom index.
775 #
776 # The following methodology is used to generate atom path string which is
777 # independemt of initial atom ordering:
778 # . Generate atom paths string from first atom to the atom before the first cycle
779 # closing atom.
780 # . Generate atom path string from atoms from first cycle closing atom index to
781 # the last path atom in both forward and reverse order. And select the lexicographically
782 # smallest atom path string.
783 # . Combine atom path string generated in first step with second step to generate
784 # final atom path string.
785 #
786 sub _GenerateUniqueAtomPathStringForPathContainingCycle {
787 my($This, $PathAtomsRef, $CycleClosingAtomIndex) = @_;
788 my($Index, $PathLength, $LastIndex, $LinearPartStartIndex, $LinearPartEndIndex, $CyclicPartStartIndex, $CyclicPartEndIndex, $CyclicPartAtomPathSymbolsRef, $CyclicPartAtomPathString, $ReverseCyclicPartAtomPathString, $AtomPathString, $AtomPathSymbolsRef, @CyclicPartPathAtoms, @PathAtoms);
789
790 $PathLength = scalar @{$PathAtomsRef};
791 $LastIndex = $PathLength - 1;
792
793 @PathAtoms = ();
794
795 # Get path atoms corresponding to linear part of the path...
796 $LinearPartStartIndex = 0; $LinearPartEndIndex = $CycleClosingAtomIndex - 1;
797
798 for $Index ($LinearPartStartIndex .. $LinearPartEndIndex) {
799 push @PathAtoms, $PathAtomsRef->[$Index];
800 }
801
802 # Get atoms correcponding to cyclic part of the path...
803 @CyclicPartPathAtoms = ();
804 $CyclicPartStartIndex = $CycleClosingAtomIndex; $CyclicPartEndIndex = $LastIndex;
805
806 for $Index ($CyclicPartStartIndex .. $CyclicPartEndIndex) {
807 push @CyclicPartPathAtoms, $PathAtomsRef->[$Index];
808 }
809
810 # Setup a lexicographically smaller atom path string for cyclic part...
811
812 $CyclicPartAtomPathSymbolsRef = $This->_GenerateAtomPathSymbols(\@CyclicPartPathAtoms);
813 $CyclicPartAtomPathString = join '', @{$CyclicPartAtomPathSymbolsRef};
814 $ReverseCyclicPartAtomPathString = join '', reverse @{$CyclicPartAtomPathSymbolsRef};
815
816 # Setup atom path corresponding to linear part and lexigraphicall smaller cyclic part...
817
818 if ($ReverseCyclicPartAtomPathString le $CyclicPartAtomPathString) {
819 push @PathAtoms, reverse @CyclicPartPathAtoms;
820 }
821 else {
822 push @PathAtoms, @CyclicPartPathAtoms;
823 }
824
825 # Setup final atom path string...
826
827 $AtomPathSymbolsRef = $This->_GenerateAtomPathSymbols(\@PathAtoms);
828 $AtomPathString = join '', @{$AtomPathSymbolsRef};
829
830 if (exists $This->{AtomPathsStrings}{$PathLength}{$AtomPathString}) {
831 $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} += 1;
832 }
833 else {
834 $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} = 1;
835 }
836
837 return $This;
838 }
839
840 # Does atom path contain a cycle?
841 #
842 # For an atom path to contain cycle, it must satisfy the following conditions:
843 # . Pathlength >= 3
844 # . Last atom ID is equal to first atom ID or some other atom ID besides itself
845 #
846 sub _DoesAtomPathContainsCycle {
847 my($This, $PathAtomsRef) = @_;
848 my($PathLength);
849
850 $PathLength = scalar @{$PathAtomsRef};
851 if ($PathLength <= 2) {
852 return 0;
853 }
854
855 my($AtomIndex, $LastAtomIndex, $Atom, $AtomID, $LastAtom, $LastAtomID);
856
857 $LastAtomIndex = $PathLength - 1;
858 $LastAtom = $PathAtomsRef->[$LastAtomIndex];
859 $LastAtomID = $LastAtom->GetID();
860
861 # Look for atomID similar to last atom ID...
862 for $AtomIndex (0 .. ($LastAtomIndex - 1)) {
863 $Atom = $PathAtomsRef->[$AtomIndex];
864 $AtomID = $Atom->GetID();
865
866 if ($AtomID == $LastAtomID) {
867 # It's a cycle...
868 return 1;
869 }
870 }
871 return 0;
872 }
873
874 # Get atom path cycle closing atom index...
875 #
876 sub _GetAtomPathCycleClosingAtomIndex {
877 my($This, $PathAtomsRef) = @_;
878 my($AtomIndex, $LastAtomIndex, $Atom, $AtomID, $LastAtom, $LastAtomID, $PathLength);
879
880 $PathLength = scalar @{$PathAtomsRef};
881
882 $LastAtomIndex = $PathLength - 1;
883 $LastAtom = $PathAtomsRef->[$LastAtomIndex]; $LastAtomID = $LastAtom->GetID();
884
885 # Look for atomID similar to last atom ID...
886 for $AtomIndex (0 .. ($LastAtomIndex - 1)) {
887 $Atom = $PathAtomsRef->[$AtomIndex]; $AtomID = $Atom->GetID();
888
889 if ($AtomID == $LastAtomID) {
890 # It's a cycle closing atom...
891 return $AtomIndex;
892 }
893 }
894 return undef;
895 }
896
897 # Is it a structurally unique path containing a cycle?
898 #
899 # For atom paths containing cycles, last atom ID is either equal to first atom ID or
900 # some other atom ID besides itself.
901 #
902 # In order to determine its structurally unqiue independent of initial atom ordering,
903 # the following methodolgy is used:
904 #
905 # . For paths with same first and atom IDs:
906 # . Remove the last atom ID from atom path
907 # . Sort atom IDs in the path
908 # . Add first atom ID from the sorted list to the end of list to complete the cycle
909 # . Generate a atom path ID
910 # . Use final path ID to track uniqueness of path containing cycle.
911 #
912 # . For paths with last atom ID equal to some other atom ID besidies itself:
913 # . Sort atom IDs in atom path
914 # . Generate atom path ID and use it to track unqiueness of atom paths.
915 #
916 sub _IsUniqueAtomPathContainingCycle {
917 my($This, $PathAtomsRef) = @_;
918 my($PathLength, $AtomPathID, $FirstAtom, $LastAtom, $FirstAtomID, $LastAtomID, @PathAtomIDs, @SortedPathAtomIDs);
919
920 @PathAtomIDs = ();
921 @PathAtomIDs = map { $_->GetID(); } @{$PathAtomsRef};
922
923 $PathLength = scalar @{$PathAtomsRef};
924
925 $FirstAtom = $PathAtomsRef->[0]; $FirstAtomID = $FirstAtom->GetID();
926 $LastAtom = $PathAtomsRef->[$PathLength - 1]; $LastAtomID = $LastAtom->GetID();
927
928 if ($FirstAtomID == $LastAtomID) {
929 pop @PathAtomIDs;
930
931 @SortedPathAtomIDs = ();
932 @SortedPathAtomIDs = sort { $a <=> $b } @PathAtomIDs;
933
934 push @SortedPathAtomIDs, $SortedPathAtomIDs[0];
935
936 $AtomPathID = join '-', @SortedPathAtomIDs;
937 }
938 else {
939 $AtomPathID = join '-', sort { $a <=> $b } @PathAtomIDs;
940 }
941
942 if (exists $This->{UniqueCyclicAtomPathsIDs}{$AtomPathID}) {
943 return 0;
944 }
945
946 # It's a unique atom path containing a cycle...
947 $This->{UniqueCyclicAtomPathsIDs}{$AtomPathID} = 1;
948
949 return 1;
950 }
951
952 # Generate atom path string for specified atom path...
953 #
954 sub _GenerateAtomPathString {
955 my($This, $PathAtomsRef) = @_;
956 my($PathLength, $AtomPathString, $ReverseAtomPathString, $AtomPathSymbolsRef);
957
958 $PathLength = scalar @{$PathAtomsRef};
959
960 # Generate path atom and bond symbols...
961 #
962 $AtomPathSymbolsRef = $This->_GenerateAtomPathSymbols($PathAtomsRef);
963
964 # Check presence of path using path ID created by atom path symbols...
965 $AtomPathString = join '', @{$AtomPathSymbolsRef};
966 if (exists $This->{AtomPathsStrings}{$PathLength}{$AtomPathString}) {
967 $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} += 1;
968 return $This;
969 }
970
971 # Check presence of reverse path using path ID created by atom path symbols...
972 #
973 $ReverseAtomPathString = join '', reverse @{$AtomPathSymbolsRef};
974 if (exists $This->{AtomPathsStrings}{$PathLength}{$ReverseAtomPathString}) {
975 $This->{AtomPathsStrings}{$PathLength}{$ReverseAtomPathString} += 1;
976 return $This;
977 }
978
979 # Use lexicographically smaller atom path string as PathID...
980 #
981 if ($AtomPathString le $ReverseAtomPathString) {
982 $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} = 1;
983 }
984 else {
985 $This->{AtomPathsStrings}{$PathLength}{$ReverseAtomPathString} = 1;
986 }
987 return $This;
988 }
989
990 # Are atom types for all path atoms same?
991 #
992 sub _AreAllPathAtomsSymbolsSame {
993 my($This, $PathAtomsRef) = @_;
994 my($Index, $Atom, $AtomID, $AtomType, $FirstAtomType);
995
996 $Atom = $PathAtomsRef->[0]; $AtomID = $Atom->GetID();
997 $FirstAtomType = $This->{AssignedAtomTypes}{$AtomID};
998
999 for $Index (1 .. $#{$PathAtomsRef}) {
1000 $Atom = $PathAtomsRef->[$Index]; $AtomID = $Atom->GetID();
1001 $AtomType = $This->{AssignedAtomTypes}{$AtomID};
1002
1003 if ($AtomType ne $FirstAtomType) {
1004 return 0;
1005 }
1006 }
1007 return 1;
1008 }
1009
1010 # Are bond symbols for all path bonds same?
1011 #
1012 sub _AreAllPathBondSymbolsSame {
1013 my($This, $PathAtomsRef) = @_;
1014 my($Index, $Atom, $BondedAtom, $AtomID, $BondedAtomID, $BondAtomID1, $BondAtomID2, $FirstBondSymbol, $BondSymbol);
1015
1016 # During no usage of bond symbols, just ignore them and assume they are same...
1017 if (!$This->{UseBondSymbols}) {
1018 return 1;
1019 }
1020
1021 $Atom = $PathAtomsRef->[0]; $BondedAtom = $PathAtomsRef->[1];
1022 $AtomID = $Atom->GetID(); $BondedAtomID = $BondedAtom->GetID();
1023
1024 ($BondAtomID1, $BondAtomID2) = ($AtomID < $BondedAtomID) ? ($AtomID, $BondedAtomID) : ($BondedAtomID, $AtomID);
1025 $FirstBondSymbol = $This->{BondSymbols}{$BondAtomID1}{$BondAtomID2};
1026
1027 for $Index (1 .. ($#{$PathAtomsRef} - 1)) {
1028 $Atom = $PathAtomsRef->[$Index]; $BondedAtom = $PathAtomsRef->[$Index + 1];
1029 $AtomID = $Atom->GetID(); $BondedAtomID = $BondedAtom->GetID();
1030
1031 ($BondAtomID1, $BondAtomID2) = ($AtomID < $BondedAtomID) ? ($AtomID, $BondedAtomID) : ($BondedAtomID, $AtomID);
1032 $BondSymbol = $This->{BondSymbols}{$BondAtomID1}{$BondAtomID2};
1033
1034 if ($BondSymbol ne $FirstBondSymbol) {
1035 return 0;
1036 }
1037 }
1038 return 1;
1039 }
1040
1041 # Generate atom path symbols...
1042 #
1043 sub _GenerateAtomPathSymbols {
1044 my($This, $PathAtomsRef) = @_;
1045 my($Atom, $AtomID, @AtomPathSymbols);
1046
1047 @AtomPathSymbols = ();
1048
1049 if (@{$PathAtomsRef} == 1) {
1050 $Atom = $PathAtomsRef->[0]; $AtomID = $Atom->GetID();
1051 push @AtomPathSymbols, $This->{AssignedAtomTypes}{$AtomID};
1052 return \@AtomPathSymbols;
1053 }
1054
1055 # Ignore bond information...
1056 if (!$This->{UseBondSymbols}) {
1057 for $Atom (@{$PathAtomsRef}) {
1058 $AtomID = $Atom->GetID();
1059 push @AtomPathSymbols, $This->{AssignedAtomTypes}{$AtomID};
1060 }
1061 return \@AtomPathSymbols;
1062 }
1063
1064 # Use atoms and bonds to generate atom path string...
1065 my($Index, $BondedAtom, $BondedAtomID, $BondAtomID1, $BondAtomID2);
1066
1067 # Process atom type of first atom in path...
1068 $Atom = $PathAtomsRef->[0]; $AtomID = $Atom->GetID();
1069 push @AtomPathSymbols, $This->{AssignedAtomTypes}{$AtomID};
1070
1071 for $Index (0 .. ($#{$PathAtomsRef} - 1)) {
1072 $Atom = $PathAtomsRef->[$Index]; $BondedAtom = $PathAtomsRef->[$Index + 1];
1073 $AtomID = $Atom->GetID(); $BondedAtomID = $BondedAtom->GetID();
1074
1075 ($BondAtomID1, $BondAtomID2) = ($AtomID < $BondedAtomID) ? ($AtomID, $BondedAtomID) : ($BondedAtomID, $AtomID);
1076 push @AtomPathSymbols, $This->{BondSymbols}{$BondAtomID1}{$BondAtomID2};
1077
1078 # Process atom type of next atom in path...
1079 push @AtomPathSymbols, $This->{AssignedAtomTypes}{$BondedAtomID};
1080 }
1081 return \@AtomPathSymbols;
1082 }
1083
1084 # Set final fingerprits...
1085 #
1086 sub _SetFinalFingerprints {
1087 my($This) = @_;
1088
1089 # Mark successful generation of fingerprints...
1090 $This->{FingerprintsGenerated} = 1;
1091
1092 if ($This->{Type} =~ /^PathLengthBits$/i) {
1093 $This->_SetFinalFingerprintsBitVector();
1094 }
1095 elsif ($This->{Type} =~ /^PathLengthCount$/i) {
1096 $This->_SetFinalFingerprintsVector();
1097 }
1098
1099 return $This;
1100 }
1101
1102 # Set final fingerprits bit vector...
1103 #
1104 sub _SetFinalFingerprintsBitVector {
1105 my($This) = @_;
1106 my($PathLength, $Size, $AtomPathString, $AtomPathHashCode, $AtomPathBitPos, $FingerprintsBitVector, $SkipBitPosCheck, $NumOfBitsToSetPerPath, $SetBitNum);
1107
1108 $FingerprintsBitVector = $This->{FingerprintsBitVector};
1109
1110 $Size = $This->{Size};
1111
1112 $SkipBitPosCheck = 1;
1113 $NumOfBitsToSetPerPath = $This->{NumOfBitsToSetPerPath};
1114
1115 for $PathLength (keys %{$This->{AtomPathsStrings}}) {
1116 for $AtomPathString (keys %{$This->{AtomPathsStrings}{$PathLength}}) {
1117 $AtomPathHashCode = TextUtil::HashCode($AtomPathString);
1118
1119 # Set random number seed...
1120 if ($This->{UsePerlCoreRandom}) {
1121 CORE::srand($AtomPathHashCode);
1122 }
1123 else {
1124 MathUtil::srandom($AtomPathHashCode);
1125 }
1126
1127 for $SetBitNum (1 .. $NumOfBitsToSetPerPath) {
1128 $AtomPathBitPos = $This->{UsePerlCoreRandom} ? int(CORE::rand($Size)) : int(MathUtil::random($Size));
1129 $FingerprintsBitVector->SetBit($AtomPathBitPos, $SkipBitPosCheck);
1130 }
1131 }
1132 }
1133 return $This;
1134 }
1135
1136 # Set final fingerprits vector...
1137 #
1138 sub _SetFinalFingerprintsVector {
1139 my($This) = @_;
1140 my($PathLength, $AtomPathString, $FingerprintsVector, $AtomPathCount, @Values, @ValueIDs);
1141
1142 @Values = ();
1143 @ValueIDs = ();
1144
1145 for $PathLength (sort { $a <=> $b } keys %{$This->{AtomPathsStrings}}) {
1146 for $AtomPathString (sort keys %{$This->{AtomPathsStrings}{$PathLength}}) {
1147 $AtomPathCount = $This->{AtomPathsStrings}{$PathLength}{$AtomPathString};
1148
1149 push @Values, $AtomPathCount;
1150 push @ValueIDs, $AtomPathString;
1151 }
1152 }
1153
1154 # Add PathLengthIDs and values to fingerprint vector...
1155 $This->{FingerprintsVector}->AddValueIDs(\@ValueIDs);
1156 $This->{FingerprintsVector}->AddValues(\@Values);
1157
1158 return $This;
1159 }
1160
1161 # Cache appropriate molecule data...
1162 #
1163 sub _SetupMoleculeDataCache {
1164 my($This) = @_;
1165
1166 # Get all atoms...
1167 @{$This->{Atoms}} = $This->GetMolecule()->GetAtoms();
1168
1169 return $This;
1170 }
1171
1172 # Clear cached molecule data...
1173 #
1174 sub _ClearMoleculeDataCache {
1175 my($This) = @_;
1176
1177 # Clear atoms...
1178 @{$This->{Atoms}} = ();
1179
1180 # Clear path atoms..
1181 $This->{AtomPathsRef} = '';
1182
1183 return $This;
1184 }
1185
1186 # Set atomic invariants to use atom identifiers...
1187 #
1188 sub SetAtomicInvariantsToUse {
1189 my($This, @Values) = @_;
1190 my($FirstValue, $TypeOfFirstValue, $AtomicInvariant, $SpecifiedAtomicInvariant, $AtomicInvariantValue, @SpecifiedAtomicInvariants, @AtomicInvariantsToUse);
1191
1192 if (!@Values) {
1193 carp "Warning: ${ClassName}->SetAtomicInvariantsToUse: No values specified...";
1194 return;
1195 }
1196
1197 $FirstValue = $Values[0];
1198 $TypeOfFirstValue = ref $FirstValue;
1199
1200 @SpecifiedAtomicInvariants = ();
1201 @AtomicInvariantsToUse = ();
1202
1203 if ($TypeOfFirstValue =~ /^ARRAY/) {
1204 push @SpecifiedAtomicInvariants, @{$FirstValue};
1205 }
1206 else {
1207 push @SpecifiedAtomicInvariants, @Values;
1208 }
1209
1210 # Make sure specified AtomicInvariants are valid...
1211 for $SpecifiedAtomicInvariant (@SpecifiedAtomicInvariants) {
1212 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($SpecifiedAtomicInvariant)) {
1213 croak "Error: ${ClassName}->SetAtomicInvariantsToUse: Specified atomic invariant, $SpecifiedAtomicInvariant, is not supported...\n ";
1214 }
1215 $AtomicInvariant = $SpecifiedAtomicInvariant;
1216 push @AtomicInvariantsToUse, $AtomicInvariant;
1217 }
1218
1219 # Set atomic invariants to use...
1220 @{$This->{AtomicInvariantsToUse}} = ();
1221 push @{$This->{AtomicInvariantsToUse}}, @AtomicInvariantsToUse;
1222
1223 return $This;
1224 }
1225
1226 # Set functional classes to use for atom identifiers...
1227 #
1228 sub SetFunctionalClassesToUse {
1229 my($This, @Values) = @_;
1230 my($FirstValue, $TypeOfFirstValue, $FunctionalClass, $SpecifiedFunctionalClass, @SpecifiedFunctionalClasses, @FunctionalClassesToUse);
1231
1232 if (!@Values) {
1233 carp "Warning: ${ClassName}->SetFunctionalClassesToUse: No values specified...";
1234 return;
1235 }
1236
1237 if ($This->{AtomIdentifierType} !~ /^FunctionalClassAtomTypes$/i) {
1238 carp "Warning: ${ClassName}->SetFunctionalClassesToUse: FunctionalClassesToUse can't be set for InitialAtomIdentifierType of $This->{AtomIdentifierType}...";
1239 return;
1240 }
1241
1242 $FirstValue = $Values[0];
1243 $TypeOfFirstValue = ref $FirstValue;
1244
1245 @SpecifiedFunctionalClasses = ();
1246 @FunctionalClassesToUse = ();
1247
1248 if ($TypeOfFirstValue =~ /^ARRAY/) {
1249 push @SpecifiedFunctionalClasses, @{$FirstValue};
1250 }
1251 else {
1252 push @SpecifiedFunctionalClasses, @Values;
1253 }
1254
1255 # Make sure specified FunctionalClasses are valid...
1256 for $SpecifiedFunctionalClass (@SpecifiedFunctionalClasses) {
1257 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($SpecifiedFunctionalClass)) {
1258 croak "Error: ${ClassName}->SetFunctionalClassesToUse: Specified functional class, $SpecifiedFunctionalClass, is not supported...\n ";
1259 }
1260 push @FunctionalClassesToUse, $SpecifiedFunctionalClass;
1261 }
1262
1263 # Set functional classes to use...
1264 @{$This->{FunctionalClassesToUse}} = ();
1265 push @{$This->{FunctionalClassesToUse}}, @FunctionalClassesToUse;
1266
1267 return $This;
1268 }
1269
1270 # Initialize atom indentifier type information...
1271 #
1272 # Current supported values:
1273 #
1274 # AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes,
1275 # MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes
1276 #
1277 sub _InitializeAtomIdentifierTypeInformation {
1278 my($This) = @_;
1279
1280 if ($This->{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
1281 $This->_InitializeAtomicInvariantsAtomTypesInformation();
1282 }
1283 elsif ($This->{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
1284 $This->_InitializeFunctionalClassAtomTypesInformation();
1285 }
1286 elsif ($This->{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
1287 # Nothing to do for now...
1288 }
1289 else {
1290 croak "Error: ${ClassName}->_InitializeAtomIdentifierTypeInformation: Unknown atom indentifier type $This->{AtomIdentifierType}...";
1291 }
1292
1293 return $This;
1294 }
1295
1296 # Initialize atomic invariants atom types to use for generating atom identifiers...
1297 #
1298 # Let:
1299 # AS = Atom symbol corresponding to element symbol
1300 #
1301 # X<n> = Number of non-hydrogen atom neighbors or heavy atoms attached to atom
1302 # BO<n> = Sum of bond orders to non-hydrogen atom neighbors or heavy atoms attached to atom
1303 # LBO<n> = Largest bond order of non-hydrogen atom neighbors or heavy atoms attached to atom
1304 # SB<n> = Number of single bonds to non-hydrogen atom neighbors or heavy atoms attached to atom
1305 # DB<n> = Number of double bonds to non-hydrogen atom neighbors or heavy atoms attached to atom
1306 # TB<n> = Number of triple bonds to non-hydrogen atom neighbors or heavy atoms attached to atom
1307 # H<n> = Number of implicit and explicit hydrogens for atom
1308 # Ar = Aromatic annotation indicating whether atom is aromatic
1309 # RA = Ring atom annotation indicating whether atom is a ring
1310 # FC<+n/-n> = Formal charge assigned to atom
1311 # MN<n> = Mass number indicating isotope other than most abundant isotope
1312 # SM<n> = Spin multiplicity of atom. Possible values: 1 (singlet), 2 (doublet) or 3 (triplet)
1313 #
1314 # Then:
1315 #
1316 # Atom type generated by AtomTypes::AtomicInvariantsAtomTypes class corresponds to:
1317 #
1318 # AS.X<n>.BO<n>.LBO<n>.<SB><n>.<DB><n>.<TB><n>.H<n>.Ar.RA.FC<+n/-n>.MN<n>.SM<n>
1319 #
1320 # Except for AS which is a required atomic invariant in atom types, all other atomic invariants are
1321 # optional. Default atomic invariants used for AtomID are: AS, X<n>, BO<n>, H<n>, FC<+n/-n>.
1322 # AtomID specification doesn't include atomic invariants with zero or undefined values.
1323 #
1324 sub _InitializeAtomicInvariantsAtomTypesInformation {
1325 my($This) = @_;
1326
1327 # Default atomic invariants to use for generating atom neighborhood atom IDs: AS, X, BO, H, FC
1328 #
1329 @{$This->{AtomicInvariantsToUse}} = ();
1330 @{$This->{AtomicInvariantsToUse}} = ('AS', 'X', 'BO', 'H', 'FC');
1331
1332 return $This;
1333 }
1334
1335 # Initialize functional class atom types, generated by AtomTypes::FunctionalClassAtomTypes
1336 # class, to use for generating atom identifiers...
1337 #
1338 # Let:
1339 # HBD: HydrogenBondDonor
1340 # HBA: HydrogenBondAcceptor
1341 # PI : PositivelyIonizable
1342 # NI : NegativelyIonizable
1343 # Ar : Aromatic
1344 # Hal : Halogen
1345 # H : Hydrophobic
1346 # RA : RingAtom
1347 # CA : ChainAtom
1348 #
1349 # Then:
1350 #
1351 # Functiononal class atom type specification for an atom corresponds to:
1352 #
1353 # Ar.CA.H.HBA.HBD.Hal.NI.PI.RA
1354 #
1355 # Default functional classes used are: HBD, HBA, PI, NI, Ar, Hal
1356 #
1357 # FunctionalAtomTypes are assigned using the following definitions [ Ref 60-61, Ref 65-66 ]:
1358 #
1359 # HydrogenBondDonor: NH, NH2, OH
1360 # HydrogenBondAcceptor: N[!H], O
1361 # PositivelyIonizable: +, NH2
1362 # NegativelyIonizable: -, C(=O)OH, S(=O)OH, P(=O)OH
1363 #
1364 sub _InitializeFunctionalClassAtomTypesInformation {
1365 my($This) = @_;
1366
1367 # Default functional class atom typess to use for generating atom identifiers
1368 # are: HBD, HBA, PI, NI, Ar, Hal
1369 #
1370 @{$This->{FunctionalClassesToUse}} = ();
1371 @{$This->{FunctionalClassesToUse}} = ('HBD', 'HBA', 'PI', 'NI', 'Ar', 'Hal');
1372
1373 return $This;
1374 }
1375
1376 # Return a string containg data for PathLengthFingerprints object...
1377 #
1378 sub StringifyPathLengthFingerprints {
1379 my($This) = @_;
1380 my($PathLengthsFingerprintsString);
1381
1382 # Type of fingerprint...
1383 $PathLengthsFingerprintsString = "Fingerprint type: $This->{Type}; AtomIdentifierType: $This->{AtomIdentifierType}";
1384
1385 # Path length...
1386 $PathLengthsFingerprintsString .= "; MinPathLength: $This->{MinLength}; MaxPathLength: $This->{MaxLength}";
1387
1388 # Fingerprint generation control...
1389 my($AllowSharedBonds, $AllowRings, $UseBondSymbols, $UseUniquePaths);
1390
1391 $AllowSharedBonds = $This->{AllowSharedBonds} ? "Yes" : "No";
1392 $AllowRings = $This->{AllowRings} ? "Yes" : "No";
1393 $UseBondSymbols = $This->{UseBondSymbols} ? "Yes" : "No";
1394 $UseUniquePaths = $This->{UseBondSymbols} ? "Yes" : "No";
1395
1396 $PathLengthsFingerprintsString .= "; UseUniquePaths: $UseUniquePaths; AllowSharedBonds: $AllowSharedBonds; AllowRings: $AllowRings; UseBondSymbols: $UseBondSymbols";
1397
1398 if ($This->{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
1399 my($AtomicInvariant, @AtomicInvariants, @AtomicInvariantsOrder, %AvailableAtomicInvariants);
1400
1401 @AtomicInvariantsOrder = AtomTypes::AtomicInvariantsAtomTypes::GetAtomicInvariantsOrder();
1402 %AvailableAtomicInvariants = AtomTypes::AtomicInvariantsAtomTypes::GetAvailableAtomicInvariants();
1403
1404 for $AtomicInvariant (@AtomicInvariantsOrder) {
1405 push @AtomicInvariants, "$AtomicInvariant: $AvailableAtomicInvariants{$AtomicInvariant}";
1406 }
1407
1408 $PathLengthsFingerprintsString .= "; AtomicInvariantsToUse: <" . TextUtil::JoinWords(\@{$This->{AtomicInvariantsToUse}}, ", ", 0) . ">";
1409 $PathLengthsFingerprintsString .= "; AtomicInvariantsOrder: <" . TextUtil::JoinWords(\@AtomicInvariantsOrder, ", ", 0) . ">";
1410 $PathLengthsFingerprintsString .= "; AvailableAtomicInvariants: <" . TextUtil::JoinWords(\@AtomicInvariants, ", ", 0) . ">";
1411 }
1412 elsif ($This->{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
1413 my($FunctionalClass, @FunctionalClasses, @FunctionalClassesOrder, %AvailableFunctionalClasses);
1414
1415 @FunctionalClassesOrder = AtomTypes::FunctionalClassAtomTypes::GetFunctionalClassesOrder();
1416 %AvailableFunctionalClasses = AtomTypes::FunctionalClassAtomTypes::GetAvailableFunctionalClasses();
1417
1418 for $FunctionalClass (@FunctionalClassesOrder) {
1419 push @FunctionalClasses, "$FunctionalClass: $AvailableFunctionalClasses{$FunctionalClass}";
1420 }
1421
1422 $PathLengthsFingerprintsString .= "; FunctionalClassesToUse: <" . TextUtil::JoinWords(\@{$This->{FunctionalClassesToUse}}, ", ", 0) . ">";
1423 $PathLengthsFingerprintsString .= "; FunctionalClassesOrder: <" . TextUtil::JoinWords(\@FunctionalClassesOrder, ", ", 0) . ">";
1424 $PathLengthsFingerprintsString .= "; AvailableFunctionalClasses: <" . TextUtil::JoinWords(\@FunctionalClasses, ", ", 0) . ">";
1425 }
1426
1427 if ($This->{Type} =~ /^PathLengthBits$/i) {
1428 # Size...
1429 $PathLengthsFingerprintsString .= "; Size: $This->{Size}; MinSize: $This->{MinSize}; MaxSize: $This->{MaxSize}";
1430
1431 # NumOfBitsToSetPerPath...
1432 $PathLengthsFingerprintsString .= "; NumOfBitsToSetPerPath: $This->{NumOfBitsToSetPerPath}";
1433
1434 # Fingerprint bit density and num of bits set...
1435 my($NumOfSetBits, $BitDensity);
1436 $NumOfSetBits = $This->{FingerprintsBitVector}->GetNumOfSetBits();
1437 $BitDensity = $This->{FingerprintsBitVector}->GetFingerprintsBitDensity();
1438 $PathLengthsFingerprintsString .= "; NumOfOnBits: $NumOfSetBits; BitDensity: $BitDensity";
1439
1440 $PathLengthsFingerprintsString .= "; FingerprintsBitVector: < $This->{FingerprintsBitVector} >";
1441 }
1442 elsif ($This->{Type} =~ /^PathLengthCount$/i) {
1443 $PathLengthsFingerprintsString .= "; FingerprintsVector: < $This->{FingerprintsVector} >";
1444 }
1445
1446 return $PathLengthsFingerprintsString;
1447 }
1448
1449 1;
1450
1451 __END__
1452
1453 =head1 NAME
1454
1455 PathLengthFingerprints
1456
1457 =head1 SYNOPSIS
1458
1459 use Fingerprints::PathLengthFingerprints;
1460
1461 use Fingerprints::PathLengthFingerprints qw(:all);
1462
1463 =head1 DESCRIPTION
1464
1465 B<PathLengthFingerprints> class provides the following methods:
1466
1467 new, GenerateFingerprints, , GetDescription, SetAtomIdentifierType,
1468 SetAtomicInvariantsToUse, SetFunctionalClassesToUse, SetMaxLength,
1469 SetMinLength, SetNumOfBitsToSetPerPath, SetType,
1470 StringifyPathLengthFingerprints
1471
1472 B<PathLengthFingerprints> is derived from B<Fingerprints> class which in turn
1473 is derived from B<ObjectProperty> base class that provides methods not explicitly defined
1474 in B<PathLengthFingerprints>, B<Fingerprints> or B<ObjectProperty> classes using Perl's
1475 AUTOLOAD functionality. These methods are generated on-the-fly for a specified object property:
1476
1477 Set<PropertyName>(<PropertyValue>);
1478 $PropertyValue = Get<PropertyName>();
1479 Delete<PropertyName>();
1480
1481 The current release of MayaChemTools supports generation of B<AtomTypesFingerpritns>
1482 corresponding to following B<AtomtomIdentifierTypes>:
1483
1484 AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes,
1485 FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes,
1486 SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes
1487
1488 Based on the values specified for B<Type>, B<AtomtomIdentifierTypes>, B<MinPathLength> and
1489 B<MaxPathLength>, all appropriate atom paths are generated for each atom in the molecule
1490 and collected in a list and the list is filtered to remove any structurally duplicate paths as
1491 indicated by the value of B<UseUniquePaths>.
1492
1493 For molecules containing rings, atom paths starting from each atom can be traversed in four
1494 different ways:
1495
1496 o Atom paths without any rings and sharing of bonds in traversed paths.
1497 o Atom paths containing rings and without any sharing of bonds in
1498 traversed paths
1499 o All possible atom paths without any rings and sharing of bonds in
1500 traversed paths
1501 o All possible atom paths containing rings and with sharing of bonds in
1502 traversed paths.
1503
1504 Atom path traversal is terminated at the last ring atom. For molecules containing no rings,
1505 first two and last two types described above are equivalent.
1506
1507 B<AllowSharedBonds> and B<AllowRings> allow generation of different types of paths
1508 to be used for fingerprints generation.
1509
1510 The combination of B<AllowSharedBonds>, B<AllowRings>, and B<UseBondSymbols> allows generation of
1511 8 different types of path length fingerprints:
1512
1513 AllowSharedBonds AllowRings UseBondSymbols
1514
1515 0 0 1 - AtomPathsNoCyclesWithBondSymbols
1516 0 1 1 - AtomPathsWithCyclesWithBondSymbols
1517
1518 1 0 1 - AllAtomPathsNoCyclesWithBondSymbols
1519 1 1 1 - AllAtomPathsWithCyclesWithBondSymbols
1520 [ DEFAULT ]
1521
1522 0 0 0 - AtomPathsNoCyclesNoBondSymbols
1523 0 1 0 - AtomPathsWithCyclesNoBondSymbols
1524
1525 1 0 0 - AllAtomPathsNoCyclesNoBondSymbols
1526 1 1 0 - AllAtomPathsWithCyclesNoWithBondSymbols
1527
1528 Additionally, possible values for option B<--AtomIdentifierType> in conjunction with corresponding
1529 specified values for B<AtomicInvariantsToUse> and B<FunctionalClassesToUse > changes the nature
1530 of atom path length strings and the fingerprints.
1531
1532 For each atom path in the filtered atom paths list, an atom path string is created using value of
1533 B<AtomIdentifierType> and specified values to use for a particular atom identifier type.
1534 Value of B<UseBondSymbols> controls whether bond order symbols are used during generation
1535 of atom path string. Atom symbol corresponds to element symbol and characters used to represent
1536 bond order are: I<1 - None; 2 - '='; 3 - '#'; 1.5 or aromatic - ':'; others: bond order value>. By default,
1537 bond symbols are included in atom path strings. Exclusion of bond symbols in atom path strings
1538 results in fingerprints which correspond purely to atom paths without considering bonds.
1539
1540 B<UseUniquePaths> controls the removal of structurally duplicate atom path strings are removed
1541 from the list.
1542
1543 For I<PathLengthBits> value of B<Type>, each atom path is hashed to a 32 bit unsigned
1544 integer key using B<TextUtil::HashCode> function. Using the hash key as a seed for a random number
1545 generator, a random integer value between 0 and B<Size> is used to set corresponding bits
1546 in the fingerprint bit-vector string. Value of B<NumOfBitsToSetPerPaths> option controls the number
1547 of time a random number is generated to set corresponding bits.
1548
1549 For I< PathLengthCount> value of B<Type>n, the number of times an atom path appears
1550 is tracked and a fingerprints count-string corresponding to count of atom paths is generated.
1551
1552 The current release of MayaChemTools generates the following types of path length
1553 fingerprints bit-vector and vector strings:
1554
1555 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLeng
1556 th1:MaxLength8;1024;BinaryString;Ascending;001000010011010101011000110
1557 0100010101011000101001011100110001000010001001101000001001001001001000
1558 0010110100000111001001000001001010100100100000000011000000101001011100
1559 0010000001000101010100000100111100110111011011011000000010110111001101
1560 0101100011000000010001000011000010100011101100001000001000100000000...
1561
1562 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLeng
1563 th1:MaxLength8;1024;HexadecimalString;Ascending;48caa1315d82d91122b029
1564 42861c9409a4208182d12015509767bd0867653604481a8b1288000056090583603078
1565 9cedae54e26596889ab121309800900490515224208421502120a0dd9200509723ae89
1566 00024181b86c0122821d4e4880c38620dab280824b455404009f082003d52c212b4e6d
1567 6ea05280140069c780290c43
1568
1569 FingerprintsVector;PathLengthCount:AtomicInvariantsAtomTypes:MinLength
1570 1:MaxLength8;432;NumericalValues;IDsAndValuesPairsString;C.X1.BO1.H3 2
1571 C.X2.BO2.H2 4 C.X2.BO3.H1 14 C.X3.BO3.H1 3 C.X3.BO4 10 F.X1.BO1 1 N.X
1572 2.BO2.H1 1 N.X3.BO3 1 O.X1.BO1.H1 3 O.X1.BO2 2 C.X1.BO1.H3C.X3.BO3.H1
1573 2 C.X2.BO2.H2C.X2.BO2.H2 1 C.X2.BO2.H2C.X3.BO3.H1 4 C.X2.BO2.H2C.X3.BO
1574 4 1 C.X2.BO2.H2N.X3.BO3 1 C.X2.BO3.H1:C.X2.BO3.H1 10 C.X2.BO3.H1:C....
1575
1576 FingerprintsVector;PathLengthCount:DREIDINGAtomTypes:MinLength1:MaxLen
1577 gth8;410;NumericalValues;IDsAndValuesPairsString;C_2 2 C_3 9 C_R 22 F_
1578 1 N_3 1 N_R 1 O_2 2 O_3 3 C_2=O_2 2 C_2C_3 1 C_2C_R 1 C_2N_3 1 C_2O_3
1579 1 C_3C_3 7 C_3C_R 1 C_3N_R 1 C_3O_3 2 C_R:C_R 21 C_R:N_R 2 C_RC_R 2 C
1580 _RF_ 1 C_RN_3 1 C_2C_3C_3 1 C_2C_R:C_R 2 C_2N_3C_R 1 C_3C_2=O_2 1 C_3C
1581 _2O_3 1 C_3C_3C_3 5 C_3C_3C_R 2 C_3C_3N_R 1 C_3C_3O_3 4 C_3C_R:C_R ...
1582
1583 FingerprintsVector;PathLengthCount:EStateAtomTypes:MinLength1:MaxLengt
1584 h8;454;NumericalValues;IDsAndValuesPairsString;aaCH 14 aasC 8 aasN 1 d
1585 O 2 dssC 2 sCH3 2 sF 1 sOH 3 ssCH2 4 ssNH 1 sssCH 3 aaCH:aaCH 10 aaCH:
1586 aasC 8 aasC:aasC 3 aasC:aasN 2 aasCaasC 2 aasCdssC 1 aasCsF 1 aasCssNH
1587 1 aasCsssCH 1 aasNssCH2 1 dO=dssC 2 dssCsOH 1 dssCssCH2 1 dssCssNH 1
1588 sCH3sssCH 2 sOHsssCH 2 ssCH2ssCH2 1 ssCH2sssCH 4 aaCH:aaCH:aaCH 6 a...
1589
1590 FingerprintsVector;PathLengthCount:FunctionalClassAtomTypes:MinLength1
1591 :MaxLength8;404;NumericalValues;IDsAndValuesPairsString;Ar 22 Ar.HBA 1
1592 HBA 2 HBA.HBD 3 HBD 1 Hal 1 NI 1 None 10 Ar.HBA:Ar 2 Ar.HBANone 1 Ar:
1593 Ar 21 ArAr 2 ArHBD 1 ArHal 1 ArNone 2 HBA.HBDNI 1 HBA.HBDNone 2 HBA=NI
1594 1 HBA=None 1 HBDNone 1 NINone 1 NoneNone 7 Ar.HBA:Ar:Ar 2 Ar.HBA:ArAr
1595 1 Ar.HBA:ArNone 1 Ar.HBANoneNone 1 Ar:Ar.HBA:Ar 1 Ar:Ar.HBANone 2 ...
1596
1597 FingerprintsVector;PathLengthCount:MMFF94AtomTypes:MinLength1:MaxLengt
1598 h8;463;NumericalValues;IDsAndValuesPairsString;C5A 2 C5B 2 C=ON 1 CB 1
1599 8 COO 1 CR 9 F 1 N5 1 NC=O 1 O=CN 1 O=CO 1 OC=O 1 OR 2 C5A:C5B 2 C5A:N
1600 5 2 C5ACB 1 C5ACR 1 C5B:C5B 1 C5BC=ON 1 C5BCB 1 C=ON=O=CN 1 C=ONNC=O 1
1601 CB:CB 18 CBF 1 CBNC=O 1 COO=O=CO 1 COOCR 1 COOOC=O 1 CRCR 7 CRN5 1 CR
1602 OR 2 C5A:C5B:C5B 2 C5A:C5BC=ON 1 C5A:C5BCB 1 C5A:N5:C5A 1 C5A:N5CR ...
1603
1604 FingerprintsVector;PathLengthCount:SLogPAtomTypes:MinLength1:MaxLength
1605 8;518;NumericalValues;IDsAndValuesPairsString;C1 5 C10 1 C11 1 C14 1 C
1606 18 14 C20 4 C21 2 C22 1 C5 2 CS 2 F 1 N11 1 N4 1 O10 1 O2 3 O9 1 C10C1
1607 1 C10N11 1 C11C1 2 C11C21 1 C14:C18 2 C14F 1 C18:C18 10 C18:C20 4 C18
1608 :C22 2 C1C5 1 C1CS 4 C20:C20 1 C20:C21 1 C20:N11 1 C20C20 2 C21:C21 1
1609 C21:N11 1 C21C5 1 C22N4 1 C5=O10 1 C5=O9 1 C5N4 1 C5O2 1 CSO2 2 C10...
1610
1611 FingerprintsVector;PathLengthCount:SYBYLAtomTypes:MinLength1:MaxLength
1612 8;412;NumericalValues;IDsAndValuesPairsString;C.2 2 C.3 9 C.ar 22 F 1
1613 N.am 1 N.ar 1 O.2 1 O.3 2 O.co2 2 C.2=O.2 1 C.2=O.co2 1 C.2C.3 1 C.2C.
1614 ar 1 C.2N.am 1 C.2O.co2 1 C.3C.3 7 C.3C.ar 1 C.3N.ar 1 C.3O.3 2 C.ar:C
1615 .ar 21 C.ar:N.ar 2 C.arC.ar 2 C.arF 1 C.arN.am 1 C.2C.3C.3 1 C.2C.ar:C
1616 .ar 2 C.2N.amC.ar 1 C.3C.2=O.co2 1 C.3C.2O.co2 1 C.3C.3C.3 5 C.3C.3...
1617
1618 FingerprintsVector;PathLengthCount:TPSAAtomTypes:MinLength1:MaxLength8
1619 ;331;NumericalValues;IDsAndValuesPairsString;N21 1 N7 1 None 34 O3 2 O
1620 4 3 N21:None 2 N21None 1 N7None 2 None:None 21 None=O3 2 NoneNone 13 N
1621 oneO4 3 N21:None:None 2 N21:NoneNone 2 N21NoneNone 1 N7None:None 2 N7N
1622 one=O3 1 N7NoneNone 1 None:N21:None 1 None:N21None 2 None:None:None 20
1623 None:NoneNone 12 NoneN7None 1 NoneNone=O3 2 NoneNoneNone 8 NoneNon...
1624
1625 FingerprintsVector;PathLengthCount:UFFAtomTypes:MinLength1:MaxLength8;
1626 410;NumericalValues;IDsAndValuesPairsString;C_2 2 C_3 9 C_R 22 F_ 1 N_
1627 3 1 N_R 1 O_2 2 O_3 3 C_2=O_2 2 C_2C_3 1 C_2C_R 1 C_2N_3 1 C_2O_3 1 C_
1628 3C_3 7 C_3C_R 1 C_3N_R 1 C_3O_3 2 C_R:C_R 21 C_R:N_R 2 C_RC_R 2 C_RF_
1629 1 C_RN_3 1 C_2C_3C_3 1 C_2C_R:C_R 2 C_2N_3C_R 1 C_3C_2=O_2 1 C_3C_2O_3
1630 1 C_3C_3C_3 5 C_3C_3C_R 2 C_3C_3N_R 1 C_3C_3O_3 4 C_3C_R:C_R 1 C_3...
1631
1632 =head2 METHODS
1633
1634 =over 4
1635
1636 =item B<new>
1637
1638 $NewPathLengthFingerprints = new PathLengthFingerprints(
1639 %NamesAndValues);
1640
1641 Using specified I<PathLengthFingerprints> property names and values hash, B<new> method creates a new object
1642 and returns a reference to newly created B<PathLengthFingerprints> object. By default, the following properties are
1643 initialized:
1644
1645 Molecule = '';
1646 Type = ''
1647 Size = 1024
1648 MinSize = 32
1649 MaxSize = 2**32
1650 NumOfBitsToSetPerPath = 1
1651 MinLength = 1
1652 MaxLength = 8
1653 AllowSharedBonds = 1
1654 AllowRings = 1
1655 UseBondSymbols = 1
1656 UseUniquePaths = ''
1657 AtomIdentifierType = ''
1658 SetAtomicInvariantsToUse = ['AS']
1659 FunctionalClassesToUse = ['HBD', 'HBA', 'PI', 'NI', 'Ar', 'Hal']
1660
1661 Examples:
1662
1663 $PathLengthFingerprints = new PathLengthFingerprints(
1664 'Molecule' => $Molecule,
1665 'Type' => 'PathLengthBits',
1666 'AtomIdentifierType' =
1667 'AtomicInvariantsAtomTypes');
1668
1669 $PathLengthFingerprints = new PathLengthFingerprints(
1670 'Molecule' => $Molecule,
1671 'Type' => 'PathLengthBits',
1672 'Size' => 1024,
1673 'MinLength' => 1,
1674 'MaxLength' => 8,
1675 'AllowRings' => 1,
1676 'AllowSharedBonds' => 1,
1677 'UseBondSymbols' => 1,
1678 'UseUniquePaths' => 1,
1679 'AtomIdentifierType' =
1680 'AtomicInvariantsAtomTypes',
1681 'AtomicInvariantsToUse' => ['AS']);
1682
1683 $PathLengthFingerprints = new PathLengthFingerprints(
1684 'Molecule' => $Molecule,
1685 'Type' => 'PathLengthCount',
1686 'MinLength' => 1,
1687 'MaxLength' => 8,
1688 'AllowRings' => 1,
1689 'AllowSharedBonds' => 1,
1690 'UseBondSymbols' => 1,
1691 'UseUniquePaths' => 1,
1692 'AtomIdentifierType' =>
1693 'AtomicInvariantsAtomTypes',
1694 'AtomicInvariantsToUse' => ['AS']);
1695
1696 $PathLengthFingerprints = new PathLengthFingerprints(
1697 'Molecule' => $Molecule,
1698 'Type' => 'PathLengthBits',
1699 'AtomIdentifierType' =
1700 'SLogPAtomTypes');
1701
1702 $PathLengthFingerprints = new PathLengthFingerprints(
1703 'Molecule' => $Molecule,
1704 'Type' => 'PathLengthCount',
1705 'AtomIdentifierType' =
1706 'SYBYLAtomTypes');
1707
1708 $PathLengthFingerprints = new PathLengthFingerprints(
1709 'Molecule' => $Molecule,
1710 'Type' => 'PathLengthBits',
1711 'AtomIdentifierType' =
1712 'FunctionalClassAtomTypes',
1713 'FunctionalClassesToUse' => ['HBD', 'HBA', 'Ar']);
1714
1715 $PathLengthFingerprints->GenerateFingerprints();
1716 print "$PathLengthFingerprints\n";
1717
1718 =item B<GetDescription>
1719
1720 $Description = $PathLengthFingerprints->GetDescription();
1721
1722 Returns a string containing description of path length fingerprints.
1723
1724 =item B<GenerateFingerprints>
1725
1726 $PathLengthFingerprints->GenerateFingerprints();
1727
1728 Generates path length fingerprints and returns I<PathLengthFingerprints>.
1729
1730 =item B<SetMaxLength>
1731
1732 $PathLengthFingerprints->SetMaxLength($Length);
1733
1734 Sets maximum value of atom path length to be used during atom path length fingerprints
1735 generation and returns I<PathLengthFingerprints>
1736
1737 =item B<SetAtomIdentifierType>
1738
1739 $PathLengthFingerprints->SetAtomIdentifierType();
1740
1741 Sets atom I<IdentifierType> to use during path length fingerprints generation and
1742 returns I<PathLengthFingerprints>.
1743
1744 Possible values: I<AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes,
1745 FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes,
1746 TPSAAtomTypes, UFFAtomTypes>.
1747
1748 =item B<SetAtomicInvariantsToUse>
1749
1750 $PathLengthFingerprints->SetAtomicInvariantsToUse($ValuesRef);
1751 $PathLengthFingerprints->SetAtomicInvariantsToUse(@Values);
1752
1753 Sets atomic invariants to use during I<AtomicInvariantsAtomTypes> value of I<AtomIdentifierType>
1754 for path length fingerprints generation and returns I<PathLengthFingerprints>.
1755
1756 Possible values for atomic invariants are: I<AS, X, BO, LBO, SB, DB, TB,
1757 H, Ar, RA, FC, MN, SM>. Default value: I<AS>.
1758
1759 The atomic invariants abbreviations correspond to:
1760
1761 AS = Atom symbol corresponding to element symbol
1762
1763 X<n> = Number of non-hydrogen atom neighbors or heavy atoms
1764 BO<n> = Sum of bond orders to non-hydrogen atom neighbors or heavy atoms
1765 LBO<n> = Largest bond order of non-hydrogen atom neighbors or heavy atoms
1766 SB<n> = Number of single bonds to non-hydrogen atom neighbors or heavy atoms
1767 DB<n> = Number of double bonds to non-hydrogen atom neighbors or heavy atoms
1768 TB<n> = Number of triple bonds to non-hydrogen atom neighbors or heavy atoms
1769 H<n> = Number of implicit and explicit hydrogens for atom
1770 Ar = Aromatic annotation indicating whether atom is aromatic
1771 RA = Ring atom annotation indicating whether atom is a ring
1772 FC<+n/-n> = Formal charge assigned to atom
1773 MN<n> = Mass number indicating isotope other than most abundant isotope
1774 SM<n> = Spin multiplicity of atom. Possible values: 1 (singlet), 2 (doublet) or
1775 3 (triplet)
1776
1777 Atom type generated by AtomTypes::AtomicInvariantsAtomTypes class corresponds to:
1778
1779 AS.X<n>.BO<n>.LBO<n>.<SB><n>.<DB><n>.<TB><n>.H<n>.Ar.RA.FC<+n/-n>.MN<n>.SM<n>
1780
1781 Except for AS which is a required atomic invariant in atom types, all other atomic invariants are
1782 optional. Atom type specification doesn't include atomic invariants with zero or undefined values.
1783
1784 In addition to usage of abbreviations for specifying atomic invariants, the following descriptive words
1785 are also allowed:
1786
1787 X : NumOfNonHydrogenAtomNeighbors or NumOfHeavyAtomNeighbors
1788 BO : SumOfBondOrdersToNonHydrogenAtoms or SumOfBondOrdersToHeavyAtoms
1789 LBO : LargestBondOrderToNonHydrogenAtoms or LargestBondOrderToHeavyAtoms
1790 SB : NumOfSingleBondsToNonHydrogenAtoms or NumOfSingleBondsToHeavyAtoms
1791 DB : NumOfDoubleBondsToNonHydrogenAtoms or NumOfDoubleBondsToHeavyAtoms
1792 TB : NumOfTripleBondsToNonHydrogenAtoms or NumOfTripleBondsToHeavyAtoms
1793 H : NumOfImplicitAndExplicitHydrogens
1794 Ar : Aromatic
1795 RA : RingAtom
1796 FC : FormalCharge
1797 MN : MassNumber
1798 SM : SpinMultiplicity
1799
1800 I<AtomTypes::AtomicInvariantsAtomTypes> module is used to assign atomic invariant
1801 atom types.
1802
1803 =item B<SetFunctionalClassesToUse>
1804
1805 $PathLengthFingerprints->SetFunctionalClassesToUse($ValuesRef);
1806 $PathLengthFingerprints->SetFunctionalClassesToUse(@Values);
1807
1808 Sets functional classes invariants to use during I<FunctionalClassAtomTypes> value of I<AtomIdentifierType>
1809 for path length fingerprints generation and returns I<PathLengthFingerprints>.
1810
1811 Possible values for atom functional classes are: I<Ar, CA, H, HBA, HBD, Hal, NI, PI, RA>.
1812 Default value [ Ref 24 ]: I<HBD,HBA,PI,NI,Ar,Hal>.
1813
1814 The functional class abbreviations correspond to:
1815
1816 HBD: HydrogenBondDonor
1817 HBA: HydrogenBondAcceptor
1818 PI : PositivelyIonizable
1819 NI : NegativelyIonizable
1820 Ar : Aromatic
1821 Hal : Halogen
1822 H : Hydrophobic
1823 RA : RingAtom
1824 CA : ChainAtom
1825
1826 Functional class atom type specification for an atom corresponds to:
1827
1828 Ar.CA.H.HBA.HBD.Hal.NI.PI.RA or None
1829
1830 I<AtomTypes::FunctionalClassAtomTypes> module is used to assign functional class atom
1831 types. It uses following definitions [ Ref 60-61, Ref 65-66 ]:
1832
1833 HydrogenBondDonor: NH, NH2, OH
1834 HydrogenBondAcceptor: N[!H], O
1835 PositivelyIonizable: +, NH2
1836 NegativelyIonizable: -, C(=O)OH, S(=O)OH, P(=O)OH
1837
1838 =item B<SetMinLength>
1839
1840 $PathLengthFingerprints->SetMinLength($Length);
1841
1842 Sets minimum value of atom path length to be used during atom path length fingerprints
1843 generation and returns I<PathLengthFingerprints>.
1844
1845 =item B<SetMaxLength>
1846
1847 $PathLengthFingerprints->SetMaxLength($Length);
1848
1849 Sets maximum value of atom path length to be used during atom path length fingerprints
1850 generation and returns I<PathLengthFingerprints>.
1851
1852 =item B<SetNumOfBitsToSetPerPath>
1853
1854 $PathLengthFingerprints->SetNumOfBitsToSetPerPath($NumOfBits);
1855
1856 Sets number of bits to set for each path during I<PathLengthBits> B<Type > during path length fingerprints
1857 generation and returns I<PathLengthFingerprints>.
1858
1859 =item B<SetType>
1860
1861 $PathLengthFingerprints->SetType($Type);
1862
1863 Sets type of path length fingerprints and returns I<PathLengthFingerprints>. Possible values:
1864 I<PathLengthBits or PathLengthCount>.
1865
1866 =item B<StringifyPathLengthFingerprints>
1867
1868 $String = $PathLengthFingerprints->StringifyPathLengthFingerprints();
1869
1870 Returns a string containing information about I<PathLengthFingerprints> object.
1871
1872 =back
1873
1874 =head1 AUTHOR
1875
1876 Manish Sud <msud@san.rr.com>
1877
1878 =head1 SEE ALSO
1879
1880 Fingerprints.pm, FingerprintsStringUtil.pm, AtomNeighborhoodsFingerprints.pm,
1881 AtomTypesFingerprints.pm, EStateIndiciesFingerprints.pm, ExtendedConnectivityFingerprints.pm,
1882 MACCSKeys.pm, TopologicalAtomPairsFingerprints.pm, TopologicalAtomTripletsFingerprints.pm,
1883 TopologicalAtomTorsionsFingerprints.pm, TopologicalPharmacophoreAtomPairsFingerprints.pm,
1884 TopologicalPharmacophoreAtomTripletsFingerprints.pm
1885
1886 =head1 COPYRIGHT
1887
1888 Copyright (C) 2015 Manish Sud. All rights reserved.
1889
1890 This file is part of MayaChemTools.
1891
1892 MayaChemTools is free software; you can redistribute it and/or modify it under
1893 the terms of the GNU Lesser General Public License as published by the Free
1894 Software Foundation; either version 3 of the License, or (at your option)
1895 any later version.
1896
1897 =cut