MayaChemTools

   1 package Fingerprints::PathLengthFingerprints;
   2 #
   3 # $RCSfile: PathLengthFingerprints.pm,v $
   4 # $Date: 2015/02/28 20:48:54 $
   5 # $Revision: 1.39 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use Carp;
  31 use Exporter;
  32 use TextUtil ();
  33 use MathUtil ();
  34 use Fingerprints::Fingerprints;
  35 use Molecule;
  36 use AtomTypes::AtomicInvariantsAtomTypes;
  37 use AtomTypes::DREIDINGAtomTypes;
  38 use AtomTypes::EStateAtomTypes;
  39 use AtomTypes::FunctionalClassAtomTypes;
  40 use AtomTypes::MMFF94AtomTypes;
  41 use AtomTypes::SLogPAtomTypes;
  42 use AtomTypes::SYBYLAtomTypes;
  43 use AtomTypes::TPSAAtomTypes;
  44 use AtomTypes::UFFAtomTypes;
  45 
  46 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  47 
  48 @ISA = qw(Fingerprints::Fingerprints Exporter);
  49 @EXPORT = qw();
  50 @EXPORT_OK = qw();
  51 
  52 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  53 
  54 # Setup class variables...
  55 my($ClassName);
  56 _InitializeClass();
  57 
  58 # Overload Perl functions...
  59 use overload '""' => 'StringifyPathLengthFingerprints';
  60 
  61 # Class constructor...
  62 sub new {
  63   my($Class, %NamesAndValues) = @_;
  64 
  65   # Initialize object...
  66   my $This = $Class->SUPER::new();
  67   bless $This, ref($Class) || $Class;
  68   $This->_InitializePathLengthFingerprints();
  69 
  70   $This->_InitializePathLengthFingerprintsProperties(%NamesAndValues);
  71 
  72   return $This;
  73 }
  74 
  75 # Initialize object data...
  76 #
  77 sub _InitializePathLengthFingerprints {
  78   my($This) = @_;
  79 
  80   # Type of fingerprint to generate...
  81   #
  82   # PathLengthBits - A bit vector indicating presence/absence of atom paths
  83   # PathLengthCount - A vector containing count of atom paths
  84   #
  85   $This->{Type} = '';
  86 
  87   # Type of vector: FingerprintsBitVector or FingerprintsVector
  88   $This->{VectorType} = '';
  89 
  90   # Set default mininum, maximum, and default size. Although any arbitrary size can
  91   # be specified, bit vector used to store bits work on a vector size which is
  92   # power of 2 and additonal bits are automatically added and cleared.
  93   #
  94   $This->{Size} = 1024;
  95 
  96   $This->{MinSize} = 32;
  97   $This->{MaxSize} = 2**32;
  98 
  99   # Minimum and maximum path lengths to use for fingerprints generation...
 100   $This->{MinLength} = 1;
 101   $This->{MaxLength} = 8;
 102 
 103   # Numner of bits to set for each atom path for FingerprintsBitVector...
 104   $This->{NumOfBitsToSetPerPath} = 1;
 105 
 106   # Atom identifier type to use for path atoms during fingerprints generation...
 107   #
 108   # Currently supported values are: AtomicInvariantsAtomTypes, DREIDINGAtomTypes,
 109   # EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes,
 110   # SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes
 111   #
 112   $This->{AtomIdentifierType} = '';
 113 
 114   # Atom types assigned to atoms...
 115   %{$This->{AssignedAtomTypes}} = ();
 116 
 117   # For molecules containing rings, atom paths starting from each atom can be traversed in four
 118   # different ways:
 119   #
 120   # . Atom paths without any rings and sharing of bonds in traversed paths.
 121   # . Atom paths containing rings and without any sharing of bonds in traversed paths
 122   # . All possible atom paths without any rings and sharing of bonds in traversed paths
 123   # . All possible atom paths containing rings and with sharing of bonds in traversed paths.
 124   #
 125   # Atom path traversal is terminated at the last ring atom. For molecules containing no rings,
 126   # first two and last two types described above are equivalent.
 127   #
 128   # AllowSharedBonds and AllowRings variables allow generation of differen types of paths
 129   # to be used for fingerprints generation.
 130   #
 131   # In addition to atom symbols, bond symbols are also used to generate a string
 132   # for atom paths. These atom paths strings are hased to a 32 bit integer key which
 133   # in turn is used as a seed for a random number generation in range of 1 to fingerprint
 134   # size for setting corresponding bit in bit vector.
 135   #
 136   # UseBondSymbols variable allow generation of atom path strings and consequently fingerprints.
 137   #
 138   # Combination of AllowSharedBonds, AllowRings, and UseBondSymbols allow generation of
 139   # 8 different types of path length fingerprints:
 140   #
 141   # AllowSharedBonds    AllowRings    UseBondSymbols    PathLengthFingerprintsType
 142   #
 143   # No                  No            Yes                AtomPathsNoCyclesWithBondSymbols
 144   # No                  Yes           Yes                AtomPathsWithCyclesWithBondSymbols
 145   #
 146   # Yes                 No            Yes                AllAtomPathsNoCyclesWithBondSymbols
 147   # Yes                 Yes           Yes                AllAtomPathsWithCyclesWithBondSymbols [ DEFAULT ]
 148   #
 149   # No                  No            No                 AtomPathsNoCyclesNoBondSymbols
 150   # No                  Yes           No                 AtomPathsWithCyclesNoBondSymbols
 151   #
 152   # Yes                 No            No                 AllAtomPathsNoCyclesNoBondSymbols
 153   # Yes                 Yes           No                 AllAtomPathsWithCyclesNoWithBondSymbols
 154   #
 155   #
 156 
 157   # By default, atom paths starting from atoms are allowed to share bonds already traversed...
 158   $This->{AllowSharedBonds} = 1;
 159 
 160   # By default rings are included in paths...
 161   $This->{AllowRings} = 1;
 162 
 163   # By default bond symbols are included in atom path strings...
 164   $This->{UseBondSymbols} = 1;
 165 
 166   # By default only structurally unique atom paths are used for generation
 167   # atom path strings...
 168   $This->{UseUniquePaths} = 1;
 169 
 170   # Random number generator to use during generation of fingerprints bit-vector
 171   # string: Perl CORE::rand or MayaChemTools MathUtil::random function.
 172   #
 173   # The random number generator implemented in MayaChemTools is a variant of
 174   # linear congruential generator (LCG) as described by Miller et al. [ Ref 120 ].
 175   # It is also referred to as Lehmer random number generator or Park-Miller
 176   # random number generator.
 177   #
 178   # Unlike Perl's core random number generator function rand, the random number
 179   # generator implemented in MayaChemTools, MathUtil::random,  generates consistent
 180   # random values across different platformsfor a specific random seed and leads
 181   # to generation of portable fingerprints bit-vector strings.
 182   #
 183   $This->{UsePerlCoreRandom} = 1;
 184 
 185   # Bond symbols to use during generation of atom path strings...
 186   %{$This->{BondOrderToSymbol}} = ();
 187   %{$This->{BondOrderToSymbol}} = ('1' => '', '1.5' => ':', '2' => '=', '3' => '#');
 188 
 189   # BondSymbols map to use for bonded atom IDs to use during atom path strings...
 190   %{$This->{BondSymbols}} = ();
 191 
 192   # Path atom IDs to remove duplicate paths...
 193   %{$This->{UniqueLinearAtomPathsIDs}} = ();
 194   %{$This->{UniqueCyclicAtomPathsIDs}} = ();
 195 
 196   # Reference to all the atom paths upto specified path length...
 197   $This->{AtomPathsRef} = '';
 198 
 199   # Atom paths strings created using specified atom types and bond symbols...
 200   %{$This->{AtomPathsStrings}} = ();
 201 }
 202 
 203 # Initialize class ...
 204 sub _InitializeClass {
 205   #Class name...
 206   $ClassName = __PACKAGE__;
 207 }
 208 
 209 # Initialize object properties....
 210 sub _InitializePathLengthFingerprintsProperties {
 211   my($This, %NamesAndValues) = @_;
 212 
 213   my($Name, $Value, $MethodName);
 214   while (($Name, $Value) = each  %NamesAndValues) {
 215     $MethodName = "Set${Name}";
 216     $This->$MethodName($Value);
 217   }
 218 
 219   # Make sure molecule object was specified...
 220   if (!exists $NamesAndValues{Molecule}) {
 221     croak "Error: ${ClassName}->New: Object can't be instantiated without specifying molecule...";
 222   }
 223 
 224   if (!exists $NamesAndValues{Type}) {
 225     croak "Error: ${ClassName}->New: Object can't be instantiated without specifying Type...";
 226   }
 227 
 228   if (!exists $NamesAndValues{AtomIdentifierType}) {
 229     croak "Error: ${ClassName}->New: Object can't be instantiated without specifying AtomIdentifierType...";
 230   }
 231 
 232   # Make sure it's power of 2...
 233   if (exists $NamesAndValues{Size}) {
 234     if (!TextUtil::IsNumberPowerOfNumber($NamesAndValues{Size}, 2)) {
 235       croak "Error: ${ClassName}->New: Specified size value, $NamesAndValues{Size}, must be power of 2...";
 236     }
 237   }
 238 
 239   if ($This->{Type} =~ /^PathLengthBits$/i) {
 240     $This->_InitializePathLengthBits();
 241   }
 242   elsif ($This->{Type} =~ /^PathLengthCount$/i) {
 243     $This->_InitializePathLengthCount();
 244   }
 245   else {
 246     croak "Error: ${ClassName}->_InitializePathLengthFingerprintsProperties: Unknown PathLength type: $This->{Type}; Supported PathLength type : PathLengthBits or PathLengthCount......";
 247   }
 248 
 249   return $This;
 250 }
 251 
 252 # Initialize PathLength bits...
 253 #
 254 sub _InitializePathLengthBits {
 255   my($This) = @_;
 256 
 257   # Vector type...
 258   $This->{VectorType} = 'FingerprintsBitVector';
 259 
 260   $This->_InitializeFingerprintsBitVector();
 261 
 262   return $This;
 263 }
 264 
 265 # Initialize PathLength key count...
 266 #
 267 sub _InitializePathLengthCount {
 268   my($This) = @_;
 269 
 270   # Vector type and type of values...
 271   $This->{VectorType} = 'FingerprintsVector';
 272   $This->{FingerprintsVectorType} = 'NumericalValues';
 273 
 274   $This->_InitializeFingerprintsVector();
 275 
 276   return $This;
 277 }
 278 
 279 # Set type...
 280 #
 281 sub SetType {
 282   my($This, $Type) = @_;
 283 
 284   if ($This->{Type}) {
 285     croak "Error: ${ClassName}->SetType: Can't change type:  It's already set...";
 286   }
 287 
 288   if ($Type =~ /^PathLengthBits$/i) {
 289     $This->{Type} = 'PathLengthBits';;
 290   }
 291   elsif ($Type =~ /^PathLengthCount$/i) {
 292     $This->{Type} = 'PathLengthCount';;
 293   }
 294   else {
 295     croak "Error: ${ClassName}->SetType: Unknown PathLength keys: $Type; Supported PathLength types: PathLengthBits or PathLengthCount...";
 296   }
 297   return $This;
 298 }
 299 
 300 # Disable vector type change...
 301 #
 302 sub SetVectorType {
 303   my($This, $Type) = @_;
 304 
 305   croak "Error: ${ClassName}->SetVectorType: Can't change vector type...";
 306 
 307   return $This;
 308 }
 309 
 310 # Disable vector type change...
 311 #
 312 sub SetFingerprintsVectorType {
 313   my($This, $Type) = @_;
 314 
 315   croak "Error: ${ClassName}->SetFingerprintsVectorType: Can't change fingerprints vector type...";
 316 
 317   return $This;
 318 }
 319 
 320 # Set atom identifier type to use for path length atom identifiers...
 321 #
 322 sub SetAtomIdentifierType {
 323   my($This, $IdentifierType) = @_;
 324 
 325   if ($IdentifierType !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
 326     croak "Error: ${ClassName}->SetAtomIdentifierType: Specified value, $IdentifierType, for AtomIdentifierType is not vaild. Supported types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, and UFFAtomTypes.";
 327   }
 328 
 329   if ($This->{AtomIdentifierType}) {
 330     croak "Error: ${ClassName}->SetAtomIdentifierType: Can't change atom identifier type:  It's already set...";
 331   }
 332 
 333   $This->{AtomIdentifierType} = $IdentifierType;
 334 
 335   # Initialize atom identifier type information...
 336   $This->_InitializeAtomIdentifierTypeInformation();
 337 
 338   return $This;
 339 }
 340 
 341 # Set minimum path length...
 342 #
 343 sub SetMinLength {
 344   my($This, $Value) = @_;
 345 
 346   if (!TextUtil::IsPositiveInteger($Value)) {
 347     croak "Error: ${ClassName}->SetMinLength: MinLength value, $Value, is not valid:  It must be a positive integer...";
 348   }
 349   $This->{MinLength} = $Value;
 350 
 351   return $This;
 352 }
 353 
 354 # Set maximum path length...
 355 #
 356 sub SetMaxLength {
 357   my($This, $Value) = @_;
 358 
 359   if (!TextUtil::IsPositiveInteger($Value)) {
 360     croak "Error: ${ClassName}->SetMaxLength: MaxLength value, $Value, is not valid:  It must be a positive integer...";
 361   }
 362   $This->{MaxLength} = $Value;
 363 
 364   return $This;
 365 }
 366 
 367 # Set number of bits to set for each path...
 368 #
 369 sub SetNumOfBitsToSetPerPath {
 370   my($This, $Value) = @_;
 371 
 372   if (!TextUtil::IsPositiveInteger($Value)) {
 373     croak "Error: ${ClassName}->SetNumOfBitsToSetPerPath: NumOfBitsToSetPerPath value, $Value, is not valid:  It must be a positive integer...";
 374   }
 375   $This->{NumOfBitsToSetPerPath} = $Value;
 376 
 377   return $This;
 378 }
 379 
 380 # Generate fingerprints description...
 381 #
 382 sub GetDescription {
 383   my($This) = @_;
 384 
 385   # Is description explicity set?
 386   if (exists $This->{Description}) {
 387     return $This->{Description};
 388   }
 389 
 390   # Generate fingerprints description...
 391 
 392   return "$This->{Type}:$This->{AtomIdentifierType}:MinLength$This->{MinLength}:MaxLength$This->{MaxLength}";
 393 }
 394 
 395 # Generate path length fingerprints...
 396 #
 397 sub GenerateFingerprints {
 398   my($This) = @_;
 399 
 400   if ($This->{MinLength} > $This->{MaxLength}) {
 401     croak "Error: ${ClassName}->GenerateFingerprints: No fingerpritns generated: MinLength, $This->{MinLength}, must be <= MaxLength, $This->{MaxLength}...";
 402   }
 403 
 404   # Cache appropriate molecule data...
 405   $This->_SetupMoleculeDataCache();
 406 
 407   # Assign atom types to all atoms...
 408   if (!$This->_AssignAtomTypes()) {
 409     carp "Warning: ${ClassName}->GenerateFingerprints: $This->{AtomIdentifierType} fingerprints generation didn't succeed: Couldn't assign valid $This->{AtomIdentifierType} to all atoms...";
 410     return $This;
 411   }
 412 
 413   # Setup bond symbol map...
 414   if ($This->{UseBondSymbols}) {
 415     $This->_InitializeBondSymbols();
 416   }
 417 
 418   # Generate appropriate atom paths...
 419   $This->_GenerateAtomPathsUpToMaxLength();
 420 
 421   # Initialize atom path strings...
 422   $This->_InitializeAtomPathsStrings();
 423 
 424   # Generate appropriate atom path strings for unique atom paths...
 425   $This->_GenerateAtomPathsStrings();
 426 
 427   # Set final fingerprints...
 428   $This->_SetFinalFingerprints();
 429 
 430   # Clear cached molecule data...
 431   $This->_ClearMoleculeDataCache();
 432 
 433   return $This;
 434 }
 435 
 436 # Assign appropriate atom types to all atoms...
 437 #
 438 sub _AssignAtomTypes {
 439   my($This) = @_;
 440   my($SpecifiedAtomTypes, $Atom, $AtomID, $IgnoreHydrogens);
 441 
 442   %{$This->{AssignedAtomTypes}} = ();
 443   $IgnoreHydrogens = 0;
 444 
 445   $SpecifiedAtomTypes = undef;
 446 
 447   IDENTIFIERTYPE: {
 448     if ($This->{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
 449       $SpecifiedAtomTypes = new AtomTypes::AtomicInvariantsAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens, 'AtomicInvariantsToUse' => $This->{AtomicInvariantsToUse});
 450       last IDENTIFIERTYPE;
 451     }
 452 
 453     if ($This->{AtomIdentifierType} =~ /^DREIDINGAtomTypes$/i) {
 454       $SpecifiedAtomTypes = new AtomTypes::DREIDINGAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens);
 455       last IDENTIFIERTYPE;
 456     }
 457 
 458     if ($This->{AtomIdentifierType} =~ /^EStateAtomTypes$/i) {
 459       $SpecifiedAtomTypes = new AtomTypes::EStateAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens);
 460       last IDENTIFIERTYPE;
 461     }
 462 
 463     if ($This->{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
 464       $SpecifiedAtomTypes = new AtomTypes::FunctionalClassAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens, 'FunctionalClassesToUse' => $This->{FunctionalClassesToUse});
 465       last IDENTIFIERTYPE;
 466     }
 467 
 468     if ($This->{AtomIdentifierType} =~ /^MMFF94AtomTypes$/i) {
 469       $SpecifiedAtomTypes = new AtomTypes::MMFF94AtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens);
 470       last IDENTIFIERTYPE;
 471     }
 472 
 473     if ($This->{AtomIdentifierType} =~ /^SLogPAtomTypes$/i) {
 474       $SpecifiedAtomTypes = new AtomTypes::SLogPAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens);
 475       last IDENTIFIERTYPE;
 476     }
 477     if ($This->{AtomIdentifierType} =~ /^SYBYLAtomTypes$/i) {
 478       $SpecifiedAtomTypes = new AtomTypes::SYBYLAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens);
 479       last IDENTIFIERTYPE;
 480     }
 481 
 482     if ($This->{AtomIdentifierType} =~ /^TPSAAtomTypes$/i) {
 483       $SpecifiedAtomTypes = new AtomTypes::TPSAAtomTypes('Molecule' => $This->{Molecule}, 'IgnorePhosphorus' => 0, 'IgnoreSulfur' => 0);
 484       last IDENTIFIERTYPE;
 485     }
 486 
 487     if ($This->{AtomIdentifierType} =~ /^UFFAtomTypes$/i) {
 488       $SpecifiedAtomTypes = new AtomTypes::UFFAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens);
 489       last IDENTIFIERTYPE;
 490     }
 491 
 492     croak "Error: ${ClassName}->_AssignAtomTypes: Unknown atom indentifier type $This->{AtomIdentifierType}...";
 493   }
 494 
 495   # Assign atom types...
 496   $SpecifiedAtomTypes->AssignAtomTypes();
 497 
 498   # Make sure atom types assignment is successful...
 499   if (!$SpecifiedAtomTypes->IsAtomTypesAssignmentSuccessful()) {
 500     return undef;
 501   }
 502 
 503   # Collect assigned atom types...
 504   ATOM: for $Atom (@{$This->{Atoms}}) {
 505     $AtomID = $Atom->GetID();
 506     $This->{AssignedAtomTypes}{$AtomID} = $SpecifiedAtomTypes->GetAtomType($Atom);
 507   }
 508 
 509   return $This;
 510 }
 511 
 512 # Setup bond symbol map for atoms to speed up generation of path length identifiers
 513 # during fingerprints generation...
 514 #
 515 sub _InitializeBondSymbols {
 516   my($This) = @_;
 517   my($Atom1, $Atom2, $AtomID1, $AtomID2, $Bond, $BondSymbol, $BondOrder);
 518 
 519   %{$This->{BondSymbols}} = ();
 520 
 521   if (!$This->{UseBondSymbols}) {
 522     return $This;
 523   }
 524 
 525   for $Bond ($This->{Molecule}->GetBonds()) {
 526     $BondOrder = $Bond->GetBondOrder();
 527     $BondSymbol = $Bond->IsAromatic() ? ':' : (exists($This->{BondOrderToSymbol}{$BondOrder}) ? $This->{BondOrderToSymbol}{$BondOrder} : $BondOrder);
 528     ($Atom1, $Atom2) = $Bond->GetAtoms();
 529     $AtomID1 = $Atom1->GetID(); $AtomID2 = $Atom2->GetID();
 530     if ($AtomID1 > $AtomID2) {
 531       ($AtomID1, $AtomID2) =  ($AtomID2, $AtomID1);
 532     }
 533 
 534     if (!exists $This->{BondSymbols}{$AtomID1}) {
 535       %{$This->{BondSymbols}{$AtomID1}} = ();
 536     }
 537     $This->{BondSymbols}{$AtomID1}{$AtomID2} = $BondSymbol;
 538   }
 539   return $This;
 540 }
 541 
 542 # Get appropriate atom paths with length up to MaxLength...
 543 #
 544 sub _GenerateAtomPathsUpToMaxLength {
 545   my($This) = @_;
 546   my($PathLength, $AllowRings, $Molecule, $AtomPathsRef);
 547 
 548   $PathLength = $This->{MaxLength};
 549   $AllowRings = $This->{AllowRings};
 550   $Molecule = $This->{Molecule};
 551 
 552   if ($This->{AllowSharedBonds}) {
 553     $AtomPathsRef =  $Molecule->GetAllAtomPathsWithLengthUpto($PathLength, $AllowRings);
 554   }
 555   else {
 556     $AtomPathsRef = $Molecule->GetAtomPathsWithLengthUpto($PathLength, $AllowRings);
 557   }
 558   $This->{AtomPathsRef} = $AtomPathsRef;
 559 
 560   return $This;
 561 }
 562 
 563 # Initialize atom paths strings at various pathlength levels...
 564 #
 565 sub _InitializeAtomPathsStrings {
 566   my($This) = @_;
 567   my($PathLength);
 568 
 569   %{$This->{AtomPathsStrings}} = ();
 570 
 571   for $PathLength ($This->{MinLength} .. $This->{MaxLength}) {
 572     %{$This->{AtomPathsStrings}{$PathLength}} = ();
 573   }
 574 
 575   return $This;
 576 }
 577 
 578 # Generate appropriate atom path strings for unique atom paths...
 579 #
 580 sub _GenerateAtomPathsStrings {
 581   my($This, $PathAtomsRef) = @_;
 582   my($PathLength, $MinPathLength, $UseUniquePaths);
 583 
 584   $MinPathLength = $This->{MinLength};
 585   $UseUniquePaths = $This->{UseUniquePaths};
 586 
 587   PATHATOMS: for $PathAtomsRef (@{$This->{AtomPathsRef}}) {
 588     $PathLength = scalar @{$PathAtomsRef};
 589     if ($PathLength < $MinPathLength) {
 590       next PATHATOMS;
 591     }
 592     if ($UseUniquePaths) {
 593       $This->_GenerateAtomPathStringUsingUniquePath($PathAtomsRef);
 594     }
 595     else {
 596       $This->_GenerateAtomPathString($PathAtomsRef);
 597     }
 598   }
 599   return $This;
 600 }
 601 
 602 # Generate atom path string using unique path...
 603 #
 604 sub _GenerateAtomPathStringUsingUniquePath {
 605   my($This, $PathAtomsRef) = @_;
 606 
 607   if ($This->{AllowRings} && $This->_DoesAtomPathContainsCycle($PathAtomsRef)) {
 608     $This->_GenerateAtomPathStringUsingUniquePathContainingCycle($PathAtomsRef);
 609   }
 610   else {
 611     $This->_GenerateAtomPathStringUsingUniqueLinearPath($PathAtomsRef);
 612   }
 613   return $This;
 614 }
 615 
 616 # Generate atom path string for specified path containing no cycle...
 617 #
 618 sub _GenerateAtomPathStringUsingUniqueLinearPath {
 619   my($This, $PathAtomsRef) = @_;
 620 
 621   # Is it a unique linear atom path?
 622   #
 623   if (!$This->_IsUniqueLinearAtomPath($PathAtomsRef)) {
 624     return $This;
 625   }
 626   $This->_GenerateAtomPathString($PathAtomsRef);
 627 
 628   return $This;
 629 }
 630 
 631 # Is it a structurally unique linear path?
 632 #
 633 # For a path to be structurally unique, all of its atom IDs must be diffferent from any
 634 # earlier path atom IDs. In order to generate atom path atom ID invariant of the atom
 635 # order in the molecule, atom IDs are sorted numerically before generating the path ID.
 636 #
 637 # Notes:
 638 #   . Atom path ID doesn't reflect the order of atoms in the atom path.
 639 #
 640 sub _IsUniqueLinearAtomPath {
 641   my($This, $PathAtomsRef) = @_;
 642   my($AtomPathID, $PathLength, @PathAtomIDs);
 643 
 644   @PathAtomIDs = ();
 645   @PathAtomIDs = map { $_->GetID(); } @{$PathAtomsRef};
 646 
 647   $AtomPathID = join '-', sort { $a <=> $b } @PathAtomIDs;
 648   if (exists $This->{UniqueLinearAtomPathsIDs}{$AtomPathID}) {
 649     return 0;
 650   }
 651 
 652   # It's a unique atom path...
 653   $This->{UniqueLinearAtomPathsIDs}{$AtomPathID} = 1;
 654 
 655   return 1;
 656 }
 657 
 658 # Generate atom path string for specified path containing a cycle...
 659 #
 660 sub _GenerateAtomPathStringUsingUniquePathContainingCycle {
 661   my($This, $PathAtomsRef) = @_;
 662 
 663   # Is it a unique atom path containing a cycle?
 664   #
 665   if (!$This->_IsUniqueAtomPathContainingCycle($PathAtomsRef)) {
 666     return $This;
 667   }
 668 
 669   my($CycleClosingPathAtomIndex);
 670   ($CycleClosingPathAtomIndex) = $This->_GetAtomPathCycleClosingAtomIndex($PathAtomsRef);
 671 
 672   if ($CycleClosingPathAtomIndex == 0) {
 673     $This->_GenerateUniqueAtomPathStringForPathCycle($PathAtomsRef);
 674   }
 675   else {
 676     $This->_GenerateUniqueAtomPathStringForPathContainingCycle($PathAtomsRef, $CycleClosingPathAtomIndex);
 677   }
 678   return $This;
 679 }
 680 
 681 # Generate a unique atom path string for a cyclic path by generating atom path
 682 # strings for all possible paths in the cycle and keeping the lexicographically smallest
 683 # one.
 684 #
 685 # Although all the paths enumerated during atom path string generation are also
 686 # present in the intial paths list, but structural uniqueness check would detect
 687 # 'em earlier and this method ends being invoked only once for the first cyclic path.
 688 #
 689 # For atom paths containg same atom types and bond symbols, atom path strings
 690 # would be same for the paths.
 691 #
 692 sub _GenerateUniqueAtomPathStringForPathCycle {
 693   my($This, $PathAtomsRef) = @_;
 694 
 695   if ($This->_AreAllPathAtomsSymbolsSame($PathAtomsRef) && $This->_AreAllPathBondSymbolsSame($PathAtomsRef)) {
 696     return $This->_GenerateAtomPathString($PathAtomsRef);
 697   }
 698 
 699   # Generate all possible atom path strings and select the lexicographically smallest one...
 700   my($Index, $PathLength, $FinalAtomPathString, $FirstAtomPathString, $LastIndex, $FirstPartIndex, $FirstPartStartIndex, $FirstPartEndIndex, $SecondPartIndex, $SecondPartStartIndex, $SecondPartEndIndex, $AtomPathSymbolsRef, $AtomPathString, $ReverseAtomPathString, @FirstPartPathAtoms, @SecondPartPathAtoms, @PathAtoms);
 701 
 702   $PathLength = scalar @{$PathAtomsRef};
 703   $LastIndex = $PathLength - 1;
 704 
 705   $FinalAtomPathString = '';
 706   $FirstAtomPathString = 1;
 707 
 708   @FirstPartPathAtoms = (); @SecondPartPathAtoms = (); @PathAtoms = ();
 709 
 710   for $Index (0 .. ($LastIndex - 1)) {
 711     @FirstPartPathAtoms = (); @SecondPartPathAtoms = (); @PathAtoms = ();
 712 
 713     $FirstPartStartIndex = 0; $FirstPartEndIndex = $Index - 1;
 714     $SecondPartStartIndex = $Index; $SecondPartEndIndex = $LastIndex - 1;
 715 
 716     # Get first part atoms...
 717     for $FirstPartIndex ($FirstPartStartIndex .. $FirstPartEndIndex) {
 718       push @FirstPartPathAtoms, $PathAtomsRef->[$FirstPartIndex];
 719     }
 720 
 721     # Get second part atoms...
 722     for $SecondPartIndex ($SecondPartStartIndex .. $SecondPartEndIndex) {
 723       push @SecondPartPathAtoms, $PathAtomsRef->[$SecondPartIndex];
 724     }
 725 
 726     # Get final list of path atoms...
 727     if (@SecondPartPathAtoms) {
 728       push @PathAtoms, @SecondPartPathAtoms;
 729     }
 730     if (@FirstPartPathAtoms) {
 731       push @PathAtoms, @FirstPartPathAtoms;
 732     }
 733 
 734     # Complete the cycle by adding first atom as the last atom...
 735     push @PathAtoms, $PathAtomsRef->[$SecondPartStartIndex];
 736 
 737     # Generate atom path string...
 738     $AtomPathSymbolsRef = $This->_GenerateAtomPathSymbols(\@PathAtoms);
 739 
 740     $AtomPathString = join '', @{$AtomPathSymbolsRef};
 741     $ReverseAtomPathString = join '', reverse @{$AtomPathSymbolsRef};
 742 
 743     if ($ReverseAtomPathString le $AtomPathString) {
 744       $AtomPathString = $ReverseAtomPathString;
 745     }
 746 
 747     # Update final atom path string...
 748 
 749     if ($FirstAtomPathString) {
 750       $FirstAtomPathString = 0;
 751       $FinalAtomPathString = $AtomPathString;
 752     }
 753     else {
 754       if ($AtomPathString le $FinalAtomPathString) {
 755         $FinalAtomPathString = $AtomPathString;
 756       }
 757     }
 758   }
 759 
 760   # Set final atom path string...
 761   #
 762   if (exists $This->{AtomPathsStrings}{$PathLength}{$FinalAtomPathString}) {
 763     $This->{AtomPathsStrings}{$PathLength}{$FinalAtomPathString} += 1;
 764   }
 765   else {
 766     $This->{AtomPathsStrings}{$PathLength}{$FinalAtomPathString} = 1;
 767   }
 768 
 769   return $This;
 770 }
 771 
 772 #
 773 # Generate a unique atom path string for paths containing a cycle closed by
 774 # the specified atom index and the last atom index.
 775 #
 776 # The following methodology is used to generate atom path string which is
 777 # independemt of initial atom ordering:
 778 #   . Generate atom paths string from first atom to the atom before the first cycle
 779 #     closing atom.
 780 #   . Generate atom path string from atoms from first cycle closing atom index to
 781 #     the last path atom in both forward and reverse order. And select the lexicographically
 782 #     smallest atom path string.
 783 #   . Combine atom path string generated in first step with second step to generate
 784 #     final atom path string.
 785 #
 786 sub _GenerateUniqueAtomPathStringForPathContainingCycle {
 787   my($This, $PathAtomsRef, $CycleClosingAtomIndex) = @_;
 788   my($Index, $PathLength, $LastIndex, $LinearPartStartIndex, $LinearPartEndIndex, $CyclicPartStartIndex, $CyclicPartEndIndex, $CyclicPartAtomPathSymbolsRef, $CyclicPartAtomPathString, $ReverseCyclicPartAtomPathString, $AtomPathString, $AtomPathSymbolsRef, @CyclicPartPathAtoms, @PathAtoms);
 789 
 790   $PathLength = scalar @{$PathAtomsRef};
 791   $LastIndex = $PathLength - 1;
 792 
 793   @PathAtoms = ();
 794 
 795   # Get path atoms corresponding to linear  part of the path...
 796   $LinearPartStartIndex = 0; $LinearPartEndIndex = $CycleClosingAtomIndex - 1;
 797 
 798   for $Index ($LinearPartStartIndex .. $LinearPartEndIndex) {
 799     push @PathAtoms, $PathAtomsRef->[$Index];
 800   }
 801 
 802   # Get atoms correcponding to cyclic part of the path...
 803   @CyclicPartPathAtoms = ();
 804   $CyclicPartStartIndex = $CycleClosingAtomIndex; $CyclicPartEndIndex = $LastIndex;
 805 
 806   for $Index ($CyclicPartStartIndex .. $CyclicPartEndIndex) {
 807     push @CyclicPartPathAtoms, $PathAtomsRef->[$Index];
 808   }
 809 
 810   # Setup a lexicographically smaller atom path string for cyclic part...
 811 
 812   $CyclicPartAtomPathSymbolsRef = $This->_GenerateAtomPathSymbols(\@CyclicPartPathAtoms);
 813   $CyclicPartAtomPathString = join '', @{$CyclicPartAtomPathSymbolsRef};
 814   $ReverseCyclicPartAtomPathString = join '', reverse @{$CyclicPartAtomPathSymbolsRef};
 815 
 816   # Setup atom path corresponding to linear part and lexigraphicall smaller cyclic part...
 817 
 818   if ($ReverseCyclicPartAtomPathString le $CyclicPartAtomPathString) {
 819     push @PathAtoms, reverse @CyclicPartPathAtoms;
 820   }
 821   else {
 822     push @PathAtoms, @CyclicPartPathAtoms;
 823   }
 824 
 825   # Setup final atom path string...
 826 
 827   $AtomPathSymbolsRef = $This->_GenerateAtomPathSymbols(\@PathAtoms);
 828   $AtomPathString = join '', @{$AtomPathSymbolsRef};
 829 
 830   if (exists $This->{AtomPathsStrings}{$PathLength}{$AtomPathString}) {
 831     $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} += 1;
 832   }
 833   else {
 834     $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} = 1;
 835   }
 836 
 837   return $This;
 838 }
 839 
 840 # Does atom path contain a cycle?
 841 #
 842 # For an atom path to contain cycle, it must satisfy the following conditions:
 843 #   . Pathlength >= 3
 844 #   . Last atom ID is equal to first atom ID or some other atom ID besides itself
 845 #
 846 sub _DoesAtomPathContainsCycle {
 847   my($This, $PathAtomsRef) = @_;
 848   my($PathLength);
 849 
 850   $PathLength = scalar @{$PathAtomsRef};
 851   if ($PathLength <= 2) {
 852     return 0;
 853   }
 854 
 855   my($AtomIndex, $LastAtomIndex, $Atom, $AtomID, $LastAtom, $LastAtomID);
 856 
 857   $LastAtomIndex = $PathLength - 1;
 858   $LastAtom = $PathAtomsRef->[$LastAtomIndex];
 859   $LastAtomID = $LastAtom->GetID();
 860 
 861   # Look for atomID similar to last atom ID...
 862   for $AtomIndex (0 .. ($LastAtomIndex - 1)) {
 863     $Atom =  $PathAtomsRef->[$AtomIndex];
 864     $AtomID = $Atom->GetID();
 865 
 866     if ($AtomID == $LastAtomID) {
 867       # It's a cycle...
 868       return 1;
 869     }
 870   }
 871   return 0;
 872 }
 873 
 874 # Get atom path cycle closing atom index...
 875 #
 876 sub _GetAtomPathCycleClosingAtomIndex {
 877   my($This, $PathAtomsRef) = @_;
 878   my($AtomIndex, $LastAtomIndex, $Atom, $AtomID, $LastAtom, $LastAtomID, $PathLength);
 879 
 880   $PathLength = scalar @{$PathAtomsRef};
 881 
 882   $LastAtomIndex = $PathLength - 1;
 883   $LastAtom = $PathAtomsRef->[$LastAtomIndex]; $LastAtomID = $LastAtom->GetID();
 884 
 885   # Look for atomID similar to last atom ID...
 886   for $AtomIndex (0 .. ($LastAtomIndex - 1)) {
 887     $Atom =  $PathAtomsRef->[$AtomIndex]; $AtomID = $Atom->GetID();
 888 
 889     if ($AtomID == $LastAtomID) {
 890       # It's a cycle closing atom...
 891       return $AtomIndex;
 892     }
 893   }
 894   return undef;
 895 }
 896 
 897 # Is it a structurally unique path containing a cycle?
 898 #
 899 # For atom paths containing cycles, last atom ID is either equal to first atom ID or
 900 # some other atom ID besides itself.
 901 #
 902 # In order to determine its structurally unqiue independent of initial atom ordering,
 903 # the following methodolgy is used:
 904 #
 905 #   . For paths with same first and atom IDs:
 906 #      . Remove the last atom ID from atom path
 907 #      . Sort atom IDs in the path
 908 #      . Add first atom ID from the sorted list to the end of list to complete the cycle
 909 #      . Generate a atom path ID
 910 #      . Use final path ID to track uniqueness of path containing cycle.
 911 #
 912 #   . For paths with last atom ID equal to some other atom ID besidies itself:
 913 #      . Sort atom IDs in atom path
 914 #      . Generate atom path ID and use it to track unqiueness of atom paths.
 915 #
 916 sub _IsUniqueAtomPathContainingCycle {
 917   my($This, $PathAtomsRef) = @_;
 918   my($PathLength, $AtomPathID, $FirstAtom, $LastAtom, $FirstAtomID, $LastAtomID, @PathAtomIDs, @SortedPathAtomIDs);
 919 
 920   @PathAtomIDs = ();
 921   @PathAtomIDs = map { $_->GetID(); } @{$PathAtomsRef};
 922 
 923   $PathLength = scalar @{$PathAtomsRef};
 924 
 925   $FirstAtom = $PathAtomsRef->[0]; $FirstAtomID = $FirstAtom->GetID();
 926   $LastAtom = $PathAtomsRef->[$PathLength - 1]; $LastAtomID = $LastAtom->GetID();
 927 
 928   if ($FirstAtomID == $LastAtomID) {
 929     pop @PathAtomIDs;
 930 
 931     @SortedPathAtomIDs = ();
 932     @SortedPathAtomIDs = sort { $a <=> $b } @PathAtomIDs;
 933 
 934     push @SortedPathAtomIDs, $SortedPathAtomIDs[0];
 935 
 936     $AtomPathID = join '-', @SortedPathAtomIDs;
 937   }
 938   else {
 939     $AtomPathID = join '-', sort { $a <=> $b } @PathAtomIDs;
 940   }
 941 
 942   if (exists $This->{UniqueCyclicAtomPathsIDs}{$AtomPathID}) {
 943     return 0;
 944   }
 945 
 946   # It's a unique atom path containing a cycle...
 947   $This->{UniqueCyclicAtomPathsIDs}{$AtomPathID} = 1;
 948 
 949   return 1;
 950 }
 951 
 952 # Generate atom path string for specified atom path...
 953 #
 954 sub _GenerateAtomPathString {
 955   my($This, $PathAtomsRef) = @_;
 956   my($PathLength, $AtomPathString, $ReverseAtomPathString, $AtomPathSymbolsRef);
 957 
 958   $PathLength = scalar @{$PathAtomsRef};
 959 
 960   # Generate path atom and bond symbols...
 961   #
 962   $AtomPathSymbolsRef = $This->_GenerateAtomPathSymbols($PathAtomsRef);
 963 
 964   # Check presence of path using path ID created by atom path symbols...
 965   $AtomPathString = join '', @{$AtomPathSymbolsRef};
 966   if (exists $This->{AtomPathsStrings}{$PathLength}{$AtomPathString}) {
 967     $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} += 1;
 968     return $This;
 969   }
 970 
 971   # Check presence of reverse path using path ID created by atom path symbols...
 972   #
 973   $ReverseAtomPathString = join '', reverse @{$AtomPathSymbolsRef};
 974   if (exists $This->{AtomPathsStrings}{$PathLength}{$ReverseAtomPathString}) {
 975     $This->{AtomPathsStrings}{$PathLength}{$ReverseAtomPathString} += 1;
 976     return $This;
 977   }
 978 
 979   # Use lexicographically smaller atom path string as PathID...
 980   #
 981   if ($AtomPathString le $ReverseAtomPathString) {
 982     $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} = 1;
 983   }
 984   else {
 985     $This->{AtomPathsStrings}{$PathLength}{$ReverseAtomPathString} = 1;
 986   }
 987   return $This;
 988 }
 989 
 990 #  Are atom types for all path atoms same?
 991 #
 992 sub _AreAllPathAtomsSymbolsSame {
 993   my($This, $PathAtomsRef) = @_;
 994   my($Index, $Atom, $AtomID, $AtomType, $FirstAtomType);
 995 
 996   $Atom = $PathAtomsRef->[0]; $AtomID = $Atom->GetID();
 997   $FirstAtomType = $This->{AssignedAtomTypes}{$AtomID};
 998 
 999   for $Index (1 .. $#{$PathAtomsRef}) {
1000     $Atom = $PathAtomsRef->[$Index]; $AtomID = $Atom->GetID();
1001     $AtomType = $This->{AssignedAtomTypes}{$AtomID};
1002 
1003     if ($AtomType ne $FirstAtomType) {
1004       return 0;
1005     }
1006   }
1007   return 1;
1008 }
1009 
1010 #  Are bond symbols for all path bonds same?
1011 #
1012 sub _AreAllPathBondSymbolsSame {
1013   my($This, $PathAtomsRef) = @_;
1014   my($Index, $Atom, $BondedAtom, $AtomID, $BondedAtomID, $BondAtomID1, $BondAtomID2, $FirstBondSymbol, $BondSymbol);
1015 
1016   # During no usage of bond symbols, just ignore them and assume they are same...
1017   if (!$This->{UseBondSymbols}) {
1018     return 1;
1019   }
1020 
1021   $Atom = $PathAtomsRef->[0]; $BondedAtom = $PathAtomsRef->[1];
1022   $AtomID = $Atom->GetID(); $BondedAtomID = $BondedAtom->GetID();
1023 
1024   ($BondAtomID1, $BondAtomID2) = ($AtomID < $BondedAtomID) ? ($AtomID, $BondedAtomID) : ($BondedAtomID, $AtomID);
1025   $FirstBondSymbol = $This->{BondSymbols}{$BondAtomID1}{$BondAtomID2};
1026 
1027   for $Index (1 .. ($#{$PathAtomsRef} - 1)) {
1028     $Atom = $PathAtomsRef->[$Index]; $BondedAtom = $PathAtomsRef->[$Index + 1];
1029     $AtomID = $Atom->GetID(); $BondedAtomID = $BondedAtom->GetID();
1030 
1031     ($BondAtomID1, $BondAtomID2) = ($AtomID < $BondedAtomID) ? ($AtomID, $BondedAtomID) : ($BondedAtomID, $AtomID);
1032     $BondSymbol = $This->{BondSymbols}{$BondAtomID1}{$BondAtomID2};
1033 
1034     if ($BondSymbol ne $FirstBondSymbol) {
1035       return 0;
1036     }
1037   }
1038   return 1;
1039 }
1040 
1041 # Generate atom path symbols...
1042 #
1043 sub _GenerateAtomPathSymbols {
1044   my($This, $PathAtomsRef) = @_;
1045   my($Atom, $AtomID, @AtomPathSymbols);
1046 
1047   @AtomPathSymbols = ();
1048 
1049   if (@{$PathAtomsRef} == 1) {
1050     $Atom = $PathAtomsRef->[0]; $AtomID = $Atom->GetID();
1051     push @AtomPathSymbols, $This->{AssignedAtomTypes}{$AtomID};
1052     return \@AtomPathSymbols;
1053   }
1054 
1055   # Ignore bond information...
1056   if (!$This->{UseBondSymbols}) {
1057     for $Atom (@{$PathAtomsRef}) {
1058       $AtomID = $Atom->GetID();
1059       push @AtomPathSymbols, $This->{AssignedAtomTypes}{$AtomID};
1060     }
1061     return \@AtomPathSymbols;
1062   }
1063 
1064   # Use atoms and bonds to generate atom path string...
1065   my($Index, $BondedAtom, $BondedAtomID, $BondAtomID1, $BondAtomID2);
1066 
1067   # Process atom type of first atom in path...
1068   $Atom = $PathAtomsRef->[0]; $AtomID = $Atom->GetID();
1069   push @AtomPathSymbols, $This->{AssignedAtomTypes}{$AtomID};
1070 
1071   for $Index (0 .. ($#{$PathAtomsRef} - 1)) {
1072     $Atom = $PathAtomsRef->[$Index]; $BondedAtom = $PathAtomsRef->[$Index + 1];
1073     $AtomID = $Atom->GetID(); $BondedAtomID = $BondedAtom->GetID();
1074 
1075     ($BondAtomID1, $BondAtomID2) = ($AtomID < $BondedAtomID) ? ($AtomID, $BondedAtomID) : ($BondedAtomID, $AtomID);
1076     push @AtomPathSymbols, $This->{BondSymbols}{$BondAtomID1}{$BondAtomID2};
1077 
1078     # Process atom type of next atom in path...
1079     push @AtomPathSymbols, $This->{AssignedAtomTypes}{$BondedAtomID};
1080   }
1081   return \@AtomPathSymbols;
1082 }
1083 
1084 # Set final fingerprits...
1085 #
1086 sub _SetFinalFingerprints {
1087   my($This) = @_;
1088 
1089   # Mark successful generation of fingerprints...
1090   $This->{FingerprintsGenerated} = 1;
1091 
1092   if ($This->{Type} =~ /^PathLengthBits$/i) {
1093     $This->_SetFinalFingerprintsBitVector();
1094   }
1095   elsif ($This->{Type} =~ /^PathLengthCount$/i) {
1096     $This->_SetFinalFingerprintsVector();
1097   }
1098 
1099   return $This;
1100 }
1101 
1102 # Set final fingerprits bit vector...
1103 #
1104 sub _SetFinalFingerprintsBitVector {
1105   my($This) = @_;
1106   my($PathLength, $Size, $AtomPathString, $AtomPathHashCode, $AtomPathBitPos, $FingerprintsBitVector, $SkipBitPosCheck, $NumOfBitsToSetPerPath, $SetBitNum);
1107 
1108   $FingerprintsBitVector = $This->{FingerprintsBitVector};
1109 
1110   $Size = $This->{Size};
1111 
1112   $SkipBitPosCheck = 1;
1113   $NumOfBitsToSetPerPath = $This->{NumOfBitsToSetPerPath};
1114 
1115   for $PathLength (keys %{$This->{AtomPathsStrings}}) {
1116     for $AtomPathString (keys %{$This->{AtomPathsStrings}{$PathLength}}) {
1117       $AtomPathHashCode = TextUtil::HashCode($AtomPathString);
1118 
1119       # Set random number seed...
1120       if ($This->{UsePerlCoreRandom}) {
1121         CORE::srand($AtomPathHashCode);
1122       }
1123       else {
1124         MathUtil::srandom($AtomPathHashCode);
1125       }
1126 
1127       for $SetBitNum (1 .. $NumOfBitsToSetPerPath) {
1128         $AtomPathBitPos = $This->{UsePerlCoreRandom} ? int(CORE::rand($Size)) : int(MathUtil::random($Size));
1129         $FingerprintsBitVector->SetBit($AtomPathBitPos, $SkipBitPosCheck);
1130       }
1131     }
1132   }
1133   return $This;
1134 }
1135 
1136 # Set final fingerprits vector...
1137 #
1138 sub _SetFinalFingerprintsVector {
1139   my($This) = @_;
1140   my($PathLength, $AtomPathString, $FingerprintsVector, $AtomPathCount, @Values, @ValueIDs);
1141 
1142   @Values = ();
1143   @ValueIDs = ();
1144 
1145   for $PathLength (sort { $a <=> $b } keys %{$This->{AtomPathsStrings}}) {
1146     for $AtomPathString (sort keys %{$This->{AtomPathsStrings}{$PathLength}}) {
1147       $AtomPathCount = $This->{AtomPathsStrings}{$PathLength}{$AtomPathString};
1148 
1149       push @Values, $AtomPathCount;
1150       push @ValueIDs, $AtomPathString;
1151     }
1152   }
1153 
1154   # Add PathLengthIDs and values to fingerprint vector...
1155   $This->{FingerprintsVector}->AddValueIDs(\@ValueIDs);
1156   $This->{FingerprintsVector}->AddValues(\@Values);
1157 
1158   return $This;
1159 }
1160 
1161 # Cache  appropriate molecule data...
1162 #
1163 sub _SetupMoleculeDataCache {
1164   my($This) = @_;
1165 
1166   # Get all atoms...
1167   @{$This->{Atoms}} = $This->GetMolecule()->GetAtoms();
1168 
1169   return $This;
1170 }
1171 
1172 # Clear cached molecule data...
1173 #
1174 sub _ClearMoleculeDataCache {
1175   my($This) = @_;
1176 
1177   # Clear atoms...
1178   @{$This->{Atoms}} = ();
1179 
1180   # Clear path atoms..
1181   $This->{AtomPathsRef} = '';
1182 
1183   return $This;
1184 }
1185 
1186 # Set atomic invariants to use atom identifiers...
1187 #
1188 sub SetAtomicInvariantsToUse {
1189   my($This, @Values) = @_;
1190   my($FirstValue, $TypeOfFirstValue, $AtomicInvariant, $SpecifiedAtomicInvariant, $AtomicInvariantValue, @SpecifiedAtomicInvariants, @AtomicInvariantsToUse);
1191 
1192   if (!@Values) {
1193     carp "Warning: ${ClassName}->SetAtomicInvariantsToUse: No values specified...";
1194     return;
1195   }
1196 
1197   $FirstValue = $Values[0];
1198   $TypeOfFirstValue = ref $FirstValue;
1199 
1200   @SpecifiedAtomicInvariants = ();
1201   @AtomicInvariantsToUse = ();
1202 
1203   if ($TypeOfFirstValue =~ /^ARRAY/) {
1204     push @SpecifiedAtomicInvariants, @{$FirstValue};
1205   }
1206   else {
1207     push @SpecifiedAtomicInvariants, @Values;
1208   }
1209 
1210   # Make sure specified AtomicInvariants are valid...
1211   for $SpecifiedAtomicInvariant (@SpecifiedAtomicInvariants) {
1212     if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($SpecifiedAtomicInvariant)) {
1213       croak "Error: ${ClassName}->SetAtomicInvariantsToUse: Specified atomic invariant, $SpecifiedAtomicInvariant, is not supported...\n ";
1214     }
1215     $AtomicInvariant = $SpecifiedAtomicInvariant;
1216     push @AtomicInvariantsToUse, $AtomicInvariant;
1217   }
1218 
1219   # Set atomic invariants to use...
1220   @{$This->{AtomicInvariantsToUse}} = ();
1221   push @{$This->{AtomicInvariantsToUse}}, @AtomicInvariantsToUse;
1222 
1223   return $This;
1224 }
1225 
1226 # Set functional classes to use for atom identifiers...
1227 #
1228 sub SetFunctionalClassesToUse {
1229   my($This, @Values) = @_;
1230   my($FirstValue, $TypeOfFirstValue, $FunctionalClass, $SpecifiedFunctionalClass, @SpecifiedFunctionalClasses, @FunctionalClassesToUse);
1231 
1232   if (!@Values) {
1233     carp "Warning: ${ClassName}->SetFunctionalClassesToUse: No values specified...";
1234     return;
1235   }
1236 
1237   if ($This->{AtomIdentifierType} !~ /^FunctionalClassAtomTypes$/i) {
1238     carp "Warning: ${ClassName}->SetFunctionalClassesToUse: FunctionalClassesToUse can't be set for InitialAtomIdentifierType of $This->{AtomIdentifierType}...";
1239     return;
1240   }
1241 
1242   $FirstValue = $Values[0];
1243   $TypeOfFirstValue = ref $FirstValue;
1244 
1245   @SpecifiedFunctionalClasses = ();
1246   @FunctionalClassesToUse = ();
1247 
1248   if ($TypeOfFirstValue =~ /^ARRAY/) {
1249     push @SpecifiedFunctionalClasses, @{$FirstValue};
1250   }
1251   else {
1252     push @SpecifiedFunctionalClasses, @Values;
1253   }
1254 
1255   # Make sure specified FunctionalClasses are valid...
1256   for $SpecifiedFunctionalClass (@SpecifiedFunctionalClasses) {
1257     if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($SpecifiedFunctionalClass)) {
1258       croak "Error: ${ClassName}->SetFunctionalClassesToUse: Specified functional class, $SpecifiedFunctionalClass, is not supported...\n ";
1259     }
1260     push @FunctionalClassesToUse, $SpecifiedFunctionalClass;
1261   }
1262 
1263   # Set functional classes to use...
1264   @{$This->{FunctionalClassesToUse}} = ();
1265   push @{$This->{FunctionalClassesToUse}}, @FunctionalClassesToUse;
1266 
1267   return $This;
1268 }
1269 
1270 # Initialize atom indentifier type information...
1271 #
1272 # Current supported values:
1273 #
1274 # AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes,
1275 # MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes
1276 #
1277 sub _InitializeAtomIdentifierTypeInformation {
1278   my($This) = @_;
1279 
1280   if ($This->{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
1281     $This->_InitializeAtomicInvariantsAtomTypesInformation();
1282   }
1283   elsif ($This->{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
1284     $This->_InitializeFunctionalClassAtomTypesInformation();
1285   }
1286   elsif ($This->{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
1287     # Nothing to do for now...
1288   }
1289   else {
1290     croak "Error: ${ClassName}->_InitializeAtomIdentifierTypeInformation: Unknown atom indentifier type $This->{AtomIdentifierType}...";
1291   }
1292 
1293   return $This;
1294 }
1295 
1296 # Initialize atomic invariants atom types to use for generating atom identifiers...
1297 #
1298 # Let:
1299 #   AS = Atom symbol corresponding to element symbol
1300 #
1301 #   X<n>   = Number of non-hydrogen atom neighbors or heavy atoms attached to atom
1302 #   BO<n> = Sum of bond orders to non-hydrogen atom neighbors or heavy atoms attached to atom
1303 #   LBO<n> = Largest bond order of non-hydrogen atom neighbors or heavy atoms attached to atom
1304 #   SB<n> = Number of single bonds to non-hydrogen atom neighbors or heavy atoms attached to atom
1305 #   DB<n> = Number of double bonds to non-hydrogen atom neighbors or heavy atoms attached to atom
1306 #   TB<n> = Number of triple bonds to non-hydrogen atom neighbors or heavy atoms attached to atom
1307 #   H<n>   = Number of implicit and explicit hydrogens for atom
1308 #   Ar     = Aromatic annotation indicating whether atom is aromatic
1309 #   RA     = Ring atom annotation indicating whether atom is a ring
1310 #   FC<+n/-n> = Formal charge assigned to atom
1311 #   MN<n> = Mass number indicating isotope other than most abundant isotope
1312 #   SM<n> = Spin multiplicity of atom. Possible values: 1 (singlet), 2 (doublet) or 3 (triplet)
1313 #
1314 # Then:
1315 #
1316 #   Atom type generated by AtomTypes::AtomicInvariantsAtomTypes class corresponds to:
1317 #
1318 #     AS.X<n>.BO<n>.LBO<n>.<SB><n>.<DB><n>.<TB><n>.H<n>.Ar.RA.FC<+n/-n>.MN<n>.SM<n>
1319 #
1320 # Except for AS which is a required atomic invariant in atom types, all other atomic invariants are
1321 # optional. Default atomic invariants used for AtomID are: AS, X<n>, BO<n>, H<n>, FC<+n/-n>.
1322 # AtomID specification doesn't include atomic invariants with zero or undefined values.
1323 #
1324 sub _InitializeAtomicInvariantsAtomTypesInformation {
1325   my($This) = @_;
1326 
1327   # Default atomic invariants to use for generating atom neighborhood atom IDs: AS, X, BO, H, FC
1328   #
1329   @{$This->{AtomicInvariantsToUse}} = ();
1330   @{$This->{AtomicInvariantsToUse}} = ('AS', 'X', 'BO', 'H', 'FC');
1331 
1332   return $This;
1333 }
1334 
1335 # Initialize functional class atom types, generated by AtomTypes::FunctionalClassAtomTypes
1336 # class, to use for generating atom identifiers...
1337 #
1338 # Let:
1339 #   HBD: HydrogenBondDonor
1340 #   HBA: HydrogenBondAcceptor
1341 #   PI :  PositivelyIonizable
1342 #   NI : NegativelyIonizable
1343 #   Ar : Aromatic
1344 #   Hal : Halogen
1345 #   H : Hydrophobic
1346 #   RA : RingAtom
1347 #   CA : ChainAtom
1348 #
1349 # Then:
1350 #
1351 #   Functiononal class atom type specification for an atom corresponds to:
1352 #
1353 #     Ar.CA.H.HBA.HBD.Hal.NI.PI.RA
1354 #
1355 #   Default functional classes used are: HBD, HBA, PI, NI, Ar, Hal
1356 #
1357 #   FunctionalAtomTypes are assigned using the following definitions [ Ref 60-61, Ref 65-66 ]:
1358 #
1359 #     HydrogenBondDonor: NH, NH2, OH
1360 #     HydrogenBondAcceptor: N[!H], O
1361 #     PositivelyIonizable: +, NH2
1362 #     NegativelyIonizable: -, C(=O)OH, S(=O)OH, P(=O)OH
1363 #
1364 sub _InitializeFunctionalClassAtomTypesInformation {
1365   my($This) = @_;
1366 
1367   # Default functional class atom typess to use for generating atom identifiers
1368   # are: HBD, HBA, PI, NI, Ar, Hal
1369   #
1370   @{$This->{FunctionalClassesToUse}} = ();
1371   @{$This->{FunctionalClassesToUse}} = ('HBD', 'HBA', 'PI', 'NI', 'Ar', 'Hal');
1372 
1373   return $This;
1374 }
1375 
1376 # Return a string containg data for PathLengthFingerprints object...
1377 #
1378 sub StringifyPathLengthFingerprints {
1379   my($This) = @_;
1380   my($PathLengthsFingerprintsString);
1381 
1382   # Type of fingerprint...
1383   $PathLengthsFingerprintsString = "Fingerprint type: $This->{Type}; AtomIdentifierType: $This->{AtomIdentifierType}";
1384 
1385   # Path length...
1386   $PathLengthsFingerprintsString .= "; MinPathLength: $This->{MinLength}; MaxPathLength: $This->{MaxLength}";
1387 
1388   # Fingerprint generation control...
1389   my($AllowSharedBonds, $AllowRings, $UseBondSymbols, $UseUniquePaths);
1390 
1391   $AllowSharedBonds = $This->{AllowSharedBonds} ? "Yes" : "No";
1392   $AllowRings = $This->{AllowRings} ? "Yes" : "No";
1393   $UseBondSymbols = $This->{UseBondSymbols} ? "Yes" : "No";
1394   $UseUniquePaths = $This->{UseBondSymbols} ? "Yes" : "No";
1395 
1396   $PathLengthsFingerprintsString .= "; UseUniquePaths: $UseUniquePaths; AllowSharedBonds: $AllowSharedBonds; AllowRings: $AllowRings; UseBondSymbols: $UseBondSymbols";
1397 
1398   if ($This->{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
1399     my($AtomicInvariant, @AtomicInvariants, @AtomicInvariantsOrder, %AvailableAtomicInvariants);
1400 
1401     @AtomicInvariantsOrder = AtomTypes::AtomicInvariantsAtomTypes::GetAtomicInvariantsOrder();
1402     %AvailableAtomicInvariants = AtomTypes::AtomicInvariantsAtomTypes::GetAvailableAtomicInvariants();
1403 
1404     for $AtomicInvariant (@AtomicInvariantsOrder) {
1405       push @AtomicInvariants, "$AtomicInvariant: $AvailableAtomicInvariants{$AtomicInvariant}";
1406     }
1407 
1408     $PathLengthsFingerprintsString .= "; AtomicInvariantsToUse: <" . TextUtil::JoinWords(\@{$This->{AtomicInvariantsToUse}}, ", ", 0) . ">";
1409     $PathLengthsFingerprintsString .= "; AtomicInvariantsOrder: <" . TextUtil::JoinWords(\@AtomicInvariantsOrder, ", ", 0) . ">";
1410     $PathLengthsFingerprintsString .= "; AvailableAtomicInvariants: <" . TextUtil::JoinWords(\@AtomicInvariants, ", ", 0) . ">";
1411   }
1412   elsif ($This->{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
1413     my($FunctionalClass, @FunctionalClasses, @FunctionalClassesOrder, %AvailableFunctionalClasses);
1414 
1415     @FunctionalClassesOrder = AtomTypes::FunctionalClassAtomTypes::GetFunctionalClassesOrder();
1416     %AvailableFunctionalClasses = AtomTypes::FunctionalClassAtomTypes::GetAvailableFunctionalClasses();
1417 
1418     for $FunctionalClass (@FunctionalClassesOrder) {
1419       push @FunctionalClasses, "$FunctionalClass: $AvailableFunctionalClasses{$FunctionalClass}";
1420     }
1421 
1422     $PathLengthsFingerprintsString .= "; FunctionalClassesToUse: <" . TextUtil::JoinWords(\@{$This->{FunctionalClassesToUse}}, ", ", 0) . ">";
1423     $PathLengthsFingerprintsString .= "; FunctionalClassesOrder: <" . TextUtil::JoinWords(\@FunctionalClassesOrder, ", ", 0) . ">";
1424     $PathLengthsFingerprintsString .= "; AvailableFunctionalClasses: <" . TextUtil::JoinWords(\@FunctionalClasses, ", ", 0) . ">";
1425   }
1426 
1427   if ($This->{Type} =~ /^PathLengthBits$/i) {
1428     # Size...
1429     $PathLengthsFingerprintsString .= "; Size: $This->{Size}; MinSize: $This->{MinSize}; MaxSize: $This->{MaxSize}";
1430 
1431     # NumOfBitsToSetPerPath...
1432     $PathLengthsFingerprintsString .= "; NumOfBitsToSetPerPath: $This->{NumOfBitsToSetPerPath}";
1433 
1434     # Fingerprint bit density and num of bits set...
1435     my($NumOfSetBits, $BitDensity);
1436     $NumOfSetBits = $This->{FingerprintsBitVector}->GetNumOfSetBits();
1437     $BitDensity = $This->{FingerprintsBitVector}->GetFingerprintsBitDensity();
1438     $PathLengthsFingerprintsString .= "; NumOfOnBits: $NumOfSetBits; BitDensity: $BitDensity";
1439 
1440     $PathLengthsFingerprintsString .= "; FingerprintsBitVector: < $This->{FingerprintsBitVector} >";
1441   }
1442   elsif ($This->{Type} =~ /^PathLengthCount$/i) {
1443     $PathLengthsFingerprintsString .= "; FingerprintsVector: < $This->{FingerprintsVector} >";
1444   }
1445 
1446   return $PathLengthsFingerprintsString;
1447 }
1448