1 package Fingerprints::PathLengthFingerprints; 2 # 3 # $RCSfile: PathLengthFingerprints.pm,v $ 4 # $Date: 2015/02/28 20:48:54 $ 5 # $Revision: 1.39 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use Carp; 31 use Exporter; 32 use TextUtil (); 33 use MathUtil (); 34 use Fingerprints::Fingerprints; 35 use Molecule; 36 use AtomTypes::AtomicInvariantsAtomTypes; 37 use AtomTypes::DREIDINGAtomTypes; 38 use AtomTypes::EStateAtomTypes; 39 use AtomTypes::FunctionalClassAtomTypes; 40 use AtomTypes::MMFF94AtomTypes; 41 use AtomTypes::SLogPAtomTypes; 42 use AtomTypes::SYBYLAtomTypes; 43 use AtomTypes::TPSAAtomTypes; 44 use AtomTypes::UFFAtomTypes; 45 46 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 47 48 @ISA = qw(Fingerprints::Fingerprints Exporter); 49 @EXPORT = qw(); 50 @EXPORT_OK = qw(); 51 52 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 53 54 # Setup class variables... 55 my($ClassName); 56 _InitializeClass(); 57 58 # Overload Perl functions... 59 use overload '""' => 'StringifyPathLengthFingerprints'; 60 61 # Class constructor... 62 sub new { 63 my($Class, %NamesAndValues) = @_; 64 65 # Initialize object... 66 my $This = $Class->SUPER::new(); 67 bless $This, ref($Class) || $Class; 68 $This->_InitializePathLengthFingerprints(); 69 70 $This->_InitializePathLengthFingerprintsProperties(%NamesAndValues); 71 72 return $This; 73 } 74 75 # Initialize object data... 76 # 77 sub _InitializePathLengthFingerprints { 78 my($This) = @_; 79 80 # Type of fingerprint to generate... 81 # 82 # PathLengthBits - A bit vector indicating presence/absence of atom paths 83 # PathLengthCount - A vector containing count of atom paths 84 # 85 $This->{Type} = ''; 86 87 # Type of vector: FingerprintsBitVector or FingerprintsVector 88 $This->{VectorType} = ''; 89 90 # Set default mininum, maximum, and default size. Although any arbitrary size can 91 # be specified, bit vector used to store bits work on a vector size which is 92 # power of 2 and additonal bits are automatically added and cleared. 93 # 94 $This->{Size} = 1024; 95 96 $This->{MinSize} = 32; 97 $This->{MaxSize} = 2**32; 98 99 # Minimum and maximum path lengths to use for fingerprints generation... 100 $This->{MinLength} = 1; 101 $This->{MaxLength} = 8; 102 103 # Numner of bits to set for each atom path for FingerprintsBitVector... 104 $This->{NumOfBitsToSetPerPath} = 1; 105 106 # Atom identifier type to use for path atoms during fingerprints generation... 107 # 108 # Currently supported values are: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, 109 # EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, 110 # SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes 111 # 112 $This->{AtomIdentifierType} = ''; 113 114 # Atom types assigned to atoms... 115 %{$This->{AssignedAtomTypes}} = (); 116 117 # For molecules containing rings, atom paths starting from each atom can be traversed in four 118 # different ways: 119 # 120 # . Atom paths without any rings and sharing of bonds in traversed paths. 121 # . Atom paths containing rings and without any sharing of bonds in traversed paths 122 # . All possible atom paths without any rings and sharing of bonds in traversed paths 123 # . All possible atom paths containing rings and with sharing of bonds in traversed paths. 124 # 125 # Atom path traversal is terminated at the last ring atom. For molecules containing no rings, 126 # first two and last two types described above are equivalent. 127 # 128 # AllowSharedBonds and AllowRings variables allow generation of differen types of paths 129 # to be used for fingerprints generation. 130 # 131 # In addition to atom symbols, bond symbols are also used to generate a string 132 # for atom paths. These atom paths strings are hased to a 32 bit integer key which 133 # in turn is used as a seed for a random number generation in range of 1 to fingerprint 134 # size for setting corresponding bit in bit vector. 135 # 136 # UseBondSymbols variable allow generation of atom path strings and consequently fingerprints. 137 # 138 # Combination of AllowSharedBonds, AllowRings, and UseBondSymbols allow generation of 139 # 8 different types of path length fingerprints: 140 # 141 # AllowSharedBonds AllowRings UseBondSymbols PathLengthFingerprintsType 142 # 143 # No No Yes AtomPathsNoCyclesWithBondSymbols 144 # No Yes Yes AtomPathsWithCyclesWithBondSymbols 145 # 146 # Yes No Yes AllAtomPathsNoCyclesWithBondSymbols 147 # Yes Yes Yes AllAtomPathsWithCyclesWithBondSymbols [ DEFAULT ] 148 # 149 # No No No AtomPathsNoCyclesNoBondSymbols 150 # No Yes No AtomPathsWithCyclesNoBondSymbols 151 # 152 # Yes No No AllAtomPathsNoCyclesNoBondSymbols 153 # Yes Yes No AllAtomPathsWithCyclesNoWithBondSymbols 154 # 155 # 156 157 # By default, atom paths starting from atoms are allowed to share bonds already traversed... 158 $This->{AllowSharedBonds} = 1; 159 160 # By default rings are included in paths... 161 $This->{AllowRings} = 1; 162 163 # By default bond symbols are included in atom path strings... 164 $This->{UseBondSymbols} = 1; 165 166 # By default only structurally unique atom paths are used for generation 167 # atom path strings... 168 $This->{UseUniquePaths} = 1; 169 170 # Random number generator to use during generation of fingerprints bit-vector 171 # string: Perl CORE::rand or MayaChemTools MathUtil::random function. 172 # 173 # The random number generator implemented in MayaChemTools is a variant of 174 # linear congruential generator (LCG) as described by Miller et al. [ Ref 120 ]. 175 # It is also referred to as Lehmer random number generator or Park-Miller 176 # random number generator. 177 # 178 # Unlike Perl's core random number generator function rand, the random number 179 # generator implemented in MayaChemTools, MathUtil::random, generates consistent 180 # random values across different platformsfor a specific random seed and leads 181 # to generation of portable fingerprints bit-vector strings. 182 # 183 $This->{UsePerlCoreRandom} = 1; 184 185 # Bond symbols to use during generation of atom path strings... 186 %{$This->{BondOrderToSymbol}} = (); 187 %{$This->{BondOrderToSymbol}} = ('1' => '', '1.5' => ':', '2' => '=', '3' => '#'); 188 189 # BondSymbols map to use for bonded atom IDs to use during atom path strings... 190 %{$This->{BondSymbols}} = (); 191 192 # Path atom IDs to remove duplicate paths... 193 %{$This->{UniqueLinearAtomPathsIDs}} = (); 194 %{$This->{UniqueCyclicAtomPathsIDs}} = (); 195 196 # Reference to all the atom paths upto specified path length... 197 $This->{AtomPathsRef} = ''; 198 199 # Atom paths strings created using specified atom types and bond symbols... 200 %{$This->{AtomPathsStrings}} = (); 201 } 202 203 # Initialize class ... 204 sub _InitializeClass { 205 #Class name... 206 $ClassName = __PACKAGE__; 207 } 208 209 # Initialize object properties.... 210 sub _InitializePathLengthFingerprintsProperties { 211 my($This, %NamesAndValues) = @_; 212 213 my($Name, $Value, $MethodName); 214 while (($Name, $Value) = each %NamesAndValues) { 215 $MethodName = "Set${Name}"; 216 $This->$MethodName($Value); 217 } 218 219 # Make sure molecule object was specified... 220 if (!exists $NamesAndValues{Molecule}) { 221 croak "Error: ${ClassName}->New: Object can't be instantiated without specifying molecule..."; 222 } 223 224 if (!exists $NamesAndValues{Type}) { 225 croak "Error: ${ClassName}->New: Object can't be instantiated without specifying Type..."; 226 } 227 228 if (!exists $NamesAndValues{AtomIdentifierType}) { 229 croak "Error: ${ClassName}->New: Object can't be instantiated without specifying AtomIdentifierType..."; 230 } 231 232 # Make sure it's power of 2... 233 if (exists $NamesAndValues{Size}) { 234 if (!TextUtil::IsNumberPowerOfNumber($NamesAndValues{Size}, 2)) { 235 croak "Error: ${ClassName}->New: Specified size value, $NamesAndValues{Size}, must be power of 2..."; 236 } 237 } 238 239 if ($This->{Type} =~ /^PathLengthBits$/i) { 240 $This->_InitializePathLengthBits(); 241 } 242 elsif ($This->{Type} =~ /^PathLengthCount$/i) { 243 $This->_InitializePathLengthCount(); 244 } 245 else { 246 croak "Error: ${ClassName}->_InitializePathLengthFingerprintsProperties: Unknown PathLength type: $This->{Type}; Supported PathLength type : PathLengthBits or PathLengthCount......"; 247 } 248 249 return $This; 250 } 251 252 # Initialize PathLength bits... 253 # 254 sub _InitializePathLengthBits { 255 my($This) = @_; 256 257 # Vector type... 258 $This->{VectorType} = 'FingerprintsBitVector'; 259 260 $This->_InitializeFingerprintsBitVector(); 261 262 return $This; 263 } 264 265 # Initialize PathLength key count... 266 # 267 sub _InitializePathLengthCount { 268 my($This) = @_; 269 270 # Vector type and type of values... 271 $This->{VectorType} = 'FingerprintsVector'; 272 $This->{FingerprintsVectorType} = 'NumericalValues'; 273 274 $This->_InitializeFingerprintsVector(); 275 276 return $This; 277 } 278 279 # Set type... 280 # 281 sub SetType { 282 my($This, $Type) = @_; 283 284 if ($This->{Type}) { 285 croak "Error: ${ClassName}->SetType: Can't change type: It's already set..."; 286 } 287 288 if ($Type =~ /^PathLengthBits$/i) { 289 $This->{Type} = 'PathLengthBits';; 290 } 291 elsif ($Type =~ /^PathLengthCount$/i) { 292 $This->{Type} = 'PathLengthCount';; 293 } 294 else { 295 croak "Error: ${ClassName}->SetType: Unknown PathLength keys: $Type; Supported PathLength types: PathLengthBits or PathLengthCount..."; 296 } 297 return $This; 298 } 299 300 # Disable vector type change... 301 # 302 sub SetVectorType { 303 my($This, $Type) = @_; 304 305 croak "Error: ${ClassName}->SetVectorType: Can't change vector type..."; 306 307 return $This; 308 } 309 310 # Disable vector type change... 311 # 312 sub SetFingerprintsVectorType { 313 my($This, $Type) = @_; 314 315 croak "Error: ${ClassName}->SetFingerprintsVectorType: Can't change fingerprints vector type..."; 316 317 return $This; 318 } 319 320 # Set atom identifier type to use for path length atom identifiers... 321 # 322 sub SetAtomIdentifierType { 323 my($This, $IdentifierType) = @_; 324 325 if ($IdentifierType !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 326 croak "Error: ${ClassName}->SetAtomIdentifierType: Specified value, $IdentifierType, for AtomIdentifierType is not vaild. Supported types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, and UFFAtomTypes."; 327 } 328 329 if ($This->{AtomIdentifierType}) { 330 croak "Error: ${ClassName}->SetAtomIdentifierType: Can't change atom identifier type: It's already set..."; 331 } 332 333 $This->{AtomIdentifierType} = $IdentifierType; 334 335 # Initialize atom identifier type information... 336 $This->_InitializeAtomIdentifierTypeInformation(); 337 338 return $This; 339 } 340 341 # Set minimum path length... 342 # 343 sub SetMinLength { 344 my($This, $Value) = @_; 345 346 if (!TextUtil::IsPositiveInteger($Value)) { 347 croak "Error: ${ClassName}->SetMinLength: MinLength value, $Value, is not valid: It must be a positive integer..."; 348 } 349 $This->{MinLength} = $Value; 350 351 return $This; 352 } 353 354 # Set maximum path length... 355 # 356 sub SetMaxLength { 357 my($This, $Value) = @_; 358 359 if (!TextUtil::IsPositiveInteger($Value)) { 360 croak "Error: ${ClassName}->SetMaxLength: MaxLength value, $Value, is not valid: It must be a positive integer..."; 361 } 362 $This->{MaxLength} = $Value; 363 364 return $This; 365 } 366 367 # Set number of bits to set for each path... 368 # 369 sub SetNumOfBitsToSetPerPath { 370 my($This, $Value) = @_; 371 372 if (!TextUtil::IsPositiveInteger($Value)) { 373 croak "Error: ${ClassName}->SetNumOfBitsToSetPerPath: NumOfBitsToSetPerPath value, $Value, is not valid: It must be a positive integer..."; 374 } 375 $This->{NumOfBitsToSetPerPath} = $Value; 376 377 return $This; 378 } 379 380 # Generate fingerprints description... 381 # 382 sub GetDescription { 383 my($This) = @_; 384 385 # Is description explicity set? 386 if (exists $This->{Description}) { 387 return $This->{Description}; 388 } 389 390 # Generate fingerprints description... 391 392 return "$This->{Type}:$This->{AtomIdentifierType}:MinLength$This->{MinLength}:MaxLength$This->{MaxLength}"; 393 } 394 395 # Generate path length fingerprints... 396 # 397 sub GenerateFingerprints { 398 my($This) = @_; 399 400 if ($This->{MinLength} > $This->{MaxLength}) { 401 croak "Error: ${ClassName}->GenerateFingerprints: No fingerpritns generated: MinLength, $This->{MinLength}, must be <= MaxLength, $This->{MaxLength}..."; 402 } 403 404 # Cache appropriate molecule data... 405 $This->_SetupMoleculeDataCache(); 406 407 # Assign atom types to all atoms... 408 if (!$This->_AssignAtomTypes()) { 409 carp "Warning: ${ClassName}->GenerateFingerprints: $This->{AtomIdentifierType} fingerprints generation didn't succeed: Couldn't assign valid $This->{AtomIdentifierType} to all atoms..."; 410 return $This; 411 } 412 413 # Setup bond symbol map... 414 if ($This->{UseBondSymbols}) { 415 $This->_InitializeBondSymbols(); 416 } 417 418 # Generate appropriate atom paths... 419 $This->_GenerateAtomPathsUpToMaxLength(); 420 421 # Initialize atom path strings... 422 $This->_InitializeAtomPathsStrings(); 423 424 # Generate appropriate atom path strings for unique atom paths... 425 $This->_GenerateAtomPathsStrings(); 426 427 # Set final fingerprints... 428 $This->_SetFinalFingerprints(); 429 430 # Clear cached molecule data... 431 $This->_ClearMoleculeDataCache(); 432 433 return $This; 434 } 435 436 # Assign appropriate atom types to all atoms... 437 # 438 sub _AssignAtomTypes { 439 my($This) = @_; 440 my($SpecifiedAtomTypes, $Atom, $AtomID, $IgnoreHydrogens); 441 442 %{$This->{AssignedAtomTypes}} = (); 443 $IgnoreHydrogens = 0; 444 445 $SpecifiedAtomTypes = undef; 446 447 IDENTIFIERTYPE: { 448 if ($This->{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) { 449 $SpecifiedAtomTypes = new AtomTypes::AtomicInvariantsAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens, 'AtomicInvariantsToUse' => $This->{AtomicInvariantsToUse}); 450 last IDENTIFIERTYPE; 451 } 452 453 if ($This->{AtomIdentifierType} =~ /^DREIDINGAtomTypes$/i) { 454 $SpecifiedAtomTypes = new AtomTypes::DREIDINGAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens); 455 last IDENTIFIERTYPE; 456 } 457 458 if ($This->{AtomIdentifierType} =~ /^EStateAtomTypes$/i) { 459 $SpecifiedAtomTypes = new AtomTypes::EStateAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens); 460 last IDENTIFIERTYPE; 461 } 462 463 if ($This->{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) { 464 $SpecifiedAtomTypes = new AtomTypes::FunctionalClassAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens, 'FunctionalClassesToUse' => $This->{FunctionalClassesToUse}); 465 last IDENTIFIERTYPE; 466 } 467 468 if ($This->{AtomIdentifierType} =~ /^MMFF94AtomTypes$/i) { 469 $SpecifiedAtomTypes = new AtomTypes::MMFF94AtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens); 470 last IDENTIFIERTYPE; 471 } 472 473 if ($This->{AtomIdentifierType} =~ /^SLogPAtomTypes$/i) { 474 $SpecifiedAtomTypes = new AtomTypes::SLogPAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens); 475 last IDENTIFIERTYPE; 476 } 477 if ($This->{AtomIdentifierType} =~ /^SYBYLAtomTypes$/i) { 478 $SpecifiedAtomTypes = new AtomTypes::SYBYLAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens); 479 last IDENTIFIERTYPE; 480 } 481 482 if ($This->{AtomIdentifierType} =~ /^TPSAAtomTypes$/i) { 483 $SpecifiedAtomTypes = new AtomTypes::TPSAAtomTypes('Molecule' => $This->{Molecule}, 'IgnorePhosphorus' => 0, 'IgnoreSulfur' => 0); 484 last IDENTIFIERTYPE; 485 } 486 487 if ($This->{AtomIdentifierType} =~ /^UFFAtomTypes$/i) { 488 $SpecifiedAtomTypes = new AtomTypes::UFFAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens); 489 last IDENTIFIERTYPE; 490 } 491 492 croak "Error: ${ClassName}->_AssignAtomTypes: Unknown atom indentifier type $This->{AtomIdentifierType}..."; 493 } 494 495 # Assign atom types... 496 $SpecifiedAtomTypes->AssignAtomTypes(); 497 498 # Make sure atom types assignment is successful... 499 if (!$SpecifiedAtomTypes->IsAtomTypesAssignmentSuccessful()) { 500 return undef; 501 } 502 503 # Collect assigned atom types... 504 ATOM: for $Atom (@{$This->{Atoms}}) { 505 $AtomID = $Atom->GetID(); 506 $This->{AssignedAtomTypes}{$AtomID} = $SpecifiedAtomTypes->GetAtomType($Atom); 507 } 508 509 return $This; 510 } 511 512 # Setup bond symbol map for atoms to speed up generation of path length identifiers 513 # during fingerprints generation... 514 # 515 sub _InitializeBondSymbols { 516 my($This) = @_; 517 my($Atom1, $Atom2, $AtomID1, $AtomID2, $Bond, $BondSymbol, $BondOrder); 518 519 %{$This->{BondSymbols}} = (); 520 521 if (!$This->{UseBondSymbols}) { 522 return $This; 523 } 524 525 for $Bond ($This->{Molecule}->GetBonds()) { 526 $BondOrder = $Bond->GetBondOrder(); 527 $BondSymbol = $Bond->IsAromatic() ? ':' : (exists($This->{BondOrderToSymbol}{$BondOrder}) ? $This->{BondOrderToSymbol}{$BondOrder} : $BondOrder); 528 ($Atom1, $Atom2) = $Bond->GetAtoms(); 529 $AtomID1 = $Atom1->GetID(); $AtomID2 = $Atom2->GetID(); 530 if ($AtomID1 > $AtomID2) { 531 ($AtomID1, $AtomID2) = ($AtomID2, $AtomID1); 532 } 533 534 if (!exists $This->{BondSymbols}{$AtomID1}) { 535 %{$This->{BondSymbols}{$AtomID1}} = (); 536 } 537 $This->{BondSymbols}{$AtomID1}{$AtomID2} = $BondSymbol; 538 } 539 return $This; 540 } 541 542 # Get appropriate atom paths with length up to MaxLength... 543 # 544 sub _GenerateAtomPathsUpToMaxLength { 545 my($This) = @_; 546 my($PathLength, $AllowRings, $Molecule, $AtomPathsRef); 547 548 $PathLength = $This->{MaxLength}; 549 $AllowRings = $This->{AllowRings}; 550 $Molecule = $This->{Molecule}; 551 552 if ($This->{AllowSharedBonds}) { 553 $AtomPathsRef = $Molecule->GetAllAtomPathsWithLengthUpto($PathLength, $AllowRings); 554 } 555 else { 556 $AtomPathsRef = $Molecule->GetAtomPathsWithLengthUpto($PathLength, $AllowRings); 557 } 558 $This->{AtomPathsRef} = $AtomPathsRef; 559 560 return $This; 561 } 562 563 # Initialize atom paths strings at various pathlength levels... 564 # 565 sub _InitializeAtomPathsStrings { 566 my($This) = @_; 567 my($PathLength); 568 569 %{$This->{AtomPathsStrings}} = (); 570 571 for $PathLength ($This->{MinLength} .. $This->{MaxLength}) { 572 %{$This->{AtomPathsStrings}{$PathLength}} = (); 573 } 574 575 return $This; 576 } 577 578 # Generate appropriate atom path strings for unique atom paths... 579 # 580 sub _GenerateAtomPathsStrings { 581 my($This, $PathAtomsRef) = @_; 582 my($PathLength, $MinPathLength, $UseUniquePaths); 583 584 $MinPathLength = $This->{MinLength}; 585 $UseUniquePaths = $This->{UseUniquePaths}; 586 587 PATHATOMS: for $PathAtomsRef (@{$This->{AtomPathsRef}}) { 588 $PathLength = scalar @{$PathAtomsRef}; 589 if ($PathLength < $MinPathLength) { 590 next PATHATOMS; 591 } 592 if ($UseUniquePaths) { 593 $This->_GenerateAtomPathStringUsingUniquePath($PathAtomsRef); 594 } 595 else { 596 $This->_GenerateAtomPathString($PathAtomsRef); 597 } 598 } 599 return $This; 600 } 601 602 # Generate atom path string using unique path... 603 # 604 sub _GenerateAtomPathStringUsingUniquePath { 605 my($This, $PathAtomsRef) = @_; 606 607 if ($This->{AllowRings} && $This->_DoesAtomPathContainsCycle($PathAtomsRef)) { 608 $This->_GenerateAtomPathStringUsingUniquePathContainingCycle($PathAtomsRef); 609 } 610 else { 611 $This->_GenerateAtomPathStringUsingUniqueLinearPath($PathAtomsRef); 612 } 613 return $This; 614 } 615 616 # Generate atom path string for specified path containing no cycle... 617 # 618 sub _GenerateAtomPathStringUsingUniqueLinearPath { 619 my($This, $PathAtomsRef) = @_; 620 621 # Is it a unique linear atom path? 622 # 623 if (!$This->_IsUniqueLinearAtomPath($PathAtomsRef)) { 624 return $This; 625 } 626 $This->_GenerateAtomPathString($PathAtomsRef); 627 628 return $This; 629 } 630 631 # Is it a structurally unique linear path? 632 # 633 # For a path to be structurally unique, all of its atom IDs must be diffferent from any 634 # earlier path atom IDs. In order to generate atom path atom ID invariant of the atom 635 # order in the molecule, atom IDs are sorted numerically before generating the path ID. 636 # 637 # Notes: 638 # . Atom path ID doesn't reflect the order of atoms in the atom path. 639 # 640 sub _IsUniqueLinearAtomPath { 641 my($This, $PathAtomsRef) = @_; 642 my($AtomPathID, $PathLength, @PathAtomIDs); 643 644 @PathAtomIDs = (); 645 @PathAtomIDs = map { $_->GetID(); } @{$PathAtomsRef}; 646 647 $AtomPathID = join '-', sort { $a <=> $b } @PathAtomIDs; 648 if (exists $This->{UniqueLinearAtomPathsIDs}{$AtomPathID}) { 649 return 0; 650 } 651 652 # It's a unique atom path... 653 $This->{UniqueLinearAtomPathsIDs}{$AtomPathID} = 1; 654 655 return 1; 656 } 657 658 # Generate atom path string for specified path containing a cycle... 659 # 660 sub _GenerateAtomPathStringUsingUniquePathContainingCycle { 661 my($This, $PathAtomsRef) = @_; 662 663 # Is it a unique atom path containing a cycle? 664 # 665 if (!$This->_IsUniqueAtomPathContainingCycle($PathAtomsRef)) { 666 return $This; 667 } 668 669 my($CycleClosingPathAtomIndex); 670 ($CycleClosingPathAtomIndex) = $This->_GetAtomPathCycleClosingAtomIndex($PathAtomsRef); 671 672 if ($CycleClosingPathAtomIndex == 0) { 673 $This->_GenerateUniqueAtomPathStringForPathCycle($PathAtomsRef); 674 } 675 else { 676 $This->_GenerateUniqueAtomPathStringForPathContainingCycle($PathAtomsRef, $CycleClosingPathAtomIndex); 677 } 678 return $This; 679 } 680 681 # Generate a unique atom path string for a cyclic path by generating atom path 682 # strings for all possible paths in the cycle and keeping the lexicographically smallest 683 # one. 684 # 685 # Although all the paths enumerated during atom path string generation are also 686 # present in the intial paths list, but structural uniqueness check would detect 687 # 'em earlier and this method ends being invoked only once for the first cyclic path. 688 # 689 # For atom paths containg same atom types and bond symbols, atom path strings 690 # would be same for the paths. 691 # 692 sub _GenerateUniqueAtomPathStringForPathCycle { 693 my($This, $PathAtomsRef) = @_; 694 695 if ($This->_AreAllPathAtomsSymbolsSame($PathAtomsRef) && $This->_AreAllPathBondSymbolsSame($PathAtomsRef)) { 696 return $This->_GenerateAtomPathString($PathAtomsRef); 697 } 698 699 # Generate all possible atom path strings and select the lexicographically smallest one... 700 my($Index, $PathLength, $FinalAtomPathString, $FirstAtomPathString, $LastIndex, $FirstPartIndex, $FirstPartStartIndex, $FirstPartEndIndex, $SecondPartIndex, $SecondPartStartIndex, $SecondPartEndIndex, $AtomPathSymbolsRef, $AtomPathString, $ReverseAtomPathString, @FirstPartPathAtoms, @SecondPartPathAtoms, @PathAtoms); 701 702 $PathLength = scalar @{$PathAtomsRef}; 703 $LastIndex = $PathLength - 1; 704 705 $FinalAtomPathString = ''; 706 $FirstAtomPathString = 1; 707 708 @FirstPartPathAtoms = (); @SecondPartPathAtoms = (); @PathAtoms = (); 709 710 for $Index (0 .. ($LastIndex - 1)) { 711 @FirstPartPathAtoms = (); @SecondPartPathAtoms = (); @PathAtoms = (); 712 713 $FirstPartStartIndex = 0; $FirstPartEndIndex = $Index - 1; 714 $SecondPartStartIndex = $Index; $SecondPartEndIndex = $LastIndex - 1; 715 716 # Get first part atoms... 717 for $FirstPartIndex ($FirstPartStartIndex .. $FirstPartEndIndex) { 718 push @FirstPartPathAtoms, $PathAtomsRef->[$FirstPartIndex]; 719 } 720 721 # Get second part atoms... 722 for $SecondPartIndex ($SecondPartStartIndex .. $SecondPartEndIndex) { 723 push @SecondPartPathAtoms, $PathAtomsRef->[$SecondPartIndex]; 724 } 725 726 # Get final list of path atoms... 727 if (@SecondPartPathAtoms) { 728 push @PathAtoms, @SecondPartPathAtoms; 729 } 730 if (@FirstPartPathAtoms) { 731 push @PathAtoms, @FirstPartPathAtoms; 732 } 733 734 # Complete the cycle by adding first atom as the last atom... 735 push @PathAtoms, $PathAtomsRef->[$SecondPartStartIndex]; 736 737 # Generate atom path string... 738 $AtomPathSymbolsRef = $This->_GenerateAtomPathSymbols(\@PathAtoms); 739 740 $AtomPathString = join '', @{$AtomPathSymbolsRef}; 741 $ReverseAtomPathString = join '', reverse @{$AtomPathSymbolsRef}; 742 743 if ($ReverseAtomPathString le $AtomPathString) { 744 $AtomPathString = $ReverseAtomPathString; 745 } 746 747 # Update final atom path string... 748 749 if ($FirstAtomPathString) { 750 $FirstAtomPathString = 0; 751 $FinalAtomPathString = $AtomPathString; 752 } 753 else { 754 if ($AtomPathString le $FinalAtomPathString) { 755 $FinalAtomPathString = $AtomPathString; 756 } 757 } 758 } 759 760 # Set final atom path string... 761 # 762 if (exists $This->{AtomPathsStrings}{$PathLength}{$FinalAtomPathString}) { 763 $This->{AtomPathsStrings}{$PathLength}{$FinalAtomPathString} += 1; 764 } 765 else { 766 $This->{AtomPathsStrings}{$PathLength}{$FinalAtomPathString} = 1; 767 } 768 769 return $This; 770 } 771 772 # 773 # Generate a unique atom path string for paths containing a cycle closed by 774 # the specified atom index and the last atom index. 775 # 776 # The following methodology is used to generate atom path string which is 777 # independemt of initial atom ordering: 778 # . Generate atom paths string from first atom to the atom before the first cycle 779 # closing atom. 780 # . Generate atom path string from atoms from first cycle closing atom index to 781 # the last path atom in both forward and reverse order. And select the lexicographically 782 # smallest atom path string. 783 # . Combine atom path string generated in first step with second step to generate 784 # final atom path string. 785 # 786 sub _GenerateUniqueAtomPathStringForPathContainingCycle { 787 my($This, $PathAtomsRef, $CycleClosingAtomIndex) = @_; 788 my($Index, $PathLength, $LastIndex, $LinearPartStartIndex, $LinearPartEndIndex, $CyclicPartStartIndex, $CyclicPartEndIndex, $CyclicPartAtomPathSymbolsRef, $CyclicPartAtomPathString, $ReverseCyclicPartAtomPathString, $AtomPathString, $AtomPathSymbolsRef, @CyclicPartPathAtoms, @PathAtoms); 789 790 $PathLength = scalar @{$PathAtomsRef}; 791 $LastIndex = $PathLength - 1; 792 793 @PathAtoms = (); 794 795 # Get path atoms corresponding to linear part of the path... 796 $LinearPartStartIndex = 0; $LinearPartEndIndex = $CycleClosingAtomIndex - 1; 797 798 for $Index ($LinearPartStartIndex .. $LinearPartEndIndex) { 799 push @PathAtoms, $PathAtomsRef->[$Index]; 800 } 801 802 # Get atoms correcponding to cyclic part of the path... 803 @CyclicPartPathAtoms = (); 804 $CyclicPartStartIndex = $CycleClosingAtomIndex; $CyclicPartEndIndex = $LastIndex; 805 806 for $Index ($CyclicPartStartIndex .. $CyclicPartEndIndex) { 807 push @CyclicPartPathAtoms, $PathAtomsRef->[$Index]; 808 } 809 810 # Setup a lexicographically smaller atom path string for cyclic part... 811 812 $CyclicPartAtomPathSymbolsRef = $This->_GenerateAtomPathSymbols(\@CyclicPartPathAtoms); 813 $CyclicPartAtomPathString = join '', @{$CyclicPartAtomPathSymbolsRef}; 814 $ReverseCyclicPartAtomPathString = join '', reverse @{$CyclicPartAtomPathSymbolsRef}; 815 816 # Setup atom path corresponding to linear part and lexigraphicall smaller cyclic part... 817 818 if ($ReverseCyclicPartAtomPathString le $CyclicPartAtomPathString) { 819 push @PathAtoms, reverse @CyclicPartPathAtoms; 820 } 821 else { 822 push @PathAtoms, @CyclicPartPathAtoms; 823 } 824 825 # Setup final atom path string... 826 827 $AtomPathSymbolsRef = $This->_GenerateAtomPathSymbols(\@PathAtoms); 828 $AtomPathString = join '', @{$AtomPathSymbolsRef}; 829 830 if (exists $This->{AtomPathsStrings}{$PathLength}{$AtomPathString}) { 831 $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} += 1; 832 } 833 else { 834 $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} = 1; 835 } 836 837 return $This; 838 } 839 840 # Does atom path contain a cycle? 841 # 842 # For an atom path to contain cycle, it must satisfy the following conditions: 843 # . Pathlength >= 3 844 # . Last atom ID is equal to first atom ID or some other atom ID besides itself 845 # 846 sub _DoesAtomPathContainsCycle { 847 my($This, $PathAtomsRef) = @_; 848 my($PathLength); 849 850 $PathLength = scalar @{$PathAtomsRef}; 851 if ($PathLength <= 2) { 852 return 0; 853 } 854 855 my($AtomIndex, $LastAtomIndex, $Atom, $AtomID, $LastAtom, $LastAtomID); 856 857 $LastAtomIndex = $PathLength - 1; 858 $LastAtom = $PathAtomsRef->[$LastAtomIndex]; 859 $LastAtomID = $LastAtom->GetID(); 860 861 # Look for atomID similar to last atom ID... 862 for $AtomIndex (0 .. ($LastAtomIndex - 1)) { 863 $Atom = $PathAtomsRef->[$AtomIndex]; 864 $AtomID = $Atom->GetID(); 865 866 if ($AtomID == $LastAtomID) { 867 # It's a cycle... 868 return 1; 869 } 870 } 871 return 0; 872 } 873 874 # Get atom path cycle closing atom index... 875 # 876 sub _GetAtomPathCycleClosingAtomIndex { 877 my($This, $PathAtomsRef) = @_; 878 my($AtomIndex, $LastAtomIndex, $Atom, $AtomID, $LastAtom, $LastAtomID, $PathLength); 879 880 $PathLength = scalar @{$PathAtomsRef}; 881 882 $LastAtomIndex = $PathLength - 1; 883 $LastAtom = $PathAtomsRef->[$LastAtomIndex]; $LastAtomID = $LastAtom->GetID(); 884 885 # Look for atomID similar to last atom ID... 886 for $AtomIndex (0 .. ($LastAtomIndex - 1)) { 887 $Atom = $PathAtomsRef->[$AtomIndex]; $AtomID = $Atom->GetID(); 888 889 if ($AtomID == $LastAtomID) { 890 # It's a cycle closing atom... 891 return $AtomIndex; 892 } 893 } 894 return undef; 895 } 896 897 # Is it a structurally unique path containing a cycle? 898 # 899 # For atom paths containing cycles, last atom ID is either equal to first atom ID or 900 # some other atom ID besides itself. 901 # 902 # In order to determine its structurally unqiue independent of initial atom ordering, 903 # the following methodolgy is used: 904 # 905 # . For paths with same first and atom IDs: 906 # . Remove the last atom ID from atom path 907 # . Sort atom IDs in the path 908 # . Add first atom ID from the sorted list to the end of list to complete the cycle 909 # . Generate a atom path ID 910 # . Use final path ID to track uniqueness of path containing cycle. 911 # 912 # . For paths with last atom ID equal to some other atom ID besidies itself: 913 # . Sort atom IDs in atom path 914 # . Generate atom path ID and use it to track unqiueness of atom paths. 915 # 916 sub _IsUniqueAtomPathContainingCycle { 917 my($This, $PathAtomsRef) = @_; 918 my($PathLength, $AtomPathID, $FirstAtom, $LastAtom, $FirstAtomID, $LastAtomID, @PathAtomIDs, @SortedPathAtomIDs); 919 920 @PathAtomIDs = (); 921 @PathAtomIDs = map { $_->GetID(); } @{$PathAtomsRef}; 922 923 $PathLength = scalar @{$PathAtomsRef}; 924 925 $FirstAtom = $PathAtomsRef->[0]; $FirstAtomID = $FirstAtom->GetID(); 926 $LastAtom = $PathAtomsRef->[$PathLength - 1]; $LastAtomID = $LastAtom->GetID(); 927 928 if ($FirstAtomID == $LastAtomID) { 929 pop @PathAtomIDs; 930 931 @SortedPathAtomIDs = (); 932 @SortedPathAtomIDs = sort { $a <=> $b } @PathAtomIDs; 933 934 push @SortedPathAtomIDs, $SortedPathAtomIDs[0]; 935 936 $AtomPathID = join '-', @SortedPathAtomIDs; 937 } 938 else { 939 $AtomPathID = join '-', sort { $a <=> $b } @PathAtomIDs; 940 } 941 942 if (exists $This->{UniqueCyclicAtomPathsIDs}{$AtomPathID}) { 943 return 0; 944 } 945 946 # It's a unique atom path containing a cycle... 947 $This->{UniqueCyclicAtomPathsIDs}{$AtomPathID} = 1; 948 949 return 1; 950 } 951 952 # Generate atom path string for specified atom path... 953 # 954 sub _GenerateAtomPathString { 955 my($This, $PathAtomsRef) = @_; 956 my($PathLength, $AtomPathString, $ReverseAtomPathString, $AtomPathSymbolsRef); 957 958 $PathLength = scalar @{$PathAtomsRef}; 959 960 # Generate path atom and bond symbols... 961 # 962 $AtomPathSymbolsRef = $This->_GenerateAtomPathSymbols($PathAtomsRef); 963 964 # Check presence of path using path ID created by atom path symbols... 965 $AtomPathString = join '', @{$AtomPathSymbolsRef}; 966 if (exists $This->{AtomPathsStrings}{$PathLength}{$AtomPathString}) { 967 $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} += 1; 968 return $This; 969 } 970 971 # Check presence of reverse path using path ID created by atom path symbols... 972 # 973 $ReverseAtomPathString = join '', reverse @{$AtomPathSymbolsRef}; 974 if (exists $This->{AtomPathsStrings}{$PathLength}{$ReverseAtomPathString}) { 975 $This->{AtomPathsStrings}{$PathLength}{$ReverseAtomPathString} += 1; 976 return $This; 977 } 978 979 # Use lexicographically smaller atom path string as PathID... 980 # 981 if ($AtomPathString le $ReverseAtomPathString) { 982 $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} = 1; 983 } 984 else { 985 $This->{AtomPathsStrings}{$PathLength}{$ReverseAtomPathString} = 1; 986 } 987 return $This; 988 } 989 990 # Are atom types for all path atoms same? 991 # 992 sub _AreAllPathAtomsSymbolsSame { 993 my($This, $PathAtomsRef) = @_; 994 my($Index, $Atom, $AtomID, $AtomType, $FirstAtomType); 995 996 $Atom = $PathAtomsRef->[0]; $AtomID = $Atom->GetID(); 997 $FirstAtomType = $This->{AssignedAtomTypes}{$AtomID}; 998 999 for $Index (1 .. $#{$PathAtomsRef}) { 1000 $Atom = $PathAtomsRef->[$Index]; $AtomID = $Atom->GetID(); 1001 $AtomType = $This->{AssignedAtomTypes}{$AtomID}; 1002 1003 if ($AtomType ne $FirstAtomType) { 1004 return 0; 1005 } 1006 } 1007 return 1; 1008 } 1009 1010 # Are bond symbols for all path bonds same? 1011 # 1012 sub _AreAllPathBondSymbolsSame { 1013 my($This, $PathAtomsRef) = @_; 1014 my($Index, $Atom, $BondedAtom, $AtomID, $BondedAtomID, $BondAtomID1, $BondAtomID2, $FirstBondSymbol, $BondSymbol); 1015 1016 # During no usage of bond symbols, just ignore them and assume they are same... 1017 if (!$This->{UseBondSymbols}) { 1018 return 1; 1019 } 1020 1021 $Atom = $PathAtomsRef->[0]; $BondedAtom = $PathAtomsRef->[1]; 1022 $AtomID = $Atom->GetID(); $BondedAtomID = $BondedAtom->GetID(); 1023 1024 ($BondAtomID1, $BondAtomID2) = ($AtomID < $BondedAtomID) ? ($AtomID, $BondedAtomID) : ($BondedAtomID, $AtomID); 1025 $FirstBondSymbol = $This->{BondSymbols}{$BondAtomID1}{$BondAtomID2}; 1026 1027 for $Index (1 .. ($#{$PathAtomsRef} - 1)) { 1028 $Atom = $PathAtomsRef->[$Index]; $BondedAtom = $PathAtomsRef->[$Index + 1]; 1029 $AtomID = $Atom->GetID(); $BondedAtomID = $BondedAtom->GetID(); 1030 1031 ($BondAtomID1, $BondAtomID2) = ($AtomID < $BondedAtomID) ? ($AtomID, $BondedAtomID) : ($BondedAtomID, $AtomID); 1032 $BondSymbol = $This->{BondSymbols}{$BondAtomID1}{$BondAtomID2}; 1033 1034 if ($BondSymbol ne $FirstBondSymbol) { 1035 return 0; 1036 } 1037 } 1038 return 1; 1039 } 1040 1041 # Generate atom path symbols... 1042 # 1043 sub _GenerateAtomPathSymbols { 1044 my($This, $PathAtomsRef) = @_; 1045 my($Atom, $AtomID, @AtomPathSymbols); 1046 1047 @AtomPathSymbols = (); 1048 1049 if (@{$PathAtomsRef} == 1) { 1050 $Atom = $PathAtomsRef->[0]; $AtomID = $Atom->GetID(); 1051 push @AtomPathSymbols, $This->{AssignedAtomTypes}{$AtomID}; 1052 return \@AtomPathSymbols; 1053 } 1054 1055 # Ignore bond information... 1056 if (!$This->{UseBondSymbols}) { 1057 for $Atom (@{$PathAtomsRef}) { 1058 $AtomID = $Atom->GetID(); 1059 push @AtomPathSymbols, $This->{AssignedAtomTypes}{$AtomID}; 1060 } 1061 return \@AtomPathSymbols; 1062 } 1063 1064 # Use atoms and bonds to generate atom path string... 1065 my($Index, $BondedAtom, $BondedAtomID, $BondAtomID1, $BondAtomID2); 1066 1067 # Process atom type of first atom in path... 1068 $Atom = $PathAtomsRef->[0]; $AtomID = $Atom->GetID(); 1069 push @AtomPathSymbols, $This->{AssignedAtomTypes}{$AtomID}; 1070 1071 for $Index (0 .. ($#{$PathAtomsRef} - 1)) { 1072 $Atom = $PathAtomsRef->[$Index]; $BondedAtom = $PathAtomsRef->[$Index + 1]; 1073 $AtomID = $Atom->GetID(); $BondedAtomID = $BondedAtom->GetID(); 1074 1075 ($BondAtomID1, $BondAtomID2) = ($AtomID < $BondedAtomID) ? ($AtomID, $BondedAtomID) : ($BondedAtomID, $AtomID); 1076 push @AtomPathSymbols, $This->{BondSymbols}{$BondAtomID1}{$BondAtomID2}; 1077 1078 # Process atom type of next atom in path... 1079 push @AtomPathSymbols, $This->{AssignedAtomTypes}{$BondedAtomID}; 1080 } 1081 return \@AtomPathSymbols; 1082 } 1083 1084 # Set final fingerprits... 1085 # 1086 sub _SetFinalFingerprints { 1087 my($This) = @_; 1088 1089 # Mark successful generation of fingerprints... 1090 $This->{FingerprintsGenerated} = 1; 1091 1092 if ($This->{Type} =~ /^PathLengthBits$/i) { 1093 $This->_SetFinalFingerprintsBitVector(); 1094 } 1095 elsif ($This->{Type} =~ /^PathLengthCount$/i) { 1096 $This->_SetFinalFingerprintsVector(); 1097 } 1098 1099 return $This; 1100 } 1101 1102 # Set final fingerprits bit vector... 1103 # 1104 sub _SetFinalFingerprintsBitVector { 1105 my($This) = @_; 1106 my($PathLength, $Size, $AtomPathString, $AtomPathHashCode, $AtomPathBitPos, $FingerprintsBitVector, $SkipBitPosCheck, $NumOfBitsToSetPerPath, $SetBitNum); 1107 1108 $FingerprintsBitVector = $This->{FingerprintsBitVector}; 1109 1110 $Size = $This->{Size}; 1111 1112 $SkipBitPosCheck = 1; 1113 $NumOfBitsToSetPerPath = $This->{NumOfBitsToSetPerPath}; 1114 1115 for $PathLength (keys %{$This->{AtomPathsStrings}}) { 1116 for $AtomPathString (keys %{$This->{AtomPathsStrings}{$PathLength}}) { 1117 $AtomPathHashCode = TextUtil::HashCode($AtomPathString); 1118 1119 # Set random number seed... 1120 if ($This->{UsePerlCoreRandom}) { 1121 CORE::srand($AtomPathHashCode); 1122 } 1123 else { 1124 MathUtil::srandom($AtomPathHashCode); 1125 } 1126 1127 for $SetBitNum (1 .. $NumOfBitsToSetPerPath) { 1128 $AtomPathBitPos = $This->{UsePerlCoreRandom} ? int(CORE::rand($Size)) : int(MathUtil::random($Size)); 1129 $FingerprintsBitVector->SetBit($AtomPathBitPos, $SkipBitPosCheck); 1130 } 1131 } 1132 } 1133 return $This; 1134 } 1135 1136 # Set final fingerprits vector... 1137 # 1138 sub _SetFinalFingerprintsVector { 1139 my($This) = @_; 1140 my($PathLength, $AtomPathString, $FingerprintsVector, $AtomPathCount, @Values, @ValueIDs); 1141 1142 @Values = (); 1143 @ValueIDs = (); 1144 1145 for $PathLength (sort { $a <=> $b } keys %{$This->{AtomPathsStrings}}) { 1146 for $AtomPathString (sort keys %{$This->{AtomPathsStrings}{$PathLength}}) { 1147 $AtomPathCount = $This->{AtomPathsStrings}{$PathLength}{$AtomPathString}; 1148 1149 push @Values, $AtomPathCount; 1150 push @ValueIDs, $AtomPathString; 1151 } 1152 } 1153 1154 # Add PathLengthIDs and values to fingerprint vector... 1155 $This->{FingerprintsVector}->AddValueIDs(\@ValueIDs); 1156 $This->{FingerprintsVector}->AddValues(\@Values); 1157 1158 return $This; 1159 } 1160 1161 # Cache appropriate molecule data... 1162 # 1163 sub _SetupMoleculeDataCache { 1164 my($This) = @_; 1165 1166 # Get all atoms... 1167 @{$This->{Atoms}} = $This->GetMolecule()->GetAtoms(); 1168 1169 return $This; 1170 } 1171 1172 # Clear cached molecule data... 1173 # 1174 sub _ClearMoleculeDataCache { 1175 my($This) = @_; 1176 1177 # Clear atoms... 1178 @{$This->{Atoms}} = (); 1179 1180 # Clear path atoms.. 1181 $This->{AtomPathsRef} = ''; 1182 1183 return $This; 1184 } 1185 1186 # Set atomic invariants to use atom identifiers... 1187 # 1188 sub SetAtomicInvariantsToUse { 1189 my($This, @Values) = @_; 1190 my($FirstValue, $TypeOfFirstValue, $AtomicInvariant, $SpecifiedAtomicInvariant, $AtomicInvariantValue, @SpecifiedAtomicInvariants, @AtomicInvariantsToUse); 1191 1192 if (!@Values) { 1193 carp "Warning: ${ClassName}->SetAtomicInvariantsToUse: No values specified..."; 1194 return; 1195 } 1196 1197 $FirstValue = $Values[0]; 1198 $TypeOfFirstValue = ref $FirstValue; 1199 1200 @SpecifiedAtomicInvariants = (); 1201 @AtomicInvariantsToUse = (); 1202 1203 if ($TypeOfFirstValue =~ /^ARRAY/) { 1204 push @SpecifiedAtomicInvariants, @{$FirstValue}; 1205 } 1206 else { 1207 push @SpecifiedAtomicInvariants, @Values; 1208 } 1209 1210 # Make sure specified AtomicInvariants are valid... 1211 for $SpecifiedAtomicInvariant (@SpecifiedAtomicInvariants) { 1212 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($SpecifiedAtomicInvariant)) { 1213 croak "Error: ${ClassName}->SetAtomicInvariantsToUse: Specified atomic invariant, $SpecifiedAtomicInvariant, is not supported...\n "; 1214 } 1215 $AtomicInvariant = $SpecifiedAtomicInvariant; 1216 push @AtomicInvariantsToUse, $AtomicInvariant; 1217 } 1218 1219 # Set atomic invariants to use... 1220 @{$This->{AtomicInvariantsToUse}} = (); 1221 push @{$This->{AtomicInvariantsToUse}}, @AtomicInvariantsToUse; 1222 1223 return $This; 1224 } 1225 1226 # Set functional classes to use for atom identifiers... 1227 # 1228 sub SetFunctionalClassesToUse { 1229 my($This, @Values) = @_; 1230 my($FirstValue, $TypeOfFirstValue, $FunctionalClass, $SpecifiedFunctionalClass, @SpecifiedFunctionalClasses, @FunctionalClassesToUse); 1231 1232 if (!@Values) { 1233 carp "Warning: ${ClassName}->SetFunctionalClassesToUse: No values specified..."; 1234 return; 1235 } 1236 1237 if ($This->{AtomIdentifierType} !~ /^FunctionalClassAtomTypes$/i) { 1238 carp "Warning: ${ClassName}->SetFunctionalClassesToUse: FunctionalClassesToUse can't be set for InitialAtomIdentifierType of $This->{AtomIdentifierType}..."; 1239 return; 1240 } 1241 1242 $FirstValue = $Values[0]; 1243 $TypeOfFirstValue = ref $FirstValue; 1244 1245 @SpecifiedFunctionalClasses = (); 1246 @FunctionalClassesToUse = (); 1247 1248 if ($TypeOfFirstValue =~ /^ARRAY/) { 1249 push @SpecifiedFunctionalClasses, @{$FirstValue}; 1250 } 1251 else { 1252 push @SpecifiedFunctionalClasses, @Values; 1253 } 1254 1255 # Make sure specified FunctionalClasses are valid... 1256 for $SpecifiedFunctionalClass (@SpecifiedFunctionalClasses) { 1257 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($SpecifiedFunctionalClass)) { 1258 croak "Error: ${ClassName}->SetFunctionalClassesToUse: Specified functional class, $SpecifiedFunctionalClass, is not supported...\n "; 1259 } 1260 push @FunctionalClassesToUse, $SpecifiedFunctionalClass; 1261 } 1262 1263 # Set functional classes to use... 1264 @{$This->{FunctionalClassesToUse}} = (); 1265 push @{$This->{FunctionalClassesToUse}}, @FunctionalClassesToUse; 1266 1267 return $This; 1268 } 1269 1270 # Initialize atom indentifier type information... 1271 # 1272 # Current supported values: 1273 # 1274 # AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, 1275 # MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes 1276 # 1277 sub _InitializeAtomIdentifierTypeInformation { 1278 my($This) = @_; 1279 1280 if ($This->{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) { 1281 $This->_InitializeAtomicInvariantsAtomTypesInformation(); 1282 } 1283 elsif ($This->{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) { 1284 $This->_InitializeFunctionalClassAtomTypesInformation(); 1285 } 1286 elsif ($This->{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 1287 # Nothing to do for now... 1288 } 1289 else { 1290 croak "Error: ${ClassName}->_InitializeAtomIdentifierTypeInformation: Unknown atom indentifier type $This->{AtomIdentifierType}..."; 1291 } 1292 1293 return $This; 1294 } 1295 1296 # Initialize atomic invariants atom types to use for generating atom identifiers... 1297 # 1298 # Let: 1299 # AS = Atom symbol corresponding to element symbol 1300 # 1301 # X<n> = Number of non-hydrogen atom neighbors or heavy atoms attached to atom 1302 # BO<n> = Sum of bond orders to non-hydrogen atom neighbors or heavy atoms attached to atom 1303 # LBO<n> = Largest bond order of non-hydrogen atom neighbors or heavy atoms attached to atom 1304 # SB<n> = Number of single bonds to non-hydrogen atom neighbors or heavy atoms attached to atom 1305 # DB<n> = Number of double bonds to non-hydrogen atom neighbors or heavy atoms attached to atom 1306 # TB<n> = Number of triple bonds to non-hydrogen atom neighbors or heavy atoms attached to atom 1307 # H<n> = Number of implicit and explicit hydrogens for atom 1308 # Ar = Aromatic annotation indicating whether atom is aromatic 1309 # RA = Ring atom annotation indicating whether atom is a ring 1310 # FC<+n/-n> = Formal charge assigned to atom 1311 # MN<n> = Mass number indicating isotope other than most abundant isotope 1312 # SM<n> = Spin multiplicity of atom. Possible values: 1 (singlet), 2 (doublet) or 3 (triplet) 1313 # 1314 # Then: 1315 # 1316 # Atom type generated by AtomTypes::AtomicInvariantsAtomTypes class corresponds to: 1317 # 1318 # AS.X<n>.BO<n>.LBO<n>.<SB><n>.<DB><n>.<TB><n>.H<n>.Ar.RA.FC<+n/-n>.MN<n>.SM<n> 1319 # 1320 # Except for AS which is a required atomic invariant in atom types, all other atomic invariants are 1321 # optional. Default atomic invariants used for AtomID are: AS, X<n>, BO<n>, H<n>, FC<+n/-n>. 1322 # AtomID specification doesn't include atomic invariants with zero or undefined values. 1323 # 1324 sub _InitializeAtomicInvariantsAtomTypesInformation { 1325 my($This) = @_; 1326 1327 # Default atomic invariants to use for generating atom neighborhood atom IDs: AS, X, BO, H, FC 1328 # 1329 @{$This->{AtomicInvariantsToUse}} = (); 1330 @{$This->{AtomicInvariantsToUse}} = ('AS', 'X', 'BO', 'H', 'FC'); 1331 1332 return $This; 1333 } 1334 1335 # Initialize functional class atom types, generated by AtomTypes::FunctionalClassAtomTypes 1336 # class, to use for generating atom identifiers... 1337 # 1338 # Let: 1339 # HBD: HydrogenBondDonor 1340 # HBA: HydrogenBondAcceptor 1341 # PI : PositivelyIonizable 1342 # NI : NegativelyIonizable 1343 # Ar : Aromatic 1344 # Hal : Halogen 1345 # H : Hydrophobic 1346 # RA : RingAtom 1347 # CA : ChainAtom 1348 # 1349 # Then: 1350 # 1351 # Functiononal class atom type specification for an atom corresponds to: 1352 # 1353 # Ar.CA.H.HBA.HBD.Hal.NI.PI.RA 1354 # 1355 # Default functional classes used are: HBD, HBA, PI, NI, Ar, Hal 1356 # 1357 # FunctionalAtomTypes are assigned using the following definitions [ Ref 60-61, Ref 65-66 ]: 1358 # 1359 # HydrogenBondDonor: NH, NH2, OH 1360 # HydrogenBondAcceptor: N[!H], O 1361 # PositivelyIonizable: +, NH2 1362 # NegativelyIonizable: -, C(=O)OH, S(=O)OH, P(=O)OH 1363 # 1364 sub _InitializeFunctionalClassAtomTypesInformation { 1365 my($This) = @_; 1366 1367 # Default functional class atom typess to use for generating atom identifiers 1368 # are: HBD, HBA, PI, NI, Ar, Hal 1369 # 1370 @{$This->{FunctionalClassesToUse}} = (); 1371 @{$This->{FunctionalClassesToUse}} = ('HBD', 'HBA', 'PI', 'NI', 'Ar', 'Hal'); 1372 1373 return $This; 1374 } 1375 1376 # Return a string containg data for PathLengthFingerprints object... 1377 # 1378 sub StringifyPathLengthFingerprints { 1379 my($This) = @_; 1380 my($PathLengthsFingerprintsString); 1381 1382 # Type of fingerprint... 1383 $PathLengthsFingerprintsString = "Fingerprint type: $This->{Type}; AtomIdentifierType: $This->{AtomIdentifierType}"; 1384 1385 # Path length... 1386 $PathLengthsFingerprintsString .= "; MinPathLength: $This->{MinLength}; MaxPathLength: $This->{MaxLength}"; 1387 1388 # Fingerprint generation control... 1389 my($AllowSharedBonds, $AllowRings, $UseBondSymbols, $UseUniquePaths); 1390 1391 $AllowSharedBonds = $This->{AllowSharedBonds} ? "Yes" : "No"; 1392 $AllowRings = $This->{AllowRings} ? "Yes" : "No"; 1393 $UseBondSymbols = $This->{UseBondSymbols} ? "Yes" : "No"; 1394 $UseUniquePaths = $This->{UseBondSymbols} ? "Yes" : "No"; 1395 1396 $PathLengthsFingerprintsString .= "; UseUniquePaths: $UseUniquePaths; AllowSharedBonds: $AllowSharedBonds; AllowRings: $AllowRings; UseBondSymbols: $UseBondSymbols"; 1397 1398 if ($This->{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) { 1399 my($AtomicInvariant, @AtomicInvariants, @AtomicInvariantsOrder, %AvailableAtomicInvariants); 1400 1401 @AtomicInvariantsOrder = AtomTypes::AtomicInvariantsAtomTypes::GetAtomicInvariantsOrder(); 1402 %AvailableAtomicInvariants = AtomTypes::AtomicInvariantsAtomTypes::GetAvailableAtomicInvariants(); 1403 1404 for $AtomicInvariant (@AtomicInvariantsOrder) { 1405 push @AtomicInvariants, "$AtomicInvariant: $AvailableAtomicInvariants{$AtomicInvariant}"; 1406 } 1407 1408 $PathLengthsFingerprintsString .= "; AtomicInvariantsToUse: <" . TextUtil::JoinWords(\@{$This->{AtomicInvariantsToUse}}, ", ", 0) . ">"; 1409 $PathLengthsFingerprintsString .= "; AtomicInvariantsOrder: <" . TextUtil::JoinWords(\@AtomicInvariantsOrder, ", ", 0) . ">"; 1410 $PathLengthsFingerprintsString .= "; AvailableAtomicInvariants: <" . TextUtil::JoinWords(\@AtomicInvariants, ", ", 0) . ">"; 1411 } 1412 elsif ($This->{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) { 1413 my($FunctionalClass, @FunctionalClasses, @FunctionalClassesOrder, %AvailableFunctionalClasses); 1414 1415 @FunctionalClassesOrder = AtomTypes::FunctionalClassAtomTypes::GetFunctionalClassesOrder(); 1416 %AvailableFunctionalClasses = AtomTypes::FunctionalClassAtomTypes::GetAvailableFunctionalClasses(); 1417 1418 for $FunctionalClass (@FunctionalClassesOrder) { 1419 push @FunctionalClasses, "$FunctionalClass: $AvailableFunctionalClasses{$FunctionalClass}"; 1420 } 1421 1422 $PathLengthsFingerprintsString .= "; FunctionalClassesToUse: <" . TextUtil::JoinWords(\@{$This->{FunctionalClassesToUse}}, ", ", 0) . ">"; 1423 $PathLengthsFingerprintsString .= "; FunctionalClassesOrder: <" . TextUtil::JoinWords(\@FunctionalClassesOrder, ", ", 0) . ">"; 1424 $PathLengthsFingerprintsString .= "; AvailableFunctionalClasses: <" . TextUtil::JoinWords(\@FunctionalClasses, ", ", 0) . ">"; 1425 } 1426 1427 if ($This->{Type} =~ /^PathLengthBits$/i) { 1428 # Size... 1429 $PathLengthsFingerprintsString .= "; Size: $This->{Size}; MinSize: $This->{MinSize}; MaxSize: $This->{MaxSize}"; 1430 1431 # NumOfBitsToSetPerPath... 1432 $PathLengthsFingerprintsString .= "; NumOfBitsToSetPerPath: $This->{NumOfBitsToSetPerPath}"; 1433 1434 # Fingerprint bit density and num of bits set... 1435 my($NumOfSetBits, $BitDensity); 1436 $NumOfSetBits = $This->{FingerprintsBitVector}->GetNumOfSetBits(); 1437 $BitDensity = $This->{FingerprintsBitVector}->GetFingerprintsBitDensity(); 1438 $PathLengthsFingerprintsString .= "; NumOfOnBits: $NumOfSetBits; BitDensity: $BitDensity"; 1439 1440 $PathLengthsFingerprintsString .= "; FingerprintsBitVector: < $This->{FingerprintsBitVector} >"; 1441 } 1442 elsif ($This->{Type} =~ /^PathLengthCount$/i) { 1443 $PathLengthsFingerprintsString .= "; FingerprintsVector: < $This->{FingerprintsVector} >"; 1444 } 1445 1446 return $PathLengthsFingerprintsString; 1447 } 1448