1 package Fingerprints::FingerprintsVector; 2 # 3 # $RCSfile: FingerprintsVector.pm,v $ 4 # $Date: 2015/02/28 20:48:54 $ 5 # $Revision: 1.31 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use Carp; 31 use Exporter; 32 use Scalar::Util (); 33 use MathUtil (); 34 use TextUtil (); 35 use StatisticsUtil (); 36 use BitVector; 37 use Vector; 38 39 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 40 41 @ISA = qw(Exporter); 42 43 # Distance coefficients 44 my(@DistanceCoefficients) = qw(CityBlockDistanceCoefficient EuclideanDistanceCoefficient HammingDistanceCoefficient ManhattanDistanceCoefficient SoergelDistanceCoefficient); 45 46 # Similarity coefficients... 47 my(@SimilarityCoefficients) = qw(CosineSimilarityCoefficient CzekanowskiSimilarityCoefficient DiceSimilarityCoefficient OchiaiSimilarityCoefficient JaccardSimilarityCoefficient SorensonSimilarityCoefficient TanimotoSimilarityCoefficient); 48 49 # New from string... 50 my(@NewFromString) = qw(NewFromValuesString NewFromValuesAndIDsString NewFromIDsAndValuesString NewFromValuesAndIDsPairsString NewFromIDsAndValuesPairsString); 51 52 @EXPORT = qw(IsFingerprintsVector); 53 @EXPORT_OK = qw(GetSupportedDistanceCoefficients GetSupportedSimilarityCoefficients GetSupportedDistanceAndSimilarityCoefficients @DistanceCoefficients @SimilarityCoefficients); 54 55 %EXPORT_TAGS = ( 56 new => [@NewFromString], 57 distancecoefficients => [@DistanceCoefficients], 58 similaritycoefficients => [@SimilarityCoefficients], 59 all => [@EXPORT, @EXPORT_OK] 60 ); 61 62 # Setup class variables... 63 my($ClassName); 64 _InitializeClass(); 65 66 # Overload Perl functions... 67 use overload '""' => 'StringifyFingerprintsVector'; 68 69 # Class constructor... 70 sub new { 71 my($Class, %NamesAndValues) = @_; 72 73 # Initialize object... 74 my $This = {}; 75 bless $This, ref($Class) || $Class; 76 77 $This->_InitializeFingerprintsVector(); 78 79 $This->_InitializeFingerprintsVectorProperties(%NamesAndValues); 80 81 return $This; 82 } 83 84 # Initialize object data... 85 # 86 sub _InitializeFingerprintsVector { 87 my($This) = @_; 88 89 # Type of fingerprint vector... 90 $This->{Type} = ''; 91 92 # Fingerprint vector values... 93 @{$This->{Values}} = (); 94 95 # Fingerprint vector value IDs... 96 @{$This->{ValueIDs}} = (); 97 98 return $This; 99 } 100 101 # Initialize class ... 102 sub _InitializeClass { 103 #Class name... 104 $ClassName = __PACKAGE__; 105 } 106 107 # Initialize object properties.... 108 sub _InitializeFingerprintsVectorProperties { 109 my($This, %NamesAndValues) = @_; 110 111 my($Name, $Value, $MethodName); 112 while (($Name, $Value) = each %NamesAndValues) { 113 $MethodName = "Set${Name}"; 114 $This->$MethodName($Value); 115 } 116 117 if (!exists $NamesAndValues{Type}) { 118 croak "Error: ${ClassName}->New: Object can't be instantiated without specifying type..."; 119 } 120 return $This; 121 } 122 123 # Create a new fingerprints vector using space delimited values string. This functionality can be 124 # either invoked as a class function or an object method. 125 # 126 sub NewFromValuesString ($$;$) { 127 my($FirstParameter, $SecondParameter, $ThirdParamater) = @_; 128 my($This, $Type, $ValuesString); 129 130 if (@_ == 3) { 131 ($This, $Type, $ValuesString) = ($FirstParameter, $SecondParameter, $ThirdParamater); 132 } 133 else { 134 ($This, $Type, $ValuesString) = (undef, $FirstParameter, $SecondParameter); 135 } 136 my($FingerprintsVector, @Values); 137 138 @Values = (); 139 if (defined($ValuesString) && length($ValuesString) && $ValuesString !~ /^None$/i) { 140 @Values = split(' ', $ValuesString); 141 } 142 143 $FingerprintsVector = new Fingerprints::FingerprintsVector('Type' => $Type, 'Values' => \@Values); 144 145 return $FingerprintsVector; 146 } 147 148 # Create a new fingerprints vector using values and IDs string containing semicolon 149 # delimited value string and value IDs strings. The values within value and value IDs 150 # string are delimited by spaces. 151 # 152 # This functionality can be either invoked as a class function or an object method. 153 # 154 sub NewFromValuesAndIDsString ($$;$) { 155 my($FirstParameter, $SecondParameter, $ThirdParamater) = @_; 156 my($This, $Type, $ValuesAndIDsString); 157 158 if (@_ == 3) { 159 ($This, $Type, $ValuesAndIDsString) = ($FirstParameter, $SecondParameter, $ThirdParamater); 160 } 161 else { 162 ($This, $Type, $ValuesAndIDsString) = (undef, $FirstParameter, $SecondParameter); 163 } 164 my($FingerprintsVector, $ValuesString, $ValueIDsString, @Values, @ValueIDs); 165 166 ($ValuesString, $ValueIDsString) = split(';', $ValuesAndIDsString); 167 168 @Values = (); 169 if (defined($ValuesString) && length($ValuesString) && $ValuesString !~ /^None$/i) { 170 @Values = split(' ', $ValuesString); 171 } 172 @ValueIDs = (); 173 if (defined($ValueIDsString) && length($ValueIDsString) && $ValueIDsString !~ /^None$/i) { 174 @ValueIDs = split(' ', $ValueIDsString); 175 } 176 177 if (@Values != @ValueIDs ) { 178 carp "Warning: ${ClassName}->NewFromValuesAndIDsString: Object can't be instantiated: Number specified values, " . scalar @Values . ", must be equal to number of specified value IDs, " . scalar @ValueIDs . "..."; 179 return undef; 180 } 181 182 $FingerprintsVector = new Fingerprints::FingerprintsVector('Type' => $Type, 'Values' => \@Values, 'ValueIDs' => \@ValueIDs); 183 184 return $FingerprintsVector; 185 } 186 187 # Create a new fingerprints vector using IDs and values string containing semicolon 188 # delimited value IDs string and values strings. The values within value and value IDs 189 # string are delimited by spaces. 190 # 191 # This functionality can be either invoked as a class function or an object method. 192 # 193 sub NewFromIDsAndValuesString ($$;$) { 194 my($FirstParameter, $SecondParameter, $ThirdParamater) = @_; 195 my($This, $Type, $IDsAndValuesString); 196 197 if (@_ == 3) { 198 ($This, $Type, $IDsAndValuesString) = ($FirstParameter, $SecondParameter, $ThirdParamater); 199 } 200 else { 201 ($This, $Type, $IDsAndValuesString) = (undef, $FirstParameter, $SecondParameter); 202 } 203 my($FingerprintsVector, $ValuesString, $ValueIDsString, @Values, @ValueIDs); 204 205 ($ValueIDsString, $ValuesString) = split(';', $IDsAndValuesString); 206 207 @Values = (); 208 if (defined($ValuesString) && length($ValuesString) && $ValuesString !~ /^None$/i) { 209 @Values = split(' ', $ValuesString); 210 } 211 @ValueIDs = (); 212 if (defined($ValueIDsString) && length($ValueIDsString) && $ValueIDsString !~ /^None$/i) { 213 @ValueIDs = split(' ', $ValueIDsString); 214 } 215 216 if (@Values != @ValueIDs ) { 217 carp "Warning: ${ClassName}->NewFromIDsAndValuesString: Object can't be instantiated: Number specified values, " . scalar @Values . ", must be equal to number of specified value IDs, " . scalar @ValueIDs . "..."; 218 return undef; 219 } 220 221 $FingerprintsVector = new Fingerprints::FingerprintsVector('Type' => $Type, 'Values' => \@Values, 'ValueIDs' => \@ValueIDs); 222 223 return $FingerprintsVector; 224 } 225 226 # Create a new fingerprints vector using values and IDs pairs string containing space 227 # value and value IDs pairs. 228 # 229 # This functionality can be either invoked as a class function or an object method. 230 # 231 sub NewFromValuesAndIDsPairsString ($$;$) { 232 my($FirstParameter, $SecondParameter, $ThirdParamater) = @_; 233 my($This, $Type, $ValuesAndIDsPairsString); 234 235 if (@_ == 3) { 236 ($This, $Type, $ValuesAndIDsPairsString) = ($FirstParameter, $SecondParameter, $ThirdParamater); 237 } 238 else { 239 ($This, $Type, $ValuesAndIDsPairsString) = (undef, $FirstParameter, $SecondParameter); 240 } 241 my($FingerprintsVector, $Index, @Values, @ValueIDs, @ValuesAndIDsPairs); 242 243 @ValuesAndIDsPairs = split(' ', $ValuesAndIDsPairsString); 244 if (@ValuesAndIDsPairs % 2) { 245 carp "Warning: ${ClassName}->NewFromValuesAndIDsPairsString: No fingerprint vector created: Invalid values and IDs pairs data: Input list must contain even number of values and IDs pairs..."; 246 return undef; 247 } 248 249 @Values = (); @ValueIDs = (); 250 if (!(@ValuesAndIDsPairs == 2 && $ValuesAndIDsPairs[0] =~ /^None$/i && $ValuesAndIDsPairs[1] =~ /^None$/i)) { 251 for ($Index = 0; $Index < $#ValuesAndIDsPairs; $Index += 2) { 252 push @Values, $ValuesAndIDsPairs[$Index]; 253 push @ValueIDs, $ValuesAndIDsPairs[$Index + 1]; 254 } 255 } 256 $FingerprintsVector = new Fingerprints::FingerprintsVector('Type' => $Type, 'Values' => \@Values, 'ValueIDs' => \@ValueIDs); 257 258 return $FingerprintsVector; 259 } 260 261 # Create a new fingerprints vector using IDs and values pairs string containing space 262 # value IDs and valus pairs. 263 # 264 # This functionality can be either invoked as a class function or an object method. 265 # 266 sub NewFromIDsAndValuesPairsString ($$;$) { 267 my($FirstParameter, $SecondParameter, $ThirdParamater) = @_; 268 my($This, $Type, $IDsAndValuesPairsString); 269 270 if (@_ == 3) { 271 ($This, $Type, $IDsAndValuesPairsString) = ($FirstParameter, $SecondParameter, $ThirdParamater); 272 } 273 else { 274 ($This, $Type, $IDsAndValuesPairsString) = (undef, $FirstParameter, $SecondParameter); 275 } 276 my($FingerprintsVector, $Index, @Values, @ValueIDs, @IDsAndValuesPairs); 277 278 @IDsAndValuesPairs = split(' ', $IDsAndValuesPairsString); 279 if (@IDsAndValuesPairs % 2) { 280 croak "Error: ${ClassName}->NewFromIDsAndValuesPairsString: No fingerprint vector created: Invalid values and IDs pairs data: Input list must contain even number of values and IDs pairs..."; 281 return undef; 282 } 283 284 @Values = (); @ValueIDs = (); 285 if (!(@IDsAndValuesPairs == 2 && $IDsAndValuesPairs[0] =~ /^None$/i && $IDsAndValuesPairs[1] =~ /^None$/i)) { 286 for ($Index = 0; $Index < $#IDsAndValuesPairs; $Index += 2) { 287 push @ValueIDs, $IDsAndValuesPairs[$Index]; 288 push @Values, $IDsAndValuesPairs[$Index + 1]; 289 } 290 } 291 $FingerprintsVector = new Fingerprints::FingerprintsVector('Type' => $Type, 'Values' => \@Values, 'ValueIDs' => \@ValueIDs); 292 293 return $FingerprintsVector; 294 } 295 296 # Set type of fingerprint vector. Supported types are: OrderedNumericalValues, NumericalValues, and 297 # AlphaNumericalValues 298 # 299 # . For OrderedNumericalValues type, both vectors must be of the same size and contain similar 300 # types of numerical values in the same order. 301 # 302 # . For NumericalValues type, vector value IDs for both vectors must be specified; however, their 303 # size and order of IDs and numerical values may be different. For each vector, value IDs must 304 # correspond to vector values. 305 # 306 # . For AlphaNumericalValues type, vectors may contain both numerical and alphanumerical values 307 # and their sizes may be different. 308 # 309 sub SetType { 310 my($This, $Type) = @_; 311 312 if ($Type !~ /^(OrderedNumericalValues|NumericalValues|AlphaNumericalValues)$/i) { 313 croak "Error: ${ClassName}->SetType: Specified value, $Type, for Type is not vaild. Supported types in current release of MayaChemTools: OrderedNumericalValues, NumericalValues or AlphaNumericalValues"; 314 } 315 316 if ($This->{Type}) { 317 croak "Error: ${ClassName}->SetType: Can't change intial fingerprints vector type: It's already set..."; 318 } 319 $This->{Type} = $Type; 320 321 return $This; 322 } 323 324 # Get fingerpints vector type... 325 # 326 sub GetType { 327 my($This) = @_; 328 329 return $This->{Type}; 330 } 331 332 # Set ID... 333 sub SetID { 334 my($This, $Value) = @_; 335 336 $This->{ID} = $Value; 337 338 return $This; 339 } 340 341 # Get ID... 342 sub GetID { 343 my($This) = @_; 344 345 return exists $This->{ID} ? $This->{ID} : 'None'; 346 } 347 348 # Set description... 349 sub SetDescription { 350 my($This, $Value) = @_; 351 352 $This->{Description} = $Value; 353 354 return $This; 355 } 356 357 # Get description... 358 sub GetDescription { 359 my($This) = @_; 360 361 return exists $This->{Description} ? $This->{Description} : 'No description available'; 362 } 363 364 # Set vector type... 365 sub SetVectorType { 366 my($This, $Value) = @_; 367 368 $This->{VectorType} = $Value; 369 370 return $This; 371 } 372 373 # Get vector type... 374 sub GetVectorType { 375 my($This) = @_; 376 377 return exists $This->{VectorType} ? $This->{VectorType} : 'FingerprintsVector'; 378 } 379 380 # Set values of a fingerprint vector using a vector, reference to an array or an array... 381 # 382 sub SetValues { 383 my($This, @Values) = @_; 384 385 $This->_SetOrAddValuesOrValueIDs("SetValues", @Values); 386 387 return $This; 388 } 389 390 # Set value IDs of a fingerprint vector using a vector, reference to an array or an array... 391 # 392 sub SetValueIDs { 393 my($This, @Values) = @_; 394 395 $This->_SetOrAddValuesOrValueIDs("SetValueIDs", @Values); 396 397 return $This; 398 } 399 400 # Add values to a fingerprint vector using a vector, reference to an array or an array... 401 # 402 sub AddValues { 403 my($This, @Values) = @_; 404 405 $This->_SetOrAddValuesOrValueIDs("AddValues", @Values); 406 407 return $This; 408 } 409 410 # Add value IDs to a fingerprint vector using a vector, reference to an array or an array... 411 # 412 sub AddValueIDs { 413 my($This, @Values) = @_; 414 415 $This->_SetOrAddValuesOrValueIDs("AddValueIDs", @Values); 416 417 return $This; 418 } 419 420 # Set or add values or value IDs using: 421 # 422 # o List of values or ValueIDs 423 # o Reference to an list of values or ValuesIDs 424 # o A vector containing values or ValueIDs 425 # 426 sub _SetOrAddValuesOrValueIDs { 427 my($This, $Mode, @Values) = @_; 428 429 if (!@Values) { 430 return; 431 } 432 433 # Collect specified values or valueIDs... 434 my($FirstValue, $TypeOfFirstValue, $ValuesRef); 435 436 $FirstValue = $Values[0]; 437 $TypeOfFirstValue = ref $FirstValue; 438 if ($TypeOfFirstValue =~ /^(SCALAR|HASH|CODE|REF|GLOB)/) { 439 croak "Error: ${ClassName}-> _SetOrAddValuesOrValueIDs: Trying to add values to vector object with a reference to unsupported value format..."; 440 } 441 442 if (Vector::IsVector($FirstValue)) { 443 # It's a vector... 444 $ValuesRef = $FirstValue->GetValues(); 445 } 446 elsif ($TypeOfFirstValue =~ /^ARRAY/) { 447 # It's an array refernce... 448 $ValuesRef = $FirstValue; 449 } 450 else { 451 # It's a list of values... 452 $ValuesRef = \@Values; 453 } 454 455 # Set or add values or value IDs... 456 MODE: { 457 if ($Mode =~ /^SetValues$/i) { @{$This->{Values}} = (); push @{$This->{Values}}, @{$ValuesRef}; last MODE; } 458 if ($Mode =~ /^SetValueIDs$/i) { @{$This->{ValueIDs}} = (); push @{$This->{ValueIDs}}, @{$ValuesRef}; last MODE; } 459 if ($Mode =~ /^AddValues$/i) { push @{$This->{Values}}, @{$ValuesRef}; last MODE; } 460 if ($Mode =~ /^AddValueIDs$/i) { push @{$This->{ValueIDs}}, @{$ValuesRef}; last MODE; } 461 croak "Error: ${ClassName}-> _SetOrAddValuesOrValueIDs: Unknown mode $Mode..."; 462 } 463 return $This; 464 } 465 466 # Set a specific value in fingerprint vector with indicies starting from 0.. 467 # 468 sub SetValue { 469 my($This, $Index, $Value, $SkipCheck) = @_; 470 471 # Just set it... 472 if ($SkipCheck) { 473 return $This->_SetValue($Index, $Value); 474 } 475 476 # Check and set... 477 if ($Index < 0) { 478 croak "Error: ${ClassName}->SetValue: Index value must be a positive number..."; 479 } 480 if ($Index >= $This->GetNumOfValues()) { 481 croak "Error: ${ClassName}->SetValue: Index vaue must be less than number of values..."; 482 } 483 484 return $This->_SetValue($Index, $Value); 485 } 486 487 # Set a fingerprint vector value... 488 # 489 sub _SetValue { 490 my($This, $Index, $Value) = @_; 491 492 $This->{Values}[$Index] = $Value; 493 494 return $This; 495 } 496 497 # Get a specific value from fingerprint vector with indicies starting from 0... 498 # 499 sub GetValue { 500 my($This, $Index) = @_; 501 502 if ($Index < 0) { 503 croak "Error: ${ClassName}->GetValue: Index value must be a positive number..."; 504 } 505 if ($Index >= $This->GetNumOfValues()) { 506 croak "Error: ${ClassName}->GetValue: Index value must be less than number of values..."; 507 } 508 return $This->_GetValue($Index); 509 } 510 511 # Get a fingerprint vector value... 512 sub _GetValue { 513 my($This, $Index) = @_; 514 515 return $This->{Values}[$Index]; 516 } 517 518 # Return vector values as an array or reference to an array... 519 # 520 sub GetValues { 521 my($This) = @_; 522 523 return wantarray ? @{$This->{Values}} : \@{$This->{Values}}; 524 } 525 526 # Set a specific value ID in fingerprint vector with indicies starting from 0.. 527 # 528 sub SetValueID { 529 my($This, $Index, $Value, $SkipCheck) = @_; 530 531 # Just set it... 532 if ($SkipCheck) { 533 return $This->_SetValueID($Index, $Value); 534 } 535 536 # Check and set... 537 if ($Index < 0) { 538 croak "Error: ${ClassName}->SetValueID: Index value must be a positive number..."; 539 } 540 if ($Index >= $This->GetNumOfValueIDs()) { 541 croak "Error: ${ClassName}->SetValueID: Index vaue must be less than number of value IDs..."; 542 } 543 544 return $This->_SetValueID($Index, $Value); 545 } 546 547 # Set a fingerprint vector value ID... 548 # 549 sub _SetValueID { 550 my($This, $Index, $Value) = @_; 551 552 $This->{ValueIDs}[$Index] = $Value; 553 554 return $This; 555 } 556 557 # Get a specific value ID from fingerprint vector with indicies starting from 0... 558 # 559 sub GetValueID { 560 my($This, $Index) = @_; 561 562 if ($Index < 0) { 563 croak "Error: ${ClassName}->GetValueID: Index value must be a positive number..."; 564 } 565 if ($Index >= $This->GetNumOfValueIDs()) { 566 croak "Error: ${ClassName}->GetValueID: Index value must be less than number of value IDs..."; 567 } 568 return $This->_GetValueID($Index); 569 } 570 571 # Get a fingerprint vector value ID... 572 # 573 sub _GetValueID { 574 my($This, $Index) = @_; 575 576 return $This->{ValueIDs}[$Index]; 577 } 578 579 # Return vector value IDs as an array or reference to an array... 580 # 581 sub GetValueIDs { 582 my($This) = @_; 583 584 return wantarray ? @{$This->{ValueIDs}} : \@{$This->{ValueIDs}}; 585 } 586 587 # Get fingerprints vector string containing values and/or IDs string in a specifed format... 588 # 589 sub GetFingerprintsVectorString { 590 my($This, $Format) = @_; 591 592 FORMAT : { 593 if ($Format =~ /^(IDsAndValuesString|IDsAndValues)$/i) { return $This->GetIDsAndValuesString(); last FORMAT; } 594 if ($Format =~ /^(IDsAndValuesPairsString|IDsAndValuesPairs)$/i) { return $This->GetIDsAndValuesPairsString(); last FORMAT; } 595 if ($Format =~ /^(ValuesAndIDsString|ValuesAndIDs)$/i) { return $This->GetValuesAndIDsString(); last FORMAT; } 596 if ($Format =~ /^(ValuesAndIDsPairsString|ValuesAndIDsPairs)$/i) { return $This->GetValuesAndIDsPairsString(); last FORMAT;} 597 if ($Format =~ /^(ValueIDsString|ValueIDs)$/i) { return $This->GetValueIDsString(); last FORMAT; } 598 if ($Format =~ /^(ValuesString|Values)$/i) { return $This->GetValuesString(); last FORMAT; } 599 croak "Error: ${ClassName}->GetFingerprintsVectorString: Specified vector string format, $Format, is not supported. Value values: IDsAndValuesString, IDsAndValues, IDsAndValuesPairsString, IDsAndValuesPairs, ValuesAndIDsString, ValuesAndIDs, ValuesAndIDsPairsString, ValuesAndIDsPairs, ValueIDsString, ValueIDs, ValuesString, Values..."; 600 } 601 return ''; 602 } 603 # Get vector value IDs and values string as space delimited ASCII string separated 604 # by semicolon... 605 # 606 sub GetIDsAndValuesString { 607 my($This) = @_; 608 609 if (@{$This->{ValueIDs}} && @{$This->{Values}}) { 610 # Both IDs and values are available... 611 return join(' ', @{$This->{ValueIDs}}) . ";" . join(' ', @{$This->{Values}}); 612 } 613 elsif (@{$This->{Values}}) { 614 # Only values are available... 615 return "None;" . join(' ', @{$This->{Values}}); 616 } 617 else { 618 # Values are not available... 619 return "None;None"; 620 } 621 } 622 623 # Get vector value IDs and value pairs string as space delimited ASCII string... 624 # 625 sub GetIDsAndValuesPairsString { 626 my($This) = @_; 627 my($Index, $ValueIDsPresent, @IDsAndValuesPairs); 628 629 if (!@{$This->{Values}}) { 630 # Values are unavailable... 631 return "None None"; 632 } 633 634 $ValueIDsPresent = @{$This->{ValueIDs}} ? 1 : 0; 635 636 @IDsAndValuesPairs = (); 637 for $Index (0 .. $#{$This->{Values}}) { 638 if ($ValueIDsPresent) { 639 push @IDsAndValuesPairs, ($This->{ValueIDs}->[$Index], $This->{Values}->[$Index]); 640 } 641 else { 642 push @IDsAndValuesPairs, ('None', $This->{Values}->[$Index]); 643 } 644 } 645 return join(' ', @IDsAndValuesPairs); 646 } 647 648 # Get vector value and value IDs string as space delimited ASCII string separated 649 # by semicolon... 650 # 651 sub GetValuesAndIDsString { 652 my($This) = @_; 653 654 if (@{$This->{ValueIDs}} && @{$This->{Values}}) { 655 # Both IDs and values are available... 656 return join(' ', @{$This->{Values}}) . ";" . join(' ', @{$This->{ValueIDs}}); 657 } 658 elsif (@{$This->{Values}}) { 659 # Only values are available... 660 return join(' ', @{$This->{Values}}) . ";None"; 661 } 662 else { 663 # Values are not available... 664 return "None;None"; 665 } 666 } 667 668 # Get vector value and value ID pairs string as space delimited ASCII string... 669 # 670 sub GetValuesAndIDsPairsString { 671 my($This) = @_; 672 my($Index, $ValueIDsPresent, @ValuesAndIDsPairs); 673 674 if (!@{$This->{Values}}) { 675 # Values are unavailable... 676 return "None None"; 677 } 678 679 $ValueIDsPresent = @{$This->{ValueIDs}} ? 1 : 0; 680 681 @ValuesAndIDsPairs = (); 682 for $Index (0 .. $#{$This->{Values}}) { 683 if ($ValueIDsPresent) { 684 push @ValuesAndIDsPairs, ($This->{Values}->[$Index], $This->{ValueIDs}->[$Index]); 685 } 686 else { 687 push @ValuesAndIDsPairs, ($This->{Values}->[$Index], 'None'); 688 } 689 } 690 return join(' ', @ValuesAndIDsPairs); 691 } 692 693 # Get vector value IDs string as space delimited ASCII string... 694 # 695 sub GetValueIDsString { 696 my($This) = @_; 697 698 return @{$This->{ValueIDs}} ? join(' ', @{$This->{ValueIDs}}) : 'None'; 699 } 700 701 # Get vector value string as space delimited ASCII string... 702 # 703 sub GetValuesString { 704 my($This) = @_; 705 706 return @{$This->{Values}} ? join(' ', @{$This->{Values}}) : 'None'; 707 } 708 709 # Get number of values... 710 sub GetNumOfValues { 711 my($This) = @_; 712 713 return scalar @{$This->{Values}}; 714 } 715 716 # Get number of non-zero values... 717 sub GetNumOfNonZeroValues { 718 my($This) = @_; 719 my($Count, $Index, $Size); 720 721 $Count = 0; 722 $Size = $This->GetNumOfValues(); 723 724 for $Index (0 .. ($Size -1)) { 725 if ($This->{Values}[$Index] != 0) { 726 $Count++; 727 } 728 } 729 return $Count; 730 } 731 732 # Get number of value IDs... 733 sub GetNumOfValueIDs { 734 my($This) = @_; 735 736 return scalar @{$This->{ValueIDs}}; 737 } 738 739 # FinegerprintsVectors class provides methods to calculate similarity between vectors 740 # containing three different types of values: 741 # 742 # Type I: OrderedNumericalValues 743 # 744 # . Size of two vectors are same 745 # . Vectors contain real values in a specific order. For example: MACCS keys count, Topological 746 # pharnacophore atom pairs and so on. 747 # . Option to calculate similarity value using continious values or binary values 748 # 749 # Type II: UnorderedNumericalValues 750 # 751 # . Size of two vectors might not be same 752 # . Vectors contain unordered real value identified by value IDs. For example: Toplogical atom pairs, 753 # Topological atom torsions and so on 754 # . Option to calculate similarity value using continous values or binary values 755 # 756 # Type III: AlphaNumericalValues 757 # 758 # . Size of two vectors might not be same 759 # . Vectors contain unordered alphanumerical values. For example: Extended connectivity fingerprints, 760 # atom neighbothood fingerpritns. 761 # . The vector values are treated as keys or bit indices and similarity value is calculated accordingly. 762 # 763 # Before performing similarity or distance calculations between vectors containing UnorderedNumericalValues 764 # or AlphaNumericalValues, the vectors are tranformed into vectors containing unique OrderedNumericalValues 765 # using value IDs for UnorderedNumericalValues and values itself for AlphaNumericalValues. 766 # 767 # Three forms similarity or distance calculation between two vectors: AlgebraicForm, BinaryForm or 768 # SetTheoreticForm. 769 # 770 # The value of an extra paramter, CalculationMode, passed to each similarity or distance function 771 # controls the calculation. Supported values for CalculationMode: AlgebraicForm, BinaryForm and 772 # SetTheoreticForm. Default: AlgebraicForm. 773 # 774 # For BinaryForm CalculationMode, the ordered list of processed final vector values containing the value or 775 # count of each unique value type is simply converted into a binary vector containing 1s and 0s 776 # corresponding to presence or absence of values before calculating similarity or distance between 777 # two vectors. 778 # 779 # For two fingerprint vectors A and B of same size containing OrderedNumericalValues, let: 780 # 781 # N = Number values in A or B 782 # 783 # Xa = Values of vector A 784 # Xb = Values of vector B 785 # 786 # Xai = Value of ith element in A 787 # Xbi = Value of ith element in B 788 # 789 # SUM = Sum of i over N values 790 # 791 # For SetTheoreticForm of calculation between two vectors, let: 792 # 793 # SetIntersectionXaXb = SUM ( MIN ( Xai, Xbi ) ) 794 # SetDifferenceXaXb = SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) 795 # 796 # For BinaryForm of calculation between two vectors, let: 797 # 798 # Na = Number of bits set to "1" in A = SUM ( Xai ) 799 # Nb = Number of bits set to "1" in B = SUM ( Xbi ) 800 # Nc = Number of bits set to "1" in both A and B = SUM ( Xai * Xbi ) 801 # Nd = Number of bits set to "0" in both A and B = SUM ( 1 - Xai - Xbi + Xai * Xbi) 802 # 803 # N = Number of bits set to "1" or "0" in A or B = Size of A or B = Na + Nb - Nc + Nd 804 # 805 # Additionally, for BinaryForm various values also correspond to: 806 # 807 # Na = | Xa | 808 # Nb = | Xb | 809 # Nc = | SetIntersectionXaXb | 810 # Nd = N - | SetDifferenceXaXb | 811 # 812 # | SetDifferenceXaXb | = N - Nd = Na + Nb - Nc + Nd - Nd = Na + Nb - Nc 813 # = | Xa | + | Xb | - | SetIntersectionXaXb | 814 # 815 # Various distance coefficients and similarity coefficients [ Ref 40, Ref 62, Ref 64 ] for a pair vectors A and B 816 # in AlgebraicForm and BinaryForm are defined as follows: 817 # 818 # . CityBlockDistanceCoefficient: ( same as HammingDistanceCoefficient and ManhattanDistanceCoefficient) 819 # 820 # . AlgebraicForm: SUM ( ABS ( Xai - Xbi ) ) 821 # 822 # . BinaryForm: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc 823 # 824 # . SetTheoreticForm: | SetDifferenceXaXb | - | SetIntersectionXaXb | 825 # = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) 826 # 827 # . CosineSimilarityCoefficient: ( same as OchiaiSimilarityCoefficient) 828 # 829 # . AlgebraicForm: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM ( Xbi ** 2) ) 830 # 831 # . BinaryForm: Nc / SQRT ( Na * Nb) 832 # 833 # . SetTheoreticForm: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) 834 # = SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) ) 835 # 836 # . CzekanowskiSimilarityCoefficient: ( same as DiceSimilarityCoefficient and SorensonSimilarityCoefficient) 837 # 838 # . AlgebraicForm: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) ) 839 # 840 # . BinaryForm: 2 * Nc / ( Na + Nb ) 841 # 842 # . SetTheoreticForm: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) 843 # = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) ) 844 # 845 # . DiceSimilarityCoefficient: ( same as CzekanowskiSimilarityCoefficient and SorensonSimilarityCoefficient) 846 # 847 # . AlgebraicForm: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) ) 848 # 849 # . BinaryForm: 2 * Nc / ( Na + Nb ) 850 # 851 # . SetTheoreticForm: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) 852 # = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) ) 853 # 854 # . EuclideanDistanceCoefficient: 855 # 856 # . AlgebraicForm: SQRT ( SUM ( ( ( Xai - Xbi ) ** 2 ) ) ) 857 # 858 # . BinaryForm: SQRT ( ( Na - Nc ) + ( Nb - Nc ) ) = SQRT ( Na + Nb - 2 * Nc ) 859 # 860 # . SetTheoreticForm: SQRT ( | SetDifferenceXaXb | - | SetIntersectionXaXb | ) 861 # = SQRT ( SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) ) 862 # 863 # . HammingDistanceCoefficient: ( same as CityBlockDistanceCoefficient and ManhattanDistanceCoefficient) 864 # 865 # . AlgebraicForm: SUM ( ABS ( Xai - Xbi ) ) 866 # 867 # . BinaryForm: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc 868 # 869 # . SetTheoreticForm: | SetDifferenceXaXb | - | SetIntersectionXaXb | 870 # = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) 871 # 872 # . JaccardSimilarityCoefficient: ( same as TanimotoSimilarityCoefficient) 873 # 874 # . AlgebraicForm: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi ** 2 ) - SUM ( Xai * Xbi ) ) 875 # 876 # . BinaryForm: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) 877 # 878 # . SetTheoreticForm: | SetIntersectionXaXb | / | SetDifferenceXaXb | 879 # = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) ) 880 # 881 # . ManhattanDistanceCoefficient: ( same as CityBlockDistanceCoefficient and HammingDistanceCoefficient) 882 # 883 # . AlgebraicForm: SUM ( ABS ( Xai - Xbi ) ) 884 # 885 # . BinaryForm: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc 886 # 887 # . SetTheoreticForm: | SetDifferenceXaXb | - | SetIntersectionXaXb | 888 # = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) 889 # 890 # . OchiaiSimilarityCoefficient: ( same as CosineSimilarityCoefficient) 891 # 892 # . AlgebraicForm: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM ( Xbi ** 2) ) 893 # 894 # . BinaryForm: Nc / SQRT ( Na * Nb) 895 # 896 # . SetTheoreticForm: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) 897 # = SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) ) 898 # 899 # . SorensonSimilarityCoefficient: ( same as CzekanowskiSimilarityCoefficient and DiceSimilarityCoefficient) 900 # 901 # . AlgebraicForm: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) ) 902 # 903 # . BinaryForm: 2 * Nc / ( Na + Nb ) 904 # 905 # . SetTheoreticForm: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) 906 # = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) ) 907 # 908 # . SoergelDistanceCoefficient: 909 # 910 # . AlgebraicForm: SUM ( ABS ( Xai - Xbi ) ) / SUM ( MAX ( Xai, Xbi ) ) 911 # 912 # . BinaryForm: 1 - Nc / ( Na + Nb - Nc ) = ( Na + Nb - 2 * Nc ) / ( Na + Nb - Nc ) 913 # 914 # . SetTheoreticForm: ( | SetDifferenceXaXb | - | SetIntersectionXaXb | ) / | SetDifferenceXaXb | 915 # = ( SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) ) 916 # 917 # . TanimotoSimilarityCoefficient: ( same as JaccardSimilarityCoefficient) 918 # 919 # . AlgebraicForm: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi ** 2 ) - SUM ( Xai * Xbi ) ) 920 # 921 # . BinaryForm: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) 922 # 923 # . SetTheoreticForm: | SetIntersectionXaXb | / | SetDifferenceXaXb | 924 # = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) ) 925 # 926 # 927 928 # Calculate Hamming distance coefficient between two fingerprint vectors. 929 # 930 # This functionality can be either invoked as a class function or an object method. 931 # 932 sub HammingDistanceCoefficient ($$;$$) { 933 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 934 935 return CityBlockDistanceCoefficient($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 936 } 937 938 # Calculate Hamming distance coefficient between two fingerprint vectors. 939 # 940 # This functionality can be either invoked as a class function or an object method. 941 # 942 sub ManhattanDistanceCoefficient ($$;$$) { 943 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 944 945 return CityBlockDistanceCoefficient($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 946 } 947 948 # Calculate CityBlock distance coefficient between two fingerprint vectors. 949 # 950 # This functionality can be either invoked as a class function or an object method. 951 # 952 sub CityBlockDistanceCoefficient ($$;$$) { 953 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 954 955 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 956 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 957 958 # Validate and process fingerprints vectors for similarity calculations... 959 # 960 _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation("CityBlockDistanceCoefficient: Calculation failed", $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 961 962 # Perform the calculation... 963 if ($CalculationMode =~ /^AlgebraicForm$/i) { 964 return _CityBlockDistanceCoefficientUsingAlgebraicForm($FingerprintsVectorA, $FingerprintsVectorB); 965 } 966 elsif ($CalculationMode =~ /^BinaryForm$/i) { 967 return _CityBlockDistanceCoefficientUsingBinaryForm($FingerprintsVectorA, $FingerprintsVectorB); 968 } 969 elsif ($CalculationMode =~ /^SetTheoreticForm$/i) { 970 return _CityBlockDistanceCoefficientUsingSetTheoreticForm($FingerprintsVectorA, $FingerprintsVectorB); 971 } 972 else { 973 return undef; 974 } 975 } 976 977 # Calculate CityBlock distance coefficient using algebraic form... 978 # 979 sub _CityBlockDistanceCoefficientUsingAlgebraicForm { 980 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 981 my($SumAbsSubtractionXaiXbi); 982 983 $SumAbsSubtractionXaiXbi = _GetSumOfAbsoluteValueOfSubtractionOfFingerprintsOrderedValues($FingerprintsVectorA, $FingerprintsVectorB); 984 985 return $SumAbsSubtractionXaiXbi; 986 } 987 988 # Calculate CityBlock distance coefficient using binary form... 989 # 990 sub _CityBlockDistanceCoefficientUsingBinaryForm { 991 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 992 my($Na, $Nb, $Nc); 993 994 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsVectorA, $FingerprintsVectorB); 995 996 return ($Na + $Nb - 2 * $Nc); 997 } 998 999 # Calculate CityBlock distance coefficient using set theoretic form... 1000 # 1001 sub _CityBlockDistanceCoefficientUsingSetTheoreticForm { 1002 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1003 my($SumMinXaiXbi, $SumXai, $SumXbi); 1004 1005 $SumXai = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorA); 1006 $SumXbi = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorB); 1007 $SumMinXaiXbi = _GetSumOfMinimumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1008 1009 return ($SumXai + $SumXbi - 2 * $SumMinXaiXbi); 1010 } 1011 1012 # Calculate Ochiai similarity cofficient between two fingerprint vectors. 1013 # 1014 # This functionality can be either invoked as a class function or an object method. 1015 # 1016 sub OchiaiSimilarityCoefficient ($$;$$) { 1017 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1018 1019 return CosineSimilarityCoefficient($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1020 } 1021 1022 # Calculate Cosine similarity cofficient between two fingerprint vectors. 1023 # 1024 # This functionality can be either invoked as a class function or an object method. 1025 # 1026 sub CosineSimilarityCoefficient ($$;$$) { 1027 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1028 1029 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 1030 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 1031 1032 # Validate and process fingerprints vectors for similarity calculations... 1033 # 1034 _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation("CosineSimilarityCoefficient: Calculation failed", $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1035 1036 # Perform the calculation... 1037 if ($CalculationMode =~ /^AlgebraicForm$/i) { 1038 return _CosineSimilarityCoefficientUsingAlgebraicForm($FingerprintsVectorA, $FingerprintsVectorB); 1039 } 1040 elsif ($CalculationMode =~ /^BinaryForm$/i) { 1041 return _CosineSimilarityCoefficientUsingBinaryForm($FingerprintsVectorA, $FingerprintsVectorB); 1042 } 1043 elsif ($CalculationMode =~ /^SetTheoreticForm$/i) { 1044 return _CosineSimilarityCoefficientUsingSetTheoreticForm($FingerprintsVectorA, $FingerprintsVectorB); 1045 } 1046 else { 1047 return undef; 1048 } 1049 } 1050 1051 # Calculate Cosine similarity coefficient using algebraic form... 1052 # 1053 sub _CosineSimilarityCoefficientUsingAlgebraicForm { 1054 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1055 my($SumProductXaiXbi, $SumXai2, $SumXbi2, $Numerator, $Denominator); 1056 1057 $SumXai2 = _GetSumOfSquaresOfFingerprintsOrderedValues($FingerprintsVectorA); 1058 $SumXbi2 = _GetSumOfSquaresOfFingerprintsOrderedValues($FingerprintsVectorB); 1059 $SumProductXaiXbi = _GetSumOfProductOfFingerprintsOrderedValues($FingerprintsVectorA, $FingerprintsVectorB); 1060 1061 $Numerator = $SumProductXaiXbi; 1062 $Denominator = sqrt($SumXai2 * $SumXbi2); 1063 1064 return $Denominator ? ($Numerator/$Denominator) : 0; 1065 } 1066 1067 # CalculateCosine similarity coefficient using binary form... 1068 # 1069 sub _CosineSimilarityCoefficientUsingBinaryForm { 1070 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1071 my($Na, $Nb, $Nc, $Numerator, $Denominator); 1072 1073 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsVectorA, $FingerprintsVectorB); 1074 1075 $Numerator = $Nc; 1076 $Denominator = sqrt($Na * $Nb); 1077 1078 return $Denominator ? ($Numerator/$Denominator) : 0; 1079 } 1080 1081 # Calculate Cosine similarity coefficient using set theoretic form... 1082 # 1083 sub _CosineSimilarityCoefficientUsingSetTheoreticForm { 1084 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1085 my($SumMinXaiXbi, $SumXai, $SumXbi, $Numerator, $Denominator); 1086 1087 $SumXai = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorA); 1088 $SumXbi = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorB); 1089 $SumMinXaiXbi = _GetSumOfMinimumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1090 1091 $Numerator = $SumMinXaiXbi; 1092 $Denominator = sqrt($SumXai * $SumXbi); 1093 1094 return $Denominator ? ($Numerator/$Denominator) : 0; 1095 } 1096 1097 # Calculate Czekanowski similarity cofficient between two fingerprint vectors. 1098 # 1099 # This functionality can be either invoked as a class function or an object method. 1100 # 1101 sub CzekanowskiSimilarityCoefficient ($$;$$) { 1102 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1103 1104 return DiceSimilarityCoefficient($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1105 } 1106 1107 # Calculate Sorenson similarity cofficient between two fingerprint vectors. 1108 # 1109 # This functionality can be either invoked as a class function or an object method. 1110 # 1111 sub SorensonSimilarityCoefficient ($$;$$) { 1112 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1113 1114 return DiceSimilarityCoefficient($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1115 } 1116 1117 # Calculate Dice similarity cofficient between two fingerprint vectors. 1118 # 1119 # This functionality can be either invoked as a class function or an object method. 1120 # 1121 sub DiceSimilarityCoefficient ($$;$$) { 1122 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1123 1124 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 1125 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 1126 1127 # Validate and process fingerprints vectors for similarity calculations... 1128 # 1129 _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation("DiceSimilarityCoefficient: Calculation failed", $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1130 1131 # Perform the calculation... 1132 if ($CalculationMode =~ /^AlgebraicForm$/i) { 1133 return _DiceSimilarityCoefficientUsingAlgebraicForm($FingerprintsVectorA, $FingerprintsVectorB); 1134 } 1135 elsif ($CalculationMode =~ /^BinaryForm$/i) { 1136 return _DiceSimilarityCoefficientUsingBinaryForm($FingerprintsVectorA, $FingerprintsVectorB); 1137 } 1138 elsif ($CalculationMode =~ /^SetTheoreticForm$/i) { 1139 return _DiceSimilarityCoefficientUsingSetTheoreticForm($FingerprintsVectorA, $FingerprintsVectorB); 1140 } 1141 else { 1142 return undef; 1143 } 1144 } 1145 1146 # Calculate Dice similarity coefficient using algebraic form... 1147 # 1148 sub _DiceSimilarityCoefficientUsingAlgebraicForm { 1149 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1150 my($SumProductXaiXbi, $SumXai2, $SumXbi2, $Numerator, $Denominator); 1151 1152 $SumXai2 = _GetSumOfSquaresOfFingerprintsOrderedValues($FingerprintsVectorA); 1153 $SumXbi2 = _GetSumOfSquaresOfFingerprintsOrderedValues($FingerprintsVectorB); 1154 $SumProductXaiXbi = _GetSumOfProductOfFingerprintsOrderedValues($FingerprintsVectorA, $FingerprintsVectorB); 1155 1156 $Numerator = 2 * $SumProductXaiXbi; 1157 $Denominator = $SumXai2 + $SumXbi2; 1158 1159 return $Denominator ? ($Numerator/$Denominator) : 0; 1160 } 1161 1162 # Calculate Dice similarity coefficient using binary form... 1163 # 1164 sub _DiceSimilarityCoefficientUsingBinaryForm { 1165 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1166 my($Na, $Nb, $Nc, $Numerator, $Denominator); 1167 1168 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsVectorA, $FingerprintsVectorB); 1169 1170 $Numerator = 2 * $Nc; 1171 $Denominator = $Na + $Nb; 1172 1173 return $Denominator ? ($Numerator/$Denominator) : 0; 1174 } 1175 1176 # Calculate Dice similarity coefficient using set theoretic form... 1177 # 1178 sub _DiceSimilarityCoefficientUsingSetTheoreticForm { 1179 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1180 my($SumMinXaiXbi, $SumXai, $SumXbi, $Numerator, $Denominator); 1181 1182 $SumXai = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorA); 1183 $SumXbi = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorB); 1184 $SumMinXaiXbi = _GetSumOfMinimumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1185 1186 $Numerator = 2 * $SumMinXaiXbi; 1187 $Denominator = $SumXai + $SumXbi; 1188 1189 return $Denominator ? ($Numerator/$Denominator) : 0; 1190 } 1191 1192 1193 # Calculate Euclidean distance coefficient between two fingerprint vectors. 1194 # 1195 # This functionality can be either invoked as a class function or an object method. 1196 # 1197 sub EuclideanDistanceCoefficient ($$;$$) { 1198 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1199 1200 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 1201 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 1202 1203 # Validate and process fingerprints vectors for similarity calculations... 1204 # 1205 _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation("EuclideanDistanceCoefficient: Calculation failed", $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1206 1207 # Perform the calculation... 1208 if ($CalculationMode =~ /^AlgebraicForm$/i) { 1209 return _EuclideanDistanceCoefficientUsingAlgebraicForm($FingerprintsVectorA, $FingerprintsVectorB); 1210 } 1211 elsif ($CalculationMode =~ /^BinaryForm$/i) { 1212 return _EuclideanDistanceCoefficientUsingBinaryForm($FingerprintsVectorA, $FingerprintsVectorB); 1213 } 1214 elsif ($CalculationMode =~ /^SetTheoreticForm$/i) { 1215 return _EuclideanDistanceCoefficientUsingSetTheoreticForm($FingerprintsVectorA, $FingerprintsVectorB); 1216 } 1217 else { 1218 return undef; 1219 } 1220 } 1221 1222 # Calculate Euclidean distance coefficient using algebraic form... 1223 # 1224 sub _EuclideanDistanceCoefficientUsingAlgebraicForm { 1225 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1226 my($SumSquaresSubtractionXaiXbi); 1227 1228 $SumSquaresSubtractionXaiXbi = _GetSumOfSquaresOfSubtractionOfFingerprintsOrderedValues($FingerprintsVectorA, $FingerprintsVectorB); 1229 1230 return sqrt($SumSquaresSubtractionXaiXbi); 1231 } 1232 1233 # Calculate Euclidean distance coefficient using binary form... 1234 # 1235 sub _EuclideanDistanceCoefficientUsingBinaryForm { 1236 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1237 my($Na, $Nb, $Nc); 1238 1239 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsVectorA, $FingerprintsVectorB); 1240 1241 return (sqrt($Na + $Nb - 2 * $Nc)); 1242 } 1243 1244 # Calculate Euclidean distance coefficient using set theoretic form... 1245 # 1246 sub _EuclideanDistanceCoefficientUsingSetTheoreticForm { 1247 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1248 my($SumMinXaiXbi, $SumXai, $SumXbi); 1249 1250 $SumXai = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorA); 1251 $SumXbi = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorB); 1252 $SumMinXaiXbi = _GetSumOfMinimumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1253 1254 return (sqrt($SumXai + $SumXbi - 2 * $SumMinXaiXbi)); 1255 } 1256 1257 # Calculate Jaccard similarity cofficient between two fingerprint vectors. 1258 # 1259 # This functionality can be either invoked as a class function or an object method. 1260 # 1261 sub JaccardSimilarityCoefficient ($$;$$) { 1262 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1263 1264 return TanimotoSimilarityCoefficient($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1265 } 1266 1267 # Calculate Tanimoto similarity cofficient between two fingerprint vectors. 1268 # 1269 # This functionality can be either invoked as a class function or an object method. 1270 # 1271 sub TanimotoSimilarityCoefficient ($$;$$) { 1272 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1273 1274 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 1275 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 1276 1277 # Validate and process fingerprints vectors for similarity calculations... 1278 # 1279 _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation("TanimotoSimilarityCoefficient: Calculation failed", $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1280 1281 # Perform the calculation... 1282 if ($CalculationMode =~ /^AlgebraicForm$/i) { 1283 return _TanimotoSimilarityCoefficientUsingAlgebraicForm($FingerprintsVectorA, $FingerprintsVectorB); 1284 } 1285 elsif ($CalculationMode =~ /^BinaryForm$/i) { 1286 return _TanimotoSimilarityCoefficientUsingBinaryForm($FingerprintsVectorA, $FingerprintsVectorB); 1287 } 1288 elsif ($CalculationMode =~ /^SetTheoreticForm$/i) { 1289 return _TanimotoSimilarityCoefficientUsingSetTheoreticForm($FingerprintsVectorA, $FingerprintsVectorB); 1290 } 1291 else { 1292 return undef; 1293 } 1294 } 1295 1296 # Calculate Tanimoto similarity coefficient using algebraic form... 1297 # 1298 sub _TanimotoSimilarityCoefficientUsingAlgebraicForm { 1299 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1300 my($SumProductXaiXbi, $SumXai2, $SumXbi2, $Numerator, $Denominator); 1301 1302 $SumXai2 = _GetSumOfSquaresOfFingerprintsOrderedValues($FingerprintsVectorA); 1303 $SumXbi2 = _GetSumOfSquaresOfFingerprintsOrderedValues($FingerprintsVectorB); 1304 $SumProductXaiXbi = _GetSumOfProductOfFingerprintsOrderedValues($FingerprintsVectorA, $FingerprintsVectorB); 1305 1306 $Numerator = $SumProductXaiXbi; 1307 $Denominator = $SumXai2 + $SumXbi2 - $SumProductXaiXbi; 1308 1309 return $Denominator ? ($Numerator/$Denominator) : 0; 1310 } 1311 1312 # Calculate Tanimoto similarity coefficient using binary form... 1313 # 1314 sub _TanimotoSimilarityCoefficientUsingBinaryForm { 1315 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1316 my($Na, $Nb, $Nc, $Numerator, $Denominator); 1317 1318 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsVectorA, $FingerprintsVectorB); 1319 1320 $Numerator = $Nc; 1321 $Denominator = $Na + $Nb - $Nc; 1322 1323 return $Denominator ? ($Numerator/$Denominator) : 0; 1324 } 1325 1326 # Calculate Tanimoto similarity coefficient using set theoretic form... 1327 # 1328 sub _TanimotoSimilarityCoefficientUsingSetTheoreticForm { 1329 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1330 my($SumMinXaiXbi, $SumXai, $SumXbi, $Numerator, $Denominator); 1331 1332 $SumXai = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorA); 1333 $SumXbi = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorB); 1334 $SumMinXaiXbi = _GetSumOfMinimumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1335 1336 $Numerator = $SumMinXaiXbi; 1337 $Denominator = $SumXai + $SumXbi - $SumMinXaiXbi; 1338 1339 return $Denominator ? ($Numerator/$Denominator) : 0; 1340 } 1341 1342 1343 # Calculate Soergel distance coefficient between two fingerprint vectors. 1344 # 1345 # This functionality can be either invoked as a class function or an object method. 1346 # 1347 sub SoergelDistanceCoefficient ($$;$$) { 1348 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1349 1350 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 1351 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 1352 1353 # Validate and process fingerprints vectors for similarity calculations... 1354 # 1355 _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation("SoergelDistanceCoefficient: Calculation failed", $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1356 1357 # Perform the calculation... 1358 if ($CalculationMode =~ /^AlgebraicForm$/i) { 1359 return _SoergelDistanceCoefficientUsingAlgebraicForm($FingerprintsVectorA, $FingerprintsVectorB); 1360 } 1361 elsif ($CalculationMode =~ /^BinaryForm$/i) { 1362 return _SoergelDistanceCoefficientUsingBinaryForm($FingerprintsVectorA, $FingerprintsVectorB); 1363 } 1364 elsif ($CalculationMode =~ /^SetTheoreticForm$/i) { 1365 return _SoergelDistanceCoefficientUsingSetTheoreticForm($FingerprintsVectorA, $FingerprintsVectorB); 1366 } 1367 else { 1368 return undef; 1369 } 1370 } 1371 1372 # Calculate Soergel distance coefficientusing algebraic form... 1373 # 1374 sub _SoergelDistanceCoefficientUsingAlgebraicForm { 1375 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1376 my($SumAbsSubtractionXaiXbi, $SumMaxXaiXbi, $Numerator, $Denominator); 1377 1378 $SumAbsSubtractionXaiXbi = _GetSumOfAbsoluteValueOfSubtractionOfFingerprintsOrderedValues($FingerprintsVectorA, $FingerprintsVectorB); 1379 $SumMaxXaiXbi = _GetSumOfMaximumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1380 1381 $Numerator = $SumAbsSubtractionXaiXbi; 1382 $Denominator = $SumMaxXaiXbi; 1383 1384 return $Denominator ? ($Numerator/$Denominator) : 0; 1385 } 1386 1387 # Calculate Soergel distance coefficient using binary form... 1388 # 1389 sub _SoergelDistanceCoefficientUsingBinaryForm { 1390 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1391 my($Na, $Nb, $Nc, $Numerator, $Denominator); 1392 1393 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsVectorA, $FingerprintsVectorB); 1394 1395 $Numerator = $Na + $Nb - 2 * $Nc; 1396 $Denominator = $Na + $Nb - $Nc; 1397 1398 return $Denominator ? ($Numerator/$Denominator) : 0; 1399 } 1400 1401 # Calculate SoergelDistanceCoefficient using set theoretic form... 1402 # 1403 sub _SoergelDistanceCoefficientUsingSetTheoreticForm { 1404 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1405 my($SumMinXaiXbi, $SumXai, $SumXbi, $Numerator, $Denominator); 1406 1407 $SumXai = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorA); 1408 $SumXbi = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorB); 1409 $SumMinXaiXbi = _GetSumOfMinimumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1410 1411 $Numerator = $SumXai + $SumXbi - 2 * $SumMinXaiXbi; 1412 $Denominator = $SumXai + $SumXbi - $SumMinXaiXbi; 1413 1414 return $Denominator ? ($Numerator/$Denominator) : 0; 1415 } 1416 1417 # Validate and process fingerprints vectors for similarity calculations... 1418 # 1419 sub _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation { 1420 my($ErrorMsg, $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1421 1422 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 1423 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 1424 1425 if (!$SkipValuesCheck) { 1426 _ValidateFingerprintsVectorsForSimilarityCalculation($ErrorMsg, $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode); 1427 } 1428 _ProcessFingerprintsVectorsForSimilarityCalculation($ErrorMsg, $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode); 1429 } 1430 1431 # Make sure fingerprint vectors are good for performing similarity/distance calculation... 1432 # 1433 sub _ValidateFingerprintsVectorsForSimilarityCalculation { 1434 my($ErrorMsg, $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode) = @_; 1435 1436 # Make sure both are fingerprint vectors.. 1437 if (!(IsFingerprintsVector($FingerprintsVectorA) && IsFingerprintsVector($FingerprintsVectorB))) { 1438 croak "Error: ${ClassName}->${ErrorMsg}: Both objects must be fingerprint vectors..."; 1439 } 1440 1441 # Check types... 1442 if ($FingerprintsVectorA->{Type} ne $FingerprintsVectorB->{Type}) { 1443 croak "Error: ${ClassName}->${ErrorMsg}: Type of first fingerprint vector, $FingerprintsVectorA->{Type}, must be same as type of second fingerprint vector, $FingerprintsVectorB->{Type}..."; 1444 } 1445 1446 # Check calculation mode... 1447 if ($CalculationMode !~ /^(AlgebraicForm|BinaryForm|SetTheoreticForm)$/i) { 1448 croak "Error: ${ClassName}->${ErrorMsg}: Specified similarity calculation mode, $CalculationMode, is not valid. Supported values: AlgebraicForm, BinaryForm, and SetTheoreticForm..."; 1449 } 1450 1451 # Check values and value IDs... 1452 my($Na, $Nb, $NIDa, $NIDb); 1453 $Na = $FingerprintsVectorA->GetNumOfValues(); $Nb = $FingerprintsVectorB->GetNumOfValues(); 1454 $NIDa = $FingerprintsVectorA->GetNumOfValueIDs(); $NIDb = $FingerprintsVectorB->GetNumOfValueIDs(); 1455 1456 if ($Na == 0) { 1457 croak "Error: ${ClassName}->${ErrorMsg}: Number of values in first fingerprint vector, $Na, must be > 0 for fingerprint vector type $FingerprintsVectorA->{Type} ..."; 1458 } 1459 if ($Nb == 0) { 1460 croak "Error: ${ClassName}->${ErrorMsg}: Number of values in second fingerprint vector, $Nb, must be > 0 for fingerprint vector type $FingerprintsVectorB->{Type} ..."; 1461 } 1462 1463 if ($FingerprintsVectorA->{Type} =~ /^OrderedNumericalValues$/i) { 1464 if ($Na != $Nb) { 1465 croak "Error: ${ClassName}->${ErrorMsg}: Number of values in first fingerprint vector, $Na, must be equal to number of values, $Nb, in second fingerprint vector for fingerprint vector types $FingerprintsVectorA->{Type} ..."; 1466 } 1467 } 1468 elsif ($FingerprintsVectorA->{Type} =~ /^NumericalValues$/i) { 1469 if ($NIDa == 0) { 1470 croak "Error: ${ClassName}->${ErrorMsg}: Number of value IDs in first fingerprint vector, $NIDa, must be > 0 for fingerprint vector type $FingerprintsVectorA->{Type} ..."; 1471 } 1472 if ($NIDb == 0) { 1473 croak "Error: ${ClassName}->${ErrorMsg}: Number of value IDs in first fingerprint vector, $NIDb, must be > 0 for fingerprint vector type $FingerprintsVectorB->{Type} ..."; 1474 } 1475 1476 if ($NIDa != $Na) { 1477 croak "Error: ${ClassName}->${ErrorMsg}: Number of value IDs in first fingerprint vector, $NIDa, must be equal to its number of values, $Na, for fingerprint vector type $FingerprintsVectorA->{Type} ..."; 1478 } 1479 if ($NIDb != $Nb) { 1480 croak "Error: ${ClassName}->${ErrorMsg}: Number of value IDs in second fingerprint vector, $NIDb, must be equal to its number of values, $Nb, for fingerprint vector type $FingerprintsVectorA->{Type} ..."; 1481 } 1482 } 1483 elsif ($FingerprintsVectorA->{Type} =~ /^AlphaNumericalValues$/i) { 1484 if ($NIDa || $NIDb) { 1485 croak "Error: ${ClassName}->${ErrorMsg}: ValueIDs cann't be specified for fingerprint vector types $FingerprintsVectorA->{Type} ..."; 1486 } 1487 } 1488 else { 1489 croak "Error: ${ClassName}->${ErrorMsg}: Fingerprint vector types $FingerprintsVectorA->{Type} is not valid..."; 1490 } 1491 } 1492 1493 # Process fingerprints vectors for similarity calculation by generating vectors 1494 # containing ordered list of values... 1495 # 1496 sub _ProcessFingerprintsVectorsForSimilarityCalculation { 1497 my($ErrorMsg, $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode) = @_; 1498 1499 $FingerprintsVectorA->{OrderedValuesRef} = undef; $FingerprintsVectorB->{OrderedValuesRef} = undef; 1500 $FingerprintsVectorA->{BitVector} = undef; $FingerprintsVectorB->{BitVector} = undef; 1501 1502 if ($FingerprintsVectorA->{Type} =~ /^OrderedNumericalValues$/i) { 1503 _ProcessOrderedNumericalValuesFingerprintsVectorsForSimilarityCalculation($FingerprintsVectorA, $FingerprintsVectorB); 1504 } 1505 elsif ($FingerprintsVectorA->{Type} =~ /^NumericalValues$/i) { 1506 _ProcessNumericalValuesFingerprintsVectorsForSimilarityCalculation($FingerprintsVectorA, $FingerprintsVectorB); 1507 } 1508 elsif ($FingerprintsVectorA->{Type} =~ /^AlphaNumericalValues$/i) { 1509 _ProcessAlphaNumericalValuesFingerprintsVectorsForSimilarityCalculation($FingerprintsVectorA, $FingerprintsVectorB); 1510 } 1511 else { 1512 croak "Error: ${ClassName}->${ErrorMsg}: Fingerprint vector types $FingerprintsVectorA->{Type} is not valid..."; 1513 } 1514 if ($CalculationMode =~ /^BinaryForm$/i) { 1515 _TransformFinalOrderedValuesIntoBitVectorsForSimilarityCalculation($FingerprintsVectorA, $FingerprintsVectorB); 1516 } 1517 } 1518 1519 # Process fingerprints vectors with ordered numerical values for similarity calculations... 1520 # 1521 sub _ProcessOrderedNumericalValuesFingerprintsVectorsForSimilarityCalculation { 1522 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1523 1524 $FingerprintsVectorA->{OrderedValuesRef} = \@{$FingerprintsVectorA->{Values}}; 1525 $FingerprintsVectorB->{OrderedValuesRef} = \@{$FingerprintsVectorB->{Values}}; 1526 } 1527 1528 # Process fingerprints vectors with numerical values for similarity calculations... 1529 # 1530 sub _ProcessNumericalValuesFingerprintsVectorsForSimilarityCalculation { 1531 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1532 1533 # Set up unique IDs and values map for each fingerprint vector... 1534 my($Index, $Value, $ValueID, %UniqueFingerprintsVectorAValueIDValues, %UniqueFingerprintsVectorBValueIDValues, %UniqueFingerprintsVectorsValueIDs); 1535 1536 %UniqueFingerprintsVectorAValueIDValues = (); 1537 %UniqueFingerprintsVectorBValueIDValues = (); 1538 %UniqueFingerprintsVectorsValueIDs = (); 1539 1540 # Go over first vector... 1541 for $Index (0 .. $#{$FingerprintsVectorA->{ValueIDs}}) { 1542 $ValueID = $FingerprintsVectorA->{ValueIDs}[$Index]; 1543 $Value = $FingerprintsVectorA->{Values}[$Index]; 1544 if (exists $UniqueFingerprintsVectorAValueIDValues{$ValueID}) { 1545 $UniqueFingerprintsVectorAValueIDValues{$ValueID} += $Value; 1546 } 1547 else { 1548 $UniqueFingerprintsVectorAValueIDValues{$ValueID} = $Value; 1549 } 1550 if (!exists $UniqueFingerprintsVectorsValueIDs{$ValueID}) { 1551 $UniqueFingerprintsVectorsValueIDs{$ValueID} = 1; 1552 } 1553 } 1554 1555 # Go over second vector... 1556 for $Index (0 .. $#{$FingerprintsVectorB->{ValueIDs}}) { 1557 $ValueID = $FingerprintsVectorB->{ValueIDs}[$Index]; 1558 $Value = $FingerprintsVectorB->{Values}[$Index]; 1559 if (exists $UniqueFingerprintsVectorBValueIDValues{$ValueID}) { 1560 $UniqueFingerprintsVectorBValueIDValues{$ValueID} += $Value; 1561 } 1562 else { 1563 $UniqueFingerprintsVectorBValueIDValues{$ValueID} = $Value; 1564 } 1565 if (!exists $UniqueFingerprintsVectorsValueIDs{$ValueID}) { 1566 $UniqueFingerprintsVectorsValueIDs{$ValueID} = 1; 1567 } 1568 } 1569 1570 # Setup ordered values... 1571 my(@UniqueOrderedValueIDs, @OrderedValuesA, @OrderedValuesB); 1572 1573 @UniqueOrderedValueIDs = (); 1574 @UniqueOrderedValueIDs = sort keys %UniqueFingerprintsVectorsValueIDs; 1575 1576 @OrderedValuesA = (); 1577 @OrderedValuesA = map { exists $UniqueFingerprintsVectorAValueIDValues{$_} ? $UniqueFingerprintsVectorAValueIDValues{$_} : 0 } @UniqueOrderedValueIDs; 1578 1579 @OrderedValuesB = (); 1580 @OrderedValuesB = map { exists $UniqueFingerprintsVectorBValueIDValues{$_} ? $UniqueFingerprintsVectorBValueIDValues{$_} : 0 } @UniqueOrderedValueIDs; 1581 1582 $FingerprintsVectorA->{OrderedValuesRef} = \@OrderedValuesA; 1583 $FingerprintsVectorB->{OrderedValuesRef} = \@OrderedValuesB; 1584 } 1585 1586 # Process fingerprints vectors with allpha numerical values for similarity calculations... 1587 # 1588 sub _ProcessAlphaNumericalValuesFingerprintsVectorsForSimilarityCalculation { 1589 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1590 1591 # Set up unique IDs and values map for each vector... 1592 my($Index, $Value, $ValueID, %UniqueFingerprintsVectorAValuesCount, %UniqueFingerprintsVectorBValuesCount, %UniqueFingerprintsVectorsValues); 1593 1594 %UniqueFingerprintsVectorAValuesCount = (); 1595 %UniqueFingerprintsVectorBValuesCount = (); 1596 %UniqueFingerprintsVectorsValues = (); 1597 1598 # Go over first vector... 1599 for $Value (@{$FingerprintsVectorA->{Values}}) { 1600 if (exists $UniqueFingerprintsVectorAValuesCount{$Value}) { 1601 $UniqueFingerprintsVectorAValuesCount{$Value} += 1; 1602 } 1603 else { 1604 $UniqueFingerprintsVectorAValuesCount{$Value} = 1; 1605 } 1606 if (!exists $UniqueFingerprintsVectorsValues{$Value}) { 1607 $UniqueFingerprintsVectorsValues{$Value} = 1; 1608 } 1609 } 1610 1611 # Go over second vector... 1612 for $Value (@{$FingerprintsVectorB->{Values}}) { 1613 if (exists $UniqueFingerprintsVectorBValuesCount{$Value}) { 1614 $UniqueFingerprintsVectorBValuesCount{$Value} += 1; 1615 } 1616 else { 1617 $UniqueFingerprintsVectorBValuesCount{$Value} = 1; 1618 } 1619 if (!exists $UniqueFingerprintsVectorsValues{$Value}) { 1620 $UniqueFingerprintsVectorsValues{$Value} = 1; 1621 } 1622 } 1623 1624 # Setup ordered values... 1625 my(@UniqueOrderedValueIDs, @OrderedValuesA, @OrderedValuesB); 1626 1627 @UniqueOrderedValueIDs = (); 1628 @UniqueOrderedValueIDs = sort keys %UniqueFingerprintsVectorsValues; 1629 1630 @OrderedValuesA = (); 1631 @OrderedValuesA = map { exists $UniqueFingerprintsVectorAValuesCount{$_} ? $UniqueFingerprintsVectorAValuesCount{$_} : 0 } @UniqueOrderedValueIDs; 1632 1633 @OrderedValuesB = (); 1634 @OrderedValuesB = map { exists $UniqueFingerprintsVectorBValuesCount{$_} ? $UniqueFingerprintsVectorBValuesCount{$_} : 0 } @UniqueOrderedValueIDs; 1635 1636 $FingerprintsVectorA->{OrderedValuesRef} = \@OrderedValuesA; 1637 $FingerprintsVectorB->{OrderedValuesRef} = \@OrderedValuesB; 1638 1639 } 1640 1641 # Transform final ordered values array into a BitVector for similarity calculation... 1642 # 1643 sub _TransformFinalOrderedValuesIntoBitVectorsForSimilarityCalculation { 1644 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1645 my($Index, $Size, $BitVectorA, $BitVectorB, $SkipCheck); 1646 1647 # Create bit vectors... 1648 $Size = scalar @{$FingerprintsVectorA->{OrderedValuesRef}}; 1649 1650 $FingerprintsVectorA->{BitVector} = new BitVector($Size); 1651 $FingerprintsVectorB->{BitVector} = new BitVector($Size); 1652 1653 # Set bits... 1654 $SkipCheck = 1; 1655 for $Index (0 .. ($Size - 1)) { 1656 if ($FingerprintsVectorA->{OrderedValuesRef}[$Index]) { 1657 $FingerprintsVectorA->{BitVector}->SetBit($Index, $SkipCheck); 1658 } 1659 if ($FingerprintsVectorB->{OrderedValuesRef}[$Index]) { 1660 $FingerprintsVectorB->{BitVector}->SetBit($Index, $SkipCheck); 1661 } 1662 } 1663 } 1664 1665 # Return sum of ordered vector values... 1666 # 1667 sub _GetSumOfFingerprintsOrderedValues { 1668 my($FingerprintVector) = @_; 1669 1670 return StatisticsUtil::Sum($FingerprintVector->{OrderedValuesRef}); 1671 } 1672 1673 # Return sum of squared ordered vector values... 1674 # 1675 sub _GetSumOfSquaresOfFingerprintsOrderedValues { 1676 my($FingerprintVector) = @_; 1677 1678 return StatisticsUtil::SumOfSquares($FingerprintVector->{OrderedValuesRef}); 1679 } 1680 1681 # Return sum of product of correponding ordered vector values... 1682 # 1683 sub _GetSumOfProductOfFingerprintsOrderedValues { 1684 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1685 my($Index, $SumProductXaiXbi); 1686 1687 $SumProductXaiXbi = 0; 1688 for $Index (0 .. $#{$FingerprintsVectorA->{OrderedValuesRef}}) { 1689 $SumProductXaiXbi += $FingerprintsVectorA->{OrderedValuesRef}[$Index] * $FingerprintsVectorB->{OrderedValuesRef}[$Index]; 1690 } 1691 return $SumProductXaiXbi; 1692 } 1693 1694 # Return sum of absolute value of subtraction of correponding ordered vector values... 1695 # 1696 sub _GetSumOfAbsoluteValueOfSubtractionOfFingerprintsOrderedValues { 1697 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1698 my($Index, $SumAbsSubtractionXaiXbi); 1699 1700 $SumAbsSubtractionXaiXbi = 0; 1701 for $Index (0 .. $#{$FingerprintsVectorA->{OrderedValuesRef}}) { 1702 $SumAbsSubtractionXaiXbi += abs($FingerprintsVectorA->{OrderedValuesRef}[$Index] - $FingerprintsVectorB->{OrderedValuesRef}[$Index]); 1703 } 1704 return $SumAbsSubtractionXaiXbi; 1705 } 1706 1707 # Return sum of squares of subtraction of correponding ordered vector values... 1708 # 1709 sub _GetSumOfSquaresOfSubtractionOfFingerprintsOrderedValues { 1710 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1711 my($Index, $SumSquaresSubtractionXaiXbi); 1712 1713 $SumSquaresSubtractionXaiXbi = 0; 1714 for $Index (0 .. $#{$FingerprintsVectorA->{OrderedValuesRef}}) { 1715 $SumSquaresSubtractionXaiXbi += ($FingerprintsVectorA->{OrderedValuesRef}[$Index] - $FingerprintsVectorB->{OrderedValuesRef}[$Index])**2; 1716 } 1717 return $SumSquaresSubtractionXaiXbi; 1718 } 1719 1720 # Return sum of minimum of correponding ordered vector values... 1721 # 1722 sub _GetSumOfMinimumOfFingerprintsOrderdedValues { 1723 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1724 my($Index, $SumMinXaiXbi); 1725 1726 $SumMinXaiXbi = 0; 1727 for $Index (0 .. $#{$FingerprintsVectorA->{OrderedValuesRef}}) { 1728 $SumMinXaiXbi += MathUtil::min($FingerprintsVectorA->{OrderedValuesRef}[$Index], $FingerprintsVectorB->{OrderedValuesRef}[$Index]); 1729 } 1730 return $SumMinXaiXbi; 1731 } 1732 1733 # Return sum of maximum of correponding ordered vector values... 1734 # 1735 sub _GetSumOfMaximumOfFingerprintsOrderdedValues { 1736 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1737 my($Index, $SumMaxXaiXbi); 1738 1739 $SumMaxXaiXbi = 0; 1740 for $Index (0 .. $#{$FingerprintsVectorA->{OrderedValuesRef}}) { 1741 $SumMaxXaiXbi += MathUtil::max($FingerprintsVectorA->{OrderedValuesRef}[$Index], $FingerprintsVectorB->{OrderedValuesRef}[$Index]); 1742 } 1743 return $SumMaxXaiXbi; 1744 } 1745 1746 # Get number of Na, Nb and Nc bits in vector A and B for BinaryForm calculation... 1747 # 1748 sub _GetNumOfIndividualAndCommonSetBits ($$) { 1749 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1750 my($Na, $Nb, $Nc, $Nd, $FingerprintsBitVectorA, $FingerprintsBitVectorB); 1751 1752 $FingerprintsBitVectorA = $FingerprintsVectorA->{BitVector}; 1753 $FingerprintsBitVectorB = $FingerprintsVectorB->{BitVector}; 1754 1755 # Number of bits set to "1" in A 1756 $Na = $FingerprintsBitVectorA->GetNumOfSetBits(); 1757 1758 # Number of bits set to "1" in B 1759 $Nb = $FingerprintsBitVectorB->GetNumOfSetBits(); 1760 1761 # Number of bits set to "1" in both A and B 1762 my($NcBitVector); 1763 $NcBitVector = $FingerprintsBitVectorA & $FingerprintsBitVectorB; 1764 $Nc = $NcBitVector->GetNumOfSetBits(); 1765 1766 return ($Na, $Nb, $Nc); 1767 } 1768 1769 # Return a list of supported distance coefficients... 1770 # 1771 sub GetSupportedDistanceCoefficients () { 1772 1773 return @DistanceCoefficients; 1774 } 1775 1776 # Return a list of supported similarity coefficients... 1777 # 1778 sub GetSupportedSimilarityCoefficients () { 1779 1780 return @SimilarityCoefficients; 1781 } 1782 1783 # Return a list of supported distance and similarity coefficients... 1784 # 1785 sub GetSupportedDistanceAndSimilarityCoefficients () { 1786 my(@DistanceAndSimilarityCoefficients); 1787 1788 @DistanceAndSimilarityCoefficients = (); 1789 push @DistanceAndSimilarityCoefficients, @DistanceCoefficients; 1790 push @DistanceAndSimilarityCoefficients, @SimilarityCoefficients; 1791 1792 return sort @DistanceAndSimilarityCoefficients; 1793 } 1794 1795 # Is it a fingerprints vector object? 1796 sub IsFingerprintsVector ($) { 1797 my($Object) = @_; 1798 1799 return _IsFingerprintsVector($Object); 1800 } 1801 1802 # Is it a fingerprints vector object? 1803 sub _IsFingerprintsVector { 1804 my($Object) = @_; 1805 1806 return (Scalar::Util::blessed($Object) && $Object->isa($ClassName)) ? 1 : 0; 1807 } 1808 1809 # Return a string containing vector values... 1810 sub StringifyFingerprintsVector { 1811 my($This) = @_; 1812 my($FingerprintsVectorString); 1813 1814 # Set type, values and value IDs... 1815 my($NumOfValues, $ValuesString, $NumOfValueIDs, $ValueIDsString, $MaxValuesToStringify); 1816 1817 $NumOfValues = $This->GetNumOfValues(); 1818 $MaxValuesToStringify = 500; 1819 1820 if ($NumOfValues < $MaxValuesToStringify) { 1821 # Append all values... 1822 $ValuesString = $NumOfValues ? join ' ', @{$This->{Values}} : 'None'; 1823 } 1824 else { 1825 # Truncate values... 1826 my($Index, @Values); 1827 for $Index (0 .. ($MaxValuesToStringify - 1)) { 1828 push @Values, $This->{Values}[$Index]; 1829 } 1830 $ValuesString = join(' ', @Values) . " ..."; 1831 } 1832 1833 $NumOfValueIDs = $This->GetNumOfValueIDs(); 1834 if ($NumOfValueIDs < $MaxValuesToStringify) { 1835 # Append all valueIDs... 1836 $ValueIDsString = $NumOfValueIDs ? join ' ', @{$This->{ValueIDs}} : 'None'; 1837 } 1838 else { 1839 # Truncate value IDs... 1840 my($Index, @ValueIDs); 1841 @ValueIDs = (); 1842 for $Index (0 .. ($MaxValuesToStringify - 1)) { 1843 push @ValueIDs, $This->{ValueIDs}[$Index]; 1844 } 1845 $ValueIDsString = join(' ', @ValueIDs) . " ..."; 1846 } 1847 1848 $FingerprintsVectorString = "Type: $This->{Type}; NumOfValues: $NumOfValues"; 1849 if ($This->{Type} =~ /^(OrderedNumericalValues|NumericalValues)$/i) { 1850 my($NumOfNonZeroValues); 1851 $NumOfNonZeroValues = $This->GetNumOfNonZeroValues(); 1852 $FingerprintsVectorString .= "; NumOfNonZeroValues: $NumOfNonZeroValues"; 1853 } 1854 1855 # Append all the values and value IDs... 1856 if ($NumOfValues < $MaxValuesToStringify) { 1857 $FingerprintsVectorString .= "; Values: <$ValuesString>; NumOfValueIDs: $NumOfValueIDs; ValueIDs: <$ValueIDsString>"; 1858 } 1859 else { 1860 $FingerprintsVectorString .= "; Values (Truncated after $MaxValuesToStringify): <$ValuesString>; NumOfValueIDs: $NumOfValueIDs; ValueIDs (Truncated after $MaxValuesToStringify): <$ValueIDsString>"; 1861 } 1862 1863 return $FingerprintsVectorString; 1864 } 1865