MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: SimilaritySearchingFingerprints.pl,v $
   4 # $Date: 2015/02/28 20:46:21 $
   5 # $Revision: 1.18 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use SDFileUtil;
  38 use StatisticsUtil;
  39 use PseudoHeap;
  40 use Fingerprints::FingerprintsFileUtil;
  41 use Fingerprints::FingerprintsBitVector;
  42 use Fingerprints::FingerprintsVector;
  43 
  44 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  45 
  46 # Autoflush STDOUT
  47 $| = 1;
  48 
  49 # Starting message...
  50 $ScriptName = basename($0);
  51 print "\n$ScriptName: Starting...\n\n";
  52 $StartTime = new Benchmark;
  53 
  54 # Get the options and setup script...
  55 SetupScriptUsage();
  56 if ($Options{help} || @ARGV != 2) {
  57   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  58 }
  59 
  60 # Process reference and database file names...
  61 my(@FingerprintsFilesList);
  62 ProcessFingerprintsFileNames();
  63 
  64 # Process options...
  65 print "Processing options...\n";
  66 my(%OptionsInfo);
  67 ProcessOptions();
  68 
  69 # Setup information about fingerprints inut and SD/text output files...
  70 my(%FingerprintsFilesInfo, %OutputFilesInfo, %SimilaritySearchInfo);
  71 print "Checking and retrieving information from reference and database fingerprints files...\n";
  72 RetrieveFingerprintsFilesInfo();
  73 
  74 # Perform similarity search...
  75 print "Performing similarity search...\n";
  76 my(%SimilaritySearchResults, %DatabaseFingerprintsFileData);
  77 PerformSimilaritySearch();
  78 
  79 print "\n$ScriptName:Done...\n\n";
  80 
  81 $EndTime = new Benchmark;
  82 $TotalTime = timediff ($EndTime, $StartTime);
  83 print "Total time: ", timestr($TotalTime), "\n";
  84 
  85 ###############################################################################
  86 
  87 # Perform similarity search using fingerprints data in reference and database text files...
  88 #
  89 sub PerformSimilaritySearch {
  90 
  91   print "\nProcessing fingerprints data for reference molecules...\n";
  92   ReadReferenceFingerprintsData();
  93 
  94   InitializeSimilaritySearchResults();
  95   GenerateSimilaritySearchResults();
  96   WriteSimilaritySearchResultFiles();
  97 }
  98 
  99 # Find similar molecules from database molecules for individual or multiple reference molecules...
 100 #
 101 sub GenerateSimilaritySearchResults {
 102   my($DatabaseFingerprintsFileIO, $FingerprintsCount, $IgnoredFingerprintsCount, $DatabaseFingerprintsObject, $DatabaseCmpdID, $ReferenceFingerprintsObject, $ReferenceIndex, $ReferenceCmpdID, $ComparisonValue, $FusedComparisonValue, @ComparisonValues);
 103 
 104   print "Processing fingerprints data for database molecules...\n";
 105 
 106   ($FingerprintsCount, $IgnoredFingerprintsCount) = (0) x 3;
 107 
 108   $DatabaseFingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{Database}{FingerprintsFileIOParameters}});
 109   $DatabaseFingerprintsFileIO->Open();
 110 
 111   @ComparisonValues = ();
 112 
 113   DATABASEFP: while ($DatabaseFingerprintsFileIO->Read()) {
 114     $FingerprintsCount++;
 115 
 116     if (!$DatabaseFingerprintsFileIO->IsFingerprintsDataValid()) {
 117       $IgnoredFingerprintsCount++;
 118       next DATABASEFP;
 119     }
 120     $DatabaseFingerprintsObject = $DatabaseFingerprintsFileIO->GetFingerprints();
 121     $DatabaseCmpdID = $DatabaseFingerprintsFileIO->GetCompoundID();
 122 
 123     if ($SimilaritySearchInfo{MultipleReferencesMode}) {
 124       @ComparisonValues = ();
 125     }
 126 
 127     REFERENCEFP: for $ReferenceIndex (0 .. $#{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) {
 128       $ReferenceCmpdID = $SimilaritySearchInfo{ReferenceCmpdIDsRef}->[$ReferenceIndex];
 129       $ReferenceFingerprintsObject = $SimilaritySearchInfo{ReferenceFingerprintsObjectsRef}->[$ReferenceIndex];
 130 
 131       $ComparisonValue = CompareReferenceAndDatabaseFingerprintsPair($ReferenceFingerprintsObject, $DatabaseFingerprintsObject);
 132       if (!defined $ComparisonValue) {
 133         next REFERENCEFP;
 134       }
 135 
 136       if ($SimilaritySearchInfo{IndividualReferenceMode}) {
 137         CollectSimilaritySearchResults($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $ComparisonValue, $ReferenceCmpdID);
 138       }
 139       elsif ($SimilaritySearchInfo{MultipleReferencesMode}) {
 140         push @ComparisonValues, $ComparisonValue;
 141       }
 142     }
 143 
 144     if ($SimilaritySearchInfo{MultipleReferencesMode}) {
 145       $FusedComparisonValue = CalculateGroupFusionComparisonValue(\@ComparisonValues);
 146       if (!defined $FusedComparisonValue) {
 147         next DATABASEFP;
 148       }
 149       CollectSimilaritySearchResults($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $FusedComparisonValue);
 150     }
 151   }
 152   $DatabaseFingerprintsFileIO->Close();
 153 
 154   print "Number of fingerprints data entries in database fingerprints file: $FingerprintsCount\n";
 155   print "Number of fingerprints date entries processed successfully: ", ($FingerprintsCount - $IgnoredFingerprintsCount)  , "\n";
 156   print "Number of fingerprints data entries ignored due to missing/invalid data: $IgnoredFingerprintsCount\n\n";
 157 }
 158 
 159 # Compare a pair of reference and database fingerprints objects corresponding to bit-vector or
 160 # vectors using specified comparison method and comparison cutoff...
 161 #
 162 sub CompareReferenceAndDatabaseFingerprintsPair {
 163   my($ReferenceFingerprintsObject, $DatabaseFingerprintsObject) = @_;
 164   my($ComparisonMethod, $ComparisonValue);
 165 
 166   $ComparisonMethod = $SimilaritySearchInfo{ComparisonMethod};
 167   $ComparisonValue = $ReferenceFingerprintsObject->$ComparisonMethod($DatabaseFingerprintsObject, @{$SimilaritySearchInfo{ComparisonMethodParameters}});
 168 
 169   if (!defined $ComparisonValue) {
 170     warn "Warning: Ignoring fingerprints data for reference compound ID ",  $ReferenceFingerprintsObject->GetID(), ": Its comparison with database compound ID, ", $DatabaseFingerprintsObject->GetID(), ", failed.\n";
 171     return undef;
 172   }
 173 
 174   $ComparisonValue = sprintf("%.$OptionsInfo{Precision}f", $ComparisonValue);
 175 
 176   # Apply any comparison cutoff...
 177   if ($SimilaritySearchInfo{ApplyComparisonCutoff}) {
 178     return $SimilaritySearchInfo{KeepTop} ? ($ComparisonValue >= $SimilaritySearchInfo{ComparisonCutoff} ? $ComparisonValue : undef) : ($ComparisonValue <= $SimilaritySearchInfo{ComparisonCutoff} ? $ComparisonValue : undef);
 179   }
 180   else {
 181     return $ComparisonValue;
 182   }
 183 }
 184 
 185 # Calculate group fusion comparison value...
 186 #
 187 sub CalculateGroupFusionComparisonValue {
 188   my($ComparisonValuesRef) = @_;
 189   my($FusedComparisonValue, @ComparisonValues);
 190 
 191   if (!@{$ComparisonValuesRef}) {
 192     return undef;
 193   }
 194 
 195   if ($SimilaritySearchInfo{SortComparisonValues}) {
 196     @ComparisonValues = sort { $SimilaritySearchInfo{KeepTop} ? ($b <=> $a) : ($a <=> $b) } @{$ComparisonValuesRef};
 197     if ($SimilaritySearchInfo{UsekNN} && ($OptionsInfo{kNN} < scalar @{$ComparisonValuesRef})) {
 198       # Keep only top kNN values for group fusion...
 199       splice @ComparisonValues, $OptionsInfo{kNN};
 200     }
 201     $ComparisonValuesRef = \@ComparisonValues;
 202   }
 203 
 204   $FusedComparisonValue = &{$SimilaritySearchInfo{GroupFusionMethodRef}}($ComparisonValuesRef);
 205   if ($SimilaritySearchInfo{ApplyPrecisionDuringFusion}) {
 206     $FusedComparisonValue = sprintf("%.$OptionsInfo{Precision}f", $FusedComparisonValue);
 207   }
 208 
 209   return $FusedComparisonValue;
 210 }
 211 
 212 # Collect similarity results for individual reference and multiple references search...
 213 #
 214 sub CollectSimilaritySearchResults {
 215   my($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $ComparisonValue, $ReferenceCmpdID) = @_;
 216 
 217   if (defined $ReferenceCmpdID) {
 218     $SimilaritySearchResults{$ReferenceCmpdID}->AddKeyValuePair($ComparisonValue, $DatabaseCmpdID);
 219   }
 220   else {
 221     $SimilaritySearchResults{ResultsPseudoHeap}->AddKeyValuePair($ComparisonValue, $DatabaseCmpdID);
 222   }
 223 
 224   if ($FingerprintsFilesInfo{Database}{CollectInputFileData}) {
 225     CollectDatabaseFileData($DatabaseCmpdID, $DatabaseFingerprintsFileIO);
 226   }
 227 }
 228 
 229 # Initialize similarity results for individual or multiple reference molecules...
 230 #
 231 sub InitializeSimilaritySearchResults {
 232   my($ReferenceCmpdID);
 233 
 234   %SimilaritySearchResults = ();
 235 
 236   if ($SimilaritySearchInfo{IndividualReferenceMode}) {
 237     for $ReferenceCmpdID (@{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) {
 238       $SimilaritySearchResults{$ReferenceCmpdID} = new PseudoHeap('Type' => ($SimilaritySearchInfo{KeepTop} ? 'KeepTopN' : 'KeepBottomN'), 'KeyType' => 'Numeric', 'MaxSize' => $OptionsInfo{MaxSimilarMolecules});
 239     }
 240   }
 241   elsif ($SimilaritySearchInfo{MultipleReferencesMode}) {
 242     $SimilaritySearchResults{ResultsPseudoHeap} = new PseudoHeap('Type' => ($SimilaritySearchInfo{KeepTop} ? 'KeepTopN' : 'KeepBottomN'), 'KeyType' => 'Numeric', 'MaxSize' => $OptionsInfo{MaxSimilarMolecules});
 243   }
 244 
 245   %DatabaseFingerprintsFileData = ();
 246 }
 247 
 248 # Write out results SD and/or CSV/TSV text files for individual or multiple reference molecules...
 249 #
 250 sub WriteSimilaritySearchResultFiles {
 251   my($NewSDFileRef, $NewTextFileRef, $ReferenceCmpdID, $DatabaseCmpdID, $ComparisonValue);
 252 
 253   ($NewSDFileRef, $NewTextFileRef) = SetupAndOpenOutputFiles();
 254 
 255   if ($SimilaritySearchInfo{IndividualReferenceMode}) {
 256     for $ReferenceCmpdID (@{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) {
 257       for $ComparisonValue ($SimilaritySearchResults{$ReferenceCmpdID}->GetSortedKeys()) {
 258         for $DatabaseCmpdID ($SimilaritySearchResults{$ReferenceCmpdID}->GetKeyValues($ComparisonValue)) {
 259           WriteDataToOutputFiles($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID, $ReferenceCmpdID);
 260         }
 261       }
 262     }
 263   }
 264   elsif ($SimilaritySearchInfo{MultipleReferencesMode}) {
 265     for $ComparisonValue ($SimilaritySearchResults{ResultsPseudoHeap}->GetSortedKeys()) {
 266       for $DatabaseCmpdID ($SimilaritySearchResults{ResultsPseudoHeap}->GetKeyValues($ComparisonValue)) {
 267         WriteDataToOutputFiles($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID);
 268       }
 269     }
 270   }
 271 
 272   if ($NewSDFileRef) {
 273     close $NewSDFileRef;
 274   }
 275   if ($NewTextFileRef) {
 276     close $NewTextFileRef;
 277   }
 278 }
 279 
 280 # Write individual reference or multiple references similarity results along with any other data to output files...
 281 #
 282 sub WriteDataToOutputFiles {
 283   my($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID, $ReferenceCmpdID) = @_;
 284 
 285   if ($NewSDFileRef) {
 286     WriteMolStringDataToSDOutputFile($DatabaseCmpdID, $NewSDFileRef);
 287     if (defined $ReferenceCmpdID) {
 288       print $NewSDFileRef  ">  <ReferenceCmpdID>\n$ReferenceCmpdID\n\n";
 289     }
 290     print $NewSDFileRef  ">  <DatabaseCmpdID>\n$DatabaseCmpdID\n\n>  <ComparisonValue>\n$ComparisonValue\n\n";
 291     WriteDatabaseDataToSDOutputFile($DatabaseCmpdID, $NewSDFileRef);
 292     print $NewSDFileRef "\$\$\$\$\n";
 293   }
 294 
 295   if ($NewTextFileRef) {
 296     my(@LineWords);
 297 
 298     @LineWords = ();
 299     if (defined $ReferenceCmpdID) {
 300       push @LineWords, $ReferenceCmpdID;
 301     }
 302     push @LineWords, ($DatabaseCmpdID, $ComparisonValue);
 303 
 304     if ($FingerprintsFilesInfo{Database}{OutputDataFields} || $FingerprintsFilesInfo{Database}{OutputDataCols}) {
 305       push @LineWords, RetrieveDatabaseDataForTextOutputFile($DatabaseCmpdID);
 306     }
 307     print $NewTextFileRef JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}), "\n";
 308   }
 309 }
 310 
 311 # Open output files...
 312 #
 313 sub SetupAndOpenOutputFiles {
 314   my($NewSDFileRef, $NewTextFileRef, $NewSDFile, $NewTextFile);
 315 
 316   ($NewSDFileRef, $NewTextFileRef) = (undef) x 2;
 317 
 318   if ($OptionsInfo{SDOutput}) {
 319     $NewSDFile = $OutputFilesInfo{SDOutFileName};
 320     print "Generating SD file $NewSDFile...\n";
 321     open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
 322     $NewSDFileRef = \*NEWSDFILE;
 323   }
 324 
 325   if ($OptionsInfo{TextOutput}) {
 326     $NewTextFile = $OutputFilesInfo{TextOutFileName};
 327     print "Generating text file $NewTextFile...\n";
 328     open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n";
 329     $NewTextFileRef = \*NEWTEXTFILE;
 330 
 331     WriteTextFileCoulmnLabels(\*NEWTEXTFILE);
 332   }
 333 
 334   return ($NewSDFileRef, $NewTextFileRef);
 335 }
 336 
 337 # Write out approriate column labels to text file...
 338 #
 339 sub WriteTextFileCoulmnLabels {
 340   my($NewTextFileRef) = @_;
 341   my($Line, @LineWords);
 342 
 343   @LineWords = ();
 344 
 345   if ($SimilaritySearchInfo{IndividualReferenceMode}) {
 346     push @LineWords, qw(ReferenceCompoundID DatabaseCompoundID ComparisonValue);
 347   }
 348   elsif ($SimilaritySearchInfo{MultipleReferencesMode}) {
 349     push @LineWords, qw(DatabaseCompoundID ComparisonValue);
 350   }
 351 
 352   # Add columns for other database fingerprints file data to be written to output file...
 353   if ($FingerprintsFilesInfo{Database}{OutputDataFields}) {
 354     push @LineWords, @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}};
 355   }
 356   elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) {
 357     push @LineWords, @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}};
 358   }
 359 
 360   $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 361   print $NewTextFileRef "$Line\n";
 362 }
 363 
 364 # Write molecule string data to SD output file...
 365 #
 366 sub WriteMolStringDataToSDOutputFile {
 367   my($DatabaseCmpdID, $NewSDFileRef) = @_;
 368 
 369   if ($FingerprintsFilesInfo{Database}{CollectCmpdStringData}) {
 370     my($MolString);
 371 
 372     ($MolString) = split /M  END/, $DatabaseFingerprintsFileData{$DatabaseCmpdID};
 373     print $NewSDFileRef "$MolString\nM  END\n";
 374   }
 375   else {
 376     # Just write out an empty molecule data string...
 377     print $NewSDFileRef SDFileUtil::GenerateEmptyCtabBlockLines(), "\n";
 378   }
 379 }
 380 
 381 # Write database data from SD or Text database file to SD output file...
 382 #
 383 sub WriteDatabaseDataToSDOutputFile {
 384   my($DatabaseCmpdID, $NewSDFileRef) = @_;
 385 
 386   if ($FingerprintsFilesInfo{Database}{OutputDataFields}) {
 387     my($DataFieldLabel, $DataFieldValue, @CmpdLines, %DataFieldLabelAndValues);
 388 
 389     @CmpdLines = split /\n/, $DatabaseFingerprintsFileData{$DatabaseCmpdID};
 390     %DataFieldLabelAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 391 
 392     for $DataFieldLabel ($FingerprintsFilesInfo{Database}{OutputCurrentDataFields} ? GetCmpdDataHeaderLabels(\@CmpdLines) : @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}) {
 393       $DataFieldValue = exists $DataFieldLabelAndValues{$DataFieldLabel} ? $DataFieldLabelAndValues{$DataFieldLabel} : '';
 394       print $NewSDFileRef  ">  <$DataFieldLabel>\n$DataFieldValue\n\n";
 395     }
 396   }
 397   elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) {
 398     my($DataColNum, $DataFieldLabel, $DataFieldValue);
 399 
 400     for $DataColNum (@{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}}) {
 401       $DataFieldLabel = $FingerprintsFilesInfo{Database}{DataColNumToLabelMap}{$DataColNum};
 402       $DataFieldValue =  $DatabaseFingerprintsFileData{$DatabaseCmpdID}->[$DataColNum];
 403       print $NewSDFileRef  ">  <$DataFieldLabel>\n$DataFieldValue\n\n";
 404     }
 405   }
 406 }
 407 
 408 # Retriebe database data from SD or Text database file for text output file...
 409 #
 410 sub RetrieveDatabaseDataForTextOutputFile {
 411   my($DatabaseCmpdID) = @_;
 412 
 413   if ($FingerprintsFilesInfo{Database}{OutputDataFields}) {
 414     my(@CmpdLines, %DataFieldLabelAndValues);
 415 
 416     @CmpdLines = split /\n/, $DatabaseFingerprintsFileData{$DatabaseCmpdID};
 417     %DataFieldLabelAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 418 
 419     return map { exists $DataFieldLabelAndValues{$_} ? $DataFieldLabelAndValues{$_} : ''} @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}};
 420   }
 421   elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) {
 422     if (exists $DatabaseFingerprintsFileData{$DatabaseCmpdID}) {
 423       return map { $DatabaseFingerprintsFileData{$DatabaseCmpdID}->[$_] } (0 .. $#{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}});
 424     }
 425     else {
 426       return ('') x $#{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}};
 427     }
 428   }
 429 }
 430 
 431 # Collect database file SD compound string or CSV/TSV data line for generating results
 432 # files..
 433 #
 434 sub CollectDatabaseFileData {
 435   my($DatabaseCmpdID, $DatabaseFingerprintsFileIO) = @_;
 436 
 437   if (exists $DatabaseFingerprintsFileData{$DatabaseCmpdID}) {
 438     return;
 439   }
 440 
 441   if ($FingerprintsFilesInfo{Database}{CollectCmpdStringData}) {
 442     $DatabaseFingerprintsFileData{$DatabaseCmpdID} = $DatabaseFingerprintsFileIO->GetCompoundString();
 443   }
 444 
 445   if ($FingerprintsFilesInfo{Database}{CollectDataLine}) {
 446     my(@DataLineWords);
 447     @DataLineWords = $DatabaseFingerprintsFileIO->GetDataLineWords();
 448     $DatabaseFingerprintsFileData{$DatabaseCmpdID} = \@DataLineWords;
 449   }
 450 
 451 }
 452 
 453 # Read fingerprints data from reference fingerprints file...
 454 #
 455 sub ReadReferenceFingerprintsData {
 456   my($FingerprintsFileIO);
 457 
 458   $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{Reference}{FingerprintsFileIOParameters}});
 459   ($SimilaritySearchInfo{ReferenceCmpdIDsRef}, $SimilaritySearchInfo{ReferenceFingerprintsObjectsRef}) = Fingerprints::FingerprintsFileUtil::ReadAndProcessFingerpritsData($FingerprintsFileIO);
 460 
 461 }
 462 
 463 # Retrieve information about fingerprints files...
 464 #
 465 sub RetrieveFingerprintsFilesInfo {
 466 
 467   %FingerprintsFilesInfo = ();
 468   %OutputFilesInfo = ();
 469   %SimilaritySearchInfo = ();
 470 
 471   %{$FingerprintsFilesInfo{Reference}} = ();
 472   %{$FingerprintsFilesInfo{Database}} = ();
 473 
 474   # Set up reference and database file names...
 475   $FingerprintsFilesInfo{Reference}{FileName} = $FingerprintsFilesList[0];
 476   $FingerprintsFilesInfo{Database}{FileName} = $FingerprintsFilesList[1];
 477 
 478   # Retrieve information about reference and database fingerprints file...
 479   RetrieveReferenceFingerprintsFileInfo();
 480   RetrieveDatabaseFingerprintsFileInfo();
 481 
 482   # Setup fingerprints comparison method and associated method parameters...
 483   SetupReferenceAndDatabaseFingerprintsComparisonInfo();
 484 
 485   # Retrieve information for output files...
 486   RetrieveOutputFilesInfo();
 487 }
 488 
 489 # Setup refrerence and database fingerprints comparison method and associated method parameters...
 490 #
 491 sub SetupReferenceAndDatabaseFingerprintsComparisonInfo {
 492 
 493   # Make sure reference and database fingerprints string match...
 494   if (($FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType} !~ /^$FingerprintsFilesInfo{Database}{FirstFingerprintsStringType}$/i) ||
 495      ($FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode} != $FingerprintsFilesInfo{Database}{FingerprintsBitVectorStringMode}) ||
 496      ($FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode} != $FingerprintsFilesInfo{Database}{FingerprintsVectorStringMode}) ) {
 497     die "Error: First reference fingerprints string type, $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType}, must match first database fingerprints type, $FingerprintsFilesInfo{Database}{FirstFingerprintsStringType}.\n";
 498   }
 499 
 500   if ($FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription} !~ /^$FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription}$/i) {
 501     warn "Warning: First reference fingerprints string description, $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription}, doesn't match first database fingerprints string description, $FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription}.\n";
 502   }
 503 
 504   # Setup individual reference and multiple references search mode...
 505   $SimilaritySearchInfo{IndividualReferenceMode} = undef;
 506   $SimilaritySearchInfo{MultipleReferencesMode} = undef;
 507 
 508   if ($OptionsInfo{Mode} =~ /^IndividualReference$/i) {
 509     $SimilaritySearchInfo{IndividualReferenceMode} = 1;
 510   }
 511   elsif ($OptionsInfo{Mode} =~ /^MultipleReferences$/i) {
 512     $SimilaritySearchInfo{MultipleReferencesMode} = 1;
 513   }
 514   else {
 515     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: IndividualReference, MultipleReferences\n";
 516   }
 517 
 518   # Set up reference and database fingerprints similarity search method and paramaters...
 519   my($ComparisonMeasure, $ComparisonMethod, $ApplyComparisonCutoff, $ComparisonCutoff, $KeepTop, @ComparisonMethodParameters);
 520 
 521   $SimilaritySearchInfo{ComparisonMethod} = '';
 522   @{$SimilaritySearchInfo{ComparisonMethodParameters}} = ();
 523 
 524   $SimilaritySearchInfo{ComparisonCutoff} = '';
 525   $SimilaritySearchInfo{KeepTop} = '';
 526 
 527   $ComparisonMeasure = ''; $ComparisonMethod = '';
 528   @ComparisonMethodParameters = ();
 529 
 530   FINGERPRINTSTYPE: {
 531     if ($FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode}) {
 532       $ComparisonMeasure = $OptionsInfo{SpecifiedBitVectorComparisonMeasure};
 533       $ComparisonMethod = $OptionsInfo{SpecifiedBitVectorComparisonMeasureMethod};
 534 
 535       if ($ComparisonMeasure =~ /^TverskySimilarity$/i) {
 536         push @ComparisonMethodParameters, $OptionsInfo{Alpha};
 537       }
 538       elsif ($ComparisonMeasure =~ /^WeightedTverskySimilarity$/i) {
 539         push @ComparisonMethodParameters, $OptionsInfo{Alpha};
 540         push @ComparisonMethodParameters, $OptionsInfo{Beta};
 541       }
 542       elsif ($ComparisonMeasure =~ /^WeightedTanimotoSimilarity$/i) {
 543         push @ComparisonMethodParameters, $OptionsInfo{Beta};
 544       }
 545 
 546       last FINGERPRINTSTYPE;
 547     }
 548     if ($FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode}) {
 549       my($SkipValuesCheck);
 550 
 551       $ComparisonMeasure = $OptionsInfo{SpecifiedVectorComparisonMeasure};
 552       $ComparisonMethod = $OptionsInfo{SpecifiedVectorComparisonMeasuresMethod};
 553 
 554       push @ComparisonMethodParameters, $OptionsInfo{SpecifiedVectorComparisonMode};
 555 
 556       $SkipValuesCheck = $OptionsInfo{Fast} ? 1 : 0;
 557       push @ComparisonMethodParameters, $SkipValuesCheck;
 558 
 559       last FINGERPRINTSTYPE;
 560     }
 561     die "Error: Uknown fingerprints string type. Supported values: FingerprintsBitVectorString or FingerprintsVectorString.\n";
 562   }
 563 
 564   $ApplyComparisonCutoff = $SimilaritySearchInfo{IndividualReferenceMode} ? 1 : (($SimilaritySearchInfo{MultipleReferencesMode} && $OptionsInfo{GroupFusionApplyCutoff}) ? 1 : 0);
 565 
 566   $ComparisonCutoff = ''; $KeepTop = '';
 567   if ($ComparisonMethod =~ /Distance/i) {
 568     $ComparisonCutoff = $OptionsInfo{DistanceCutoff};
 569     $KeepTop = ($OptionsInfo{SearchMode} =~ /^SimilaritySearch$/i) ? 0 : 1;
 570   }
 571   else {
 572     $ComparisonCutoff = $OptionsInfo{SimilarityCutoff};
 573     $KeepTop = ($OptionsInfo{SearchMode} =~ /^SimilaritySearch$/i) ? 1 : 0;
 574   }
 575 
 576   $SimilaritySearchInfo{ComparisonMethod} = $ComparisonMethod;
 577   @{$SimilaritySearchInfo{ComparisonMethodParameters}} = @ComparisonMethodParameters;
 578 
 579   $SimilaritySearchInfo{ComparisonCutoff} = $ComparisonCutoff;
 580   $SimilaritySearchInfo{KeepTop} = $KeepTop;
 581   $SimilaritySearchInfo{ApplyComparisonCutoff} = $ApplyComparisonCutoff;
 582 
 583   # Setup references to group fusion methods...
 584   $SimilaritySearchInfo{GroupFusionMethodRef} = undef;
 585   $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = undef;
 586 
 587   FUSIONRULE: {
 588     if ($OptionsInfo{GroupFusionRule} =~ /^Max$/i) {
 589       # It's always the first value in the appropriated sorted list using value of KeepTop...
 590       $SimilaritySearchInfo{GroupFusionMethodRef} = sub { my($ComparisonValuesRef) = @_; return $ComparisonValuesRef->[0]; };
 591       last FUSIONRULE;
 592     }
 593     if ($OptionsInfo{GroupFusionRule} =~ /^Min$/i) {
 594       # It's always the last value in the appropriated sorted list using value of KeepTop...
 595       $SimilaritySearchInfo{GroupFusionMethodRef} = sub { my($ComparisonValuesRef) = @_; return $ComparisonValuesRef->[$#{$ComparisonValuesRef}]; };
 596       last FUSIONRULE;
 597     }
 598     if ($OptionsInfo{GroupFusionRule} =~ /^Mean$/i) {
 599       $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Mean;
 600       $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1;
 601       last FUSIONRULE;
 602     }
 603     if ($OptionsInfo{GroupFusionRule} =~ /^Median$/i) {
 604       $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Median;
 605       $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1;
 606       last FUSIONRULE;
 607     }
 608     if ($OptionsInfo{GroupFusionRule} =~ /^Sum$/i) {
 609       $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Sum;
 610       $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1;
 611       last FUSIONRULE;
 612     }
 613     if ($OptionsInfo{GroupFusionRule} =~ /^Euclidean$/i) {
 614       $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Euclidean;
 615       $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1;
 616       last FUSIONRULE;
 617     }
 618     die "Error: The value specified, $Options{groupfusionrule}, for option \"-g, --GroupFusionRule\" is not valid. Allowed values: Max, Min, Mean, Median, Sum, Euclidean\n";
 619   }
 620 
 621   $SimilaritySearchInfo{UsekNN} = ($OptionsInfo{kNN} !~ /^All$/i) ? 1 : 0;
 622   $SimilaritySearchInfo{SortComparisonValues} = (($OptionsInfo{GroupFusionRule} =~ /^(Max|Min)$/i) || $SimilaritySearchInfo{UsekNN}) ? 1 : 0;
 623 }
 624 
 625 # Retrieve information about reference fingerprints file...
 626 #
 627 sub RetrieveReferenceFingerprintsFileInfo {
 628   my($FingerprintsFile, $FileType, $InDelim, $FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription);
 629 
 630   $FingerprintsFile = $FingerprintsFilesInfo{Reference}{FileName};
 631   ($FileType, $InDelim) =  RetrieveFingerprintsFileInfo($FingerprintsFile);
 632 
 633   $FingerprintsFilesInfo{Reference}{FileType} = $FileType;
 634   $FingerprintsFilesInfo{Reference}{InDelim} = $InDelim;
 635 
 636   # Setup reference FingerprintsFileIO parameters...
 637   %{$FingerprintsFilesInfo{Reference}{FingerprintsFileIOParameters}} = RetrieveFingerprintsFileIOParameters('Reference', $FileType, $FingerprintsFile);
 638 
 639   # Make sure reference fingerprints data file contains valid and retrieve fingerprints string mode information...
 640   ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription) = RetrieveFingerprintsFileFingerprintsStringInfo('Reference', $FingerprintsFile);
 641   $FingerprintsFilesInfo{Reference}{FingerprintsStringMode} = $FingerprintsStringMode;
 642   $FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode} = $FingerprintsBitVectorStringMode;
 643   $FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode} = $FingerprintsVectorStringMode;
 644   $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType} = $FirstFingerprintsStringType;
 645   $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription} = $FirstFingerprintsStringDescription;
 646 
 647 }
 648 
 649 # Retrieve information about database fingerprints file...
 650 #
 651 sub RetrieveDatabaseFingerprintsFileInfo {
 652   my($FingerprintsFile, $FileType, $InDelim, $FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription);
 653 
 654   $FingerprintsFile = $FingerprintsFilesInfo{Database}{FileName};
 655   ($FileType, $InDelim) =  RetrieveFingerprintsFileInfo($FingerprintsFile);
 656 
 657   $FingerprintsFilesInfo{Database}{FileType} = $FileType;
 658   $FingerprintsFilesInfo{Database}{InDelim} = $InDelim;
 659 
 660   # Setup reference FingerprintsFileIO parameters...
 661   %{$FingerprintsFilesInfo{Database}{FingerprintsFileIOParameters}} = RetrieveFingerprintsFileIOParameters('Database', $FileType, $FingerprintsFile);
 662 
 663   # Make sure database fingerprints data file contains valid and retrieve fingerprints string mode information...
 664   ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription) = RetrieveFingerprintsFileFingerprintsStringInfo('Database', $FingerprintsFile);
 665   $FingerprintsFilesInfo{Database}{FingerprintsStringMode} = $FingerprintsStringMode;
 666   $FingerprintsFilesInfo{Database}{FingerprintsBitVectorStringMode} = $FingerprintsBitVectorStringMode;
 667   $FingerprintsFilesInfo{Database}{FingerprintsVectorStringMode} = $FingerprintsVectorStringMode;
 668   $FingerprintsFilesInfo{Database}{FirstFingerprintsStringType} = $FirstFingerprintsStringType;
 669   $FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription} = $FirstFingerprintsStringDescription;
 670 
 671   # Retrieve database fingerprints data field information for output file...
 672   #
 673   RetrieveDatabaseFingerprintsDataFieldsInfo($FingerprintsFile, $FileType, $InDelim);
 674 
 675   # Retrieve database fingerprints text file data columns information for output file...
 676   #
 677   RetrieveDatabaseFingerprintsDataColsInfo($FingerprintsFile, $FileType, $InDelim);
 678 
 679   # Any need to collect database compound string or data line for generation of results files...
 680   $FingerprintsFilesInfo{Database}{CollectCmpdStringData} = ($FileType =~ /^SD$/i) ? 1 : 0;
 681   $FingerprintsFilesInfo{Database}{CollectDataLine} = ($FileType =~ /^Text$/i && $OptionsInfo{DatabaseDataColsMode} =~ /^(All|Specify)$/i) ? 1 : 0;
 682   $FingerprintsFilesInfo{Database}{CollectInputFileData} = ($FingerprintsFilesInfo{Database}{CollectCmpdStringData} || $FingerprintsFilesInfo{Database}{CollectDataLine}) ? 1 : 0;
 683 
 684   # Set maximum number of similar compounds to find for individual reference of set of multiple
 685   # reference compounds...
 686   #
 687   SetMaximumSimilarMoleculesToRetrieve($FingerprintsFile, $FileType, $InDelim);
 688 }
 689 
 690 # Retrieve database fingerprints data field information...
 691 #
 692 sub RetrieveDatabaseFingerprintsDataFieldsInfo {
 693   my($FingerprintsFile, $FileType, $InDelim) = @_;
 694   my($CollectDataFields, $CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef, @DataFieldsToOutput);
 695 
 696   $FingerprintsFilesInfo{Database}{OutputDataFields} = 0;
 697   @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}} = ();
 698 
 699   $FingerprintsFilesInfo{Database}{OutputCurrentDataFields} = 0;
 700 
 701   @{$FingerprintsFilesInfo{Database}{AllDataFields}} = ();
 702   @{$FingerprintsFilesInfo{Database}{CommonDataFields}} = ();
 703   @{$FingerprintsFilesInfo{Database}{SpecifiedDatabaseDataFields}} = ();
 704 
 705   if ($FileType !~ /^SD$/i) {
 706     return;
 707   }
 708 
 709   # No need to go over SD file and collect data fields for SD file during All DatabaseDataFieldsMode as
 710   # they would be retrieved from database SD file compound string during generation of output files...
 711   #
 712   $CollectDataFields = (($OptionsInfo{TextOutput} && $OptionsInfo{DatabaseDataFieldsMode} =~ /^(All|Common)$/i) || ($OptionsInfo{SDOutput} && $OptionsInfo{DatabaseDataFieldsMode} =~ /^Common$/i)) ? 1 : 0;
 713 
 714   ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = (undef) x 2;
 715 
 716   if ($CollectDataFields) {
 717     open SDFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n";
 718     ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 719     close SDFILE;
 720   }
 721 
 722   @DataFieldsToOutput = ();
 723   if ($OptionsInfo{DatabaseDataFieldsMode} =~ /^All$/i) {
 724     if (defined $AllDataFieldsRef) {
 725       push @DataFieldsToOutput, @{$AllDataFieldsRef};
 726       push @{$FingerprintsFilesInfo{Database}{AllDataFields}}, @{$AllDataFieldsRef};
 727     }
 728     else {
 729       # Retrieve and output data fields and values dynamically...
 730       $FingerprintsFilesInfo{Database}{OutputCurrentDataFields} = 1;
 731     }
 732   }
 733   elsif ($OptionsInfo{DatabaseDataFieldsMode} =~ /^Common$/i) {
 734     if (defined $CommonDataFieldsRef) {
 735       push @DataFieldsToOutput, @{$CommonDataFieldsRef};
 736       push @{$FingerprintsFilesInfo{Database}{CommonDataFields}}, @{$CommonDataFieldsRef};
 737     }
 738   }
 739   elsif ($OptionsInfo{DatabaseDataFieldsMode} =~ /^Specify$/i) {
 740     push @DataFieldsToOutput, @{$OptionsInfo{SpecifiedDatabaseDataFields}};
 741     push @{$FingerprintsFilesInfo{Database}{SpecifiedDatabaseDataFields}}, @{$OptionsInfo{SpecifiedDatabaseDataFields}};
 742   }
 743 
 744   if ($OptionsInfo{DatabaseDataFieldsMode} !~ /^CompoundID$/i) {
 745     $FingerprintsFilesInfo{Database}{OutputDataFields} = 1;
 746   }
 747 
 748   push @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}, @DataFieldsToOutput;
 749 
 750 }
 751 
 752 # Retrieve database fingerprints data columns information...
 753 #
 754 sub RetrieveDatabaseFingerprintsDataColsInfo {
 755   my($FingerprintsFile, $FileType, $InDelim) = @_;
 756   my($Line, $ColNum, $ColLabel, $NumOfCols, @DataColLabels, @DataColLabelsToOutput, @DataColNumsToOutput, %DataColLabelToNumMap, %DataColNumToLabelMap);
 757 
 758   $FingerprintsFilesInfo{Database}{OutputDataCols} = 0;
 759 
 760   @{$FingerprintsFilesInfo{Database}{DataColLabels}} = ();
 761   %{$FingerprintsFilesInfo{Database}{DataColLabelToNumMap}} = ();
 762   %{$FingerprintsFilesInfo{Database}{DataColNumToLabelMap}} = ();
 763 
 764   @{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}} = ();
 765   @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}} = ();
 766 
 767   if ($FileType !~ /^Text$/i) {
 768     return;
 769   }
 770 
 771   @DataColLabels = ();
 772   @DataColLabelsToOutput = ();
 773   @DataColNumsToOutput = ();
 774 
 775   %DataColLabelToNumMap = ();
 776   %DataColNumToLabelMap = ();
 777 
 778   # Get column label line...
 779   open TEXTFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n";
 780   $Line = TextUtil::GetTextLine(\*TEXTFILE);
 781   close TEXTFILE;
 782 
 783   $InDelim = ($InDelim =~ /^Tab$/i) ? "\t" : ($InDelim =~ /semicolon/i ? "\;" : "\,");
 784 
 785   @DataColLabels = TextUtil::SplitWords($Line, $InDelim);
 786   $NumOfCols = scalar @DataColLabels;
 787 
 788   for $ColNum (0 .. $#DataColLabels) {
 789     $ColLabel = $DataColLabels[$ColNum];
 790     $DataColLabelToNumMap{$ColLabel} = $ColNum;
 791     $DataColNumToLabelMap{$ColNum} = $ColLabel;
 792   }
 793 
 794   if ($OptionsInfo{DatabaseDataColsMode} =~ /^Specify$/i) {
 795     if ($OptionsInfo{DatabaseColMode} =~ /^ColNum$/i) {
 796       for $ColNum (@{$OptionsInfo{SpecifiedDatabaseDataCols}}) {
 797         if ($ColNum > $NumOfCols) {
 798           die "Error: Column number, $ColNum, specified using \"--DatabaseDataCols\" is not valid: It must be <= $NumOfCols\n";
 799         }
 800         push @DataColNumsToOutput, ($ColNum - 1);
 801       }
 802     }
 803     elsif ($OptionsInfo{DatabaseColMode} =~ /^ColLabel$/i) {
 804       for $ColLabel (@{$OptionsInfo{SpecifiedDatabaseDataCols}}) {
 805         if (!exists $DataColLabelToNumMap{$ColLabel}) {
 806           die "Error: Column label, $ColLabel, specified using \"--DatabaseDataCols\" is not valid: It doesn't exist\n";
 807         }
 808         push @DataColNumsToOutput, $DataColLabelToNumMap{$ColLabel};
 809       }
 810     }
 811   }
 812   elsif ($OptionsInfo{DatabaseDataColsMode} =~ /^All$/i) {
 813     @DataColNumsToOutput = map { $_ } (0 .. $#DataColLabels);
 814   }
 815 
 816   # Setup data column labels to output...
 817   if (scalar @DataColNumsToOutput) {
 818     @DataColLabelsToOutput = map { $DataColNumToLabelMap{$_} } (0 .. $#DataColNumsToOutput);
 819   }
 820 
 821   $FingerprintsFilesInfo{Database}{OutputDataCols} = scalar @DataColNumsToOutput ? 1 : 0;
 822 
 823   @{$FingerprintsFilesInfo{Database}{DataColLabels}} = @DataColLabels;
 824   %{$FingerprintsFilesInfo{Database}{DataColLabelToNumMap}} = %DataColLabelToNumMap;
 825   %{$FingerprintsFilesInfo{Database}{DataColNumToLabelMap}} = %DataColNumToLabelMap;
 826 
 827   @{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}} = @DataColNumsToOutput;
 828   @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}} = @DataColLabelsToOutput;
 829 }
 830 
 831 # Set maximum number of similar compounds to find for individual reference of set of multiple
 832 # reference compounds...
 833 #
 834 sub SetMaximumSimilarMoleculesToRetrieve {
 835   my($FingerprintsFile, $FileType, $InDelim) = @_;
 836   my($MaxSimilarMolecules, $NumOfDatabaseMolecules, $PercentSimilarMolecules, $Line);
 837 
 838   if ($OptionsInfo{SimilarCountMode} !~ /^PercentSimilar$/i) {
 839     return;
 840   }
 841 
 842   $PercentSimilarMolecules = $OptionsInfo{PercentSimilarMolecules};
 843 
 844   # Count database entries to figure out MaxSimilarMolecules using PercentSimilarMolecules
 845   # value...
 846   $NumOfDatabaseMolecules = 0;
 847   if ($FileType =~ /^SD$/i && exists($FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules})) {
 848     # It might already be counted for SD file...
 849     $NumOfDatabaseMolecules = $FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules};
 850   }
 851   else {
 852     print "Calculating maximum number of similar molecules to retrieve for \"PercentSimilar\" value of \"--SimilarCountMode\" option by counting number of molecules in database fingerprints file...\n";
 853     open FINGERPRINTSFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n";
 854     FILETYPE: {
 855       if ($FileType =~ /^SD$/i) {
 856         while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) {
 857           if ($Line =~ /^\$\$\$\$/) {
 858             $NumOfDatabaseMolecules++;
 859           }
 860         }
 861         last FILETYPE;
 862       }
 863       if ($FileType =~ /^Text$/i) {
 864         # Ignore column label line...
 865         $Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE);
 866         while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) {
 867           $NumOfDatabaseMolecules++;
 868         }
 869         last FILETYPE;
 870       }
 871       if ($FileType =~ /^FP$/i) {
 872         while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) {
 873           if ($Line !~ /^#/) {
 874             $NumOfDatabaseMolecules++;
 875           }
 876         }
 877         last FILETYPE;
 878       }
 879       $NumOfDatabaseMolecules = 0;
 880     }
 881     close FINGERPRINTSFILE;
 882     $FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules} = $NumOfDatabaseMolecules;
 883   }
 884 
 885   $MaxSimilarMolecules = int (($NumOfDatabaseMolecules * $PercentSimilarMolecules)/100);
 886   if ($MaxSimilarMolecules < 1) {
 887     $MaxSimilarMolecules = 1;
 888   }
 889 
 890   $OptionsInfo{MaxSimilarMolecules} = $MaxSimilarMolecules;
 891 }
 892 
 893 # Retrieve information about fingerprints file...
 894 #
 895 sub RetrieveFingerprintsFileInfo {
 896   my($FingerprintsFile) = @_;
 897   my($FileType, $InDelim, $FileDir, $FileExt, $FileName);
 898 
 899   if (!(-e $FingerprintsFile)) {
 900     die "Error: Input fingerprints file, $FingerprintsFile, doesn't exist.\n";
 901   }
 902 
 903   $FileType = Fingerprints::FingerprintsFileUtil::GetFingerprintsFileType($FingerprintsFile);
 904   if (IsEmpty($FileType)) {
 905     die "Error: Input file, $FingerprintsFile, is not a fingerprints file.\n";
 906   }
 907 
 908   $InDelim = '';
 909   if ($FileType =~ /^Text$/i) {
 910     $FileDir = ""; $FileName = ""; $FileExt = "";
 911     ($FileDir, $FileName, $FileExt) = ParseFileName($FingerprintsFile);
 912     $InDelim = ($FileExt =~ /^tsv$/i) ? 'Tab' : $OptionsInfo{InDelim};
 913   }
 914 
 915   return ($FileType, $InDelim);
 916 }
 917 
 918 # Retrieve fingerprints file IO parameters...
 919 #
 920 sub RetrieveFingerprintsFileIOParameters {
 921   my($FingerprintsFileMode, $FileType, $FingerprintsFile) = @_;
 922   my(%FingerprintsFileIOParams);
 923 
 924   if ($FingerprintsFileMode !~ /^(Reference|Database)$/) {
 925     die "Error: Unknown fingerprints file mode: $FingerprintsFileMode. Supported values: Reference or Database\n";
 926   }
 927 
 928   %FingerprintsFileIOParams = ();
 929 
 930   FILETYPE: {
 931     if ($FileType =~ /^SD$/i) {
 932       %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' =>  $OptionsInfo{Detail}, 'FingerprintsFieldLabel' => $OptionsInfo{"${FingerprintsFileMode}FingerprintsField"}, 'CompoundIDMode' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDMode"}, 'CompoundIDFieldLabel' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDField"}, 'CompoundIDPrefix' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDPrefix"});
 933       last FILETYPE;
 934     }
 935     if ($FileType =~ /^FP$/i) {
 936       %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' =>  $OptionsInfo{Detail});
 937       last FILETYPE;
 938     }
 939     if ($FileType =~ /^Text$/i) {
 940       %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' =>  $OptionsInfo{Detail}, 'FingerprintsCol' => $OptionsInfo{"${FingerprintsFileMode}FingerprintsCol"}, 'ColMode' => $OptionsInfo{"${FingerprintsFileMode}ColMode"}, 'CompoundIDCol' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDCol"}, 'CompoundIDPrefix' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDPrefix"}, 'InDelim' => $FingerprintsFilesInfo{$FingerprintsFileMode}{InDelim});
 941       last FILETYPE;
 942     }
 943     die "Error: Fingerprints file type, $FileType, is not valid. Supported file types: SD, FP or Text\n";
 944   }
 945 
 946   return %FingerprintsFileIOParams;
 947 }
 948 
 949 # Make sure fingerprints data file contains valid dta and retrieve fingerprints string mode information...
 950 #
 951 sub RetrieveFingerprintsFileFingerprintsStringInfo {
 952   my($FingerprintsFileMode, $FingerprintsFile) = @_;
 953   my($FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription);
 954 
 955   $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{$FingerprintsFileMode}{FingerprintsFileIOParameters}});
 956   if (!$FingerprintsFileIO) {
 957     die "Error: Reference fingerprints file, $FingerprintsFile, contains invalid fingerprints data.\n";
 958   }
 959   if (!$FingerprintsFileIO->IsFingerprintsFileDataValid()) {
 960     die "Error: Reference fingerprints file, $FingerprintsFile, contains invalid fingerprints data.\n";
 961   }
 962 
 963   $FingerprintsStringMode = $FingerprintsFileIO->GetFingerprintsStringMode();
 964   $FingerprintsBitVectorStringMode = $FingerprintsFileIO->GetFingerprintsBitVectorStringMode();
 965   $FingerprintsVectorStringMode = $FingerprintsFileIO->GetFingerprintsVectorStringMode();
 966 
 967   $FirstFingerprintsStringType = $FingerprintsFileIO->GetFirstFingerprintsStringType();
 968   $FirstFingerprintsStringDescription = $FingerprintsFileIO->GetFirstFingerprintsStringDescription();
 969 
 970   $FingerprintsFileIO->Close();
 971 
 972   return ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription);
 973 }
 974 
 975 # Retrieve output files names using reference fingerprints file name...
 976 #
 977 sub RetrieveOutputFilesInfo {
 978   my($FingerprintsFile, $FileDir, $FileExt, $FileName, $OutFileRoot, $SDOutFileName, $TextOutFileName, $SDOutFileExt, $TextOutFileExt, $ReferenceFileName, $DatabaseFileName);
 979 
 980   $OutputFilesInfo{OutFileRoot} = '';
 981   $OutputFilesInfo{SDOutFileName} = '';
 982   $OutputFilesInfo{TextOutFileName} = '';
 983 
 984   $FingerprintsFile = $FingerprintsFilesInfo{Reference}{FileName};
 985 
 986   $FileDir = ""; $FileName = ""; $FileExt = "";
 987   ($FileDir, $FileName, $FileExt) = ParseFileName($FingerprintsFile);
 988 
 989   $SDOutFileExt = "sdf";
 990   $TextOutFileExt = ($Options{outdelim} =~ /^tab$/i) ? "tsv" : "csv";
 991 
 992   if ($OptionsInfo{OutFileRoot}) {
 993     my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 994     if ($RootFileName && $RootFileExt) {
 995       $FileName = $RootFileName;
 996     }
 997     else {
 998       $FileName = $OptionsInfo{OutFileRoot};
 999     }
1000     $OutFileRoot = $FileName;
1001   }
1002   else {
1003     $OutFileRoot = "${FileName}SimilaritySearching";
1004   }
1005 
1006   $SDOutFileName = "${OutFileRoot}.${SDOutFileExt}";
1007   $TextOutFileName = "${OutFileRoot}.${TextOutFileExt}";
1008 
1009   $ReferenceFileName = $FingerprintsFilesInfo{Reference}{FileName};
1010   $DatabaseFileName = $FingerprintsFilesInfo{Database}{FileName};
1011 
1012   if ($OptionsInfo{SDOutput}) {
1013     if ($SDOutFileName =~ /^$ReferenceFileName$/i) {
1014       die "Error: Same output, $SDOutFileName, and reference input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n";
1015     }
1016     if ($SDOutFileName =~ /^$DatabaseFileName$/i) {
1017       die "Error: Same output, $SDOutFileName, and database input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n";
1018     }
1019   }
1020 
1021   if ($OptionsInfo{TextOutput}) {
1022     if ($TextOutFileName =~ /^$ReferenceFileName$/i) {
1023       die "Error: Same output, $TextOutFileName, and reference input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n";
1024     }
1025     if ($TextOutFileName =~ /^$DatabaseFileName$/i) {
1026       die "Error: Same output, $TextOutFileName, and database input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n";
1027     }
1028   }
1029 
1030   if (!$OptionsInfo{OverwriteFiles}) {
1031     if ($OptionsInfo{SDOutput}) {
1032       if (-e $SDOutFileName) {
1033         die "Error: The output file $SDOutFileName already exists.\n";
1034       }
1035     }
1036     if ($OptionsInfo{TextOutput}) {
1037       if (-e $TextOutFileName) {
1038         die "Error: The output file $TextOutFileName already exists.\n";
1039       }
1040     }
1041   }
1042 
1043   $OutputFilesInfo{OutFileRoot} = $OutFileRoot;
1044   $OutputFilesInfo{SDOutFileName} = $SDOutFileName;
1045   $OutputFilesInfo{TextOutFileName} = $TextOutFileName;
1046 
1047 }
1048 
1049 # Process input fingerprints file names...
1050 #
1051 sub ProcessFingerprintsFileNames {
1052   @FingerprintsFilesList = ();
1053 
1054   if (@ARGV != 2) {
1055     die GetUsageFromPod("$FindBin::Bin/$ScriptName");
1056   }
1057 
1058   # Reference fingerprints file name...
1059   push @FingerprintsFilesList, $ARGV[0];
1060 
1061   # Database fingerprints file name...
1062   push @FingerprintsFilesList, $ARGV[1];
1063 
1064 }
1065 
1066 # Process option values...
1067 sub ProcessOptions {
1068   %OptionsInfo = ();
1069 
1070   $OptionsInfo{Mode} = $Options{mode};
1071   $OptionsInfo{FingerprintsMode} = $Options{fingerprintsmode};
1072 
1073   $OptionsInfo{SearchMode} = $Options{searchmode};
1074 
1075   ProcessBitVectorComparisonOptions();
1076   ProcessVectorComparisonOptions();
1077 
1078   $OptionsInfo{GroupFusionRule} = $Options{groupfusionrule};
1079   $OptionsInfo{GroupFusionApplyCutoff} = ($Options{groupfusionapplycutoff} =~ /^Yes$/i) ? 1 : 0;;
1080 
1081   $OptionsInfo{SimilarCountMode} = $Options{similarcountmode};
1082   $OptionsInfo{NumOfSimilarMolecules} = $Options{numofsimilarmolecules};
1083   $OptionsInfo{PercentSimilarMolecules} = $Options{percentsimilarmolecules};
1084 
1085   # Set MaxSimilarMolecules to NumOfSimilarMolecules. For PercentSimilar value of SimilarCountMode,
1086   # it'll be overwritten using number of entries in database fingerprints file and value of PercentSimilarMolecules...
1087   #
1088   $OptionsInfo{MaxSimilarMolecules} = $OptionsInfo{NumOfSimilarMolecules};
1089 
1090   $OptionsInfo{SimilarityCutoff} = $Options{similaritycutoff};
1091   $OptionsInfo{DistanceCutoff} = $Options{distancecutoff};
1092 
1093   $OptionsInfo{kNN} = $Options{knn};
1094   if ($Options{knn} !~ /^All$/i) {
1095     if (!IsPositiveInteger($Options{knn})) {
1096       die "Error: The value specified, $Options{knn}, for option \"-k, --KNN\" is not valid. Allowed values: > 0 \n";
1097     }
1098   }
1099 
1100   ProcessReferenceFingerprintsDataOptions();
1101   ProcessDatabaseFingerprintsDataOptions();
1102 
1103   $OptionsInfo{Detail} = $Options{detail};
1104 
1105   $OptionsInfo{InDelim} = $Options{indelim};
1106   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
1107   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
1108 
1109   $OptionsInfo{Output} = $Options{output};
1110   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|Both)$/i) ? 1 : 0;
1111   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|Both)$/i) ? 1 : 0;
1112 
1113   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
1114   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
1115 
1116   $OptionsInfo{Fast} = $Options{fast} ? 1 : 0;
1117   $OptionsInfo{ValidateData} = $Options{fast} ? 0 : 1;
1118 
1119   $OptionsInfo{Precision} = $Options{precision};
1120 }
1121 
1122 # Process options related to comparion of bit vector strings...
1123 #
1124 sub ProcessBitVectorComparisonOptions {
1125   # Setup supported bit vector similarity coefficients for bit vector strings...
1126   my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap);
1127 
1128   @SupportedComparisonMeasures = ();
1129   %SupportedComparisonMeasuresNameMap = ();
1130   %SupportedComparisonMeasuresMethodMap = ();
1131 
1132   for $SupportedComparisonMeasure (Fingerprints::FingerprintsBitVector::GetSupportedSimilarityCoefficients()) {
1133     # Similarity coefficient function/method names contain "Coefficient" in their names.
1134     # So take 'em out and setup a map to original function/method name...
1135     $ComparisonMeasure = $SupportedComparisonMeasure;
1136     $ComparisonMeasure =~ s/Coefficient$//;
1137 
1138     push @SupportedComparisonMeasures, $ComparisonMeasure;
1139     $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure;
1140     $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure;
1141   }
1142 
1143   # Setup similarity coefficient to use for calculating similarity matrices for bit vector strings...
1144   my($SpecifiedMeasure, $SpecifiedComparisonMeasureName, $SpecifiedComparisonMeasureMethod);
1145 
1146   $SpecifiedComparisonMeasureName = '';
1147   $SpecifiedComparisonMeasureMethod = '';
1148 
1149   $SpecifiedMeasure = $Options{bitvectorcomparisonmode};
1150 
1151   if (! exists $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)} )  {
1152       die "Error: The value specified, $SpecifiedMeasure, for option \"-b --BitVectorComparisonMode\" is not valid.\nAllowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n";
1153   }
1154 
1155   $SpecifiedComparisonMeasureMethod = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)};
1156   $SpecifiedComparisonMeasureName = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)};
1157 
1158   $OptionsInfo{BitVectorComparisonMode} = $Options{bitvectorcomparisonmode};
1159 
1160   $OptionsInfo{SpecifiedBitVectorComparisonMeasure} = $SpecifiedMeasure;
1161   $OptionsInfo{SpecifiedBitVectorComparisonMeasureName} = $SpecifiedComparisonMeasureName;
1162   $OptionsInfo{SpecifiedBitVectorComparisonMeasureMethod} = $SpecifiedComparisonMeasureMethod;
1163 
1164   # Make sure valid alpha parameter is specified for Tversky calculation...
1165   $OptionsInfo{Alpha} = '';
1166   if ($SpecifiedMeasure =~ /^(TverskySimilarity|WeightedTverskySimilarity)$/i) {
1167     if (IsEmpty($Options{alpha})) {
1168       die "Error: You must specify a value for \"-a, --alpha\" option in \"TverskySimilarity or WeightedTverskySimilarity\" \"-m --mode\". \n";
1169     }
1170     my($Alpha);
1171     $Alpha = $Options{alpha};
1172     if (!(IsFloat($Alpha) && $Alpha >=0 && $Alpha <= 1)) {
1173       die "Error: The value specified, $Options{alpha}, for option \"-a, --alpha\" is not valid. Allowed values: >= 0 and <= 1\n";
1174     }
1175     $OptionsInfo{Alpha} = $Alpha;
1176   }
1177 
1178   # Make sure valid beta parameter is specified for WeightedTanimoto and WeightedTversky
1179   # calculations...
1180   $OptionsInfo{Beta} = '';
1181   if ($SpecifiedMeasure =~ /^(WeightedTverskySimilarity|WeightedTanimotoSimilarity)$/i) {
1182     if (IsEmpty($Options{beta})) {
1183       die "Error: You must specify a value for \"-b, --beta\" option in \"WeightedTverskySimilarity or WeightedTanimotoSimilarity\" \"-m --mode\". \n";
1184     }
1185     my($Beta);
1186     $Beta = $Options{beta};
1187     if (!(IsFloat($Beta) && $Beta >=0 && $Beta <= 1)) {
1188       die "Error: The value specified, $Options{beta}, for option \"-b, --beta\" is not valid. Allowed values: >= 0 and <= 1\n";
1189     }
1190     $OptionsInfo{Beta} = $Beta;
1191   }
1192 }
1193 
1194 # Process options related to comparion of vector strings...
1195 #
1196 sub ProcessVectorComparisonOptions {
1197   # Setup specified similarity coefficients for vector strings..
1198   my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap);
1199 
1200   @SupportedComparisonMeasures = ();
1201   %SupportedComparisonMeasuresNameMap = ();
1202   %SupportedComparisonMeasuresMethodMap = ();
1203   for $SupportedComparisonMeasure (Fingerprints::FingerprintsVector::GetSupportedDistanceAndSimilarityCoefficients()) {
1204     # Similarity and distance coefficient function/method names contain "Coefficient" in their names.
1205     # So take 'em out and setup a map to original function/method name...
1206     $ComparisonMeasure = $SupportedComparisonMeasure;
1207     if ($ComparisonMeasure =~ /Coefficient$/i) {
1208       $ComparisonMeasure =~ s/Coefficient$//i;
1209     }
1210     push @SupportedComparisonMeasures, $ComparisonMeasure;
1211     $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure;
1212     $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure;
1213   }
1214 
1215   # Setup a list of similarity coefficients to use for calculating similarity matrices for bit vector strings...
1216   my($SpecifiedMeasure, $SpecifiedComparisonMeasureName, $SpecifiedComparisonMeasureMethod);
1217 
1218   $SpecifiedComparisonMeasureName = '';
1219   $SpecifiedComparisonMeasureMethod = '';
1220 
1221   $SpecifiedMeasure = $Options{vectorcomparisonmode};
1222   $SpecifiedMeasure =~ s/ //g;
1223 
1224   if (! exists($SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)})) {
1225     die "Error: The value specified, $SpecifiedMeasure, for option \"-v --VectorComparisonMode\" is not valid.\nAllowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n";
1226   }
1227 
1228   $SpecifiedComparisonMeasureMethod = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)};
1229   $SpecifiedComparisonMeasureName = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)};
1230 
1231   $OptionsInfo{VectorComparisonMode} = $Options{vectorcomparisonmode};
1232 
1233   $OptionsInfo{SpecifiedVectorComparisonMeasure} = $SpecifiedMeasure;
1234   $OptionsInfo{SpecifiedVectorComparisonMeasuresName} = $SpecifiedComparisonMeasureName;
1235   $OptionsInfo{SpecifiedVectorComparisonMeasuresMethod} = $SpecifiedComparisonMeasureMethod;
1236 
1237   # Setup specified vector comparison calculation modes...
1238   my($SpecifiedFormulism);
1239 
1240   $SpecifiedFormulism = $Options{vectorcomparisonformulism};
1241   $SpecifiedFormulism =~ s/ //g;
1242   if ($SpecifiedFormulism !~ /^(AlgebraicForm|BinaryForm|SetTheoreticForm)$/i) {
1243     die "Error: The value specified, $SpecifiedFormulism, for option \"--VectorComparisonFormulism\" is not valid. Allowed values: AlgebraicForm, BinaryForm or SetTheoreticForm\n";
1244   }
1245 
1246   $OptionsInfo{VectorComparisonFormulism} = $Options{vectorcomparisonformulism};
1247   $OptionsInfo{SpecifiedVectorComparisonMode} = $SpecifiedFormulism;
1248 
1249 }
1250 
1251 # Process options related to data retrieval from reference fingerprints SD and CSV/TSV
1252 # text files...
1253 #
1254 sub ProcessReferenceFingerprintsDataOptions {
1255 
1256   $OptionsInfo{ReferenceCompoundIDPrefix} = $Options{referencecompoundidprefix} ? $Options{referencecompoundidprefix} : 'Cmpd';
1257 
1258   # Compound ID and fingerprints column options for text files...
1259 
1260   $OptionsInfo{ReferenceColMode} = $Options{referencecolmode};
1261 
1262   if (IsNotEmpty($Options{referencecompoundidcol})) {
1263     if ($Options{referencecolmode} =~ /^ColNum$/i) {
1264       if (!IsPositiveInteger($Options{referencecompoundidcol})) {
1265         die "Error: Column value, $Options{referencecompoundidcol}, specified using \"--ReferenceCompoundIDCol\" is not valid: Allowed integer values: > 0\n";
1266       }
1267     }
1268     $OptionsInfo{ReferenceCompoundIDCol} = $Options{referencecompoundidcol};
1269   }
1270   else {
1271     $OptionsInfo{ReferenceCompoundIDCol} = 'AutoDetect';
1272   }
1273 
1274   if (IsNotEmpty($Options{referencefingerprintscol})) {
1275     if ($Options{referencecolmode} =~ /^ColNum$/i) {
1276       if (!IsPositiveInteger($Options{referencefingerprintscol})) {
1277         die "Error: Column value, $Options{referencefingerprintscol}, specified using \"--ReferenceFingerprintsCol\" is not valid: Allowed integer values: > 0\n";
1278       }
1279     }
1280     $OptionsInfo{ReferenceFingerprintsCol} = $Options{referencefingerprintscol};
1281   }
1282   else {
1283     $OptionsInfo{ReferenceFingerprintsCol} = 'AutoDetect';
1284   }
1285 
1286   if (IsNotEmpty($Options{referencecompoundidcol}) && IsNotEmpty($Options{referencefingerprintscol})) {
1287     if (IsPositiveInteger($Options{referencecompoundidcol}) && IsPositiveInteger($Options{referencefingerprintscol})) {
1288       if (($Options{referencecompoundidcol} == $Options{referencefingerprintscol})) {
1289         die "Error: Values specified using \"--ReferenceCompoundIDCol\" and \"--ReferenceFingerprintsCol\", $Options{referencecompoundidcol}, must be different.\n";
1290       }
1291     }
1292     else {
1293       if (($Options{referencecompoundidcol} eq $Options{referencefingerprintscol})) {
1294         die "Error: Values specified using \"--ReferenceCompoundIDCol\" and \"--ReferenceFingerprintsCol\", $Options{referencecompoundidcol}, must be different.\n";
1295       }
1296     }
1297   }
1298 
1299   # Compound ID and fingerprints field options for SD files...
1300 
1301   $OptionsInfo{ReferenceCompoundIDMode} = $Options{referencecompoundidmode};
1302   $OptionsInfo{ReferenceCompoundIDField} = '';
1303 
1304   if ($Options{referencecompoundidmode} =~ /^DataField$/i && !$Options{referencecompoundidfield}) {
1305     die "Error: You must specify a value for \"--ReferenceCompoundIDField\" option in \"DataField\" \"--ReferenceCompoundIDMode\". \n";
1306   }
1307   if ($Options{referencecompoundidfield}) {
1308     $OptionsInfo{ReferenceCompoundIDField} = $Options{referencecompoundidfield};
1309   }
1310 
1311   if (IsNotEmpty($Options{referencefingerprintsfield})) {
1312     $OptionsInfo{ReferenceFingerprintsField} = $Options{referencefingerprintsfield};
1313   }
1314   else {
1315     $OptionsInfo{ReferenceFingerprintsField} = 'AutoDetect';
1316   }
1317 
1318   if ($Options{referencecompoundidfield} && IsNotEmpty($Options{referencefingerprintsfield})) {
1319     if (($Options{referencecompoundidfield} eq $Options{referencefingerprintsfield})) {
1320       die "Error: Values specified using \"--ReferenceCompoundIDField\" and \"--ReferenceFingerprintsfield\", $Options{referencecompoundidfield}, must be different.\n";
1321     }
1322   }
1323 
1324 }
1325 
1326 # Process options related to data retrieval from database fingerprints SD and CSV/TSV
1327 # text files...
1328 #
1329 sub ProcessDatabaseFingerprintsDataOptions {
1330 
1331   $OptionsInfo{DatabaseCompoundIDPrefix} = $Options{databasecompoundidprefix} ? $Options{databasecompoundidprefix} : 'Cmpd';
1332 
1333   # Compound ID and fingerprints column options for text files...
1334 
1335   $OptionsInfo{DatabaseColMode} = $Options{databasecolmode};
1336 
1337   if (IsNotEmpty($Options{databasecompoundidcol})) {
1338     if ($Options{databasecolmode} =~ /^ColNum$/i) {
1339       if (!IsPositiveInteger($Options{databasecompoundidcol})) {
1340         die "Error: Column value, $Options{databasecompoundidcol}, specified using \"--DatabaseCompoundIDCol\" is not valid: Allowed integer values: > 0\n";
1341       }
1342     }
1343     $OptionsInfo{DatabaseCompoundIDCol} = $Options{databasecompoundidcol};
1344   }
1345   else {
1346     $OptionsInfo{DatabaseCompoundIDCol} = 'AutoDetect';
1347   }
1348 
1349   if (IsNotEmpty($Options{databasefingerprintscol})) {
1350     if ($Options{databasecolmode} =~ /^ColNum$/i) {
1351       if (!IsPositiveInteger($Options{databasefingerprintscol})) {
1352         die "Error: Column value, $Options{databasefingerprintscol}, specified using \"--DatabaseFingerprintsCol\" is not valid: Allowed integer values: > 0\n";
1353       }
1354     }
1355     $OptionsInfo{DatabaseFingerprintsCol} = $Options{databasefingerprintscol};
1356   }
1357   else {
1358     $OptionsInfo{DatabaseFingerprintsCol} = 'AutoDetect';
1359   }
1360 
1361   if (IsNotEmpty($Options{databasecompoundidcol}) && IsNotEmpty($Options{databasefingerprintscol})) {
1362     if (IsPositiveInteger($Options{databasecompoundidcol}) && IsPositiveInteger($Options{databasefingerprintscol})) {
1363       if (($Options{databasecompoundidcol} == $Options{databasefingerprintscol})) {
1364         die "Error: Values specified using \"--DatabaseCompoundIDCol\" and \"--DatabaseFingerprintsCol\", $Options{databasecompoundidcol}, must be different.\n";
1365       }
1366     }
1367     else {
1368       if (($Options{databasecompoundidcol} eq $Options{databasefingerprintscol})) {
1369         die "Error: Values specified using \"--DatabaseCompoundIDCol\" and \"--DatabaseFingerprintsCol\", $Options{databasecompoundidcol}, must be different.\n";
1370       }
1371     }
1372   }
1373 
1374   # Database data column options for text files...
1375 
1376   $OptionsInfo{DatabaseDataColsMode} = $Options{databasedatacolsmode};
1377   $OptionsInfo{DatabaseDataCols} = '';
1378   @{$OptionsInfo{SpecifiedDatabaseDataCols}} = ();
1379 
1380   if ($Options{databasedatacolsmode} =~ /^Specify$/i) {
1381     my($DatabaseDataCols, $DatabaseColNum, @SpecifiedDataCols);
1382 
1383     if (!$Options{databasedatacols}) {
1384       die "Error: You must specify a value for \"--DatabaseDataCols\" option in \"Specify\" \"--DatabaseDataColsMode\". \n";
1385     }
1386     $DatabaseDataCols = $Options{databasedatacols};
1387 
1388     if ($Options{databasecolmode} =~ /^ColNum$/i) {
1389       $DatabaseDataCols =~ s/ //g;
1390       @SpecifiedDataCols = split /\,/, $DatabaseDataCols;
1391       for $DatabaseColNum (@SpecifiedDataCols) {
1392         if (!IsPositiveInteger($DatabaseColNum)) {
1393           die "Error: Column value, $DatabaseColNum, specified using \"--DatabaseDataCols\" is not valid: Allowed integer values: > 0\n";
1394         }
1395       }
1396     }
1397     else {
1398       @SpecifiedDataCols = split /\,/, $DatabaseDataCols;
1399     }
1400     $OptionsInfo{DatabaseDataCols} = $DatabaseDataCols;
1401     push @{$OptionsInfo{SpecifiedDatabaseDataCols}}, @SpecifiedDataCols;
1402   }
1403   elsif ($Options{databasedatacolsmode} =~ /^All$/i) {
1404     $OptionsInfo{DatabaseDataCols} = 'All';
1405   }
1406 
1407   if ($OptionsInfo{DatabaseDataColsMode} =~ /^Specify$/i && !$OptionsInfo{DatabaseDataCols}) {
1408     die "Error: You must specify a value for \"--DatabaseDataCols\" option in \"Specify\" \"--DatabaseDataColsMode\". \n";
1409   }
1410 
1411   # Compound ID and fingerprints field options for SD files...
1412 
1413   $OptionsInfo{DatabaseCompoundIDMode} = $Options{databasecompoundidmode};
1414   $OptionsInfo{DatabaseCompoundIDField} = $Options{databasecompoundidfield} ? $Options{databasecompoundidfield} : '';
1415 
1416   if ($Options{databasecompoundidmode} =~ /^DataField$/i) {
1417     if (!$Options{databasecompoundidfield}) {
1418       die "Error: You must specify a value for \"--DatabaseCompoundIDField\" option in \"DataField\" \"--DatabaseCompoundIDMode\". \n";
1419     }
1420     $OptionsInfo{DatabaseCompoundIDField} = $Options{databasecompoundidfield};
1421   }
1422 
1423 
1424   if (IsNotEmpty($Options{databasefingerprintsfield})) {
1425     $OptionsInfo{DatabaseFingerprintsField} = $Options{databasefingerprintsfield};
1426   }
1427   else {
1428     $OptionsInfo{DatabaseFingerprintsField} = 'AutoDetect';
1429   }
1430 
1431   if ($Options{databasecompoundidfield} && IsNotEmpty($Options{databasefingerprintsfield})) {
1432     if (($Options{databasecompoundidfield} eq $Options{databasefingerprintsfield})) {
1433       die "Error: Values specified using \"--DatabaseCompoundIDField\" and \"--DatabaseFingerprintsfield\", $Options{databasecompoundidfield}, must be different.\n";
1434     }
1435   }
1436 
1437   # Database data field options for SD files...
1438 
1439   $OptionsInfo{DatabaseDataFieldsMode} = $Options{databasedatafieldsmode};
1440   $OptionsInfo{DatabaseDataFields} = '';
1441   @{$OptionsInfo{SpecifiedDatabaseDataFields}} = ();
1442 
1443   if ($Options{databasedatafieldsmode} =~ /^Specify$/i && !$Options{databasedatafields}) {
1444     die "Error: You must specify a value for \"--DatabaseDataFields\" option in \"Specify\" \"--DatabaseDataFieldsMode\". \n";
1445   }
1446   if ($Options{databasedatafields}) {
1447     my(@SpecifiedDataFields);
1448     $OptionsInfo{DatabaseDataFields} = $Options{databasedatafields};
1449 
1450     @SpecifiedDataFields = split /\,/, $Options{databasedatafields};
1451     push @{$OptionsInfo{SpecifiedDatabaseDataFields}}, @SpecifiedDataFields;
1452   }
1453 }
1454 
1455 # Setup script usage  and retrieve command line arguments specified using various options...
1456 sub SetupScriptUsage {
1457 
1458   # Retrieve all the options...
1459   %Options = ();
1460 
1461   $Options{alpha} = 0.5;
1462   $Options{beta} = 1;
1463 
1464   $Options{bitvectorcomparisonmode} = "TanimotoSimilarity";
1465 
1466   $Options{databasecolmode} = 'colnum';
1467 
1468   $Options{databasecompoundidprefix} = 'Cmpd';
1469   $Options{databasecompoundidmode} = 'LabelPrefix';
1470 
1471   $Options{databasedatacolsmode} = 'CompoundID';
1472   $Options{databasedatafieldsmode} = 'CompoundID';
1473 
1474   $Options{distancecutoff} = 10;
1475 
1476   $Options{referencecolmode} = 'colnum';
1477 
1478   $Options{referencecompoundidprefix} = 'Cmpd';
1479   $Options{referencecompoundidmode} = 'LabelPrefix';
1480 
1481   $Options{detail} = 1;
1482 
1483   $Options{fingerprintsmode} = 'AutoDetect';
1484   $Options{groupfusionrule} = 'Max';
1485   $Options{groupfusionapplycutoff} = 'Yes';
1486 
1487   $Options{knn} = 'All';
1488 
1489   $Options{mode} = 'MultipleReferences';
1490 
1491   $Options{numofsimilarmolecules} = 10;
1492   $Options{percentsimilarmolecules} = 1;
1493 
1494   $Options{indelim} = 'comma';
1495   $Options{outdelim} = 'comma';
1496   $Options{quote} = 'yes';
1497 
1498   $Options{output} = 'text';
1499 
1500   $Options{precision} = 2;
1501 
1502   $Options{searchmode} = 'SimilaritySearch';
1503 
1504   $Options{similarcountmode} = 'NumOfSimilar';
1505 
1506   $Options{similaritycutoff} = 0.75;
1507 
1508   $Options{vectorcomparisonmode} = 'TanimotoSimilarity';
1509   $Options{vectorcomparisonformulism} = 'AlgebraicForm';
1510 
1511   if (!GetOptions(\%Options, "alpha=f", "beta=f", "bitvectorcomparisonmode|b=s", "databasecolmode=s", "databasecompoundidcol=s", "databasecompoundidprefix=s", "databasecompoundidfield=s", "databasecompoundidmode=s", "databasedatacols=s", "databasedatacolsmode=s", "databasedatafields=s", "databasedatafieldsmode=s", "databasefingerprintscol=s", "databasefingerprintsfield=s", "distancecutoff=f", "detail|d=i", "fast|f", "fingerprintsmode=s", "groupfusionrule|g=s", , "groupfusionapplycutoff=s", "help|h", "indelim=s", "knn|k=s", "mode|m=s", "numofsimilarmolecules|n=i", "outdelim=s", "output=s", "overwrite|o", "percentsimilarmolecules|p=f", "precision=s", "quote|q=s", "referencecolmode=s", "referencecompoundidcol=s", "referencecompoundidprefix=s", "referencecompoundidfield=s", "referencecompoundidmode=s", "referencefingerprintscol=s", "referencefingerprintsfield=s", "root|r=s", "searchmode|s=s", "similarcountmode=s", "similaritycutoff=f", "vectorcomparisonmode|v=s", "vectorcomparisonformulism=s", "workingdir|w=s")) {
1512     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1513   }
1514   if ($Options{workingdir}) {
1515     if (! -d $Options{workingdir}) {
1516       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1517     }
1518     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1519   }
1520   if ($Options{databasecolmode} !~ /^(ColNum|ColLabel)$/i) {
1521     die "Error: The value specified, $Options{databasecolmode}, for option \"--DatabaseColMode\" is not valid. Allowed values: ColNum, or ColLabel\n";
1522   }
1523   if ($Options{databasecompoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
1524     die "Error: The value specified, $Options{databasecompoundidmode}, for option \"--DatabaseCompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
1525   }
1526   if ($Options{databasedatacolsmode} !~ /^(All|Specify|CompoundID)$/i) {
1527     die "Error: The value specified, $Options{databasedatacolsmode}, for option \"--DatabaseDataColsMode\" is not valid. Allowed values: All, Specify, or CompoundID\n";
1528   }
1529   if ($Options{databasedatafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
1530     die "Error: The value specified, $Options{databasedatafieldsmode}, for option \"--DatabaseDataFieldsMode\" is not valid. Allowed values: All, Common, Specify, or CompoundID\n";
1531   }
1532   if (!IsPositiveInteger($Options{detail})) {
1533     die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n";
1534   }
1535   if ($Options{fingerprintsmode} !~ /^(AutoDetect|FingerprintsBitVectorString|FingerprintsVectorString)$/i) {
1536     die "Error: The value specified, $Options{fingerprintsmode}, for option \"--FingerprintsMode\" is not valid. Allowed values: AutoDetect, FingerprintsBitVectorString or FingerprintsVectorString \n";
1537   }
1538   if ($Options{groupfusionrule} !~ /^(Max|Min|Mean|Median|Sum|Euclidean)$/i) {
1539     die "Error: The value specified, $Options{groupfusionrule}, for option \"-g, --GroupFusionRule\" is not valid. Allowed values: Max, Min, Mean, Median, Sum, Euclidean\n";
1540   }
1541   if ($Options{groupfusionapplycutoff} !~ /^(Yes|No)$/i) {
1542     die "Error: The value specified, $Options{quote}, for option \"--GroupFusionApplyCutoff\" is not valid. Allowed values: Yes or No\n";
1543   }
1544   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
1545     die "Error: The value specified, $Options{indelim}, for option \"--InDelim\" is not valid. Allowed values: comma, or semicolon\n";
1546   }
1547   if ($Options{mode} !~ /^(IndividualReference|MultipleReferences)$/i) {
1548     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: IndividualReference, MultipleReferences\n";
1549   }
1550   if (!IsPositiveInteger($Options{numofsimilarmolecules})) {
1551     die "Error: The value specified, $Options{numofsimilarmolecules}, for option \"-n, --NumOfSimilarMolecules\" is not valid. Allowed values: > 0 \n";
1552   }
1553   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
1554     die "Error: The value specified, $Options{outdelim}, for option \"--OutDelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1555   }
1556   if ($Options{output} !~ /^(SD|text|both)$/i) {
1557     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n";
1558   }
1559   if (!(IsFloat($Options{percentsimilarmolecules}) && $Options{percentsimilarmolecules} > 0 && $Options{percentsimilarmolecules} <= 100)) {
1560     die "Error: The value specified, $Options{percentsimilarmolecules}, for option \"-p, --PercentSimilarMolecules\" is not valid. Allowed values: > 0 and <= 100 \n";
1561   }
1562   if ($Options{quote} !~ /^(Yes|No)$/i) {
1563     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
1564   }
1565   if (!IsPositiveInteger($Options{precision})) {
1566     die "Error: The value specified, $Options{precision}, for option \"--precision\" is not valid. Allowed values: > 0 \n";
1567   }
1568   if ($Options{referencecolmode} !~ /^(ColNum|ColLabel)$/i) {
1569     die "Error: The value specified, $Options{referencecolmode}, for option \"--ReferenceColMode\" is not valid. Allowed values: ColNum, or ColLabel\n";
1570   }
1571   if ($Options{referencecompoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
1572     die "Error: The value specified, $Options{referencecompoundidmode}, for option \"--ReferenceCompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
1573   }
1574   if ($Options{searchmode} !~ /^(SimilaritySearch|DissimilaritySearch)$/i) {
1575     die "Error: The value specified, $Options{searchmode}, for option \"-s, --SearchMode\" is not valid. Allowed values: SimilaritySearch, DissimilaritySearch \n";
1576   }
1577   if ($Options{similarcountmode} !~ /^(NumOfSimilar|PercentSimilar)$/i) {
1578     die "Error: The value specified, $Options{similarcountmode}, for option \"--SimilarCountMode\" is not valid. Allowed values: NumOfSimilar, PercentSimilar \n";
1579   }
1580 }
1581