1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: SimilaritySearchingFingerprints.pl,v $ 4 # $Date: 2015/02/28 20:46:21 $ 5 # $Revision: 1.18 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use SDFileUtil; 38 use StatisticsUtil; 39 use PseudoHeap; 40 use Fingerprints::FingerprintsFileUtil; 41 use Fingerprints::FingerprintsBitVector; 42 use Fingerprints::FingerprintsVector; 43 44 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 45 46 # Autoflush STDOUT 47 $| = 1; 48 49 # Starting message... 50 $ScriptName = basename($0); 51 print "\n$ScriptName: Starting...\n\n"; 52 $StartTime = new Benchmark; 53 54 # Get the options and setup script... 55 SetupScriptUsage(); 56 if ($Options{help} || @ARGV != 2) { 57 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 58 } 59 60 # Process reference and database file names... 61 my(@FingerprintsFilesList); 62 ProcessFingerprintsFileNames(); 63 64 # Process options... 65 print "Processing options...\n"; 66 my(%OptionsInfo); 67 ProcessOptions(); 68 69 # Setup information about fingerprints inut and SD/text output files... 70 my(%FingerprintsFilesInfo, %OutputFilesInfo, %SimilaritySearchInfo); 71 print "Checking and retrieving information from reference and database fingerprints files...\n"; 72 RetrieveFingerprintsFilesInfo(); 73 74 # Perform similarity search... 75 print "Performing similarity search...\n"; 76 my(%SimilaritySearchResults, %DatabaseFingerprintsFileData); 77 PerformSimilaritySearch(); 78 79 print "\n$ScriptName:Done...\n\n"; 80 81 $EndTime = new Benchmark; 82 $TotalTime = timediff ($EndTime, $StartTime); 83 print "Total time: ", timestr($TotalTime), "\n"; 84 85 ############################################################################### 86 87 # Perform similarity search using fingerprints data in reference and database text files... 88 # 89 sub PerformSimilaritySearch { 90 91 print "\nProcessing fingerprints data for reference molecules...\n"; 92 ReadReferenceFingerprintsData(); 93 94 InitializeSimilaritySearchResults(); 95 GenerateSimilaritySearchResults(); 96 WriteSimilaritySearchResultFiles(); 97 } 98 99 # Find similar molecules from database molecules for individual or multiple reference molecules... 100 # 101 sub GenerateSimilaritySearchResults { 102 my($DatabaseFingerprintsFileIO, $FingerprintsCount, $IgnoredFingerprintsCount, $DatabaseFingerprintsObject, $DatabaseCmpdID, $ReferenceFingerprintsObject, $ReferenceIndex, $ReferenceCmpdID, $ComparisonValue, $FusedComparisonValue, @ComparisonValues); 103 104 print "Processing fingerprints data for database molecules...\n"; 105 106 ($FingerprintsCount, $IgnoredFingerprintsCount) = (0) x 3; 107 108 $DatabaseFingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{Database}{FingerprintsFileIOParameters}}); 109 $DatabaseFingerprintsFileIO->Open(); 110 111 @ComparisonValues = (); 112 113 DATABASEFP: while ($DatabaseFingerprintsFileIO->Read()) { 114 $FingerprintsCount++; 115 116 if (!$DatabaseFingerprintsFileIO->IsFingerprintsDataValid()) { 117 $IgnoredFingerprintsCount++; 118 next DATABASEFP; 119 } 120 $DatabaseFingerprintsObject = $DatabaseFingerprintsFileIO->GetFingerprints(); 121 $DatabaseCmpdID = $DatabaseFingerprintsFileIO->GetCompoundID(); 122 123 if ($SimilaritySearchInfo{MultipleReferencesMode}) { 124 @ComparisonValues = (); 125 } 126 127 REFERENCEFP: for $ReferenceIndex (0 .. $#{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) { 128 $ReferenceCmpdID = $SimilaritySearchInfo{ReferenceCmpdIDsRef}->[$ReferenceIndex]; 129 $ReferenceFingerprintsObject = $SimilaritySearchInfo{ReferenceFingerprintsObjectsRef}->[$ReferenceIndex]; 130 131 $ComparisonValue = CompareReferenceAndDatabaseFingerprintsPair($ReferenceFingerprintsObject, $DatabaseFingerprintsObject); 132 if (!defined $ComparisonValue) { 133 next REFERENCEFP; 134 } 135 136 if ($SimilaritySearchInfo{IndividualReferenceMode}) { 137 CollectSimilaritySearchResults($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $ComparisonValue, $ReferenceCmpdID); 138 } 139 elsif ($SimilaritySearchInfo{MultipleReferencesMode}) { 140 push @ComparisonValues, $ComparisonValue; 141 } 142 } 143 144 if ($SimilaritySearchInfo{MultipleReferencesMode}) { 145 $FusedComparisonValue = CalculateGroupFusionComparisonValue(\@ComparisonValues); 146 if (!defined $FusedComparisonValue) { 147 next DATABASEFP; 148 } 149 CollectSimilaritySearchResults($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $FusedComparisonValue); 150 } 151 } 152 $DatabaseFingerprintsFileIO->Close(); 153 154 print "Number of fingerprints data entries in database fingerprints file: $FingerprintsCount\n"; 155 print "Number of fingerprints date entries processed successfully: ", ($FingerprintsCount - $IgnoredFingerprintsCount) , "\n"; 156 print "Number of fingerprints data entries ignored due to missing/invalid data: $IgnoredFingerprintsCount\n\n"; 157 } 158 159 # Compare a pair of reference and database fingerprints objects corresponding to bit-vector or 160 # vectors using specified comparison method and comparison cutoff... 161 # 162 sub CompareReferenceAndDatabaseFingerprintsPair { 163 my($ReferenceFingerprintsObject, $DatabaseFingerprintsObject) = @_; 164 my($ComparisonMethod, $ComparisonValue); 165 166 $ComparisonMethod = $SimilaritySearchInfo{ComparisonMethod}; 167 $ComparisonValue = $ReferenceFingerprintsObject->$ComparisonMethod($DatabaseFingerprintsObject, @{$SimilaritySearchInfo{ComparisonMethodParameters}}); 168 169 if (!defined $ComparisonValue) { 170 warn "Warning: Ignoring fingerprints data for reference compound ID ", $ReferenceFingerprintsObject->GetID(), ": Its comparison with database compound ID, ", $DatabaseFingerprintsObject->GetID(), ", failed.\n"; 171 return undef; 172 } 173 174 $ComparisonValue = sprintf("%.$OptionsInfo{Precision}f", $ComparisonValue); 175 176 # Apply any comparison cutoff... 177 if ($SimilaritySearchInfo{ApplyComparisonCutoff}) { 178 return $SimilaritySearchInfo{KeepTop} ? ($ComparisonValue >= $SimilaritySearchInfo{ComparisonCutoff} ? $ComparisonValue : undef) : ($ComparisonValue <= $SimilaritySearchInfo{ComparisonCutoff} ? $ComparisonValue : undef); 179 } 180 else { 181 return $ComparisonValue; 182 } 183 } 184 185 # Calculate group fusion comparison value... 186 # 187 sub CalculateGroupFusionComparisonValue { 188 my($ComparisonValuesRef) = @_; 189 my($FusedComparisonValue, @ComparisonValues); 190 191 if (!@{$ComparisonValuesRef}) { 192 return undef; 193 } 194 195 if ($SimilaritySearchInfo{SortComparisonValues}) { 196 @ComparisonValues = sort { $SimilaritySearchInfo{KeepTop} ? ($b <=> $a) : ($a <=> $b) } @{$ComparisonValuesRef}; 197 if ($SimilaritySearchInfo{UsekNN} && ($OptionsInfo{kNN} < scalar @{$ComparisonValuesRef})) { 198 # Keep only top kNN values for group fusion... 199 splice @ComparisonValues, $OptionsInfo{kNN}; 200 } 201 $ComparisonValuesRef = \@ComparisonValues; 202 } 203 204 $FusedComparisonValue = &{$SimilaritySearchInfo{GroupFusionMethodRef}}($ComparisonValuesRef); 205 if ($SimilaritySearchInfo{ApplyPrecisionDuringFusion}) { 206 $FusedComparisonValue = sprintf("%.$OptionsInfo{Precision}f", $FusedComparisonValue); 207 } 208 209 return $FusedComparisonValue; 210 } 211 212 # Collect similarity results for individual reference and multiple references search... 213 # 214 sub CollectSimilaritySearchResults { 215 my($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $ComparisonValue, $ReferenceCmpdID) = @_; 216 217 if (defined $ReferenceCmpdID) { 218 $SimilaritySearchResults{$ReferenceCmpdID}->AddKeyValuePair($ComparisonValue, $DatabaseCmpdID); 219 } 220 else { 221 $SimilaritySearchResults{ResultsPseudoHeap}->AddKeyValuePair($ComparisonValue, $DatabaseCmpdID); 222 } 223 224 if ($FingerprintsFilesInfo{Database}{CollectInputFileData}) { 225 CollectDatabaseFileData($DatabaseCmpdID, $DatabaseFingerprintsFileIO); 226 } 227 } 228 229 # Initialize similarity results for individual or multiple reference molecules... 230 # 231 sub InitializeSimilaritySearchResults { 232 my($ReferenceCmpdID); 233 234 %SimilaritySearchResults = (); 235 236 if ($SimilaritySearchInfo{IndividualReferenceMode}) { 237 for $ReferenceCmpdID (@{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) { 238 $SimilaritySearchResults{$ReferenceCmpdID} = new PseudoHeap('Type' => ($SimilaritySearchInfo{KeepTop} ? 'KeepTopN' : 'KeepBottomN'), 'KeyType' => 'Numeric', 'MaxSize' => $OptionsInfo{MaxSimilarMolecules}); 239 } 240 } 241 elsif ($SimilaritySearchInfo{MultipleReferencesMode}) { 242 $SimilaritySearchResults{ResultsPseudoHeap} = new PseudoHeap('Type' => ($SimilaritySearchInfo{KeepTop} ? 'KeepTopN' : 'KeepBottomN'), 'KeyType' => 'Numeric', 'MaxSize' => $OptionsInfo{MaxSimilarMolecules}); 243 } 244 245 %DatabaseFingerprintsFileData = (); 246 } 247 248 # Write out results SD and/or CSV/TSV text files for individual or multiple reference molecules... 249 # 250 sub WriteSimilaritySearchResultFiles { 251 my($NewSDFileRef, $NewTextFileRef, $ReferenceCmpdID, $DatabaseCmpdID, $ComparisonValue); 252 253 ($NewSDFileRef, $NewTextFileRef) = SetupAndOpenOutputFiles(); 254 255 if ($SimilaritySearchInfo{IndividualReferenceMode}) { 256 for $ReferenceCmpdID (@{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) { 257 for $ComparisonValue ($SimilaritySearchResults{$ReferenceCmpdID}->GetSortedKeys()) { 258 for $DatabaseCmpdID ($SimilaritySearchResults{$ReferenceCmpdID}->GetKeyValues($ComparisonValue)) { 259 WriteDataToOutputFiles($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID, $ReferenceCmpdID); 260 } 261 } 262 } 263 } 264 elsif ($SimilaritySearchInfo{MultipleReferencesMode}) { 265 for $ComparisonValue ($SimilaritySearchResults{ResultsPseudoHeap}->GetSortedKeys()) { 266 for $DatabaseCmpdID ($SimilaritySearchResults{ResultsPseudoHeap}->GetKeyValues($ComparisonValue)) { 267 WriteDataToOutputFiles($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID); 268 } 269 } 270 } 271 272 if ($NewSDFileRef) { 273 close $NewSDFileRef; 274 } 275 if ($NewTextFileRef) { 276 close $NewTextFileRef; 277 } 278 } 279 280 # Write individual reference or multiple references similarity results along with any other data to output files... 281 # 282 sub WriteDataToOutputFiles { 283 my($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID, $ReferenceCmpdID) = @_; 284 285 if ($NewSDFileRef) { 286 WriteMolStringDataToSDOutputFile($DatabaseCmpdID, $NewSDFileRef); 287 if (defined $ReferenceCmpdID) { 288 print $NewSDFileRef "> <ReferenceCmpdID>\n$ReferenceCmpdID\n\n"; 289 } 290 print $NewSDFileRef "> <DatabaseCmpdID>\n$DatabaseCmpdID\n\n> <ComparisonValue>\n$ComparisonValue\n\n"; 291 WriteDatabaseDataToSDOutputFile($DatabaseCmpdID, $NewSDFileRef); 292 print $NewSDFileRef "\$\$\$\$\n"; 293 } 294 295 if ($NewTextFileRef) { 296 my(@LineWords); 297 298 @LineWords = (); 299 if (defined $ReferenceCmpdID) { 300 push @LineWords, $ReferenceCmpdID; 301 } 302 push @LineWords, ($DatabaseCmpdID, $ComparisonValue); 303 304 if ($FingerprintsFilesInfo{Database}{OutputDataFields} || $FingerprintsFilesInfo{Database}{OutputDataCols}) { 305 push @LineWords, RetrieveDatabaseDataForTextOutputFile($DatabaseCmpdID); 306 } 307 print $NewTextFileRef JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}), "\n"; 308 } 309 } 310 311 # Open output files... 312 # 313 sub SetupAndOpenOutputFiles { 314 my($NewSDFileRef, $NewTextFileRef, $NewSDFile, $NewTextFile); 315 316 ($NewSDFileRef, $NewTextFileRef) = (undef) x 2; 317 318 if ($OptionsInfo{SDOutput}) { 319 $NewSDFile = $OutputFilesInfo{SDOutFileName}; 320 print "Generating SD file $NewSDFile...\n"; 321 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 322 $NewSDFileRef = \*NEWSDFILE; 323 } 324 325 if ($OptionsInfo{TextOutput}) { 326 $NewTextFile = $OutputFilesInfo{TextOutFileName}; 327 print "Generating text file $NewTextFile...\n"; 328 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n"; 329 $NewTextFileRef = \*NEWTEXTFILE; 330 331 WriteTextFileCoulmnLabels(\*NEWTEXTFILE); 332 } 333 334 return ($NewSDFileRef, $NewTextFileRef); 335 } 336 337 # Write out approriate column labels to text file... 338 # 339 sub WriteTextFileCoulmnLabels { 340 my($NewTextFileRef) = @_; 341 my($Line, @LineWords); 342 343 @LineWords = (); 344 345 if ($SimilaritySearchInfo{IndividualReferenceMode}) { 346 push @LineWords, qw(ReferenceCompoundID DatabaseCompoundID ComparisonValue); 347 } 348 elsif ($SimilaritySearchInfo{MultipleReferencesMode}) { 349 push @LineWords, qw(DatabaseCompoundID ComparisonValue); 350 } 351 352 # Add columns for other database fingerprints file data to be written to output file... 353 if ($FingerprintsFilesInfo{Database}{OutputDataFields}) { 354 push @LineWords, @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}; 355 } 356 elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) { 357 push @LineWords, @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}}; 358 } 359 360 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 361 print $NewTextFileRef "$Line\n"; 362 } 363 364 # Write molecule string data to SD output file... 365 # 366 sub WriteMolStringDataToSDOutputFile { 367 my($DatabaseCmpdID, $NewSDFileRef) = @_; 368 369 if ($FingerprintsFilesInfo{Database}{CollectCmpdStringData}) { 370 my($MolString); 371 372 ($MolString) = split /M END/, $DatabaseFingerprintsFileData{$DatabaseCmpdID}; 373 print $NewSDFileRef "$MolString\nM END\n"; 374 } 375 else { 376 # Just write out an empty molecule data string... 377 print $NewSDFileRef SDFileUtil::GenerateEmptyCtabBlockLines(), "\n"; 378 } 379 } 380 381 # Write database data from SD or Text database file to SD output file... 382 # 383 sub WriteDatabaseDataToSDOutputFile { 384 my($DatabaseCmpdID, $NewSDFileRef) = @_; 385 386 if ($FingerprintsFilesInfo{Database}{OutputDataFields}) { 387 my($DataFieldLabel, $DataFieldValue, @CmpdLines, %DataFieldLabelAndValues); 388 389 @CmpdLines = split /\n/, $DatabaseFingerprintsFileData{$DatabaseCmpdID}; 390 %DataFieldLabelAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 391 392 for $DataFieldLabel ($FingerprintsFilesInfo{Database}{OutputCurrentDataFields} ? GetCmpdDataHeaderLabels(\@CmpdLines) : @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}) { 393 $DataFieldValue = exists $DataFieldLabelAndValues{$DataFieldLabel} ? $DataFieldLabelAndValues{$DataFieldLabel} : ''; 394 print $NewSDFileRef "> <$DataFieldLabel>\n$DataFieldValue\n\n"; 395 } 396 } 397 elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) { 398 my($DataColNum, $DataFieldLabel, $DataFieldValue); 399 400 for $DataColNum (@{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}}) { 401 $DataFieldLabel = $FingerprintsFilesInfo{Database}{DataColNumToLabelMap}{$DataColNum}; 402 $DataFieldValue = $DatabaseFingerprintsFileData{$DatabaseCmpdID}->[$DataColNum]; 403 print $NewSDFileRef "> <$DataFieldLabel>\n$DataFieldValue\n\n"; 404 } 405 } 406 } 407 408 # Retriebe database data from SD or Text database file for text output file... 409 # 410 sub RetrieveDatabaseDataForTextOutputFile { 411 my($DatabaseCmpdID) = @_; 412 413 if ($FingerprintsFilesInfo{Database}{OutputDataFields}) { 414 my(@CmpdLines, %DataFieldLabelAndValues); 415 416 @CmpdLines = split /\n/, $DatabaseFingerprintsFileData{$DatabaseCmpdID}; 417 %DataFieldLabelAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 418 419 return map { exists $DataFieldLabelAndValues{$_} ? $DataFieldLabelAndValues{$_} : ''} @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}; 420 } 421 elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) { 422 if (exists $DatabaseFingerprintsFileData{$DatabaseCmpdID}) { 423 return map { $DatabaseFingerprintsFileData{$DatabaseCmpdID}->[$_] } (0 .. $#{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}}); 424 } 425 else { 426 return ('') x $#{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}}; 427 } 428 } 429 } 430 431 # Collect database file SD compound string or CSV/TSV data line for generating results 432 # files.. 433 # 434 sub CollectDatabaseFileData { 435 my($DatabaseCmpdID, $DatabaseFingerprintsFileIO) = @_; 436 437 if (exists $DatabaseFingerprintsFileData{$DatabaseCmpdID}) { 438 return; 439 } 440 441 if ($FingerprintsFilesInfo{Database}{CollectCmpdStringData}) { 442 $DatabaseFingerprintsFileData{$DatabaseCmpdID} = $DatabaseFingerprintsFileIO->GetCompoundString(); 443 } 444 445 if ($FingerprintsFilesInfo{Database}{CollectDataLine}) { 446 my(@DataLineWords); 447 @DataLineWords = $DatabaseFingerprintsFileIO->GetDataLineWords(); 448 $DatabaseFingerprintsFileData{$DatabaseCmpdID} = \@DataLineWords; 449 } 450 451 } 452 453 # Read fingerprints data from reference fingerprints file... 454 # 455 sub ReadReferenceFingerprintsData { 456 my($FingerprintsFileIO); 457 458 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{Reference}{FingerprintsFileIOParameters}}); 459 ($SimilaritySearchInfo{ReferenceCmpdIDsRef}, $SimilaritySearchInfo{ReferenceFingerprintsObjectsRef}) = Fingerprints::FingerprintsFileUtil::ReadAndProcessFingerpritsData($FingerprintsFileIO); 460 461 } 462 463 # Retrieve information about fingerprints files... 464 # 465 sub RetrieveFingerprintsFilesInfo { 466 467 %FingerprintsFilesInfo = (); 468 %OutputFilesInfo = (); 469 %SimilaritySearchInfo = (); 470 471 %{$FingerprintsFilesInfo{Reference}} = (); 472 %{$FingerprintsFilesInfo{Database}} = (); 473 474 # Set up reference and database file names... 475 $FingerprintsFilesInfo{Reference}{FileName} = $FingerprintsFilesList[0]; 476 $FingerprintsFilesInfo{Database}{FileName} = $FingerprintsFilesList[1]; 477 478 # Retrieve information about reference and database fingerprints file... 479 RetrieveReferenceFingerprintsFileInfo(); 480 RetrieveDatabaseFingerprintsFileInfo(); 481 482 # Setup fingerprints comparison method and associated method parameters... 483 SetupReferenceAndDatabaseFingerprintsComparisonInfo(); 484 485 # Retrieve information for output files... 486 RetrieveOutputFilesInfo(); 487 } 488 489 # Setup refrerence and database fingerprints comparison method and associated method parameters... 490 # 491 sub SetupReferenceAndDatabaseFingerprintsComparisonInfo { 492 493 # Make sure reference and database fingerprints string match... 494 if (($FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType} !~ /^$FingerprintsFilesInfo{Database}{FirstFingerprintsStringType}$/i) || 495 ($FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode} != $FingerprintsFilesInfo{Database}{FingerprintsBitVectorStringMode}) || 496 ($FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode} != $FingerprintsFilesInfo{Database}{FingerprintsVectorStringMode}) ) { 497 die "Error: First reference fingerprints string type, $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType}, must match first database fingerprints type, $FingerprintsFilesInfo{Database}{FirstFingerprintsStringType}.\n"; 498 } 499 500 if ($FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription} !~ /^$FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription}$/i) { 501 warn "Warning: First reference fingerprints string description, $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription}, doesn't match first database fingerprints string description, $FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription}.\n"; 502 } 503 504 # Setup individual reference and multiple references search mode... 505 $SimilaritySearchInfo{IndividualReferenceMode} = undef; 506 $SimilaritySearchInfo{MultipleReferencesMode} = undef; 507 508 if ($OptionsInfo{Mode} =~ /^IndividualReference$/i) { 509 $SimilaritySearchInfo{IndividualReferenceMode} = 1; 510 } 511 elsif ($OptionsInfo{Mode} =~ /^MultipleReferences$/i) { 512 $SimilaritySearchInfo{MultipleReferencesMode} = 1; 513 } 514 else { 515 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: IndividualReference, MultipleReferences\n"; 516 } 517 518 # Set up reference and database fingerprints similarity search method and paramaters... 519 my($ComparisonMeasure, $ComparisonMethod, $ApplyComparisonCutoff, $ComparisonCutoff, $KeepTop, @ComparisonMethodParameters); 520 521 $SimilaritySearchInfo{ComparisonMethod} = ''; 522 @{$SimilaritySearchInfo{ComparisonMethodParameters}} = (); 523 524 $SimilaritySearchInfo{ComparisonCutoff} = ''; 525 $SimilaritySearchInfo{KeepTop} = ''; 526 527 $ComparisonMeasure = ''; $ComparisonMethod = ''; 528 @ComparisonMethodParameters = (); 529 530 FINGERPRINTSTYPE: { 531 if ($FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode}) { 532 $ComparisonMeasure = $OptionsInfo{SpecifiedBitVectorComparisonMeasure}; 533 $ComparisonMethod = $OptionsInfo{SpecifiedBitVectorComparisonMeasureMethod}; 534 535 if ($ComparisonMeasure =~ /^TverskySimilarity$/i) { 536 push @ComparisonMethodParameters, $OptionsInfo{Alpha}; 537 } 538 elsif ($ComparisonMeasure =~ /^WeightedTverskySimilarity$/i) { 539 push @ComparisonMethodParameters, $OptionsInfo{Alpha}; 540 push @ComparisonMethodParameters, $OptionsInfo{Beta}; 541 } 542 elsif ($ComparisonMeasure =~ /^WeightedTanimotoSimilarity$/i) { 543 push @ComparisonMethodParameters, $OptionsInfo{Beta}; 544 } 545 546 last FINGERPRINTSTYPE; 547 } 548 if ($FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode}) { 549 my($SkipValuesCheck); 550 551 $ComparisonMeasure = $OptionsInfo{SpecifiedVectorComparisonMeasure}; 552 $ComparisonMethod = $OptionsInfo{SpecifiedVectorComparisonMeasuresMethod}; 553 554 push @ComparisonMethodParameters, $OptionsInfo{SpecifiedVectorComparisonMode}; 555 556 $SkipValuesCheck = $OptionsInfo{Fast} ? 1 : 0; 557 push @ComparisonMethodParameters, $SkipValuesCheck; 558 559 last FINGERPRINTSTYPE; 560 } 561 die "Error: Uknown fingerprints string type. Supported values: FingerprintsBitVectorString or FingerprintsVectorString.\n"; 562 } 563 564 $ApplyComparisonCutoff = $SimilaritySearchInfo{IndividualReferenceMode} ? 1 : (($SimilaritySearchInfo{MultipleReferencesMode} && $OptionsInfo{GroupFusionApplyCutoff}) ? 1 : 0); 565 566 $ComparisonCutoff = ''; $KeepTop = ''; 567 if ($ComparisonMethod =~ /Distance/i) { 568 $ComparisonCutoff = $OptionsInfo{DistanceCutoff}; 569 $KeepTop = ($OptionsInfo{SearchMode} =~ /^SimilaritySearch$/i) ? 0 : 1; 570 } 571 else { 572 $ComparisonCutoff = $OptionsInfo{SimilarityCutoff}; 573 $KeepTop = ($OptionsInfo{SearchMode} =~ /^SimilaritySearch$/i) ? 1 : 0; 574 } 575 576 $SimilaritySearchInfo{ComparisonMethod} = $ComparisonMethod; 577 @{$SimilaritySearchInfo{ComparisonMethodParameters}} = @ComparisonMethodParameters; 578 579 $SimilaritySearchInfo{ComparisonCutoff} = $ComparisonCutoff; 580 $SimilaritySearchInfo{KeepTop} = $KeepTop; 581 $SimilaritySearchInfo{ApplyComparisonCutoff} = $ApplyComparisonCutoff; 582 583 # Setup references to group fusion methods... 584 $SimilaritySearchInfo{GroupFusionMethodRef} = undef; 585 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = undef; 586 587 FUSIONRULE: { 588 if ($OptionsInfo{GroupFusionRule} =~ /^Max$/i) { 589 # It's always the first value in the appropriated sorted list using value of KeepTop... 590 $SimilaritySearchInfo{GroupFusionMethodRef} = sub { my($ComparisonValuesRef) = @_; return $ComparisonValuesRef->[0]; }; 591 last FUSIONRULE; 592 } 593 if ($OptionsInfo{GroupFusionRule} =~ /^Min$/i) { 594 # It's always the last value in the appropriated sorted list using value of KeepTop... 595 $SimilaritySearchInfo{GroupFusionMethodRef} = sub { my($ComparisonValuesRef) = @_; return $ComparisonValuesRef->[$#{$ComparisonValuesRef}]; }; 596 last FUSIONRULE; 597 } 598 if ($OptionsInfo{GroupFusionRule} =~ /^Mean$/i) { 599 $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Mean; 600 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1; 601 last FUSIONRULE; 602 } 603 if ($OptionsInfo{GroupFusionRule} =~ /^Median$/i) { 604 $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Median; 605 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1; 606 last FUSIONRULE; 607 } 608 if ($OptionsInfo{GroupFusionRule} =~ /^Sum$/i) { 609 $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Sum; 610 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1; 611 last FUSIONRULE; 612 } 613 if ($OptionsInfo{GroupFusionRule} =~ /^Euclidean$/i) { 614 $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Euclidean; 615 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1; 616 last FUSIONRULE; 617 } 618 die "Error: The value specified, $Options{groupfusionrule}, for option \"-g, --GroupFusionRule\" is not valid. Allowed values: Max, Min, Mean, Median, Sum, Euclidean\n"; 619 } 620 621 $SimilaritySearchInfo{UsekNN} = ($OptionsInfo{kNN} !~ /^All$/i) ? 1 : 0; 622 $SimilaritySearchInfo{SortComparisonValues} = (($OptionsInfo{GroupFusionRule} =~ /^(Max|Min)$/i) || $SimilaritySearchInfo{UsekNN}) ? 1 : 0; 623 } 624 625 # Retrieve information about reference fingerprints file... 626 # 627 sub RetrieveReferenceFingerprintsFileInfo { 628 my($FingerprintsFile, $FileType, $InDelim, $FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription); 629 630 $FingerprintsFile = $FingerprintsFilesInfo{Reference}{FileName}; 631 ($FileType, $InDelim) = RetrieveFingerprintsFileInfo($FingerprintsFile); 632 633 $FingerprintsFilesInfo{Reference}{FileType} = $FileType; 634 $FingerprintsFilesInfo{Reference}{InDelim} = $InDelim; 635 636 # Setup reference FingerprintsFileIO parameters... 637 %{$FingerprintsFilesInfo{Reference}{FingerprintsFileIOParameters}} = RetrieveFingerprintsFileIOParameters('Reference', $FileType, $FingerprintsFile); 638 639 # Make sure reference fingerprints data file contains valid and retrieve fingerprints string mode information... 640 ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription) = RetrieveFingerprintsFileFingerprintsStringInfo('Reference', $FingerprintsFile); 641 $FingerprintsFilesInfo{Reference}{FingerprintsStringMode} = $FingerprintsStringMode; 642 $FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode} = $FingerprintsBitVectorStringMode; 643 $FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode} = $FingerprintsVectorStringMode; 644 $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType} = $FirstFingerprintsStringType; 645 $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription} = $FirstFingerprintsStringDescription; 646 647 } 648 649 # Retrieve information about database fingerprints file... 650 # 651 sub RetrieveDatabaseFingerprintsFileInfo { 652 my($FingerprintsFile, $FileType, $InDelim, $FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription); 653 654 $FingerprintsFile = $FingerprintsFilesInfo{Database}{FileName}; 655 ($FileType, $InDelim) = RetrieveFingerprintsFileInfo($FingerprintsFile); 656 657 $FingerprintsFilesInfo{Database}{FileType} = $FileType; 658 $FingerprintsFilesInfo{Database}{InDelim} = $InDelim; 659 660 # Setup reference FingerprintsFileIO parameters... 661 %{$FingerprintsFilesInfo{Database}{FingerprintsFileIOParameters}} = RetrieveFingerprintsFileIOParameters('Database', $FileType, $FingerprintsFile); 662 663 # Make sure database fingerprints data file contains valid and retrieve fingerprints string mode information... 664 ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription) = RetrieveFingerprintsFileFingerprintsStringInfo('Database', $FingerprintsFile); 665 $FingerprintsFilesInfo{Database}{FingerprintsStringMode} = $FingerprintsStringMode; 666 $FingerprintsFilesInfo{Database}{FingerprintsBitVectorStringMode} = $FingerprintsBitVectorStringMode; 667 $FingerprintsFilesInfo{Database}{FingerprintsVectorStringMode} = $FingerprintsVectorStringMode; 668 $FingerprintsFilesInfo{Database}{FirstFingerprintsStringType} = $FirstFingerprintsStringType; 669 $FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription} = $FirstFingerprintsStringDescription; 670 671 # Retrieve database fingerprints data field information for output file... 672 # 673 RetrieveDatabaseFingerprintsDataFieldsInfo($FingerprintsFile, $FileType, $InDelim); 674 675 # Retrieve database fingerprints text file data columns information for output file... 676 # 677 RetrieveDatabaseFingerprintsDataColsInfo($FingerprintsFile, $FileType, $InDelim); 678 679 # Any need to collect database compound string or data line for generation of results files... 680 $FingerprintsFilesInfo{Database}{CollectCmpdStringData} = ($FileType =~ /^SD$/i) ? 1 : 0; 681 $FingerprintsFilesInfo{Database}{CollectDataLine} = ($FileType =~ /^Text$/i && $OptionsInfo{DatabaseDataColsMode} =~ /^(All|Specify)$/i) ? 1 : 0; 682 $FingerprintsFilesInfo{Database}{CollectInputFileData} = ($FingerprintsFilesInfo{Database}{CollectCmpdStringData} || $FingerprintsFilesInfo{Database}{CollectDataLine}) ? 1 : 0; 683 684 # Set maximum number of similar compounds to find for individual reference of set of multiple 685 # reference compounds... 686 # 687 SetMaximumSimilarMoleculesToRetrieve($FingerprintsFile, $FileType, $InDelim); 688 } 689 690 # Retrieve database fingerprints data field information... 691 # 692 sub RetrieveDatabaseFingerprintsDataFieldsInfo { 693 my($FingerprintsFile, $FileType, $InDelim) = @_; 694 my($CollectDataFields, $CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef, @DataFieldsToOutput); 695 696 $FingerprintsFilesInfo{Database}{OutputDataFields} = 0; 697 @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}} = (); 698 699 $FingerprintsFilesInfo{Database}{OutputCurrentDataFields} = 0; 700 701 @{$FingerprintsFilesInfo{Database}{AllDataFields}} = (); 702 @{$FingerprintsFilesInfo{Database}{CommonDataFields}} = (); 703 @{$FingerprintsFilesInfo{Database}{SpecifiedDatabaseDataFields}} = (); 704 705 if ($FileType !~ /^SD$/i) { 706 return; 707 } 708 709 # No need to go over SD file and collect data fields for SD file during All DatabaseDataFieldsMode as 710 # they would be retrieved from database SD file compound string during generation of output files... 711 # 712 $CollectDataFields = (($OptionsInfo{TextOutput} && $OptionsInfo{DatabaseDataFieldsMode} =~ /^(All|Common)$/i) || ($OptionsInfo{SDOutput} && $OptionsInfo{DatabaseDataFieldsMode} =~ /^Common$/i)) ? 1 : 0; 713 714 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = (undef) x 2; 715 716 if ($CollectDataFields) { 717 open SDFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n"; 718 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 719 close SDFILE; 720 } 721 722 @DataFieldsToOutput = (); 723 if ($OptionsInfo{DatabaseDataFieldsMode} =~ /^All$/i) { 724 if (defined $AllDataFieldsRef) { 725 push @DataFieldsToOutput, @{$AllDataFieldsRef}; 726 push @{$FingerprintsFilesInfo{Database}{AllDataFields}}, @{$AllDataFieldsRef}; 727 } 728 else { 729 # Retrieve and output data fields and values dynamically... 730 $FingerprintsFilesInfo{Database}{OutputCurrentDataFields} = 1; 731 } 732 } 733 elsif ($OptionsInfo{DatabaseDataFieldsMode} =~ /^Common$/i) { 734 if (defined $CommonDataFieldsRef) { 735 push @DataFieldsToOutput, @{$CommonDataFieldsRef}; 736 push @{$FingerprintsFilesInfo{Database}{CommonDataFields}}, @{$CommonDataFieldsRef}; 737 } 738 } 739 elsif ($OptionsInfo{DatabaseDataFieldsMode} =~ /^Specify$/i) { 740 push @DataFieldsToOutput, @{$OptionsInfo{SpecifiedDatabaseDataFields}}; 741 push @{$FingerprintsFilesInfo{Database}{SpecifiedDatabaseDataFields}}, @{$OptionsInfo{SpecifiedDatabaseDataFields}}; 742 } 743 744 if ($OptionsInfo{DatabaseDataFieldsMode} !~ /^CompoundID$/i) { 745 $FingerprintsFilesInfo{Database}{OutputDataFields} = 1; 746 } 747 748 push @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}, @DataFieldsToOutput; 749 750 } 751 752 # Retrieve database fingerprints data columns information... 753 # 754 sub RetrieveDatabaseFingerprintsDataColsInfo { 755 my($FingerprintsFile, $FileType, $InDelim) = @_; 756 my($Line, $ColNum, $ColLabel, $NumOfCols, @DataColLabels, @DataColLabelsToOutput, @DataColNumsToOutput, %DataColLabelToNumMap, %DataColNumToLabelMap); 757 758 $FingerprintsFilesInfo{Database}{OutputDataCols} = 0; 759 760 @{$FingerprintsFilesInfo{Database}{DataColLabels}} = (); 761 %{$FingerprintsFilesInfo{Database}{DataColLabelToNumMap}} = (); 762 %{$FingerprintsFilesInfo{Database}{DataColNumToLabelMap}} = (); 763 764 @{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}} = (); 765 @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}} = (); 766 767 if ($FileType !~ /^Text$/i) { 768 return; 769 } 770 771 @DataColLabels = (); 772 @DataColLabelsToOutput = (); 773 @DataColNumsToOutput = (); 774 775 %DataColLabelToNumMap = (); 776 %DataColNumToLabelMap = (); 777 778 # Get column label line... 779 open TEXTFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n"; 780 $Line = TextUtil::GetTextLine(\*TEXTFILE); 781 close TEXTFILE; 782 783 $InDelim = ($InDelim =~ /^Tab$/i) ? "\t" : ($InDelim =~ /semicolon/i ? "\;" : "\,"); 784 785 @DataColLabels = TextUtil::SplitWords($Line, $InDelim); 786 $NumOfCols = scalar @DataColLabels; 787 788 for $ColNum (0 .. $#DataColLabels) { 789 $ColLabel = $DataColLabels[$ColNum]; 790 $DataColLabelToNumMap{$ColLabel} = $ColNum; 791 $DataColNumToLabelMap{$ColNum} = $ColLabel; 792 } 793 794 if ($OptionsInfo{DatabaseDataColsMode} =~ /^Specify$/i) { 795 if ($OptionsInfo{DatabaseColMode} =~ /^ColNum$/i) { 796 for $ColNum (@{$OptionsInfo{SpecifiedDatabaseDataCols}}) { 797 if ($ColNum > $NumOfCols) { 798 die "Error: Column number, $ColNum, specified using \"--DatabaseDataCols\" is not valid: It must be <= $NumOfCols\n"; 799 } 800 push @DataColNumsToOutput, ($ColNum - 1); 801 } 802 } 803 elsif ($OptionsInfo{DatabaseColMode} =~ /^ColLabel$/i) { 804 for $ColLabel (@{$OptionsInfo{SpecifiedDatabaseDataCols}}) { 805 if (!exists $DataColLabelToNumMap{$ColLabel}) { 806 die "Error: Column label, $ColLabel, specified using \"--DatabaseDataCols\" is not valid: It doesn't exist\n"; 807 } 808 push @DataColNumsToOutput, $DataColLabelToNumMap{$ColLabel}; 809 } 810 } 811 } 812 elsif ($OptionsInfo{DatabaseDataColsMode} =~ /^All$/i) { 813 @DataColNumsToOutput = map { $_ } (0 .. $#DataColLabels); 814 } 815 816 # Setup data column labels to output... 817 if (scalar @DataColNumsToOutput) { 818 @DataColLabelsToOutput = map { $DataColNumToLabelMap{$_} } (0 .. $#DataColNumsToOutput); 819 } 820 821 $FingerprintsFilesInfo{Database}{OutputDataCols} = scalar @DataColNumsToOutput ? 1 : 0; 822 823 @{$FingerprintsFilesInfo{Database}{DataColLabels}} = @DataColLabels; 824 %{$FingerprintsFilesInfo{Database}{DataColLabelToNumMap}} = %DataColLabelToNumMap; 825 %{$FingerprintsFilesInfo{Database}{DataColNumToLabelMap}} = %DataColNumToLabelMap; 826 827 @{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}} = @DataColNumsToOutput; 828 @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}} = @DataColLabelsToOutput; 829 } 830 831 # Set maximum number of similar compounds to find for individual reference of set of multiple 832 # reference compounds... 833 # 834 sub SetMaximumSimilarMoleculesToRetrieve { 835 my($FingerprintsFile, $FileType, $InDelim) = @_; 836 my($MaxSimilarMolecules, $NumOfDatabaseMolecules, $PercentSimilarMolecules, $Line); 837 838 if ($OptionsInfo{SimilarCountMode} !~ /^PercentSimilar$/i) { 839 return; 840 } 841 842 $PercentSimilarMolecules = $OptionsInfo{PercentSimilarMolecules}; 843 844 # Count database entries to figure out MaxSimilarMolecules using PercentSimilarMolecules 845 # value... 846 $NumOfDatabaseMolecules = 0; 847 if ($FileType =~ /^SD$/i && exists($FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules})) { 848 # It might already be counted for SD file... 849 $NumOfDatabaseMolecules = $FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules}; 850 } 851 else { 852 print "Calculating maximum number of similar molecules to retrieve for \"PercentSimilar\" value of \"--SimilarCountMode\" option by counting number of molecules in database fingerprints file...\n"; 853 open FINGERPRINTSFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n"; 854 FILETYPE: { 855 if ($FileType =~ /^SD$/i) { 856 while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) { 857 if ($Line =~ /^\$\$\$\$/) { 858 $NumOfDatabaseMolecules++; 859 } 860 } 861 last FILETYPE; 862 } 863 if ($FileType =~ /^Text$/i) { 864 # Ignore column label line... 865 $Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE); 866 while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) { 867 $NumOfDatabaseMolecules++; 868 } 869 last FILETYPE; 870 } 871 if ($FileType =~ /^FP$/i) { 872 while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) { 873 if ($Line !~ /^#/) { 874 $NumOfDatabaseMolecules++; 875 } 876 } 877 last FILETYPE; 878 } 879 $NumOfDatabaseMolecules = 0; 880 } 881 close FINGERPRINTSFILE; 882 $FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules} = $NumOfDatabaseMolecules; 883 } 884 885 $MaxSimilarMolecules = int (($NumOfDatabaseMolecules * $PercentSimilarMolecules)/100); 886 if ($MaxSimilarMolecules < 1) { 887 $MaxSimilarMolecules = 1; 888 } 889 890 $OptionsInfo{MaxSimilarMolecules} = $MaxSimilarMolecules; 891 } 892 893 # Retrieve information about fingerprints file... 894 # 895 sub RetrieveFingerprintsFileInfo { 896 my($FingerprintsFile) = @_; 897 my($FileType, $InDelim, $FileDir, $FileExt, $FileName); 898 899 if (!(-e $FingerprintsFile)) { 900 die "Error: Input fingerprints file, $FingerprintsFile, doesn't exist.\n"; 901 } 902 903 $FileType = Fingerprints::FingerprintsFileUtil::GetFingerprintsFileType($FingerprintsFile); 904 if (IsEmpty($FileType)) { 905 die "Error: Input file, $FingerprintsFile, is not a fingerprints file.\n"; 906 } 907 908 $InDelim = ''; 909 if ($FileType =~ /^Text$/i) { 910 $FileDir = ""; $FileName = ""; $FileExt = ""; 911 ($FileDir, $FileName, $FileExt) = ParseFileName($FingerprintsFile); 912 $InDelim = ($FileExt =~ /^tsv$/i) ? 'Tab' : $OptionsInfo{InDelim}; 913 } 914 915 return ($FileType, $InDelim); 916 } 917 918 # Retrieve fingerprints file IO parameters... 919 # 920 sub RetrieveFingerprintsFileIOParameters { 921 my($FingerprintsFileMode, $FileType, $FingerprintsFile) = @_; 922 my(%FingerprintsFileIOParams); 923 924 if ($FingerprintsFileMode !~ /^(Reference|Database)$/) { 925 die "Error: Unknown fingerprints file mode: $FingerprintsFileMode. Supported values: Reference or Database\n"; 926 } 927 928 %FingerprintsFileIOParams = (); 929 930 FILETYPE: { 931 if ($FileType =~ /^SD$/i) { 932 %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}, 'FingerprintsFieldLabel' => $OptionsInfo{"${FingerprintsFileMode}FingerprintsField"}, 'CompoundIDMode' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDMode"}, 'CompoundIDFieldLabel' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDField"}, 'CompoundIDPrefix' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDPrefix"}); 933 last FILETYPE; 934 } 935 if ($FileType =~ /^FP$/i) { 936 %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}); 937 last FILETYPE; 938 } 939 if ($FileType =~ /^Text$/i) { 940 %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}, 'FingerprintsCol' => $OptionsInfo{"${FingerprintsFileMode}FingerprintsCol"}, 'ColMode' => $OptionsInfo{"${FingerprintsFileMode}ColMode"}, 'CompoundIDCol' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDCol"}, 'CompoundIDPrefix' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDPrefix"}, 'InDelim' => $FingerprintsFilesInfo{$FingerprintsFileMode}{InDelim}); 941 last FILETYPE; 942 } 943 die "Error: Fingerprints file type, $FileType, is not valid. Supported file types: SD, FP or Text\n"; 944 } 945 946 return %FingerprintsFileIOParams; 947 } 948 949 # Make sure fingerprints data file contains valid dta and retrieve fingerprints string mode information... 950 # 951 sub RetrieveFingerprintsFileFingerprintsStringInfo { 952 my($FingerprintsFileMode, $FingerprintsFile) = @_; 953 my($FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription); 954 955 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{$FingerprintsFileMode}{FingerprintsFileIOParameters}}); 956 if (!$FingerprintsFileIO) { 957 die "Error: Reference fingerprints file, $FingerprintsFile, contains invalid fingerprints data.\n"; 958 } 959 if (!$FingerprintsFileIO->IsFingerprintsFileDataValid()) { 960 die "Error: Reference fingerprints file, $FingerprintsFile, contains invalid fingerprints data.\n"; 961 } 962 963 $FingerprintsStringMode = $FingerprintsFileIO->GetFingerprintsStringMode(); 964 $FingerprintsBitVectorStringMode = $FingerprintsFileIO->GetFingerprintsBitVectorStringMode(); 965 $FingerprintsVectorStringMode = $FingerprintsFileIO->GetFingerprintsVectorStringMode(); 966 967 $FirstFingerprintsStringType = $FingerprintsFileIO->GetFirstFingerprintsStringType(); 968 $FirstFingerprintsStringDescription = $FingerprintsFileIO->GetFirstFingerprintsStringDescription(); 969 970 $FingerprintsFileIO->Close(); 971 972 return ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription); 973 } 974 975 # Retrieve output files names using reference fingerprints file name... 976 # 977 sub RetrieveOutputFilesInfo { 978 my($FingerprintsFile, $FileDir, $FileExt, $FileName, $OutFileRoot, $SDOutFileName, $TextOutFileName, $SDOutFileExt, $TextOutFileExt, $ReferenceFileName, $DatabaseFileName); 979 980 $OutputFilesInfo{OutFileRoot} = ''; 981 $OutputFilesInfo{SDOutFileName} = ''; 982 $OutputFilesInfo{TextOutFileName} = ''; 983 984 $FingerprintsFile = $FingerprintsFilesInfo{Reference}{FileName}; 985 986 $FileDir = ""; $FileName = ""; $FileExt = ""; 987 ($FileDir, $FileName, $FileExt) = ParseFileName($FingerprintsFile); 988 989 $SDOutFileExt = "sdf"; 990 $TextOutFileExt = ($Options{outdelim} =~ /^tab$/i) ? "tsv" : "csv"; 991 992 if ($OptionsInfo{OutFileRoot}) { 993 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 994 if ($RootFileName && $RootFileExt) { 995 $FileName = $RootFileName; 996 } 997 else { 998 $FileName = $OptionsInfo{OutFileRoot}; 999 } 1000 $OutFileRoot = $FileName; 1001 } 1002 else { 1003 $OutFileRoot = "${FileName}SimilaritySearching"; 1004 } 1005 1006 $SDOutFileName = "${OutFileRoot}.${SDOutFileExt}"; 1007 $TextOutFileName = "${OutFileRoot}.${TextOutFileExt}"; 1008 1009 $ReferenceFileName = $FingerprintsFilesInfo{Reference}{FileName}; 1010 $DatabaseFileName = $FingerprintsFilesInfo{Database}{FileName}; 1011 1012 if ($OptionsInfo{SDOutput}) { 1013 if ($SDOutFileName =~ /^$ReferenceFileName$/i) { 1014 die "Error: Same output, $SDOutFileName, and reference input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n"; 1015 } 1016 if ($SDOutFileName =~ /^$DatabaseFileName$/i) { 1017 die "Error: Same output, $SDOutFileName, and database input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n"; 1018 } 1019 } 1020 1021 if ($OptionsInfo{TextOutput}) { 1022 if ($TextOutFileName =~ /^$ReferenceFileName$/i) { 1023 die "Error: Same output, $TextOutFileName, and reference input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n"; 1024 } 1025 if ($TextOutFileName =~ /^$DatabaseFileName$/i) { 1026 die "Error: Same output, $TextOutFileName, and database input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n"; 1027 } 1028 } 1029 1030 if (!$OptionsInfo{OverwriteFiles}) { 1031 if ($OptionsInfo{SDOutput}) { 1032 if (-e $SDOutFileName) { 1033 die "Error: The output file $SDOutFileName already exists.\n"; 1034 } 1035 } 1036 if ($OptionsInfo{TextOutput}) { 1037 if (-e $TextOutFileName) { 1038 die "Error: The output file $TextOutFileName already exists.\n"; 1039 } 1040 } 1041 } 1042 1043 $OutputFilesInfo{OutFileRoot} = $OutFileRoot; 1044 $OutputFilesInfo{SDOutFileName} = $SDOutFileName; 1045 $OutputFilesInfo{TextOutFileName} = $TextOutFileName; 1046 1047 } 1048 1049 # Process input fingerprints file names... 1050 # 1051 sub ProcessFingerprintsFileNames { 1052 @FingerprintsFilesList = (); 1053 1054 if (@ARGV != 2) { 1055 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 1056 } 1057 1058 # Reference fingerprints file name... 1059 push @FingerprintsFilesList, $ARGV[0]; 1060 1061 # Database fingerprints file name... 1062 push @FingerprintsFilesList, $ARGV[1]; 1063 1064 } 1065 1066 # Process option values... 1067 sub ProcessOptions { 1068 %OptionsInfo = (); 1069 1070 $OptionsInfo{Mode} = $Options{mode}; 1071 $OptionsInfo{FingerprintsMode} = $Options{fingerprintsmode}; 1072 1073 $OptionsInfo{SearchMode} = $Options{searchmode}; 1074 1075 ProcessBitVectorComparisonOptions(); 1076 ProcessVectorComparisonOptions(); 1077 1078 $OptionsInfo{GroupFusionRule} = $Options{groupfusionrule}; 1079 $OptionsInfo{GroupFusionApplyCutoff} = ($Options{groupfusionapplycutoff} =~ /^Yes$/i) ? 1 : 0;; 1080 1081 $OptionsInfo{SimilarCountMode} = $Options{similarcountmode}; 1082 $OptionsInfo{NumOfSimilarMolecules} = $Options{numofsimilarmolecules}; 1083 $OptionsInfo{PercentSimilarMolecules} = $Options{percentsimilarmolecules}; 1084 1085 # Set MaxSimilarMolecules to NumOfSimilarMolecules. For PercentSimilar value of SimilarCountMode, 1086 # it'll be overwritten using number of entries in database fingerprints file and value of PercentSimilarMolecules... 1087 # 1088 $OptionsInfo{MaxSimilarMolecules} = $OptionsInfo{NumOfSimilarMolecules}; 1089 1090 $OptionsInfo{SimilarityCutoff} = $Options{similaritycutoff}; 1091 $OptionsInfo{DistanceCutoff} = $Options{distancecutoff}; 1092 1093 $OptionsInfo{kNN} = $Options{knn}; 1094 if ($Options{knn} !~ /^All$/i) { 1095 if (!IsPositiveInteger($Options{knn})) { 1096 die "Error: The value specified, $Options{knn}, for option \"-k, --KNN\" is not valid. Allowed values: > 0 \n"; 1097 } 1098 } 1099 1100 ProcessReferenceFingerprintsDataOptions(); 1101 ProcessDatabaseFingerprintsDataOptions(); 1102 1103 $OptionsInfo{Detail} = $Options{detail}; 1104 1105 $OptionsInfo{InDelim} = $Options{indelim}; 1106 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 1107 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 1108 1109 $OptionsInfo{Output} = $Options{output}; 1110 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|Both)$/i) ? 1 : 0; 1111 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|Both)$/i) ? 1 : 0; 1112 1113 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 1114 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 1115 1116 $OptionsInfo{Fast} = $Options{fast} ? 1 : 0; 1117 $OptionsInfo{ValidateData} = $Options{fast} ? 0 : 1; 1118 1119 $OptionsInfo{Precision} = $Options{precision}; 1120 } 1121 1122 # Process options related to comparion of bit vector strings... 1123 # 1124 sub ProcessBitVectorComparisonOptions { 1125 # Setup supported bit vector similarity coefficients for bit vector strings... 1126 my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap); 1127 1128 @SupportedComparisonMeasures = (); 1129 %SupportedComparisonMeasuresNameMap = (); 1130 %SupportedComparisonMeasuresMethodMap = (); 1131 1132 for $SupportedComparisonMeasure (Fingerprints::FingerprintsBitVector::GetSupportedSimilarityCoefficients()) { 1133 # Similarity coefficient function/method names contain "Coefficient" in their names. 1134 # So take 'em out and setup a map to original function/method name... 1135 $ComparisonMeasure = $SupportedComparisonMeasure; 1136 $ComparisonMeasure =~ s/Coefficient$//; 1137 1138 push @SupportedComparisonMeasures, $ComparisonMeasure; 1139 $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure; 1140 $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure; 1141 } 1142 1143 # Setup similarity coefficient to use for calculating similarity matrices for bit vector strings... 1144 my($SpecifiedMeasure, $SpecifiedComparisonMeasureName, $SpecifiedComparisonMeasureMethod); 1145 1146 $SpecifiedComparisonMeasureName = ''; 1147 $SpecifiedComparisonMeasureMethod = ''; 1148 1149 $SpecifiedMeasure = $Options{bitvectorcomparisonmode}; 1150 1151 if (! exists $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)} ) { 1152 die "Error: The value specified, $SpecifiedMeasure, for option \"-b --BitVectorComparisonMode\" is not valid.\nAllowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n"; 1153 } 1154 1155 $SpecifiedComparisonMeasureMethod = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)}; 1156 $SpecifiedComparisonMeasureName = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)}; 1157 1158 $OptionsInfo{BitVectorComparisonMode} = $Options{bitvectorcomparisonmode}; 1159 1160 $OptionsInfo{SpecifiedBitVectorComparisonMeasure} = $SpecifiedMeasure; 1161 $OptionsInfo{SpecifiedBitVectorComparisonMeasureName} = $SpecifiedComparisonMeasureName; 1162 $OptionsInfo{SpecifiedBitVectorComparisonMeasureMethod} = $SpecifiedComparisonMeasureMethod; 1163 1164 # Make sure valid alpha parameter is specified for Tversky calculation... 1165 $OptionsInfo{Alpha} = ''; 1166 if ($SpecifiedMeasure =~ /^(TverskySimilarity|WeightedTverskySimilarity)$/i) { 1167 if (IsEmpty($Options{alpha})) { 1168 die "Error: You must specify a value for \"-a, --alpha\" option in \"TverskySimilarity or WeightedTverskySimilarity\" \"-m --mode\". \n"; 1169 } 1170 my($Alpha); 1171 $Alpha = $Options{alpha}; 1172 if (!(IsFloat($Alpha) && $Alpha >=0 && $Alpha <= 1)) { 1173 die "Error: The value specified, $Options{alpha}, for option \"-a, --alpha\" is not valid. Allowed values: >= 0 and <= 1\n"; 1174 } 1175 $OptionsInfo{Alpha} = $Alpha; 1176 } 1177 1178 # Make sure valid beta parameter is specified for WeightedTanimoto and WeightedTversky 1179 # calculations... 1180 $OptionsInfo{Beta} = ''; 1181 if ($SpecifiedMeasure =~ /^(WeightedTverskySimilarity|WeightedTanimotoSimilarity)$/i) { 1182 if (IsEmpty($Options{beta})) { 1183 die "Error: You must specify a value for \"-b, --beta\" option in \"WeightedTverskySimilarity or WeightedTanimotoSimilarity\" \"-m --mode\". \n"; 1184 } 1185 my($Beta); 1186 $Beta = $Options{beta}; 1187 if (!(IsFloat($Beta) && $Beta >=0 && $Beta <= 1)) { 1188 die "Error: The value specified, $Options{beta}, for option \"-b, --beta\" is not valid. Allowed values: >= 0 and <= 1\n"; 1189 } 1190 $OptionsInfo{Beta} = $Beta; 1191 } 1192 } 1193 1194 # Process options related to comparion of vector strings... 1195 # 1196 sub ProcessVectorComparisonOptions { 1197 # Setup specified similarity coefficients for vector strings.. 1198 my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap); 1199 1200 @SupportedComparisonMeasures = (); 1201 %SupportedComparisonMeasuresNameMap = (); 1202 %SupportedComparisonMeasuresMethodMap = (); 1203 for $SupportedComparisonMeasure (Fingerprints::FingerprintsVector::GetSupportedDistanceAndSimilarityCoefficients()) { 1204 # Similarity and distance coefficient function/method names contain "Coefficient" in their names. 1205 # So take 'em out and setup a map to original function/method name... 1206 $ComparisonMeasure = $SupportedComparisonMeasure; 1207 if ($ComparisonMeasure =~ /Coefficient$/i) { 1208 $ComparisonMeasure =~ s/Coefficient$//i; 1209 } 1210 push @SupportedComparisonMeasures, $ComparisonMeasure; 1211 $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure; 1212 $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure; 1213 } 1214 1215 # Setup a list of similarity coefficients to use for calculating similarity matrices for bit vector strings... 1216 my($SpecifiedMeasure, $SpecifiedComparisonMeasureName, $SpecifiedComparisonMeasureMethod); 1217 1218 $SpecifiedComparisonMeasureName = ''; 1219 $SpecifiedComparisonMeasureMethod = ''; 1220 1221 $SpecifiedMeasure = $Options{vectorcomparisonmode}; 1222 $SpecifiedMeasure =~ s/ //g; 1223 1224 if (! exists($SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)})) { 1225 die "Error: The value specified, $SpecifiedMeasure, for option \"-v --VectorComparisonMode\" is not valid.\nAllowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n"; 1226 } 1227 1228 $SpecifiedComparisonMeasureMethod = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)}; 1229 $SpecifiedComparisonMeasureName = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)}; 1230 1231 $OptionsInfo{VectorComparisonMode} = $Options{vectorcomparisonmode}; 1232 1233 $OptionsInfo{SpecifiedVectorComparisonMeasure} = $SpecifiedMeasure; 1234 $OptionsInfo{SpecifiedVectorComparisonMeasuresName} = $SpecifiedComparisonMeasureName; 1235 $OptionsInfo{SpecifiedVectorComparisonMeasuresMethod} = $SpecifiedComparisonMeasureMethod; 1236 1237 # Setup specified vector comparison calculation modes... 1238 my($SpecifiedFormulism); 1239 1240 $SpecifiedFormulism = $Options{vectorcomparisonformulism}; 1241 $SpecifiedFormulism =~ s/ //g; 1242 if ($SpecifiedFormulism !~ /^(AlgebraicForm|BinaryForm|SetTheoreticForm)$/i) { 1243 die "Error: The value specified, $SpecifiedFormulism, for option \"--VectorComparisonFormulism\" is not valid. Allowed values: AlgebraicForm, BinaryForm or SetTheoreticForm\n"; 1244 } 1245 1246 $OptionsInfo{VectorComparisonFormulism} = $Options{vectorcomparisonformulism}; 1247 $OptionsInfo{SpecifiedVectorComparisonMode} = $SpecifiedFormulism; 1248 1249 } 1250 1251 # Process options related to data retrieval from reference fingerprints SD and CSV/TSV 1252 # text files... 1253 # 1254 sub ProcessReferenceFingerprintsDataOptions { 1255 1256 $OptionsInfo{ReferenceCompoundIDPrefix} = $Options{referencecompoundidprefix} ? $Options{referencecompoundidprefix} : 'Cmpd'; 1257 1258 # Compound ID and fingerprints column options for text files... 1259 1260 $OptionsInfo{ReferenceColMode} = $Options{referencecolmode}; 1261 1262 if (IsNotEmpty($Options{referencecompoundidcol})) { 1263 if ($Options{referencecolmode} =~ /^ColNum$/i) { 1264 if (!IsPositiveInteger($Options{referencecompoundidcol})) { 1265 die "Error: Column value, $Options{referencecompoundidcol}, specified using \"--ReferenceCompoundIDCol\" is not valid: Allowed integer values: > 0\n"; 1266 } 1267 } 1268 $OptionsInfo{ReferenceCompoundIDCol} = $Options{referencecompoundidcol}; 1269 } 1270 else { 1271 $OptionsInfo{ReferenceCompoundIDCol} = 'AutoDetect'; 1272 } 1273 1274 if (IsNotEmpty($Options{referencefingerprintscol})) { 1275 if ($Options{referencecolmode} =~ /^ColNum$/i) { 1276 if (!IsPositiveInteger($Options{referencefingerprintscol})) { 1277 die "Error: Column value, $Options{referencefingerprintscol}, specified using \"--ReferenceFingerprintsCol\" is not valid: Allowed integer values: > 0\n"; 1278 } 1279 } 1280 $OptionsInfo{ReferenceFingerprintsCol} = $Options{referencefingerprintscol}; 1281 } 1282 else { 1283 $OptionsInfo{ReferenceFingerprintsCol} = 'AutoDetect'; 1284 } 1285 1286 if (IsNotEmpty($Options{referencecompoundidcol}) && IsNotEmpty($Options{referencefingerprintscol})) { 1287 if (IsPositiveInteger($Options{referencecompoundidcol}) && IsPositiveInteger($Options{referencefingerprintscol})) { 1288 if (($Options{referencecompoundidcol} == $Options{referencefingerprintscol})) { 1289 die "Error: Values specified using \"--ReferenceCompoundIDCol\" and \"--ReferenceFingerprintsCol\", $Options{referencecompoundidcol}, must be different.\n"; 1290 } 1291 } 1292 else { 1293 if (($Options{referencecompoundidcol} eq $Options{referencefingerprintscol})) { 1294 die "Error: Values specified using \"--ReferenceCompoundIDCol\" and \"--ReferenceFingerprintsCol\", $Options{referencecompoundidcol}, must be different.\n"; 1295 } 1296 } 1297 } 1298 1299 # Compound ID and fingerprints field options for SD files... 1300 1301 $OptionsInfo{ReferenceCompoundIDMode} = $Options{referencecompoundidmode}; 1302 $OptionsInfo{ReferenceCompoundIDField} = ''; 1303 1304 if ($Options{referencecompoundidmode} =~ /^DataField$/i && !$Options{referencecompoundidfield}) { 1305 die "Error: You must specify a value for \"--ReferenceCompoundIDField\" option in \"DataField\" \"--ReferenceCompoundIDMode\". \n"; 1306 } 1307 if ($Options{referencecompoundidfield}) { 1308 $OptionsInfo{ReferenceCompoundIDField} = $Options{referencecompoundidfield}; 1309 } 1310 1311 if (IsNotEmpty($Options{referencefingerprintsfield})) { 1312 $OptionsInfo{ReferenceFingerprintsField} = $Options{referencefingerprintsfield}; 1313 } 1314 else { 1315 $OptionsInfo{ReferenceFingerprintsField} = 'AutoDetect'; 1316 } 1317 1318 if ($Options{referencecompoundidfield} && IsNotEmpty($Options{referencefingerprintsfield})) { 1319 if (($Options{referencecompoundidfield} eq $Options{referencefingerprintsfield})) { 1320 die "Error: Values specified using \"--ReferenceCompoundIDField\" and \"--ReferenceFingerprintsfield\", $Options{referencecompoundidfield}, must be different.\n"; 1321 } 1322 } 1323 1324 } 1325 1326 # Process options related to data retrieval from database fingerprints SD and CSV/TSV 1327 # text files... 1328 # 1329 sub ProcessDatabaseFingerprintsDataOptions { 1330 1331 $OptionsInfo{DatabaseCompoundIDPrefix} = $Options{databasecompoundidprefix} ? $Options{databasecompoundidprefix} : 'Cmpd'; 1332 1333 # Compound ID and fingerprints column options for text files... 1334 1335 $OptionsInfo{DatabaseColMode} = $Options{databasecolmode}; 1336 1337 if (IsNotEmpty($Options{databasecompoundidcol})) { 1338 if ($Options{databasecolmode} =~ /^ColNum$/i) { 1339 if (!IsPositiveInteger($Options{databasecompoundidcol})) { 1340 die "Error: Column value, $Options{databasecompoundidcol}, specified using \"--DatabaseCompoundIDCol\" is not valid: Allowed integer values: > 0\n"; 1341 } 1342 } 1343 $OptionsInfo{DatabaseCompoundIDCol} = $Options{databasecompoundidcol}; 1344 } 1345 else { 1346 $OptionsInfo{DatabaseCompoundIDCol} = 'AutoDetect'; 1347 } 1348 1349 if (IsNotEmpty($Options{databasefingerprintscol})) { 1350 if ($Options{databasecolmode} =~ /^ColNum$/i) { 1351 if (!IsPositiveInteger($Options{databasefingerprintscol})) { 1352 die "Error: Column value, $Options{databasefingerprintscol}, specified using \"--DatabaseFingerprintsCol\" is not valid: Allowed integer values: > 0\n"; 1353 } 1354 } 1355 $OptionsInfo{DatabaseFingerprintsCol} = $Options{databasefingerprintscol}; 1356 } 1357 else { 1358 $OptionsInfo{DatabaseFingerprintsCol} = 'AutoDetect'; 1359 } 1360 1361 if (IsNotEmpty($Options{databasecompoundidcol}) && IsNotEmpty($Options{databasefingerprintscol})) { 1362 if (IsPositiveInteger($Options{databasecompoundidcol}) && IsPositiveInteger($Options{databasefingerprintscol})) { 1363 if (($Options{databasecompoundidcol} == $Options{databasefingerprintscol})) { 1364 die "Error: Values specified using \"--DatabaseCompoundIDCol\" and \"--DatabaseFingerprintsCol\", $Options{databasecompoundidcol}, must be different.\n"; 1365 } 1366 } 1367 else { 1368 if (($Options{databasecompoundidcol} eq $Options{databasefingerprintscol})) { 1369 die "Error: Values specified using \"--DatabaseCompoundIDCol\" and \"--DatabaseFingerprintsCol\", $Options{databasecompoundidcol}, must be different.\n"; 1370 } 1371 } 1372 } 1373 1374 # Database data column options for text files... 1375 1376 $OptionsInfo{DatabaseDataColsMode} = $Options{databasedatacolsmode}; 1377 $OptionsInfo{DatabaseDataCols} = ''; 1378 @{$OptionsInfo{SpecifiedDatabaseDataCols}} = (); 1379 1380 if ($Options{databasedatacolsmode} =~ /^Specify$/i) { 1381 my($DatabaseDataCols, $DatabaseColNum, @SpecifiedDataCols); 1382 1383 if (!$Options{databasedatacols}) { 1384 die "Error: You must specify a value for \"--DatabaseDataCols\" option in \"Specify\" \"--DatabaseDataColsMode\". \n"; 1385 } 1386 $DatabaseDataCols = $Options{databasedatacols}; 1387 1388 if ($Options{databasecolmode} =~ /^ColNum$/i) { 1389 $DatabaseDataCols =~ s/ //g; 1390 @SpecifiedDataCols = split /\,/, $DatabaseDataCols; 1391 for $DatabaseColNum (@SpecifiedDataCols) { 1392 if (!IsPositiveInteger($DatabaseColNum)) { 1393 die "Error: Column value, $DatabaseColNum, specified using \"--DatabaseDataCols\" is not valid: Allowed integer values: > 0\n"; 1394 } 1395 } 1396 } 1397 else { 1398 @SpecifiedDataCols = split /\,/, $DatabaseDataCols; 1399 } 1400 $OptionsInfo{DatabaseDataCols} = $DatabaseDataCols; 1401 push @{$OptionsInfo{SpecifiedDatabaseDataCols}}, @SpecifiedDataCols; 1402 } 1403 elsif ($Options{databasedatacolsmode} =~ /^All$/i) { 1404 $OptionsInfo{DatabaseDataCols} = 'All'; 1405 } 1406 1407 if ($OptionsInfo{DatabaseDataColsMode} =~ /^Specify$/i && !$OptionsInfo{DatabaseDataCols}) { 1408 die "Error: You must specify a value for \"--DatabaseDataCols\" option in \"Specify\" \"--DatabaseDataColsMode\". \n"; 1409 } 1410 1411 # Compound ID and fingerprints field options for SD files... 1412 1413 $OptionsInfo{DatabaseCompoundIDMode} = $Options{databasecompoundidmode}; 1414 $OptionsInfo{DatabaseCompoundIDField} = $Options{databasecompoundidfield} ? $Options{databasecompoundidfield} : ''; 1415 1416 if ($Options{databasecompoundidmode} =~ /^DataField$/i) { 1417 if (!$Options{databasecompoundidfield}) { 1418 die "Error: You must specify a value for \"--DatabaseCompoundIDField\" option in \"DataField\" \"--DatabaseCompoundIDMode\". \n"; 1419 } 1420 $OptionsInfo{DatabaseCompoundIDField} = $Options{databasecompoundidfield}; 1421 } 1422 1423 1424 if (IsNotEmpty($Options{databasefingerprintsfield})) { 1425 $OptionsInfo{DatabaseFingerprintsField} = $Options{databasefingerprintsfield}; 1426 } 1427 else { 1428 $OptionsInfo{DatabaseFingerprintsField} = 'AutoDetect'; 1429 } 1430 1431 if ($Options{databasecompoundidfield} && IsNotEmpty($Options{databasefingerprintsfield})) { 1432 if (($Options{databasecompoundidfield} eq $Options{databasefingerprintsfield})) { 1433 die "Error: Values specified using \"--DatabaseCompoundIDField\" and \"--DatabaseFingerprintsfield\", $Options{databasecompoundidfield}, must be different.\n"; 1434 } 1435 } 1436 1437 # Database data field options for SD files... 1438 1439 $OptionsInfo{DatabaseDataFieldsMode} = $Options{databasedatafieldsmode}; 1440 $OptionsInfo{DatabaseDataFields} = ''; 1441 @{$OptionsInfo{SpecifiedDatabaseDataFields}} = (); 1442 1443 if ($Options{databasedatafieldsmode} =~ /^Specify$/i && !$Options{databasedatafields}) { 1444 die "Error: You must specify a value for \"--DatabaseDataFields\" option in \"Specify\" \"--DatabaseDataFieldsMode\". \n"; 1445 } 1446 if ($Options{databasedatafields}) { 1447 my(@SpecifiedDataFields); 1448 $OptionsInfo{DatabaseDataFields} = $Options{databasedatafields}; 1449 1450 @SpecifiedDataFields = split /\,/, $Options{databasedatafields}; 1451 push @{$OptionsInfo{SpecifiedDatabaseDataFields}}, @SpecifiedDataFields; 1452 } 1453 } 1454 1455 # Setup script usage and retrieve command line arguments specified using various options... 1456 sub SetupScriptUsage { 1457 1458 # Retrieve all the options... 1459 %Options = (); 1460 1461 $Options{alpha} = 0.5; 1462 $Options{beta} = 1; 1463 1464 $Options{bitvectorcomparisonmode} = "TanimotoSimilarity"; 1465 1466 $Options{databasecolmode} = 'colnum'; 1467 1468 $Options{databasecompoundidprefix} = 'Cmpd'; 1469 $Options{databasecompoundidmode} = 'LabelPrefix'; 1470 1471 $Options{databasedatacolsmode} = 'CompoundID'; 1472 $Options{databasedatafieldsmode} = 'CompoundID'; 1473 1474 $Options{distancecutoff} = 10; 1475 1476 $Options{referencecolmode} = 'colnum'; 1477 1478 $Options{referencecompoundidprefix} = 'Cmpd'; 1479 $Options{referencecompoundidmode} = 'LabelPrefix'; 1480 1481 $Options{detail} = 1; 1482 1483 $Options{fingerprintsmode} = 'AutoDetect'; 1484 $Options{groupfusionrule} = 'Max'; 1485 $Options{groupfusionapplycutoff} = 'Yes'; 1486 1487 $Options{knn} = 'All'; 1488 1489 $Options{mode} = 'MultipleReferences'; 1490 1491 $Options{numofsimilarmolecules} = 10; 1492 $Options{percentsimilarmolecules} = 1; 1493 1494 $Options{indelim} = 'comma'; 1495 $Options{outdelim} = 'comma'; 1496 $Options{quote} = 'yes'; 1497 1498 $Options{output} = 'text'; 1499 1500 $Options{precision} = 2; 1501 1502 $Options{searchmode} = 'SimilaritySearch'; 1503 1504 $Options{similarcountmode} = 'NumOfSimilar'; 1505 1506 $Options{similaritycutoff} = 0.75; 1507 1508 $Options{vectorcomparisonmode} = 'TanimotoSimilarity'; 1509 $Options{vectorcomparisonformulism} = 'AlgebraicForm'; 1510 1511 if (!GetOptions(\%Options, "alpha=f", "beta=f", "bitvectorcomparisonmode|b=s", "databasecolmode=s", "databasecompoundidcol=s", "databasecompoundidprefix=s", "databasecompoundidfield=s", "databasecompoundidmode=s", "databasedatacols=s", "databasedatacolsmode=s", "databasedatafields=s", "databasedatafieldsmode=s", "databasefingerprintscol=s", "databasefingerprintsfield=s", "distancecutoff=f", "detail|d=i", "fast|f", "fingerprintsmode=s", "groupfusionrule|g=s", , "groupfusionapplycutoff=s", "help|h", "indelim=s", "knn|k=s", "mode|m=s", "numofsimilarmolecules|n=i", "outdelim=s", "output=s", "overwrite|o", "percentsimilarmolecules|p=f", "precision=s", "quote|q=s", "referencecolmode=s", "referencecompoundidcol=s", "referencecompoundidprefix=s", "referencecompoundidfield=s", "referencecompoundidmode=s", "referencefingerprintscol=s", "referencefingerprintsfield=s", "root|r=s", "searchmode|s=s", "similarcountmode=s", "similaritycutoff=f", "vectorcomparisonmode|v=s", "vectorcomparisonformulism=s", "workingdir|w=s")) { 1512 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1513 } 1514 if ($Options{workingdir}) { 1515 if (! -d $Options{workingdir}) { 1516 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1517 } 1518 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1519 } 1520 if ($Options{databasecolmode} !~ /^(ColNum|ColLabel)$/i) { 1521 die "Error: The value specified, $Options{databasecolmode}, for option \"--DatabaseColMode\" is not valid. Allowed values: ColNum, or ColLabel\n"; 1522 } 1523 if ($Options{databasecompoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 1524 die "Error: The value specified, $Options{databasecompoundidmode}, for option \"--DatabaseCompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 1525 } 1526 if ($Options{databasedatacolsmode} !~ /^(All|Specify|CompoundID)$/i) { 1527 die "Error: The value specified, $Options{databasedatacolsmode}, for option \"--DatabaseDataColsMode\" is not valid. Allowed values: All, Specify, or CompoundID\n"; 1528 } 1529 if ($Options{databasedatafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 1530 die "Error: The value specified, $Options{databasedatafieldsmode}, for option \"--DatabaseDataFieldsMode\" is not valid. Allowed values: All, Common, Specify, or CompoundID\n"; 1531 } 1532 if (!IsPositiveInteger($Options{detail})) { 1533 die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n"; 1534 } 1535 if ($Options{fingerprintsmode} !~ /^(AutoDetect|FingerprintsBitVectorString|FingerprintsVectorString)$/i) { 1536 die "Error: The value specified, $Options{fingerprintsmode}, for option \"--FingerprintsMode\" is not valid. Allowed values: AutoDetect, FingerprintsBitVectorString or FingerprintsVectorString \n"; 1537 } 1538 if ($Options{groupfusionrule} !~ /^(Max|Min|Mean|Median|Sum|Euclidean)$/i) { 1539 die "Error: The value specified, $Options{groupfusionrule}, for option \"-g, --GroupFusionRule\" is not valid. Allowed values: Max, Min, Mean, Median, Sum, Euclidean\n"; 1540 } 1541 if ($Options{groupfusionapplycutoff} !~ /^(Yes|No)$/i) { 1542 die "Error: The value specified, $Options{quote}, for option \"--GroupFusionApplyCutoff\" is not valid. Allowed values: Yes or No\n"; 1543 } 1544 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 1545 die "Error: The value specified, $Options{indelim}, for option \"--InDelim\" is not valid. Allowed values: comma, or semicolon\n"; 1546 } 1547 if ($Options{mode} !~ /^(IndividualReference|MultipleReferences)$/i) { 1548 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: IndividualReference, MultipleReferences\n"; 1549 } 1550 if (!IsPositiveInteger($Options{numofsimilarmolecules})) { 1551 die "Error: The value specified, $Options{numofsimilarmolecules}, for option \"-n, --NumOfSimilarMolecules\" is not valid. Allowed values: > 0 \n"; 1552 } 1553 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 1554 die "Error: The value specified, $Options{outdelim}, for option \"--OutDelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1555 } 1556 if ($Options{output} !~ /^(SD|text|both)$/i) { 1557 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; 1558 } 1559 if (!(IsFloat($Options{percentsimilarmolecules}) && $Options{percentsimilarmolecules} > 0 && $Options{percentsimilarmolecules} <= 100)) { 1560 die "Error: The value specified, $Options{percentsimilarmolecules}, for option \"-p, --PercentSimilarMolecules\" is not valid. Allowed values: > 0 and <= 100 \n"; 1561 } 1562 if ($Options{quote} !~ /^(Yes|No)$/i) { 1563 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 1564 } 1565 if (!IsPositiveInteger($Options{precision})) { 1566 die "Error: The value specified, $Options{precision}, for option \"--precision\" is not valid. Allowed values: > 0 \n"; 1567 } 1568 if ($Options{referencecolmode} !~ /^(ColNum|ColLabel)$/i) { 1569 die "Error: The value specified, $Options{referencecolmode}, for option \"--ReferenceColMode\" is not valid. Allowed values: ColNum, or ColLabel\n"; 1570 } 1571 if ($Options{referencecompoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 1572 die "Error: The value specified, $Options{referencecompoundidmode}, for option \"--ReferenceCompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 1573 } 1574 if ($Options{searchmode} !~ /^(SimilaritySearch|DissimilaritySearch)$/i) { 1575 die "Error: The value specified, $Options{searchmode}, for option \"-s, --SearchMode\" is not valid. Allowed values: SimilaritySearch, DissimilaritySearch \n"; 1576 } 1577 if ($Options{similarcountmode} !~ /^(NumOfSimilar|PercentSimilar)$/i) { 1578 die "Error: The value specified, $Options{similarcountmode}, for option \"--SimilarCountMode\" is not valid. Allowed values: NumOfSimilar, PercentSimilar \n"; 1579 } 1580 } 1581