1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: TopologicalPharmacophoreAtomPairsFingerprints.pl,v $ 4 # $Date: 2015/02/28 20:46:23 $ 5 # $Revision: 1.36 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use SDFileUtil; 38 use MoleculeFileIO; 39 use FileIO::FingerprintsSDFileIO; 40 use FileIO::FingerprintsTextFileIO; 41 use FileIO::FingerprintsFPFileIO; 42 use AtomTypes::FunctionalClassAtomTypes; 43 use Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints; 44 45 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 46 47 # Autoflush STDOUT 48 $| = 1; 49 50 # Starting message... 51 $ScriptName = basename($0); 52 print "\n$ScriptName: Starting...\n\n"; 53 $StartTime = new Benchmark; 54 55 # Get the options and setup script... 56 SetupScriptUsage(); 57 if ($Options{help} || @ARGV < 1) { 58 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 59 } 60 61 my(@SDFilesList); 62 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 63 64 # Process options... 65 print "Processing options...\n"; 66 my(%OptionsInfo); 67 ProcessOptions(); 68 69 # Setup information about input files... 70 print "Checking input SD file(s)...\n"; 71 my(%SDFilesInfo); 72 RetrieveSDFilesInfo(); 73 74 # Process input files.. 75 my($FileIndex); 76 if (@SDFilesList > 1) { 77 print "\nProcessing SD files...\n"; 78 } 79 for $FileIndex (0 .. $#SDFilesList) { 80 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 81 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 82 GenerateTopologicalPharmacophoreAtomPairsFingerprints($FileIndex); 83 } 84 } 85 print "\n$ScriptName:Done...\n\n"; 86 87 $EndTime = new Benchmark; 88 $TotalTime = timediff ($EndTime, $StartTime); 89 print "Total time: ", timestr($TotalTime), "\n"; 90 91 ############################################################################### 92 93 # Generate fingerprints for a SD file... 94 # 95 sub GenerateTopologicalPharmacophoreAtomPairsFingerprints { 96 my($FileIndex) = @_; 97 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, $SetupOutputFiles); 98 99 $SDFile = $SDFilesList[$FileIndex]; 100 101 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3; 102 $SetupOutputFiles = 1; 103 104 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); 105 $MoleculeFileIO->Open(); 106 107 $CmpdCount = 0; 108 $IgnoredCmpdCount = 0; 109 110 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { 111 $CmpdCount++; 112 113 # Filter compound data before calculating fingerprints... 114 if ($OptionsInfo{Filter}) { 115 if (CheckAndFilterCompound($CmpdCount, $Molecule)) { 116 $IgnoredCmpdCount++; 117 next COMPOUND; 118 } 119 } 120 121 $TopologicalPharmacophoreAtomPairsFingerprints = GenerateMoleculeFingerprints($Molecule); 122 if (!$TopologicalPharmacophoreAtomPairsFingerprints) { 123 $IgnoredCmpdCount++; 124 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule); 125 next COMPOUND; 126 } 127 128 if ($SetupOutputFiles) { 129 $SetupOutputFiles = 0; 130 SetupFingerprintsLabelValueIDs($TopologicalPharmacophoreAtomPairsFingerprints); 131 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex); 132 } 133 134 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 135 } 136 $MoleculeFileIO->Close(); 137 138 if ($NewFPSDFileIO) { 139 $NewFPSDFileIO->Close(); 140 } 141 if ($NewFPTextFileIO) { 142 $NewFPTextFileIO->Close(); 143 } 144 if ($NewFPFileIO) { 145 $NewFPFileIO->Close(); 146 } 147 148 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount); 149 } 150 151 # Process compound being ignored due to problems in fingerprints geneation... 152 # 153 sub ProcessIgnoredCompound { 154 my($Mode, $CmpdCount, $Molecule) = @_; 155 my($CmpdID, $DataFieldLabelAndValuesRef); 156 157 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 158 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 159 160 MODE: { 161 if ($Mode =~ /^ContainsNonElementalData$/i) { 162 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; 163 next MODE; 164 } 165 166 if ($Mode =~ /^ContainsNoElementalData$/i) { 167 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; 168 next MODE; 169 } 170 171 if ($Mode =~ /^FingerprintsGenerationFailed$/i) { 172 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 173 next MODE; 174 } 175 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 176 } 177 } 178 179 # Check and filter compounds.... 180 # 181 sub CheckAndFilterCompound { 182 my($CmpdCount, $Molecule) = @_; 183 my($ElementCount, $NonElementCount); 184 185 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); 186 187 if ($NonElementCount) { 188 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); 189 return 1; 190 } 191 192 if (!$ElementCount) { 193 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); 194 return 1; 195 } 196 197 return 0; 198 } 199 200 # Write out compounds fingerprints generation summary statistics... 201 # 202 sub WriteFingerprintsGenerationSummaryStatistics { 203 my($CmpdCount, $IgnoredCmpdCount) = @_; 204 my($ProcessedCmpdCount); 205 206 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; 207 208 print "\nNumber of compounds: $CmpdCount\n"; 209 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n"; 210 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n"; 211 } 212 213 # Append atom pair value IDs to fingerprint label... 214 # 215 sub SetupFingerprintsLabelValueIDs { 216 my($TopologicalPharmacophoreAtomPairsFingerprints) = @_; 217 218 if ($OptionsInfo{AtomPairsSetSizeToUse} =~ /^ArbitrarySize$/i || 219 $OptionsInfo{FingerprintsLabelMode} !~ /^FingerprintsLabelWithIDs$/i) { 220 return; 221 } 222 223 $OptionsInfo{FingerprintsLabel} .= "; Value IDs: " . $TopologicalPharmacophoreAtomPairsFingerprints->GetFingerprintsVector->GetValueIDsString(); 224 } 225 226 # Open output files... 227 # 228 sub SetupAndOpenOutputFiles { 229 my($FileIndex) = @_; 230 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams); 231 232 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3; 233 234 # Setup common parameters for fingerprints file IO objects... 235 # 236 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat}); 237 238 if ($OptionsInfo{SDOutput}) { 239 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; 240 print "Generating SD file $NewFPSDFile...\n"; 241 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel}); 242 $NewFPSDFileIO->Open(); 243 } 244 245 if ($OptionsInfo{FPOutput}) { 246 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex]; 247 print "Generating FP file $NewFPFile...\n"; 248 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams); 249 $NewFPFileIO->Open(); 250 } 251 252 if ($OptionsInfo{TextOutput}) { 253 my($ColLabelsRef); 254 255 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; 256 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex); 257 258 print "Generating text file $NewFPTextFile...\n"; 259 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote}); 260 $NewFPTextFileIO->Open(); 261 } 262 263 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 264 } 265 266 # Write fingerpritns and other data to appropriate output files... 267 # 268 sub WriteDataToOutputFiles { 269 my($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_; 270 my($DataFieldLabelAndValuesRef); 271 272 $DataFieldLabelAndValuesRef = undef; 273 if ($NewFPTextFileIO || $NewFPFileIO) { 274 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 275 } 276 277 if ($NewFPSDFileIO) { 278 my($CmpdString); 279 280 $CmpdString = $Molecule->GetInputMoleculeString(); 281 $NewFPSDFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $CmpdString); 282 } 283 284 if ($NewFPTextFileIO) { 285 my($ColValuesRef); 286 287 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 288 $NewFPTextFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $ColValuesRef); 289 } 290 291 if ($NewFPFileIO) { 292 my($CompoundID); 293 294 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 295 $NewFPFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $CompoundID); 296 } 297 } 298 299 # Generate approriate column labels for FPText output file... 300 # 301 sub SetupFPTextFileCoulmnLabels { 302 my($FileIndex) = @_; 303 my($Line, @ColLabels); 304 305 @ColLabels = (); 306 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 307 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 308 } 309 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 310 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 311 } 312 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 313 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}}; 314 } 315 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 316 push @ColLabels, $OptionsInfo{CompoundIDLabel}; 317 } 318 # Add fingerprints label... 319 push @ColLabels, $OptionsInfo{FingerprintsLabel}; 320 321 return \@ColLabels; 322 } 323 324 # Generate column values FPText output file.. 325 # 326 sub SetupFPTextFileCoulmnValues { 327 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 328 my(@ColValues); 329 330 @ColValues = (); 331 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 332 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 333 } 334 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 335 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 336 } 337 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 338 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 339 } 340 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 341 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; 342 } 343 344 return \@ColValues; 345 } 346 347 # Generate compound ID for FP and FPText output files.. 348 # 349 sub SetupCmpdIDForOutputFiles { 350 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 351 my($CmpdID); 352 353 $CmpdID = ''; 354 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { 355 my($MolName); 356 $MolName = $Molecule->GetName(); 357 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; 358 } 359 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { 360 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; 361 } 362 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 363 my($SpecifiedDataField); 364 $SpecifiedDataField = $OptionsInfo{CompoundID}; 365 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; 366 } 367 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { 368 $CmpdID = $Molecule->GetName(); 369 } 370 return $CmpdID; 371 } 372 373 # Generate fingerprints for molecule... 374 # 375 sub GenerateMoleculeFingerprints { 376 my($Molecule) = @_; 377 my($TopologicalPharmacophoreAtomPairsFingerprints); 378 379 if ($OptionsInfo{KeepLargestComponent}) { 380 $Molecule->KeepLargestComponent(); 381 } 382 if (!$Molecule->DetectRings()) { 383 return undef; 384 } 385 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel}); 386 $Molecule->DetectAromaticity(); 387 388 if ($OptionsInfo{FuzzifyAtomPairsCount}) { 389 $TopologicalPharmacophoreAtomPairsFingerprints = new Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints('Molecule' => $Molecule, 'AtomPairsSetSizeToUse' => $OptionsInfo{AtomPairsSetSizeToUse}, 'MinDistance' => $OptionsInfo{MinDistance}, 'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomTypesToUse' => \@{$OptionsInfo{AtomTypesToUse}}, , 'NormalizationMethodology' => $OptionsInfo{NormalizationMethodology}, , 'ValuesPrecision' => $OptionsInfo{ValuesPrecision}, 'FuzzifyAtomPairsCount' => $OptionsInfo{FuzzifyAtomPairsCount}, 'FuzzificationMode' => $OptionsInfo{FuzzificationMode}, 'FuzzificationMethodology' => $OptionsInfo{FuzzificationMethodology}, 'FuzzFactor' => $OptionsInfo{FuzzFactor}); 390 } 391 else { 392 $TopologicalPharmacophoreAtomPairsFingerprints = new Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints('Molecule' => $Molecule, 'AtomPairsSetSizeToUse' => $OptionsInfo{AtomPairsSetSizeToUse}, 'MinDistance' => $OptionsInfo{MinDistance}, 'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomTypesToUse' => \@{$OptionsInfo{AtomTypesToUse}}, 'NormalizationMethodology' => $OptionsInfo{NormalizationMethodology}, 'ValuesPrecision' => $OptionsInfo{ValuesPrecision}); 393 } 394 395 # Set atom types weights... 396 if ($OptionsInfo{UseAtomTypesWeight}) { 397 $TopologicalPharmacophoreAtomPairsFingerprints->SetAtomTypesWeight(%{$OptionsInfo{AtomTypesWeight}}); 398 } 399 400 # Generate fingerprints... 401 $TopologicalPharmacophoreAtomPairsFingerprints->GenerateFingerprints(); 402 403 # Make sure fingerprints generation is successful... 404 if (!$TopologicalPharmacophoreAtomPairsFingerprints->IsFingerprintsGenerationSuccessful()) { 405 return undef; 406 } 407 408 return $TopologicalPharmacophoreAtomPairsFingerprints; 409 } 410 411 # Retrieve information about SD files... 412 # 413 sub RetrieveSDFilesInfo { 414 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); 415 416 %SDFilesInfo = (); 417 @{$SDFilesInfo{FileOkay}} = (); 418 @{$SDFilesInfo{OutFileRoot}} = (); 419 @{$SDFilesInfo{SDOutFileNames}} = (); 420 @{$SDFilesInfo{FPOutFileNames}} = (); 421 @{$SDFilesInfo{TextOutFileNames}} = (); 422 @{$SDFilesInfo{AllDataFieldsRef}} = (); 423 @{$SDFilesInfo{CommonDataFieldsRef}} = (); 424 425 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; 426 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; 427 428 FILELIST: for $Index (0 .. $#SDFilesList) { 429 $SDFile = $SDFilesList[$Index]; 430 431 $SDFilesInfo{FileOkay}[$Index] = 0; 432 $SDFilesInfo{OutFileRoot}[$Index] = ''; 433 $SDFilesInfo{SDOutFileNames}[$Index] = ''; 434 $SDFilesInfo{FPOutFileNames}[$Index] = ''; 435 $SDFilesInfo{TextOutFileNames}[$Index] = ''; 436 437 $SDFile = $SDFilesList[$Index]; 438 if (!(-e $SDFile)) { 439 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 440 next FILELIST; 441 } 442 if (!CheckFileType($SDFile, "sd sdf")) { 443 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 444 next FILELIST; 445 } 446 447 if ($CheckDataField) { 448 # Make sure data field exists in SD file.. 449 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 450 451 @CmpdLines = (); 452 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 453 $CmpdString = ReadCmpdString(\*SDFILE); 454 close SDFILE; 455 @CmpdLines = split "\n", $CmpdString; 456 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 457 $SpecifiedDataField = $OptionsInfo{CompoundID}; 458 if (!exists $DataFieldValues{$SpecifiedDataField}) { 459 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; 460 next FILELIST; 461 } 462 } 463 464 $AllDataFieldsRef = ''; 465 $CommonDataFieldsRef = ''; 466 if ($CollectDataFields) { 467 my($CmpdCount); 468 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 469 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 470 close SDFILE; 471 } 472 473 # Setup output file names... 474 $FileDir = ""; $FileName = ""; $FileExt = ""; 475 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 476 477 $TextOutFileExt = "csv"; 478 if ($Options{outdelim} =~ /^tab$/i) { 479 $TextOutFileExt = "tsv"; 480 } 481 $SDOutFileExt = $FileExt; 482 $FPOutFileExt = "fpf"; 483 484 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 485 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 486 if ($RootFileName && $RootFileExt) { 487 $FileName = $RootFileName; 488 } 489 else { 490 $FileName = $OptionsInfo{OutFileRoot}; 491 } 492 $OutFileRoot = $FileName; 493 } 494 else { 495 $OutFileRoot = "${FileName}TopologicalPharmacophoreAtomPairsFP"; 496 } 497 498 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; 499 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}"; 500 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; 501 502 if ($OptionsInfo{SDOutput}) { 503 if ($SDFile =~ /$NewSDFileName/i) { 504 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 505 print "Specify a different name using \"-r --root\" option or use default name.\n"; 506 next FILELIST; 507 } 508 } 509 510 if (!$OptionsInfo{OverwriteFiles}) { 511 # Check SD and text outout files... 512 if ($OptionsInfo{SDOutput}) { 513 if (-e $NewSDFileName) { 514 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; 515 next FILELIST; 516 } 517 } 518 if ($OptionsInfo{FPOutput}) { 519 if (-e $NewFPFileName) { 520 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n"; 521 next FILELIST; 522 } 523 } 524 if ($OptionsInfo{TextOutput}) { 525 if (-e $NewTextFileName) { 526 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; 527 next FILELIST; 528 } 529 } 530 } 531 532 $SDFilesInfo{FileOkay}[$Index] = 1; 533 534 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 535 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; 536 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName; 537 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; 538 539 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; 540 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; 541 } 542 } 543 544 # Process option values... 545 sub ProcessOptions { 546 %OptionsInfo = (); 547 548 ProcessAtomTypesToUseOption(); 549 ProcessAtomTypesWeightOption(); 550 551 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel}; 552 553 $OptionsInfo{AtomPairsSetSizeToUse} = $Options{atompairssetsizetouse}; 554 555 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; 556 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; 557 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; 558 559 my(@SpecifiedDataFields); 560 @SpecifiedDataFields = (); 561 562 @{$OptionsInfo{SpecifiedDataFields}} = (); 563 $OptionsInfo{CompoundID} = ''; 564 565 if ($Options{datafieldsmode} =~ /^CompoundID$/i) { 566 if ($Options{compoundidmode} =~ /^DataField$/i) { 567 if (!$Options{compoundid}) { 568 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; 569 } 570 $OptionsInfo{CompoundID} = $Options{compoundid}; 571 } 572 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { 573 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; 574 } 575 } 576 elsif ($Options{datafieldsmode} =~ /^Specify$/i) { 577 if (!$Options{datafields}) { 578 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; 579 } 580 @SpecifiedDataFields = split /\,/, $Options{datafields}; 581 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; 582 } 583 584 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; 585 586 $OptionsInfo{FingerprintsLabelMode} = $Options{fingerprintslabelmode}; 587 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalPharmacophoreAtomPairsFingerprints'; 588 589 $OptionsInfo{FuzzifyAtomPairsCount} = ($Options{fuzzifyatompairscount} =~ /^Yes$/i) ? 1 : 0; 590 $OptionsInfo{FuzzificationMode} = $Options{fuzzificationmode}; 591 $OptionsInfo{FuzzificationMethodology} = $Options{fuzzificationmethodology}; 592 $OptionsInfo{FuzzFactor} = $Options{fuzzfactor}; 593 594 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; 595 596 $OptionsInfo{MinDistance} = $Options{mindistance}; 597 $OptionsInfo{MaxDistance} = $Options{maxdistance}; 598 599 $OptionsInfo{NormalizationMethodology} = $Options{normalizationmethodology}; 600 601 $OptionsInfo{Output} = $Options{output}; 602 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0; 603 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0; 604 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0; 605 606 $OptionsInfo{OutDelim} = $Options{outdelim}; 607 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 608 609 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 610 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 611 612 $OptionsInfo{ValuesPrecision} = $Options{valuesprecision}; 613 614 # Setup default vector string format... 615 my($VectorStringFormat); 616 $VectorStringFormat = ''; 617 618 if ($Options{vectorstringformat}) { 619 $VectorStringFormat = $Options{vectorstringformat}; 620 621 if ($Options{atompairssetsizetouse} =~ /^ArbitrarySize$/i && $VectorStringFormat =~ /^ValuesString$/i) { 622 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid for $Options{atompairssetsizetouse} value of \"--AtomPairsSetSizeToUse\" option. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; 623 } 624 } 625 else { 626 $VectorStringFormat = ($Options{atompairssetsizetouse} =~ /^FixedSize$/) ? "ValuesString" : "IDsAndValuesString"; 627 } 628 $OptionsInfo{VectorStringFormat} = $VectorStringFormat; 629 } 630 631 # Process atom type to use option... 632 # 633 sub ProcessAtomTypesToUseOption { 634 my($AtomType, $SpecifiedAtomTypesToUse, @AtomTypesWords); 635 636 @{$OptionsInfo{AtomTypesToUse}} = (); 637 if (IsEmpty($Options{atomtypestouse})) { 638 die "Error: Atom types value specified using \"-a, --AtomTypesToUse\" option is empty\n"; 639 } 640 641 $SpecifiedAtomTypesToUse = $Options{atomtypestouse}; 642 $SpecifiedAtomTypesToUse =~ s/ //g; 643 @AtomTypesWords = split /\,/, $SpecifiedAtomTypesToUse; 644 645 for $AtomType (@AtomTypesWords) { 646 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($AtomType)) { 647 die "Error: Atomic type specified, $AtomType, using \"-a, --AtomTypesToUse\" option is not valid...\n "; 648 } 649 push @{$OptionsInfo{AtomTypesToUse}}, $AtomType; 650 } 651 } 652 653 # Process atom types weight option... 654 # 655 sub ProcessAtomTypesWeightOption { 656 my($Index, $AtomType, $AtomTypeWeight, $SpecifiedAtomTypesWeight, @AtomTypesWeightsPairs); 657 658 %{$OptionsInfo{AtomTypesWeight}} = (); 659 660 if (IsEmpty($Options{atomtypesweight})) { 661 die "Error: Atom types weight value specified using \"--AtomTypesWeight\" option is empty\n"; 662 } 663 $OptionsInfo{UseAtomTypesWeight} = ($Options{atomtypesweight} =~ /^None$/i) ? 0 : 1; 664 if (!$OptionsInfo{UseAtomTypesWeight}) { 665 return; 666 } 667 668 # Process specified atom type/weight pairs... 669 $SpecifiedAtomTypesWeight = $Options{atomtypesweight}; 670 $SpecifiedAtomTypesWeight =~ s/ //g; 671 @AtomTypesWeightsPairs = split /\,/, $SpecifiedAtomTypesWeight; 672 673 if (@AtomTypesWeightsPairs % 2) { 674 die "Error: Invalid number of values specified using \"--AtomTypesWeight\" option: It must contain even number of values.\n"; 675 } 676 677 for ($Index = 0; $Index < @AtomTypesWeightsPairs; $Index += 2) { 678 $AtomType = $AtomTypesWeightsPairs[$Index]; $AtomTypeWeight = $AtomTypesWeightsPairs[$Index + 1]; 679 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($AtomType)) { 680 die "Error: Atom type specified, $AtomType, using \"--AtomTypesWeight\" option is not valid\n "; 681 } 682 if (!(IsFloat($AtomTypeWeight) && $AtomTypeWeight >= 0)) { 683 die "Error: Atom type weight specified, $AtomTypeWeight, using option \"--AtomTypesWeight\" is not valid. Allowed values: real numbers >= 0 \n"; 684 } 685 $OptionsInfo{AtomTypesWeight}{$AtomType} = $AtomTypeWeight; 686 } 687 } 688 689 # Setup script usage and retrieve command line arguments specified using various options... 690 sub SetupScriptUsage { 691 692 # Retrieve all the options... 693 %Options = (); 694 695 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel'; 696 697 $Options{atompairssetsizetouse} = 'ArbitrarySize'; 698 699 $Options{atomtypestouse} = 'HBD,HBA,PI,NI,H'; 700 $Options{atomtypesweight} = 'None'; 701 702 $Options{compoundidmode} = 'LabelPrefix'; 703 $Options{compoundidlabel} = 'CompoundID'; 704 $Options{datafieldsmode} = 'CompoundID'; 705 706 $Options{filter} = 'Yes'; 707 708 $Options{fingerprintslabelmode} = 'FingerprintsLabelOnly'; 709 710 $Options{fuzzifyatompairscount} = 'No'; 711 $Options{fuzzificationmode} = 'AfterNormalization'; 712 $Options{fuzzificationmethodology} = 'FuzzyBinning'; 713 $Options{fuzzfactor} = 0.15; 714 715 $Options{keeplargestcomponent} = 'Yes'; 716 717 $Options{mindistance} = 1; 718 $Options{maxdistance} = 10; 719 720 $Options{normalizationmethodology} = 'None'; 721 722 $Options{output} = 'text'; 723 $Options{outdelim} = 'comma'; 724 $Options{quote} = 'yes'; 725 726 $Options{valuesprecision} = 2; 727 728 $Options{vectorstringformat} = ''; 729 730 if (!GetOptions(\%Options, "aromaticitymodel=s", "atompairssetsizetouse=s", "atomtypestouse|a=s", "atomtypesweight=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabelmode=s", "fingerprintslabel=s", "fuzzifyatompairscount=s", "fuzzificationmode=s", "fuzzificationmethodology=s", "fuzzfactor=s", "help|h", "keeplargestcomponent|k=s", "mindistance=s", "maxdistance=s", "normalizationmethodology|n=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "valuesprecision=s", "vectorstringformat|v=s", "workingdir|w=s")) { 731 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 732 } 733 if ($Options{workingdir}) { 734 if (! -d $Options{workingdir}) { 735 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 736 } 737 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 738 } 739 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) { 740 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels(); 741 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n"; 742 } 743 if ($Options{atompairssetsizetouse} !~ /^(ArbitrarySize|FixedSize)$/i) { 744 die "Error: The value specified, $Options{atompairssetsizetouse}, for option \"--AtomPairsSetSizeToUse\" is not valid. Allowed values: ArbitrarySize or FixedSize\n"; 745 } 746 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 747 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 748 } 749 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 750 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; 751 } 752 if ($Options{filter} !~ /^(Yes|No)$/i) { 753 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; 754 } 755 if ($Options{fingerprintslabelmode} !~ /^(FingerprintsLabelOnly|FingerprintsLabelWithIDs)$/i) { 756 die "Error: The value specified, $Options{fingerprintslabelmode}, for option \"--FingerprintsLabelMode\" is not valid. Allowed values: FingerprintsLabelOnly or FingerprintsLabelWithIDs\n"; 757 } 758 if ($Options{fuzzifyatompairscount} !~ /^(Yes|No)$/i) { 759 die "Error: The value specified, $Options{fuzzifyatompairscount}, for option \"--FuzzifyAtomPairsCount\" is not valid. Allowed values: Yes or No\n"; 760 } 761 if ($Options{fuzzificationmode} !~ /^(BeforeNormalization|AfterNormalization)$/i) { 762 die "Error: The value specified, $Options{fuzzificationmode}, for option \"--FuzzificationMode\" is not valid. Allowed values: BeforeNormalization or AfterNormalization\n"; 763 } 764 if ($Options{fuzzificationmethodology} !~ /^(FuzzyBinning|FuzzyBinSmoothing)$/i) { 765 die "Error: The value specified, $Options{fuzzificationmethodology}, for option \"--FuzzificationMethodology\" is not valid. Allowed values: FuzzyBinning or FuzzyBinSmoothing\n"; 766 } 767 if (!IsFloat($Options{fuzzfactor})) { 768 die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" is not valid. Allowed values: real numbers >= 0 \n"; 769 } 770 if ($Options{fuzzificationmethodology} !~ /^FuzzyBinning$/i) { 771 if (!($Options{fuzzfactor} >=0 && $Options{fuzzfactor} <= 1.0)) { 772 die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" during FuzzyBinning \"--FuzzificationMethodology\" is not valid. Allowed values: >= 0 and <= 1 \n"; 773 } 774 } 775 elsif ($Options{fuzzificationmethodology} !~ /^FuzzyBinSmoothing$/i) { 776 if (!($Options{fuzzfactor} >=0 && $Options{fuzzfactor} <= 0.5)) { 777 die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" during FuzzyBinSmoothing \"--FuzzificationMethodology\" is not valid. Allowed values: >= 0 and <= 0.5 \n"; 778 } 779 } 780 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { 781 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; 782 } 783 if (!IsInteger($Options{mindistance})) { 784 die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: >= 0 \n"; 785 } 786 if (!IsPositiveInteger($Options{maxdistance})) { 787 die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n"; 788 } 789 if ($Options{mindistance} > $Options{maxdistance}) { 790 die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n"; 791 } 792 if ($Options{normalizationmethodology} !~ /^(None|ByHeavyAtomsCount|ByAtomTypesCount)$/i) { 793 die "Error: The value specified, $Options{normalizationmethodology}, for option \"--NormalizationMethodology\" is not valid. Allowed values: None, ByHeavyAtomsCount, or ByAtomTypesCount\n"; 794 } 795 if ($Options{output} !~ /^(SD|FP|text|all)$/i) { 796 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n"; 797 } 798 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 799 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 800 } 801 if ($Options{quote} !~ /^(Yes|No)$/i) { 802 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 803 } 804 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) { 805 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n"; 806 } 807 if (!IsPositiveInteger($Options{valuesprecision})) { 808 die "Error: The value specified, $Options{valuesprecision}, for option \"--ValuesPrecision\" is not valid. Allowed values: > 0 \n"; 809 } 810 if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) { 811 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; 812 } 813 } 814