1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: TopologicalAtomPairsFingerprints.pl,v $ 4 # $Date: 2015/02/28 20:46:22 $ 5 # $Revision: 1.34 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use SDFileUtil; 38 use MoleculeFileIO; 39 use FileIO::FingerprintsSDFileIO; 40 use FileIO::FingerprintsTextFileIO; 41 use FileIO::FingerprintsFPFileIO; 42 use AtomTypes::AtomicInvariantsAtomTypes; 43 use AtomTypes::FunctionalClassAtomTypes; 44 use Fingerprints::TopologicalAtomPairsFingerprints; 45 46 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 47 48 # Autoflush STDOUT 49 $| = 1; 50 51 # Starting message... 52 $ScriptName = basename($0); 53 print "\n$ScriptName: Starting...\n\n"; 54 $StartTime = new Benchmark; 55 56 # Get the options and setup script... 57 SetupScriptUsage(); 58 if ($Options{help} || @ARGV < 1) { 59 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 60 } 61 62 my(@SDFilesList); 63 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 64 65 # Process options... 66 print "Processing options...\n"; 67 my(%OptionsInfo); 68 ProcessOptions(); 69 70 # Setup information about input files... 71 print "Checking input SD file(s)...\n"; 72 my(%SDFilesInfo); 73 RetrieveSDFilesInfo(); 74 75 # Process input files.. 76 my($FileIndex); 77 if (@SDFilesList > 1) { 78 print "\nProcessing SD files...\n"; 79 } 80 for $FileIndex (0 .. $#SDFilesList) { 81 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 82 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 83 GenerateTopologicalAtomPairsFingerprints($FileIndex); 84 } 85 } 86 print "\n$ScriptName:Done...\n\n"; 87 88 $EndTime = new Benchmark; 89 $TotalTime = timediff ($EndTime, $StartTime); 90 print "Total time: ", timestr($TotalTime), "\n"; 91 92 ############################################################################### 93 94 # Generate fingerprints for a SD file... 95 # 96 sub GenerateTopologicalAtomPairsFingerprints { 97 my($FileIndex) = @_; 98 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 99 100 $SDFile = $SDFilesList[$FileIndex]; 101 102 # Setup output files... 103 # 104 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex); 105 106 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); 107 $MoleculeFileIO->Open(); 108 109 $CmpdCount = 0; 110 $IgnoredCmpdCount = 0; 111 112 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { 113 $CmpdCount++; 114 115 # Filter compound data before calculating fingerprints... 116 if ($OptionsInfo{Filter}) { 117 if (CheckAndFilterCompound($CmpdCount, $Molecule)) { 118 $IgnoredCmpdCount++; 119 next COMPOUND; 120 } 121 } 122 123 $TopologicalAtomPairsFingerprints = GenerateMoleculeFingerprints($Molecule); 124 if (!$TopologicalAtomPairsFingerprints) { 125 $IgnoredCmpdCount++; 126 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule); 127 next COMPOUND; 128 } 129 130 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 131 } 132 $MoleculeFileIO->Close(); 133 134 if ($NewFPSDFileIO) { 135 $NewFPSDFileIO->Close(); 136 } 137 if ($NewFPTextFileIO) { 138 $NewFPTextFileIO->Close(); 139 } 140 if ($NewFPFileIO) { 141 $NewFPFileIO->Close(); 142 } 143 144 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount); 145 } 146 147 # Process compound being ignored due to problems in fingerprints geneation... 148 # 149 sub ProcessIgnoredCompound { 150 my($Mode, $CmpdCount, $Molecule) = @_; 151 my($CmpdID, $DataFieldLabelAndValuesRef); 152 153 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 154 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 155 156 MODE: { 157 if ($Mode =~ /^ContainsNonElementalData$/i) { 158 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; 159 next MODE; 160 } 161 162 if ($Mode =~ /^ContainsNoElementalData$/i) { 163 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; 164 next MODE; 165 } 166 167 if ($Mode =~ /^FingerprintsGenerationFailed$/i) { 168 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 169 next MODE; 170 } 171 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 172 } 173 } 174 175 # Check and filter compounds.... 176 # 177 sub CheckAndFilterCompound { 178 my($CmpdCount, $Molecule) = @_; 179 my($ElementCount, $NonElementCount); 180 181 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); 182 183 if ($NonElementCount) { 184 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); 185 return 1; 186 } 187 188 if (!$ElementCount) { 189 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); 190 return 1; 191 } 192 193 return 0; 194 } 195 196 # Write out compounds fingerprints generation summary statistics... 197 # 198 sub WriteFingerprintsGenerationSummaryStatistics { 199 my($CmpdCount, $IgnoredCmpdCount) = @_; 200 my($ProcessedCmpdCount); 201 202 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; 203 204 print "\nNumber of compounds: $CmpdCount\n"; 205 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n"; 206 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n"; 207 } 208 209 # Open output files... 210 # 211 sub SetupAndOpenOutputFiles { 212 my($FileIndex) = @_; 213 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams); 214 215 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3; 216 217 # Setup common parameters for fingerprints file IO objects... 218 # 219 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat}); 220 221 if ($OptionsInfo{SDOutput}) { 222 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; 223 print "Generating SD file $NewFPSDFile...\n"; 224 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel}); 225 $NewFPSDFileIO->Open(); 226 } 227 228 if ($OptionsInfo{FPOutput}) { 229 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex]; 230 print "Generating FP file $NewFPFile...\n"; 231 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams); 232 $NewFPFileIO->Open(); 233 } 234 235 if ($OptionsInfo{TextOutput}) { 236 my($ColLabelsRef); 237 238 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; 239 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex); 240 241 print "Generating text file $NewFPTextFile...\n"; 242 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote}); 243 $NewFPTextFileIO->Open(); 244 } 245 246 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 247 } 248 249 # Write fingerpritns and other data to appropriate output files... 250 # 251 sub WriteDataToOutputFiles { 252 my($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_; 253 my($DataFieldLabelAndValuesRef); 254 255 $DataFieldLabelAndValuesRef = undef; 256 if ($NewFPTextFileIO || $NewFPFileIO) { 257 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 258 } 259 260 if ($NewFPSDFileIO) { 261 my($CmpdString); 262 263 $CmpdString = $Molecule->GetInputMoleculeString(); 264 $NewFPSDFileIO->WriteFingerprints($TopologicalAtomPairsFingerprints, $CmpdString); 265 } 266 267 if ($NewFPTextFileIO) { 268 my($ColValuesRef); 269 270 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 271 $NewFPTextFileIO->WriteFingerprints($TopologicalAtomPairsFingerprints, $ColValuesRef); 272 } 273 274 if ($NewFPFileIO) { 275 my($CompoundID); 276 277 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 278 $NewFPFileIO->WriteFingerprints($TopologicalAtomPairsFingerprints, $CompoundID); 279 } 280 } 281 282 # Generate approriate column labels for FPText output file... 283 # 284 sub SetupFPTextFileCoulmnLabels { 285 my($FileIndex) = @_; 286 my($Line, @ColLabels); 287 288 @ColLabels = (); 289 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 290 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 291 } 292 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 293 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 294 } 295 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 296 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}}; 297 } 298 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 299 push @ColLabels, $OptionsInfo{CompoundIDLabel}; 300 } 301 # Add fingerprints label... 302 push @ColLabels, $OptionsInfo{FingerprintsLabel}; 303 304 return \@ColLabels; 305 } 306 307 # Generate column values FPText output file.. 308 # 309 sub SetupFPTextFileCoulmnValues { 310 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 311 my(@ColValues); 312 313 @ColValues = (); 314 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 315 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 316 } 317 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 318 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 319 } 320 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 321 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 322 } 323 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 324 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; 325 } 326 327 return \@ColValues; 328 } 329 330 # Generate compound ID for FP and FPText output files.. 331 # 332 sub SetupCmpdIDForOutputFiles { 333 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 334 my($CmpdID); 335 336 $CmpdID = ''; 337 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { 338 my($MolName); 339 $MolName = $Molecule->GetName(); 340 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; 341 } 342 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { 343 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; 344 } 345 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 346 my($SpecifiedDataField); 347 $SpecifiedDataField = $OptionsInfo{CompoundID}; 348 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; 349 } 350 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { 351 $CmpdID = $Molecule->GetName(); 352 } 353 return $CmpdID; 354 } 355 356 # Generate fingerprints for molecule... 357 # 358 sub GenerateMoleculeFingerprints { 359 my($Molecule) = @_; 360 my($TopologicalAtomPairsFingerprints); 361 362 if ($OptionsInfo{KeepLargestComponent}) { 363 $Molecule->KeepLargestComponent(); 364 } 365 if (!$Molecule->DetectRings()) { 366 return undef; 367 } 368 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel}); 369 $Molecule->DetectAromaticity(); 370 371 $TopologicalAtomPairsFingerprints = new Fingerprints::TopologicalAtomPairsFingerprints('Molecule' => $Molecule, 'MinDistance' => $OptionsInfo{MinDistance}, 'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType}); 372 SetAtomIdentifierTypeValuesToUse($TopologicalAtomPairsFingerprints); 373 374 # Generate fingerprints... 375 $TopologicalAtomPairsFingerprints->GenerateFingerprints(); 376 377 # Make sure fingerprints generation is successful... 378 if (!$TopologicalAtomPairsFingerprints->IsFingerprintsGenerationSuccessful()) { 379 return undef; 380 } 381 382 return $TopologicalAtomPairsFingerprints; 383 } 384 385 # Set atom identifier type to use for generating fingerprints... 386 # 387 sub SetAtomIdentifierTypeValuesToUse { 388 my($TopologicalAtomPairsFingerprints) = @_; 389 390 if ($OptionsInfo{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) { 391 $TopologicalAtomPairsFingerprints->SetAtomicInvariantsToUse(\@{$OptionsInfo{AtomicInvariantsToUse}}); 392 } 393 elsif ($OptionsInfo{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) { 394 $TopologicalAtomPairsFingerprints->SetFunctionalClassesToUse(\@{$OptionsInfo{FunctionalClassesToUse}}); 395 } 396 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 397 # Nothing to do for now... 398 } 399 else { 400 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n"; 401 } 402 } 403 404 # Retrieve information about SD files... 405 # 406 sub RetrieveSDFilesInfo { 407 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); 408 409 %SDFilesInfo = (); 410 @{$SDFilesInfo{FileOkay}} = (); 411 @{$SDFilesInfo{OutFileRoot}} = (); 412 @{$SDFilesInfo{SDOutFileNames}} = (); 413 @{$SDFilesInfo{FPOutFileNames}} = (); 414 @{$SDFilesInfo{TextOutFileNames}} = (); 415 @{$SDFilesInfo{AllDataFieldsRef}} = (); 416 @{$SDFilesInfo{CommonDataFieldsRef}} = (); 417 418 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; 419 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; 420 421 FILELIST: for $Index (0 .. $#SDFilesList) { 422 $SDFile = $SDFilesList[$Index]; 423 424 $SDFilesInfo{FileOkay}[$Index] = 0; 425 $SDFilesInfo{OutFileRoot}[$Index] = ''; 426 $SDFilesInfo{SDOutFileNames}[$Index] = ''; 427 $SDFilesInfo{FPOutFileNames}[$Index] = ''; 428 $SDFilesInfo{TextOutFileNames}[$Index] = ''; 429 430 $SDFile = $SDFilesList[$Index]; 431 if (!(-e $SDFile)) { 432 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 433 next FILELIST; 434 } 435 if (!CheckFileType($SDFile, "sd sdf")) { 436 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 437 next FILELIST; 438 } 439 440 if ($CheckDataField) { 441 # Make sure data field exists in SD file.. 442 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 443 444 @CmpdLines = (); 445 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 446 $CmpdString = ReadCmpdString(\*SDFILE); 447 close SDFILE; 448 @CmpdLines = split "\n", $CmpdString; 449 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 450 $SpecifiedDataField = $OptionsInfo{CompoundID}; 451 if (!exists $DataFieldValues{$SpecifiedDataField}) { 452 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; 453 next FILELIST; 454 } 455 } 456 457 $AllDataFieldsRef = ''; 458 $CommonDataFieldsRef = ''; 459 if ($CollectDataFields) { 460 my($CmpdCount); 461 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 462 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 463 close SDFILE; 464 } 465 466 # Setup output file names... 467 $FileDir = ""; $FileName = ""; $FileExt = ""; 468 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 469 470 $TextOutFileExt = "csv"; 471 if ($Options{outdelim} =~ /^tab$/i) { 472 $TextOutFileExt = "tsv"; 473 } 474 $SDOutFileExt = $FileExt; 475 $FPOutFileExt = "fpf"; 476 477 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 478 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 479 if ($RootFileName && $RootFileExt) { 480 $FileName = $RootFileName; 481 } 482 else { 483 $FileName = $OptionsInfo{OutFileRoot}; 484 } 485 $OutFileRoot = $FileName; 486 } 487 else { 488 $OutFileRoot = "${FileName}TopologicalAtomPairsFP"; 489 } 490 491 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; 492 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}"; 493 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; 494 495 if ($OptionsInfo{SDOutput}) { 496 if ($SDFile =~ /$NewSDFileName/i) { 497 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 498 print "Specify a different name using \"-r --root\" option or use default name.\n"; 499 next FILELIST; 500 } 501 } 502 503 if (!$OptionsInfo{OverwriteFiles}) { 504 # Check SD and text outout files... 505 if ($OptionsInfo{SDOutput}) { 506 if (-e $NewSDFileName) { 507 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; 508 next FILELIST; 509 } 510 } 511 if ($OptionsInfo{FPOutput}) { 512 if (-e $NewFPFileName) { 513 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n"; 514 next FILELIST; 515 } 516 } 517 if ($OptionsInfo{TextOutput}) { 518 if (-e $NewTextFileName) { 519 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; 520 next FILELIST; 521 } 522 } 523 } 524 525 $SDFilesInfo{FileOkay}[$Index] = 1; 526 527 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 528 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; 529 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName; 530 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; 531 532 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; 533 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; 534 } 535 } 536 537 # Process option values... 538 sub ProcessOptions { 539 %OptionsInfo = (); 540 541 ProcessAtomIdentifierTypeOptions(); 542 543 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel}; 544 545 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; 546 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; 547 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; 548 549 my(@SpecifiedDataFields); 550 @SpecifiedDataFields = (); 551 552 @{$OptionsInfo{SpecifiedDataFields}} = (); 553 $OptionsInfo{CompoundID} = ''; 554 555 if ($Options{datafieldsmode} =~ /^CompoundID$/i) { 556 if ($Options{compoundidmode} =~ /^DataField$/i) { 557 if (!$Options{compoundid}) { 558 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; 559 } 560 $OptionsInfo{CompoundID} = $Options{compoundid}; 561 } 562 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { 563 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; 564 } 565 } 566 elsif ($Options{datafieldsmode} =~ /^Specify$/i) { 567 if (!$Options{datafields}) { 568 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; 569 } 570 @SpecifiedDataFields = split /\,/, $Options{datafields}; 571 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; 572 } 573 574 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; 575 576 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalAtomPairsFingerprints'; 577 578 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; 579 580 $OptionsInfo{MinDistance} = $Options{mindistance}; 581 $OptionsInfo{MaxDistance} = $Options{maxdistance}; 582 583 $OptionsInfo{Output} = $Options{output}; 584 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0; 585 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0; 586 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0; 587 588 $OptionsInfo{OutDelim} = $Options{outdelim}; 589 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 590 591 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 592 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 593 594 $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat}; 595 } 596 597 # Process atom identifier type and related options... 598 # 599 sub ProcessAtomIdentifierTypeOptions { 600 601 $OptionsInfo{AtomIdentifierType} = $Options{atomidentifiertype}; 602 603 if ($Options{atomidentifiertype} =~ /^AtomicInvariantsAtomTypes$/i) { 604 ProcessAtomicInvariantsToUseOption(); 605 } 606 elsif ($Options{atomidentifiertype} =~ /^FunctionalClassAtomTypes$/i) { 607 ProcessFunctionalClassesToUse(); 608 } 609 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 610 # Nothing to do for now... 611 } 612 else { 613 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n"; 614 } 615 } 616 617 # Process specified atomic invariants to use... 618 # 619 sub ProcessAtomicInvariantsToUseOption { 620 my($AtomicInvariant, $AtomSymbolSpecified, @AtomicInvariantsWords); 621 622 @{$OptionsInfo{AtomicInvariantsToUse}} = (); 623 if (IsEmpty($Options{atomicinvariantstouse})) { 624 die "Error: Atomic invariants value specified using \"--AtomicInvariantsToUse\" option is empty\n"; 625 } 626 $AtomSymbolSpecified = 0; 627 @AtomicInvariantsWords = split /\,/, $Options{atomicinvariantstouse}; 628 for $AtomicInvariant (@AtomicInvariantsWords) { 629 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($AtomicInvariant)) { 630 die "Error: Atomic invariant specified, $AtomicInvariant, using \"--AtomicInvariantsToUse\" option is not valid...\n "; 631 } 632 if ($AtomicInvariant =~ /^(AS|AtomSymbol)$/i) { 633 $AtomSymbolSpecified = 1; 634 } 635 push @{$OptionsInfo{AtomicInvariantsToUse}}, $AtomicInvariant; 636 } 637 if (!$AtomSymbolSpecified) { 638 die "Error: Atomic invariant, AS or AtomSymbol, must be specified as using \"--AtomicInvariantsToUse\" option...\n "; 639 } 640 } 641 642 # Process specified functional classes invariants to use... 643 # 644 sub ProcessFunctionalClassesToUse { 645 my($FunctionalClass, @FunctionalClassesToUseWords); 646 647 @{$OptionsInfo{FunctionalClassesToUse}} = (); 648 if (IsEmpty($Options{functionalclassestouse})) { 649 die "Error: Functional classes value specified using \"--FunctionalClassesToUse\" option is empty\n"; 650 } 651 @FunctionalClassesToUseWords = split /\,/, $Options{functionalclassestouse}; 652 for $FunctionalClass (@FunctionalClassesToUseWords) { 653 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($FunctionalClass)) { 654 die "Error: Functional class specified, $FunctionalClass, using \"--FunctionalClassesToUse\" option is not valid...\n "; 655 } 656 push @{$OptionsInfo{FunctionalClassesToUse}}, $FunctionalClass; 657 } 658 } 659 660 # Setup script usage and retrieve command line arguments specified using various options... 661 sub SetupScriptUsage { 662 663 # Retrieve all the options... 664 %Options = (); 665 666 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel'; 667 668 $Options{atomidentifiertype} = 'AtomicInvariantsAtomTypes'; 669 $Options{atomicinvariantstouse} = 'AS,X,BO,H,FC'; 670 671 $Options{functionalclassestouse} = 'HBD,HBA,PI,NI,Ar,Hal'; 672 673 $Options{compoundidmode} = 'LabelPrefix'; 674 $Options{compoundidlabel} = 'CompoundID'; 675 $Options{datafieldsmode} = 'CompoundID'; 676 677 $Options{filter} = 'Yes'; 678 679 $Options{keeplargestcomponent} = 'Yes'; 680 681 $Options{mindistance} = 1; 682 $Options{maxdistance} = 10; 683 684 $Options{output} = 'text'; 685 $Options{outdelim} = 'comma'; 686 $Options{quote} = 'yes'; 687 688 $Options{vectorstringformat} = 'IDsAndValuesString'; 689 690 if (!GetOptions(\%Options, "aromaticitymodel=s", "atomidentifiertype|a=s", "atomicinvariantstouse=s", "functionalclassestouse=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "mindistance=s", "maxdistance=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "vectorstringformat|v=s", "workingdir|w=s")) { 691 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 692 } 693 if ($Options{workingdir}) { 694 if (! -d $Options{workingdir}) { 695 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 696 } 697 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 698 } 699 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) { 700 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels(); 701 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n"; 702 } 703 if ($Options{atomidentifiertype} !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 704 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n"; 705 } 706 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 707 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 708 } 709 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 710 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; 711 } 712 if ($Options{filter} !~ /^(Yes|No)$/i) { 713 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; 714 } 715 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { 716 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; 717 } 718 if (!IsPositiveInteger($Options{mindistance})) { 719 die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: > 0 \n"; 720 } 721 if (!IsPositiveInteger($Options{maxdistance})) { 722 die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n"; 723 } 724 if ($Options{mindistance} > $Options{maxdistance}) { 725 die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n"; 726 } 727 if ($Options{output} !~ /^(SD|FP|text|all)$/i) { 728 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n"; 729 } 730 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 731 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 732 } 733 if ($Options{quote} !~ /^(Yes|No)$/i) { 734 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 735 } 736 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) { 737 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n"; 738 } 739 if ($Options{vectorstringformat} !~ /^(IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) { 740 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; 741 } 742 } 743