1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: TopologicalPharmacophoreAtomTripletsFingerprints.pl,v $ 4 # $Date: 2015/02/28 20:46:23 $ 5 # $Revision: 1.34 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use SDFileUtil; 38 use MoleculeFileIO; 39 use FileIO::FingerprintsSDFileIO; 40 use FileIO::FingerprintsTextFileIO; 41 use FileIO::FingerprintsFPFileIO; 42 use AtomTypes::FunctionalClassAtomTypes; 43 use Fingerprints::TopologicalPharmacophoreAtomTripletsFingerprints; 44 45 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 46 47 # Autoflush STDOUT 48 $| = 1; 49 50 # Starting message... 51 $ScriptName = basename($0); 52 print "\n$ScriptName: Starting...\n\n"; 53 $StartTime = new Benchmark; 54 55 # Get the options and setup script... 56 SetupScriptUsage(); 57 if ($Options{help} || @ARGV < 1) { 58 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 59 } 60 61 my(@SDFilesList); 62 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 63 64 # Process options... 65 print "Processing options...\n"; 66 my(%OptionsInfo); 67 ProcessOptions(); 68 69 # Setup information about input files... 70 print "Checking input SD file(s)...\n"; 71 my(%SDFilesInfo); 72 RetrieveSDFilesInfo(); 73 74 # Process input files.. 75 my($FileIndex); 76 if (@SDFilesList > 1) { 77 print "\nProcessing SD files...\n"; 78 } 79 for $FileIndex (0 .. $#SDFilesList) { 80 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 81 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 82 GenerateTopologicalPharmacophoreAtomTripletsFingerprints($FileIndex); 83 } 84 } 85 print "\n$ScriptName:Done...\n\n"; 86 87 $EndTime = new Benchmark; 88 $TotalTime = timediff ($EndTime, $StartTime); 89 print "Total time: ", timestr($TotalTime), "\n"; 90 91 ############################################################################### 92 93 # Generate fingerprints for a SD file... 94 # 95 sub GenerateTopologicalPharmacophoreAtomTripletsFingerprints { 96 my($FileIndex) = @_; 97 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalPharmacophoreAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, $SetupOutputFiles); 98 99 $SDFile = $SDFilesList[$FileIndex]; 100 101 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3; 102 $SetupOutputFiles = 1; 103 104 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); 105 $MoleculeFileIO->Open(); 106 107 $CmpdCount = 0; 108 $IgnoredCmpdCount = 0; 109 110 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { 111 $CmpdCount++; 112 113 # Filter compound data before calculating fingerprints... 114 if ($OptionsInfo{Filter}) { 115 if (CheckAndFilterCompound($CmpdCount, $Molecule)) { 116 $IgnoredCmpdCount++; 117 next COMPOUND; 118 } 119 } 120 121 $TopologicalPharmacophoreAtomTripletsFingerprints = GenerateMoleculeFingerprints($Molecule); 122 if (!$TopologicalPharmacophoreAtomTripletsFingerprints) { 123 $IgnoredCmpdCount++; 124 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule); 125 next COMPOUND; 126 } 127 128 if ($SetupOutputFiles) { 129 $SetupOutputFiles = 0; 130 SetupFingerprintsLabelValueIDs($TopologicalPharmacophoreAtomTripletsFingerprints); 131 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex); 132 } 133 134 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 135 } 136 $MoleculeFileIO->Close(); 137 138 if ($NewFPSDFileIO) { 139 $NewFPSDFileIO->Close(); 140 } 141 if ($NewFPTextFileIO) { 142 $NewFPTextFileIO->Close(); 143 } 144 if ($NewFPFileIO) { 145 $NewFPFileIO->Close(); 146 } 147 148 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount); 149 } 150 151 # Process compound being ignored due to problems in fingerprints geneation... 152 # 153 sub ProcessIgnoredCompound { 154 my($Mode, $CmpdCount, $Molecule) = @_; 155 my($CmpdID, $DataFieldLabelAndValuesRef); 156 157 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 158 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 159 160 MODE: { 161 if ($Mode =~ /^ContainsNonElementalData$/i) { 162 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; 163 next MODE; 164 } 165 166 if ($Mode =~ /^ContainsNoElementalData$/i) { 167 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; 168 next MODE; 169 } 170 171 if ($Mode =~ /^FingerprintsGenerationFailed$/i) { 172 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 173 next MODE; 174 } 175 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 176 } 177 } 178 179 # Check and filter compounds.... 180 # 181 sub CheckAndFilterCompound { 182 my($CmpdCount, $Molecule) = @_; 183 my($ElementCount, $NonElementCount); 184 185 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); 186 187 if ($NonElementCount) { 188 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); 189 return 1; 190 } 191 192 if (!$ElementCount) { 193 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); 194 return 1; 195 } 196 197 return 0; 198 } 199 200 # Write out compounds fingerprints generation summary statistics... 201 # 202 sub WriteFingerprintsGenerationSummaryStatistics { 203 my($CmpdCount, $IgnoredCmpdCount) = @_; 204 my($ProcessedCmpdCount); 205 206 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; 207 208 print "\nNumber of compounds: $CmpdCount\n"; 209 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n"; 210 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n"; 211 } 212 213 # Append atom pair value IDs to fingerprint label... 214 # 215 sub SetupFingerprintsLabelValueIDs { 216 my($TopologicalPharmacophoreAtomTripletsFingerprints) = @_; 217 218 if ($OptionsInfo{AtomTripletsSetSizeToUse} =~ /^ArbitrarySize$/i || 219 $OptionsInfo{FingerprintsLabelMode} !~ /^FingerprintsLabelWithIDs$/i) { 220 return; 221 } 222 $OptionsInfo{FingerprintsLabel} .= "; Value IDs: " . $TopologicalPharmacophoreAtomTripletsFingerprints->GetFingerprintsVector->GetValueIDsString(); 223 } 224 225 # Open output files... 226 # 227 sub SetupAndOpenOutputFiles { 228 my($FileIndex) = @_; 229 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams); 230 231 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3; 232 233 # Setup common parameters for fingerprints file IO objects... 234 # 235 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat}); 236 237 if ($OptionsInfo{SDOutput}) { 238 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; 239 print "Generating SD file $NewFPSDFile...\n"; 240 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel}); 241 $NewFPSDFileIO->Open(); 242 } 243 244 if ($OptionsInfo{FPOutput}) { 245 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex]; 246 print "Generating FP file $NewFPFile...\n"; 247 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams); 248 $NewFPFileIO->Open(); 249 } 250 251 if ($OptionsInfo{TextOutput}) { 252 my($ColLabelsRef); 253 254 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; 255 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex); 256 257 print "Generating text file $NewFPTextFile...\n"; 258 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote}); 259 $NewFPTextFileIO->Open(); 260 } 261 262 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 263 } 264 265 # Write fingerpritns and other data to appropriate output files... 266 # 267 sub WriteDataToOutputFiles { 268 my($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_; 269 my($DataFieldLabelAndValuesRef); 270 271 $DataFieldLabelAndValuesRef = undef; 272 if ($NewFPTextFileIO || $NewFPFileIO) { 273 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 274 } 275 276 if ($NewFPSDFileIO) { 277 my($CmpdString); 278 279 $CmpdString = $Molecule->GetInputMoleculeString(); 280 $NewFPSDFileIO->WriteFingerprints($TopologicalPharmacophoreAtomTripletsFingerprints, $CmpdString); 281 } 282 283 if ($NewFPTextFileIO) { 284 my($ColValuesRef); 285 286 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 287 $NewFPTextFileIO->WriteFingerprints($TopologicalPharmacophoreAtomTripletsFingerprints, $ColValuesRef); 288 } 289 290 if ($NewFPFileIO) { 291 my($CompoundID); 292 293 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 294 $NewFPFileIO->WriteFingerprints($TopologicalPharmacophoreAtomTripletsFingerprints, $CompoundID); 295 } 296 } 297 298 # Generate approriate column labels for FPText output file... 299 # 300 sub SetupFPTextFileCoulmnLabels { 301 my($FileIndex) = @_; 302 my($Line, @ColLabels); 303 304 @ColLabels = (); 305 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 306 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 307 } 308 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 309 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 310 } 311 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 312 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}}; 313 } 314 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 315 push @ColLabels, $OptionsInfo{CompoundIDLabel}; 316 } 317 # Add fingerprints label... 318 push @ColLabels, $OptionsInfo{FingerprintsLabel}; 319 320 return \@ColLabels; 321 } 322 323 # Generate column values FPText output file.. 324 # 325 sub SetupFPTextFileCoulmnValues { 326 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 327 my(@ColValues); 328 329 @ColValues = (); 330 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 331 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 332 } 333 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 334 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 335 } 336 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 337 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 338 } 339 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 340 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; 341 } 342 343 return \@ColValues; 344 } 345 346 # Generate compound ID for FP and FPText output files.. 347 # 348 sub SetupCmpdIDForOutputFiles { 349 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 350 my($CmpdID); 351 352 $CmpdID = ''; 353 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { 354 my($MolName); 355 $MolName = $Molecule->GetName(); 356 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; 357 } 358 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { 359 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; 360 } 361 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 362 my($SpecifiedDataField); 363 $SpecifiedDataField = $OptionsInfo{CompoundID}; 364 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; 365 } 366 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { 367 $CmpdID = $Molecule->GetName(); 368 } 369 return $CmpdID; 370 } 371 372 # Generate fingerprints for molecule... 373 # 374 sub GenerateMoleculeFingerprints { 375 my($Molecule) = @_; 376 my($TopologicalPharmacophoreAtomTripletsFingerprints); 377 378 if ($OptionsInfo{KeepLargestComponent}) { 379 $Molecule->KeepLargestComponent(); 380 } 381 if (!$Molecule->DetectRings()) { 382 return undef; 383 } 384 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel}); 385 $Molecule->DetectAromaticity(); 386 387 $TopologicalPharmacophoreAtomTripletsFingerprints = new Fingerprints::TopologicalPharmacophoreAtomTripletsFingerprints('Molecule' => $Molecule, 'AtomTripletsSetSizeToUse' => $OptionsInfo{AtomTripletsSetSizeToUse}, 'MinDistance' => $OptionsInfo{MinDistance}, 'MaxDistance' => $OptionsInfo{MaxDistance}, 'DistanceBinSize' => $OptionsInfo{DistanceBinSize}, 'UseTriangleInequality' => $OptionsInfo{UseTriangleInequality}, 'AtomTypesToUse' => \@{$OptionsInfo{AtomTypesToUse}}); 388 389 # Generate fingerprints... 390 $TopologicalPharmacophoreAtomTripletsFingerprints->GenerateFingerprints(); 391 392 # Make sure fingerprints generation is successful... 393 if (!$TopologicalPharmacophoreAtomTripletsFingerprints->IsFingerprintsGenerationSuccessful()) { 394 return undef; 395 } 396 397 return $TopologicalPharmacophoreAtomTripletsFingerprints; 398 } 399 400 # Retrieve information about SD files... 401 # 402 sub RetrieveSDFilesInfo { 403 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); 404 405 %SDFilesInfo = (); 406 @{$SDFilesInfo{FileOkay}} = (); 407 @{$SDFilesInfo{OutFileRoot}} = (); 408 @{$SDFilesInfo{SDOutFileNames}} = (); 409 @{$SDFilesInfo{FPOutFileNames}} = (); 410 @{$SDFilesInfo{TextOutFileNames}} = (); 411 @{$SDFilesInfo{AllDataFieldsRef}} = (); 412 @{$SDFilesInfo{CommonDataFieldsRef}} = (); 413 414 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; 415 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; 416 417 FILELIST: for $Index (0 .. $#SDFilesList) { 418 $SDFile = $SDFilesList[$Index]; 419 420 $SDFilesInfo{FileOkay}[$Index] = 0; 421 $SDFilesInfo{OutFileRoot}[$Index] = ''; 422 $SDFilesInfo{SDOutFileNames}[$Index] = ''; 423 $SDFilesInfo{FPOutFileNames}[$Index] = ''; 424 $SDFilesInfo{TextOutFileNames}[$Index] = ''; 425 426 $SDFile = $SDFilesList[$Index]; 427 if (!(-e $SDFile)) { 428 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 429 next FILELIST; 430 } 431 if (!CheckFileType($SDFile, "sd sdf")) { 432 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 433 next FILELIST; 434 } 435 436 if ($CheckDataField) { 437 # Make sure data field exists in SD file.. 438 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 439 440 @CmpdLines = (); 441 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 442 $CmpdString = ReadCmpdString(\*SDFILE); 443 close SDFILE; 444 @CmpdLines = split "\n", $CmpdString; 445 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 446 $SpecifiedDataField = $OptionsInfo{CompoundID}; 447 if (!exists $DataFieldValues{$SpecifiedDataField}) { 448 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; 449 next FILELIST; 450 } 451 } 452 453 $AllDataFieldsRef = ''; 454 $CommonDataFieldsRef = ''; 455 if ($CollectDataFields) { 456 my($CmpdCount); 457 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 458 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 459 close SDFILE; 460 } 461 462 # Setup output file names... 463 $FileDir = ""; $FileName = ""; $FileExt = ""; 464 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 465 466 $TextOutFileExt = "csv"; 467 if ($Options{outdelim} =~ /^tab$/i) { 468 $TextOutFileExt = "tsv"; 469 } 470 $SDOutFileExt = $FileExt; 471 $FPOutFileExt = "fpf"; 472 473 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 474 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 475 if ($RootFileName && $RootFileExt) { 476 $FileName = $RootFileName; 477 } 478 else { 479 $FileName = $OptionsInfo{OutFileRoot}; 480 } 481 $OutFileRoot = $FileName; 482 } 483 else { 484 $OutFileRoot = "${FileName}TopologicalPharmacophoreAtomTripletsFP"; 485 } 486 487 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; 488 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}"; 489 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; 490 491 if ($OptionsInfo{SDOutput}) { 492 if ($SDFile =~ /$NewSDFileName/i) { 493 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 494 print "Specify a different name using \"-r --root\" option or use default name.\n"; 495 next FILELIST; 496 } 497 } 498 499 if (!$OptionsInfo{OverwriteFiles}) { 500 # Check SD and text outout files... 501 if ($OptionsInfo{SDOutput}) { 502 if (-e $NewSDFileName) { 503 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; 504 next FILELIST; 505 } 506 } 507 if ($OptionsInfo{FPOutput}) { 508 if (-e $NewFPFileName) { 509 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n"; 510 next FILELIST; 511 } 512 } 513 if ($OptionsInfo{TextOutput}) { 514 if (-e $NewTextFileName) { 515 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; 516 next FILELIST; 517 } 518 } 519 } 520 521 $SDFilesInfo{FileOkay}[$Index] = 1; 522 523 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 524 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; 525 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName; 526 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; 527 528 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; 529 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; 530 } 531 } 532 533 # Process option values... 534 sub ProcessOptions { 535 %OptionsInfo = (); 536 537 ProcessAtomTypesToUseOption(); 538 539 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel}; 540 541 $OptionsInfo{AtomTripletsSetSizeToUse} = $Options{atomtripletssetsizetouse}; 542 543 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; 544 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; 545 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; 546 547 my(@SpecifiedDataFields); 548 @SpecifiedDataFields = (); 549 550 @{$OptionsInfo{SpecifiedDataFields}} = (); 551 $OptionsInfo{CompoundID} = ''; 552 553 if ($Options{datafieldsmode} =~ /^CompoundID$/i) { 554 if ($Options{compoundidmode} =~ /^DataField$/i) { 555 if (!$Options{compoundid}) { 556 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; 557 } 558 $OptionsInfo{CompoundID} = $Options{compoundid}; 559 } 560 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { 561 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; 562 } 563 } 564 elsif ($Options{datafieldsmode} =~ /^Specify$/i) { 565 if (!$Options{datafields}) { 566 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; 567 } 568 @SpecifiedDataFields = split /\,/, $Options{datafields}; 569 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; 570 } 571 572 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; 573 574 $OptionsInfo{FingerprintsLabelMode} = $Options{fingerprintslabelmode}; 575 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalPharmacophoreAtomTripletsFingerprints'; 576 577 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; 578 579 $OptionsInfo{DistanceBinSize} = $Options{distancebinsize}; 580 581 $OptionsInfo{MinDistance} = $Options{mindistance}; 582 $OptionsInfo{MaxDistance} = $Options{maxdistance}; 583 584 $OptionsInfo{Output} = $Options{output}; 585 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0; 586 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0; 587 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0; 588 589 $OptionsInfo{OutDelim} = $Options{outdelim}; 590 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 591 592 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 593 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 594 595 $OptionsInfo{UseTriangleInequality} = ($Options{usetriangleinequality} =~ /^Yes$/i) ? 1 : 0; 596 597 # Setup default vector string format... 598 my($VectorStringFormat); 599 $VectorStringFormat = ''; 600 601 if ($Options{vectorstringformat}) { 602 $VectorStringFormat = $Options{vectorstringformat}; 603 604 if ($Options{atomtripletssetsizetouse} =~ /^ArbitrarySize$/i && $VectorStringFormat =~ /^ValuesString$/i) { 605 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid for $Options{atomtripletssetsizetouse} value of \"--AtomTripletsSetSizeToUse\" option. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; 606 } 607 } 608 else { 609 $VectorStringFormat = ($Options{atomtripletssetsizetouse} =~ /^FixedSize$/) ? "ValuesString" : "IDsAndValuesString"; 610 } 611 $OptionsInfo{VectorStringFormat} = $VectorStringFormat; 612 } 613 614 # Process atom type to use option... 615 # 616 sub ProcessAtomTypesToUseOption { 617 my($AtomType, $SpecifiedAtomTypesToUse, @AtomTypesWords); 618 619 @{$OptionsInfo{AtomTypesToUse}} = (); 620 if (IsEmpty($Options{atomtypestouse})) { 621 die "Error: Atom types value specified using \"-a, --AtomTypesToUse\" option is empty\n"; 622 } 623 624 $SpecifiedAtomTypesToUse = $Options{atomtypestouse}; 625 $SpecifiedAtomTypesToUse =~ s/ //g; 626 @AtomTypesWords = split /\,/, $SpecifiedAtomTypesToUse; 627 628 for $AtomType (@AtomTypesWords) { 629 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($AtomType)) { 630 die "Error: Atom type specified, $AtomType, using \"-a, --AtomTypesToUse\" option is not valid...\n "; 631 } 632 push @{$OptionsInfo{AtomTypesToUse}}, $AtomType; 633 } 634 } 635 636 # Setup script usage and retrieve command line arguments specified using various options... 637 sub SetupScriptUsage { 638 639 # Retrieve all the options... 640 %Options = (); 641 642 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel'; 643 644 $Options{atomtripletssetsizetouse} = 'ArbitrarySize'; 645 646 $Options{atomtypestouse} = 'HBD,HBA,PI,NI,H,Ar'; 647 648 $Options{compoundidmode} = 'LabelPrefix'; 649 $Options{compoundidlabel} = 'CompoundID'; 650 $Options{datafieldsmode} = 'CompoundID'; 651 652 $Options{filter} = 'Yes'; 653 654 $Options{fingerprintslabelmode} = 'FingerprintsLabelOnly'; 655 656 $Options{keeplargestcomponent} = 'Yes'; 657 658 $Options{mindistance} = 1; 659 $Options{maxdistance} = 10; 660 661 $Options{distancebinsize} = 2; 662 663 $Options{usetriangleinequality} = 'Yes'; 664 665 $Options{output} = 'text'; 666 $Options{outdelim} = 'comma'; 667 $Options{quote} = 'yes'; 668 669 $Options{vectorstringformat} = ''; 670 671 if (!GetOptions(\%Options, "aromaticitymodel=s", "atomtripletssetsizetouse=s", "atomtypestouse|a=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "distancebinsize=s", "filter|f=s", "fingerprintslabelmode=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "mindistance=s", "maxdistance=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "usetriangleinequality|u=s", "vectorstringformat|v=s", "workingdir|w=s")) { 672 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 673 } 674 if ($Options{workingdir}) { 675 if (! -d $Options{workingdir}) { 676 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 677 } 678 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 679 } 680 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) { 681 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels(); 682 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n"; 683 } 684 if ($Options{atomtripletssetsizetouse} !~ /^(ArbitrarySize|FixedSize)$/i) { 685 die "Error: The value specified, $Options{atomtripletssetsizetouse}, for option \"--AtomTripletsSetSizeToUse\" is not valid. Allowed values: ArbitrarySize or FixedSize\n"; 686 } 687 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 688 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 689 } 690 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 691 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; 692 } 693 if (!IsPositiveInteger($Options{distancebinsize})) { 694 die "Error: The value specified, $Options{distancebinsize}, for option \"--DistanceBinSize\" is not valid. Allowed values: > 0 \n"; 695 } 696 if ($Options{filter} !~ /^(Yes|No)$/i) { 697 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; 698 } 699 if ($Options{fingerprintslabelmode} !~ /^(FingerprintsLabelOnly|FingerprintsLabelWithIDs)$/i) { 700 die "Error: The value specified, $Options{fingerprintslabelmode}, for option \"--FingerprintsLabelMode\" is not valid. Allowed values: FingerprintsLabelOnly or FingerprintsLabelWithIDs\n"; 701 } 702 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { 703 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; 704 } 705 if (!IsPositiveInteger($Options{mindistance})) { 706 die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: > 0 \n"; 707 } 708 if (!IsPositiveInteger($Options{maxdistance})) { 709 die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n"; 710 } 711 if ($Options{mindistance} > $Options{maxdistance}) { 712 die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n"; 713 } 714 if ($Options{output} !~ /^(SD|FP|text|all)$/i) { 715 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n"; 716 } 717 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 718 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 719 } 720 if ($Options{quote} !~ /^(Yes|No)$/i) { 721 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 722 } 723 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) { 724 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n"; 725 } 726 if ($Options{usetriangleinequality} !~ /^(Yes|No)$/i) { 727 die "Error: The value specified, $Options{usetriangleinequality}, for option \"-u, --UseTriangleInequality\" is not valid. Allowed values: Yes or No\n"; 728 } 729 if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) { 730 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; 731 } 732 } 733