1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: CalculatePhysicochemicalProperties.pl,v $ 4 # $Date: 2015/02/28 20:46:19 $ 5 # $Revision: 1.20 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use SDFileUtil; 38 use MoleculeFileIO; 39 use Molecule; 40 use AtomTypes::AtomicInvariantsAtomTypes; 41 use AtomTypes::FunctionalClassAtomTypes; 42 use MolecularDescriptors::MolecularDescriptorsGenerator; 43 44 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 45 46 # Autoflush STDOUT 47 $| = 1; 48 49 # Starting message... 50 $ScriptName = basename($0); 51 print "\n$ScriptName: Starting...\n\n"; 52 $StartTime = new Benchmark; 53 54 # Get the options and setup script... 55 SetupScriptUsage(); 56 if ($Options{help} || @ARGV < 1) { 57 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 58 } 59 60 my(@SDFilesList); 61 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 62 63 # Process options... 64 print "Processing options...\n"; 65 my(%OptionsInfo); 66 ProcessOptions(); 67 68 # Setup information about input files... 69 print "Checking input SD file(s)...\n"; 70 my(%SDFilesInfo); 71 RetrieveSDFilesInfo(); 72 73 # Process input files.. 74 my($FileIndex); 75 if (@SDFilesList > 1) { 76 print "\nProcessing SD files...\n"; 77 } 78 for $FileIndex (0 .. $#SDFilesList) { 79 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 80 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 81 CalculatePhysicochemicalProperties($FileIndex); 82 } 83 } 84 print "\n$ScriptName:Done...\n\n"; 85 86 $EndTime = new Benchmark; 87 $TotalTime = timediff ($EndTime, $StartTime); 88 print "Total time: ", timestr($TotalTime), "\n"; 89 90 ############################################################################### 91 92 # Calculate physicochemical properties for a SD file... 93 # 94 sub CalculatePhysicochemicalProperties { 95 my($FileIndex) = @_; 96 my($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount, $SDFile, $MoleculeFileIO, $Molecule, $MolecularDescriptorsGenerator, $PhysicochemicalPropertiesDataRef, $NewSDFileRef, $NewTextFileRef); 97 98 $SDFile = $SDFilesList[$FileIndex]; 99 100 # Setup output files... 101 $NewSDFileRef = ''; $NewTextFileRef = ''; 102 ($NewSDFileRef, $NewTextFileRef) = SetupAndOpenOutputFiles($FileIndex); 103 104 # Setup molecular descriptor generator to calculate property values for specifed 105 # property names... 106 $MolecularDescriptorsGenerator = SetupMolecularDescriptorsGenerator(); 107 108 ($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount) = ('0') x 4; 109 110 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); 111 $MoleculeFileIO->Open(); 112 113 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { 114 $CmpdCount++; 115 116 # Filter compound data before calculating physiochemical properties... 117 if ($OptionsInfo{Filter}) { 118 if (CheckAndFilterCompound($CmpdCount, $Molecule)) { 119 $IgnoredCmpdCount++; 120 next COMPOUND; 121 } 122 } 123 124 # Calculate properties... 125 $PhysicochemicalPropertiesDataRef = CalculateMoleculeProperties($MolecularDescriptorsGenerator, $Molecule); 126 127 if (!defined($PhysicochemicalPropertiesDataRef)) { 128 $IgnoredCmpdCount++; 129 ProcessIgnoredCompound('PropertiesCalculationFailed', $CmpdCount, $Molecule); 130 next COMPOUND; 131 } 132 133 # Calculate any rule violations... 134 if ($OptionsInfo{RuleOf5Violations} && $PhysicochemicalPropertiesDataRef->{RuleOf5Violations}) { 135 $RuleOf5ViolationsCount++; 136 } 137 138 if ($OptionsInfo{RuleOf3Violations} && $PhysicochemicalPropertiesDataRef->{RuleOf3Violations}) { 139 $RuleOf3ViolationsCount++; 140 } 141 142 # Write out calculate properties... 143 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $PhysicochemicalPropertiesDataRef, $NewSDFileRef, $NewTextFileRef); 144 } 145 $MoleculeFileIO->Close(); 146 147 if ($OptionsInfo{SDOutput} && $NewSDFileRef) { 148 close $NewSDFileRef; 149 } 150 if ($OptionsInfo{TextOutput} && $NewTextFileRef) { 151 close $NewTextFileRef; 152 } 153 154 WriteCalculationSummaryStatistics($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount); 155 } 156 157 # Process compound being ignored due to problems in physicochemical properties calculation... 158 # 159 sub ProcessIgnoredCompound { 160 my($Mode, $CmpdCount, $Molecule) = @_; 161 my($CmpdID, $DataFieldLabelAndValuesRef); 162 163 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 164 $CmpdID = SetupCmpdIDForTextFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 165 166 MODE: { 167 if ($Mode =~ /^ContainsNonElementalData$/i) { 168 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; 169 next MODE; 170 } 171 172 if ($Mode =~ /^ContainsNoElementalData$/i) { 173 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; 174 next MODE; 175 } 176 177 if ($Mode =~ /^PropertiesCalculationFailed$/i) { 178 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Physicochemical properties calculation didn't succeed...\n\n"; 179 next MODE; 180 } 181 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Physicochemical properties calculation didn't succeed...\n\n"; 182 } 183 } 184 185 # Check and filter compounds.... 186 # 187 sub CheckAndFilterCompound { 188 my($CmpdCount, $Molecule) = @_; 189 my($ElementCount, $NonElementCount); 190 191 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); 192 193 if ($NonElementCount) { 194 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); 195 return 1; 196 } 197 198 if (!$ElementCount) { 199 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); 200 return 1; 201 } 202 203 return 0; 204 } 205 206 # Write out compounds physicochemical properties calculation summary statistics... 207 # 208 sub WriteCalculationSummaryStatistics { 209 my($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount) = @_; 210 my($ProcessedCmpdCount); 211 212 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; 213 214 print "\nNumber of compounds: $CmpdCount\n"; 215 print "Number of compounds processed successfully during physicochemical properties calculation: $ProcessedCmpdCount\n"; 216 print "Number of compounds ignored during physicochemical properties calculation: $IgnoredCmpdCount\n"; 217 218 if ($OptionsInfo{RuleOf5Violations}) { 219 print "Number of compounds with one or more RuleOf5 violations: $RuleOf5ViolationsCount\n"; 220 } 221 222 if ($OptionsInfo{RuleOf3Violations}) { 223 print "Number of compounds with one or more RuleOf3 violations: $RuleOf3ViolationsCount\n"; 224 } 225 226 } 227 228 # Open output files... 229 # 230 sub SetupAndOpenOutputFiles { 231 my($FileIndex) = @_; 232 my($NewSDFile, $NewTextFile, $NewSDFileRef, $NewTextFileRef); 233 234 $NewSDFileRef = ''; 235 $NewTextFileRef = ''; 236 237 if ($OptionsInfo{SDOutput}) { 238 $NewSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; 239 print "Generating SD file $NewSDFile...\n"; 240 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 241 $NewSDFileRef = \*NEWSDFILE; 242 } 243 if ($OptionsInfo{TextOutput}) { 244 $NewTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; 245 print "Generating text file $NewTextFile...\n"; 246 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n"; 247 WriteTextFileCoulmnLabels($FileIndex, \*NEWTEXTFILE); 248 $NewTextFileRef = \*NEWTEXTFILE; 249 } 250 return ($NewSDFileRef, $NewTextFileRef); 251 } 252 253 # Write calculated physicochemical properties and other data to appropriate output files... 254 # 255 sub WriteDataToOutputFiles { 256 my($FileIndex, $CmpdCount, $Molecule, $PhysicochemicalPropertiesDataRef, $NewSDFileRef, $NewTextFileRef) = @_; 257 my($PropertyName, $PropertyValue); 258 259 if ($OptionsInfo{SDOutput}) { 260 # Retrieve input compound string used to create molecule and write it out 261 # without last line containing a delimiter... 262 my($CmpdString); 263 $CmpdString = $Molecule->GetInputMoleculeString(); 264 $CmpdString =~ s/\$\$\$\$$//; 265 print $NewSDFileRef "$CmpdString"; 266 267 # Write out calculated physicochemical properties data... 268 for $PropertyName (@{$OptionsInfo{SpecifiedPropertyNames}}) { 269 $PropertyValue = $PhysicochemicalPropertiesDataRef->{$PropertyName}; 270 print $NewSDFileRef "> <$PropertyName>\n$PropertyValue\n\n"; 271 } 272 273 # Write out RuleOf5 violations for molecule.... 274 if ($OptionsInfo{RuleOf5Violations}) { 275 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf5Violations}; 276 print $NewSDFileRef "> <RuleOf5Violations>\n$PropertyValue\n\n"; 277 } 278 279 # Write out RuleOf3 violations for molecule.... 280 if ($OptionsInfo{RuleOf3Violations}) { 281 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf3Violations}; 282 print $NewSDFileRef "> <RuleOf3Violations>\n$PropertyValue\n\n"; 283 } 284 285 # Write out delimiter... 286 print $NewSDFileRef "\$\$\$\$\n"; 287 } 288 289 if ($OptionsInfo{TextOutput}) { 290 my($Line, $DataFieldLabelAndValuesRef, $DataFieldLabel, $DataFieldValue, @LineWords,); 291 292 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 293 @LineWords = (); 294 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 295 push @LineWords, SetupCmpdIDForTextFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 296 } 297 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 298 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 299 } 300 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 301 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 302 } 303 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 304 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; 305 } 306 307 # Append calculated physicochemical properties data... 308 for $PropertyName (@{$OptionsInfo{SpecifiedPropertyNames}}) { 309 $PropertyValue = $PhysicochemicalPropertiesDataRef->{$PropertyName}; 310 push @LineWords, $PropertyValue; 311 } 312 313 # Write out RuleOf5 violations for molecule.... 314 if ($OptionsInfo{RuleOf5Violations}) { 315 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf5Violations}; 316 push @LineWords, $PropertyValue; 317 } 318 319 # Write out RuleOf3 violations for molecule.... 320 if ($OptionsInfo{RuleOf3Violations}) { 321 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf3Violations}; 322 push @LineWords, $PropertyValue; 323 } 324 325 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 326 print $NewTextFileRef "$Line\n"; 327 } 328 } 329 330 # Write out approriate column labels to text file... 331 sub WriteTextFileCoulmnLabels { 332 my($FileIndex, $NewTextFileRef) = @_; 333 my($Line, @LineWords); 334 335 @LineWords = (); 336 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 337 push @LineWords, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 338 } 339 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 340 push @LineWords, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 341 } 342 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 343 push @LineWords, @{$OptionsInfo{SpecifiedDataFields}}; 344 } 345 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 346 push @LineWords, $OptionsInfo{CompoundIDLabel}; 347 } 348 my($SpecifiedPropertyName); 349 350 # Append physicochemical properties column labels... 351 push @LineWords, @{$OptionsInfo{SpecifiedPropertyNames}}; 352 353 # Write out RuleOf5 violations label... 354 if ($OptionsInfo{RuleOf5Violations}) { 355 push @LineWords, 'RuleOf5Violations'; 356 } 357 358 # Write out RuleOf3 violations label... 359 if ($OptionsInfo{RuleOf3Violations}) { 360 push @LineWords, 'RuleOf3Violations'; 361 } 362 363 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 364 print $NewTextFileRef "$Line\n"; 365 } 366 367 # Generate compound ID for text files.. 368 # 369 sub SetupCmpdIDForTextFiles { 370 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 371 my($CmpdID); 372 373 $CmpdID = ''; 374 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { 375 my($MolName); 376 $MolName = $Molecule->GetName(); 377 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; 378 } 379 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { 380 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; 381 } 382 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 383 my($SpecifiedDataField); 384 $SpecifiedDataField = $OptionsInfo{CompoundID}; 385 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; 386 } 387 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { 388 $CmpdID = $Molecule->GetName(); 389 } 390 return $CmpdID; 391 } 392 393 # Calculate physicochemical properties for molecule... 394 # 395 sub CalculateMoleculeProperties { 396 my($MolecularDescriptorsGenerator, $Molecule) = @_; 397 my($PropertyName, $PropertyValue, $MolecularDescriptorsObject, %CalculatedPhysicochemicalProperties); 398 399 %CalculatedPhysicochemicalProperties = (); 400 401 if ($OptionsInfo{KeepLargestComponent}) { 402 $Molecule->KeepLargestComponent(); 403 } 404 405 if (!$Molecule->DetectRings()) { 406 return undef; 407 } 408 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel}); 409 $Molecule->DetectAromaticity(); 410 411 if ($OptionsInfo{AddHydrogens}) { 412 $Molecule->AddHydrogens(); 413 } 414 415 # Calculate physicochemical properties... 416 $MolecularDescriptorsGenerator->SetMolecule($Molecule); 417 $MolecularDescriptorsGenerator->GenerateDescriptors(); 418 419 if (!$MolecularDescriptorsGenerator->IsDescriptorsGenerationSuccessful()) { 420 return undef; 421 } 422 423 %CalculatedPhysicochemicalProperties = $MolecularDescriptorsGenerator->GetDescriptorNamesAndValues(); 424 425 # Count RuleOf3 violations... 426 if ($OptionsInfo{RuleOf3Violations}) { 427 CalculateRuleViolationsCount('RuleOf3Violations', \%CalculatedPhysicochemicalProperties); 428 } 429 430 # Count RuleOf5 violations... 431 if ($OptionsInfo{RuleOf5Violations}) { 432 CalculateRuleViolationsCount('RuleOf5Violations', \%CalculatedPhysicochemicalProperties); 433 } 434 435 return \%CalculatedPhysicochemicalProperties; 436 } 437 438 # Setup molecular descriptor generator to calculate property values for specifed 439 # property names... 440 # 441 sub SetupMolecularDescriptorsGenerator { 442 my($PropertyName, $MolecularDescriptorsGenerator); 443 444 $MolecularDescriptorsGenerator = new MolecularDescriptors::MolecularDescriptorsGenerator('Mode' => 'Specify', 'DescriptorNames' => \@{$OptionsInfo{SpecifiedPropertyNames}}); 445 446 # Setup molecular desciptor calculation parameters... 447 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('MolecularWeight')}) || exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('ExactMass')}) ) { 448 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'WeightAndMassDescriptors', %{$OptionsInfo{PrecisionParametersMap}}); 449 } 450 451 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('RotatableBonds')})) { 452 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'RotatableBondsDescriptors', %{$OptionsInfo{RotatableBondsParametersMap}}); 453 } 454 455 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('HydrogenBondDonors')}) || exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('HydrogenBondAcceptors')}) ) { 456 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'HydrogenBondsDescriptors', 'HydrogenBondsType' => $OptionsInfo{HydrogenBonds}); 457 } 458 459 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('TPSA')})) { 460 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'TPSADescriptors', %{$OptionsInfo{TPSAParametersMap}}); 461 } 462 463 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('MolecularComplexity')})) { 464 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'MolecularComplexityDescriptors', %{$OptionsInfo{MolecularComplexityParametersMap}}); 465 } 466 467 return $MolecularDescriptorsGenerator; 468 } 469 470 # Calculate RuleOf3 or RuleOf5 violations count... 471 # 472 sub CalculateRuleViolationsCount { 473 my($RuleViolationsType, $CalculatedPropertiesMapRef) = @_; 474 my($RuleViolationsCount, $PropertyName); 475 476 $RuleViolationsCount = 0; 477 478 RULEVIOLATIONSTYPE: { 479 if ($RuleViolationsType =~ /^RuleOf3Violations$/i) { 480 for $PropertyName (@{$OptionsInfo{RuleOf3PropertyNames}}) { 481 if ($CalculatedPropertiesMapRef->{$PropertyName} > $OptionsInfo{RuleOf3MaxPropertyValuesMap}{$PropertyName}) { 482 $RuleViolationsCount++; 483 } 484 } 485 last RULEVIOLATIONSTYPE; 486 } 487 488 if ($RuleViolationsType =~ /^RuleOf5Violations$/i) { 489 for $PropertyName (@{$OptionsInfo{RuleOf5PropertyNames}}) { 490 if ($CalculatedPropertiesMapRef->{$PropertyName} > $OptionsInfo{RuleOf5MaxPropertyValuesMap}{$PropertyName}) { 491 $RuleViolationsCount++; 492 } 493 } 494 last RULEVIOLATIONSTYPE; 495 } 496 497 die "Warning: Unknown rule violation type: $RuleViolationsType..."; 498 } 499 500 # Set rule violation count... 501 $CalculatedPropertiesMapRef->{$RuleViolationsType} = $RuleViolationsCount; 502 503 } 504 505 # Retrieve information about SD files... 506 # 507 sub RetrieveSDFilesInfo { 508 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $NewSDFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); 509 510 %SDFilesInfo = (); 511 @{$SDFilesInfo{FileOkay}} = (); 512 @{$SDFilesInfo{OutFileRoot}} = (); 513 @{$SDFilesInfo{SDOutFileNames}} = (); 514 @{$SDFilesInfo{TextOutFileNames}} = (); 515 @{$SDFilesInfo{AllDataFieldsRef}} = (); 516 @{$SDFilesInfo{CommonDataFieldsRef}} = (); 517 518 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; 519 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; 520 521 FILELIST: for $Index (0 .. $#SDFilesList) { 522 $SDFile = $SDFilesList[$Index]; 523 524 $SDFilesInfo{FileOkay}[$Index] = 0; 525 $SDFilesInfo{OutFileRoot}[$Index] = ''; 526 $SDFilesInfo{SDOutFileNames}[$Index] = ''; 527 $SDFilesInfo{TextOutFileNames}[$Index] = ''; 528 529 $SDFile = $SDFilesList[$Index]; 530 if (!(-e $SDFile)) { 531 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 532 next FILELIST; 533 } 534 if (!CheckFileType($SDFile, "sd sdf")) { 535 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 536 next FILELIST; 537 } 538 539 if ($CheckDataField) { 540 # Make sure data field exists in SD file.. 541 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 542 543 @CmpdLines = (); 544 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 545 $CmpdString = ReadCmpdString(\*SDFILE); 546 close SDFILE; 547 @CmpdLines = split "\n", $CmpdString; 548 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 549 $SpecifiedDataField = $OptionsInfo{CompoundID}; 550 if (!exists $DataFieldValues{$SpecifiedDataField}) { 551 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; 552 next FILELIST; 553 } 554 } 555 556 $AllDataFieldsRef = ''; 557 $CommonDataFieldsRef = ''; 558 if ($CollectDataFields) { 559 my($CmpdCount); 560 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 561 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 562 close SDFILE; 563 } 564 565 # Setup output file names... 566 $FileDir = ""; $FileName = ""; $FileExt = ""; 567 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 568 569 $TextOutFileExt = "csv"; 570 if ($Options{outdelim} =~ /^tab$/i) { 571 $TextOutFileExt = "tsv"; 572 } 573 $SDOutFileExt = $FileExt; 574 575 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 576 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 577 if ($RootFileName && $RootFileExt) { 578 $FileName = $RootFileName; 579 } 580 else { 581 $FileName = $OptionsInfo{OutFileRoot}; 582 } 583 $OutFileRoot = $FileName; 584 } 585 else { 586 $OutFileRoot = "${FileName}PhysicochemicalProperties"; 587 } 588 589 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; 590 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; 591 592 if ($OptionsInfo{SDOutput}) { 593 if ($SDFile =~ /$NewSDFileName/i) { 594 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 595 print "Specify a different name using \"-r --root\" option or use default name.\n"; 596 next FILELIST; 597 } 598 } 599 600 if (!$OptionsInfo{OverwriteFiles}) { 601 # Check SD and text outout files... 602 if ($OptionsInfo{SDOutput}) { 603 if (-e $NewSDFileName) { 604 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; 605 next FILELIST; 606 } 607 } 608 if ($OptionsInfo{TextOutput}) { 609 if (-e $NewTextFileName) { 610 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; 611 next FILELIST; 612 } 613 } 614 } 615 616 $SDFilesInfo{FileOkay}[$Index] = 1; 617 618 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 619 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; 620 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; 621 622 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; 623 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; 624 } 625 } 626 627 # Process option values... 628 sub ProcessOptions { 629 %OptionsInfo = (); 630 631 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel}; 632 633 # Process property name related options... 634 ProcessPropertyNamesOption(); 635 636 # Setup RuleOf3 and RuleOf5 violation calculations... 637 $OptionsInfo{RuleOf3Violations} = ($Options{ruleof3violations} =~ /^Yes$/i) ? 1 : 0; 638 $OptionsInfo{RuleOf5Violations} = ($Options{ruleof5violations} =~ /^Yes$/i) ? 1 : 0; 639 640 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; 641 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; 642 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; 643 644 my(@SpecifiedDataFields); 645 @SpecifiedDataFields = (); 646 647 @{$OptionsInfo{SpecifiedDataFields}} = (); 648 $OptionsInfo{CompoundID} = ''; 649 650 if ($Options{datafieldsmode} =~ /^CompoundID$/i) { 651 if ($Options{compoundidmode} =~ /^DataField$/i) { 652 if (!$Options{compoundid}) { 653 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; 654 } 655 $OptionsInfo{CompoundID} = $Options{compoundid}; 656 } 657 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { 658 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; 659 } 660 } 661 elsif ($Options{datafieldsmode} =~ /^Specify$/i) { 662 if (!$Options{datafields}) { 663 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; 664 } 665 @SpecifiedDataFields = split /\,/, $Options{datafields}; 666 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; 667 } 668 669 # Types of hydrogen bonds... 670 $OptionsInfo{HydrogenBonds} = $Options{hydrogenbonds}; 671 672 # Process precision value parameters... 673 ProcessPrecisionOption(); 674 675 # Process rotatable bonds parameters... 676 ProcessRotatableBondsOption(); 677 678 # Process TPSA parameters... 679 ProcessTPSAOption(); 680 681 # Process molecular complexity parameters... 682 ProcessMolecularComplexityOption(); 683 684 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; 685 686 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; 687 688 $OptionsInfo{Output} = $Options{output}; 689 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|Both)$/i) ? 1 : 0; 690 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|Both)$/i) ? 1 : 0; 691 692 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 693 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 694 695 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 696 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 697 } 698 699 # Process property name related options... 700 # 701 sub ProcessPropertyNamesOption { 702 703 # Setup supported physicochemical properties... 704 my($SupportedProperty); 705 706 @{$OptionsInfo{SupportedPropertyNames}} = (); 707 %{$OptionsInfo{SupportedPropertyNamesMap}} = (); 708 709 @{$OptionsInfo{RuleOf5PropertyNames}} = (); 710 %{$OptionsInfo{RuleOf5MaxPropertyValuesMap}} = (); 711 712 @{$OptionsInfo{RuleOf3PropertyNames}} = (); 713 %{$OptionsInfo{RuleOf3MaxPropertyValuesMap}} = (); 714 715 @{$OptionsInfo{DefaultPropertyNames}} = (); 716 717 @{$OptionsInfo{SupportedPropertyNames}} = qw(MolecularWeight ExactMass HeavyAtoms Rings AromaticRings MolecularVolume RotatableBonds HydrogenBondDonors HydrogenBondAcceptors SLogP SMR TPSA Fsp3Carbons Sp3Carbons MolecularComplexity); 718 719 @{$OptionsInfo{RuleOf5PropertyNames}} = qw(MolecularWeight HydrogenBondDonors HydrogenBondAcceptors SLogP); 720 %{$OptionsInfo{RuleOf5MaxPropertyValuesMap}} = ('MolecularWeight' => 500, 'HydrogenBondDonors' => 5, 'HydrogenBondAcceptors' => 10, 'SLogP' => 5); 721 722 @{$OptionsInfo{RuleOf3PropertyNames}} = qw(MolecularWeight RotatableBonds HydrogenBondDonors HydrogenBondAcceptors SLogP TPSA); 723 %{$OptionsInfo{RuleOf3MaxPropertyValuesMap}} = ('MolecularWeight' => 300, 'RotatableBonds' => 3, 'HydrogenBondDonors' => 3, 'HydrogenBondAcceptors' => 3, 'SLogP' => 3, 'TPSA' => 60); 724 725 @{$OptionsInfo{DefaultPropertyNames}} = qw(MolecularWeight HeavyAtoms MolecularVolume RotatableBonds HydrogenBondDonors HydrogenBondAcceptors SLogP TPSA); 726 727 for $SupportedProperty (@{$OptionsInfo{SupportedPropertyNames}}) { 728 $OptionsInfo{SupportedPropertyNamesMap}{lc($SupportedProperty)} = $SupportedProperty; 729 } 730 731 # Process specified properties.... 732 my($SpecifiedPropertyName, @SpecifiedPropertyNames, %SpecifiedPropertyNamesMap); 733 734 @SpecifiedPropertyNames = (); 735 %SpecifiedPropertyNamesMap = (); 736 737 @{$OptionsInfo{SpecifiedPropertyNames}} = (); 738 %{$OptionsInfo{SpecifiedPropertyNamesMap}} = (); 739 740 if ($Options{mode} =~ /^All$/i) { 741 @SpecifiedPropertyNames = @{$OptionsInfo{SupportedPropertyNames}}; 742 } 743 elsif ($Options{mode} =~ /^RuleOf5$/i) { 744 @SpecifiedPropertyNames = @{$OptionsInfo{RuleOf5PropertyNames}}; 745 } 746 elsif ($Options{mode} =~ /^RuleOf3$/i) { 747 @SpecifiedPropertyNames = @{$OptionsInfo{RuleOf3PropertyNames}}; 748 } 749 elsif (IsEmpty($Options{mode})) { 750 @SpecifiedPropertyNames = @{$OptionsInfo{DefaultPropertyNames}}; 751 } 752 else { 753 # Comma delimited lisr of specified property names... 754 my($Mode, $PropertyName, @PropertyNames, @UnsupportedPropertyNames); 755 756 $Mode = $Options{mode}; 757 $Mode =~ s/ //g; 758 759 @PropertyNames = split ",", $Mode; 760 @UnsupportedPropertyNames = (); 761 762 for $PropertyName (@PropertyNames) { 763 if (exists($OptionsInfo{SupportedPropertyNamesMap}{lc($PropertyName)})) { 764 push @SpecifiedPropertyNames, $PropertyName; 765 } 766 else { 767 push @UnsupportedPropertyNames, $PropertyName; 768 } 769 } 770 if (@UnsupportedPropertyNames) { 771 if (@UnsupportedPropertyNames > 1) { 772 warn "Error: The physicochemical property names specified - ", JoinWords(\@UnsupportedPropertyNames, ", ", 0)," - for option \"-m --mode\" are not valid.\n"; 773 } 774 else { 775 warn "Error: The physicochemical property name specified, @UnsupportedPropertyNames , for option \"-m --mode\" is not valid.\n"; 776 } 777 die "Allowed values:", JoinWords(\@{$OptionsInfo{SupportedPropertyNames}}, ", ", 0), "\n"; 778 } 779 if (!@SpecifiedPropertyNames) { 780 die "Error: No valid physicochemical property names specified for option \"-m --mode\".\n"; 781 } 782 } 783 784 # Set up specified property names map... 785 PROPERTY: for $SpecifiedPropertyName (@SpecifiedPropertyNames) { 786 if (exists $SpecifiedPropertyNamesMap{lc($SpecifiedPropertyName)}) { 787 warn "Warning: The physicochemical property name, $SpecifiedPropertyName, is specified multiple times as value of option \"-m --mode\" .\n"; 788 next PROPERTY; 789 } 790 # Canonical specified property name... 791 $SpecifiedPropertyNamesMap{lc($SpecifiedPropertyName)} = $OptionsInfo{SupportedPropertyNamesMap}{lc($SpecifiedPropertyName)}; 792 } 793 794 # Make sure for calculation of RuleOf3Violations, all appropriate property names are specified... 795 if ($Options{ruleof3violations} =~ /^Yes$/i && $Options{mode} =~ /^RuleOf5$/i) { 796 die "Error: The value specified, $Options{ruleof3violations}, for \"--RuleOf3Violations\" option in \"RuleOf5\" \"-m --Mode\" is not valid. You must specify RuleOf3 value for \"-m --Mode\" to calculate RuleOf3 violations.\n"; 797 } 798 799 if ($Options{ruleof3violations} =~ /^Yes$/i) { 800 my($RuleOf3PropertyName, @MissingRuleOf3Names); 801 802 @MissingRuleOf3Names = (); 803 PROPERTY: for $RuleOf3PropertyName (@{$OptionsInfo{RuleOf3PropertyNames}}) { 804 if (exists $SpecifiedPropertyNamesMap{lc($RuleOf3PropertyName)}) { 805 next PROPERTY; 806 } 807 push @MissingRuleOf3Names, $RuleOf3PropertyName; 808 809 # Add property name to specified properties names list and map... 810 push @SpecifiedPropertyNames, $RuleOf3PropertyName; 811 $SpecifiedPropertyNamesMap{lc($RuleOf3PropertyName)} = $OptionsInfo{SupportedPropertyNamesMap}{lc($RuleOf3PropertyName)}; 812 } 813 if (@MissingRuleOf3Names) { 814 warn "Warning: The following physicochemical property names not specified in \"-m --Mode\" option are required for calculating RuleOf3Violations and have been added to the list of property names: @MissingRuleOf3Names\n"; 815 } 816 } 817 818 # Make sure for calculation of RuleOf5Violations, all appropriate property names are specified... 819 if ($Options{ruleof5violations} =~ /^Yes$/i && $Options{mode} =~ /^RuleOf3$/i) { 820 die "Error: The value specified, $Options{ruleof5violations}, for \"--RuleOf5Violations\" option in \"RuleOf3\" \"-m --Mode\" is not valid. You must specify RuleOf5 value for \"-m --Mode\" to calculate RuleOf5 violations.\n"; 821 } 822 823 if ($Options{ruleof5violations} =~ /^Yes$/i) { 824 my($RuleOf5PropertyName, @MissingRuleOf5Names); 825 826 @MissingRuleOf5Names = (); 827 PROPERTY: for $RuleOf5PropertyName (@{$OptionsInfo{RuleOf5PropertyNames}}) { 828 if (exists $SpecifiedPropertyNamesMap{lc($RuleOf5PropertyName)}) { 829 next PROPERTY; 830 } 831 push @MissingRuleOf5Names, $RuleOf5PropertyName; 832 833 # Add property name to specified properties names list and map... 834 push @SpecifiedPropertyNames, $RuleOf5PropertyName; 835 $SpecifiedPropertyNamesMap{lc($RuleOf5PropertyName)} = $OptionsInfo{SupportedPropertyNamesMap}{lc($RuleOf5PropertyName)}; 836 } 837 if (@MissingRuleOf5Names) { 838 warn "Warning: The following physicochemical property names not specified in \"-m --Mode\" option are required for calculating RuleOf5Violations and have been added to the list of property names: @MissingRuleOf5Names\n"; 839 } 840 } 841 $OptionsInfo{Mode} = $Options{mode}; 842 843 # Setup canonical specified property names corresponding to supported names in mixed case... 844 my(@SpecifiedCanonicalPropertyNames); 845 846 @SpecifiedCanonicalPropertyNames = (); 847 for $SpecifiedPropertyName (@SpecifiedPropertyNames) { 848 push @SpecifiedCanonicalPropertyNames, $SpecifiedPropertyNamesMap{lc($SpecifiedPropertyName)}; 849 } 850 @{$OptionsInfo{SpecifiedPropertyNames}} = @SpecifiedCanonicalPropertyNames; 851 %{$OptionsInfo{SpecifiedPropertyNamesMap}} = %SpecifiedPropertyNamesMap; 852 853 # Based on specified property names, figure out whether hydrogens need to be added before 854 # calculation of properties... 855 # 856 $OptionsInfo{AddHydrogens} = 0; 857 if (exists($SpecifiedPropertyNamesMap{lc('MolecularVolume')}) || exists($SpecifiedPropertyNamesMap{lc('SLogP')}) || exists($SpecifiedPropertyNamesMap{lc('SMR')})) { 858 $OptionsInfo{AddHydrogens} = 1; 859 } 860 } 861 862 # Process precision option... 863 # 864 sub ProcessPrecisionOption { 865 my($ParameterName, $ParameterValue, %PrecisionParametersMap, %PrecisionParameterNamesMap); 866 867 %{$OptionsInfo{PrecisionParametersMap}} = (); 868 869 %PrecisionParametersMap = ('WeightPrecision' => 2, 'MassPrecision' => 4); 870 %PrecisionParameterNamesMap = ('molecularweight' => 'WeightPrecision', 'exactmass' => 'MassPrecision'); 871 872 if ($Options{precision}) { 873 # Process specified values... 874 my($Index, $SpecifiedPrecision, @SpecifiedPrecisionValuePairs); 875 876 $SpecifiedPrecision = $Options{precision}; 877 $SpecifiedPrecision =~ s/ //g; 878 @SpecifiedPrecisionValuePairs = split ",", $SpecifiedPrecision; 879 if (@SpecifiedPrecisionValuePairs % 2) { 880 die "Error: Invalid number of values specified using \"--Precision\" option: It must contain even number of values.\n"; 881 } 882 for ($Index = 0; (($Index + 1) < @SpecifiedPrecisionValuePairs); $Index += 2 ) { 883 $ParameterName = $SpecifiedPrecisionValuePairs[$Index]; 884 $ParameterValue = $SpecifiedPrecisionValuePairs[$Index + 1]; 885 if (!exists $PrecisionParameterNamesMap{lc($ParameterName)}) { 886 die "Error: The precision parameter name specified, $ParameterName, for option \"--Precision\" is not valid.\n"; 887 } 888 if (!IsPositiveInteger($ParameterValue)) { 889 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--Precision\" is not valid. Allowed values: positive integer. \n"; 890 } 891 $ParameterName = $PrecisionParameterNamesMap{lc($ParameterName)}; 892 $PrecisionParametersMap{$ParameterName} = $ParameterValue; 893 } 894 } 895 $OptionsInfo{Precision} = $Options{precision}; 896 %{$OptionsInfo{PrecisionParametersMap}} = %PrecisionParametersMap; 897 } 898 899 # Process rotatable bonds option... 900 sub ProcessRotatableBondsOption { 901 my($ParameterName, $ParameterValue, %RotatableBondsParametersMap, %RotatableBondsParameterNamesMap); 902 903 %{$OptionsInfo{RotatableBondsParametersMap}} = (); 904 %RotatableBondsParametersMap = ('IgnoreTerminalBonds' => 1, 'IgnoreBondsToTripleBonds' => 1, 'IgnoreAmideBonds' => 1, 'IgnoreThioamideBonds' => 1, 'IgnoreSulfonamideBonds' => 1); 905 906 for $ParameterName (keys %RotatableBondsParametersMap) { 907 $RotatableBondsParameterNamesMap{lc($ParameterName)} = $ParameterName; 908 } 909 910 if ($Options{rotatablebonds}) { 911 # Process specified values... 912 my($Index, $SpecifiedRotatableBonds, @SpecifiedRotatableBondsValuePairs); 913 914 $SpecifiedRotatableBonds = $Options{rotatablebonds}; 915 $SpecifiedRotatableBonds =~ s/ //g; 916 @SpecifiedRotatableBondsValuePairs = split ",", $SpecifiedRotatableBonds; 917 if (@SpecifiedRotatableBondsValuePairs % 2) { 918 die "Error: Invalid number of values specified using \"--RotatableBonds\" option: It must contain even number of values.\n"; 919 } 920 for ($Index = 0; (($Index + 1) < @SpecifiedRotatableBondsValuePairs); $Index += 2 ) { 921 $ParameterName = $SpecifiedRotatableBondsValuePairs[$Index]; 922 $ParameterValue = $SpecifiedRotatableBondsValuePairs[$Index + 1]; 923 if (!exists $RotatableBondsParameterNamesMap{lc($ParameterName)}) { 924 die "Error: The rotatable bonds parameter name specified, $ParameterName, for option \"--RotatableBonds\" is not valid.\n"; 925 } 926 if ($ParameterValue !~ /^(Yes|No)$/i) { 927 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--RotatableBonds\" is not valid. Allowed values: Yes or No. \n"; 928 } 929 $ParameterName = $RotatableBondsParameterNamesMap{lc($ParameterName)}; 930 $ParameterValue = ($ParameterValue =~ /^Yes$/i) ? 1 : 0; 931 $RotatableBondsParametersMap{$ParameterName} = $ParameterValue; 932 } 933 } 934 $OptionsInfo{RotatableBonds} = $Options{rotatablebonds}; 935 %{$OptionsInfo{RotatableBondsParametersMap}} = %RotatableBondsParametersMap; 936 } 937 938 # Process TPSA option... 939 # 940 sub ProcessTPSAOption { 941 my($ParameterName, $ParameterValue, %TPSAParametersMap, %TPSAParameterNamesMap); 942 943 %{$OptionsInfo{TPSAParametersMap}} = (); 944 945 %TPSAParametersMap = ('IgnorePhosphorus' => 1, 'IgnoreSulfur' => 1); 946 for $ParameterName (keys %TPSAParametersMap) { 947 $TPSAParameterNamesMap{lc($ParameterName)} = $ParameterName; 948 } 949 950 if ($Options{tpsa}) { 951 # Process specified values... 952 my($Index, $SpecifiedTPSA, @SpecifiedTPSAValuePairs); 953 954 $SpecifiedTPSA = $Options{tpsa}; 955 $SpecifiedTPSA =~ s/ //g; 956 @SpecifiedTPSAValuePairs = split ",", $SpecifiedTPSA; 957 if (@SpecifiedTPSAValuePairs % 2) { 958 die "Error: Invalid number of values specified using \"--TPSA\" option: It must contain even number of values.\n"; 959 } 960 for ($Index = 0; (($Index + 1) < @SpecifiedTPSAValuePairs); $Index += 2 ) { 961 $ParameterName = $SpecifiedTPSAValuePairs[$Index]; 962 $ParameterValue = $SpecifiedTPSAValuePairs[$Index + 1]; 963 if (!exists $TPSAParameterNamesMap{lc($ParameterName)}) { 964 die "Error: The TPSA parameter name specified, $ParameterName, for option \"--TPSA\" is not valid.\n"; 965 } 966 if ($ParameterValue !~ /^(Yes|No)$/i) { 967 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--TPSA\" is not valid. Allowed values: Yes or No. \n"; 968 } 969 $ParameterName = $TPSAParameterNamesMap{lc($ParameterName)}; 970 $ParameterValue = ($ParameterValue =~ /^Yes$/i) ? 1 : 0; 971 $TPSAParametersMap{$ParameterName} = $ParameterValue; 972 } 973 } 974 $OptionsInfo{TPSA} = $Options{tpsa}; 975 %{$OptionsInfo{TPSAParametersMap}} = %TPSAParametersMap; 976 } 977 978 # Process molecular complexity parameters... 979 # 980 sub ProcessMolecularComplexityOption { 981 my($MolecularComplexityType, $ParameterName, $ParameterValue, @ParameterNames, @ParameterValues, @AtomIdentifierTypeParameters, %ComplexityParametersMap, %ComplexityParameterNamesMap); 982 983 %{$OptionsInfo{MolecularComplexityParametersMap}} = (); 984 985 %ComplexityParametersMap = ('MolecularComplexityType' => '', 'AtomIdentifierType' => '', 986 'AtomicInvariantsToUse' => '', 'FunctionalClassesToUse' => '', 987 'MACCSKeysSize' => '166', 'NeighborhoodRadius' => '2', 988 'MinPathLength' => '1', 'MaxPathLength' => '8', 'UseBondSymbols' => '1', 989 'MinDistance' => '1', 'MaxDistance' => '10', 'UseTriangleInequality' => '', 990 'DistanceBinSize' => '2', 'NormalizationMethodology' => 'None'); 991 992 %ComplexityParameterNamesMap = (); 993 for $ParameterName (keys %ComplexityParametersMap) { 994 $ComplexityParameterNamesMap{lc($ParameterName)} = $ParameterName; 995 } 996 997 if ($Options{molecularcomplexity}) { 998 # Process specified values... 999 my($Index, $SpecifiedComplexity, @SpecifiedComplexityValuePairs); 1000 1001 $SpecifiedComplexity = $Options{molecularcomplexity}; 1002 1003 @SpecifiedComplexityValuePairs = split ",", $SpecifiedComplexity; 1004 if (@SpecifiedComplexityValuePairs % 2) { 1005 die "Error: Invalid number of values specified using \"--MolecularComplexity\" option: It must contain even number of values.\n"; 1006 } 1007 1008 for ($Index = 0; (($Index + 1) < @SpecifiedComplexityValuePairs); $Index += 2 ) { 1009 $ParameterName = $SpecifiedComplexityValuePairs[$Index]; 1010 $ParameterValue = $SpecifiedComplexityValuePairs[$Index + 1]; 1011 1012 $ParameterName = RemoveLeadingAndTrailingWhiteSpaces($ParameterName); 1013 $ParameterValue = RemoveLeadingAndTrailingWhiteSpaces($ParameterValue); 1014 1015 if (!exists $ComplexityParameterNamesMap{lc($ParameterName)}) { 1016 die "Error: The molecular complexity parameter name specified, $ParameterName, for option \"--MolecularComplexity\" is not valid.\n"; 1017 } 1018 $ParameterName = $ComplexityParameterNamesMap{lc($ParameterName)}; 1019 1020 if ($ParameterName =~ /^AtomicInvariantsToUse$/i) { 1021 my($AtomSymbolFound); 1022 1023 $AtomSymbolFound = 0; 1024 @ParameterValues = split(' ', $ParameterValue); 1025 for $ParameterValue (@ParameterValues) { 1026 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($ParameterValue)) { 1027 die "Error: The atomic invariant specified, $ParameterValue, for AtomicInvariantsToUse in option \"--MolecularComplexity\" is not valid.\n"; 1028 } 1029 if ($ParameterValue =~ /^(AS|AtomSymbol)$/i) { 1030 $AtomSymbolFound = 1; 1031 } 1032 } 1033 if (!$AtomSymbolFound) { 1034 die "Error: The atomic invariants specified using AtomicInvariantsToUse in option \"--MolecularComplexity\" is not valid: AtomicInvariant atom symbol, AS or AtomSymbol, must be specified.\n"; 1035 } 1036 $ParameterValue = JoinWords(\@ParameterValues, ",", 0); 1037 } 1038 elsif ($ParameterName =~ /^FunctionalClassesToUse$/i) { 1039 @ParameterValues = split(' ', $ParameterValue); 1040 for $ParameterValue (@ParameterValues) { 1041 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($ParameterValue)) { 1042 die "Error: The functional class specified, $ParameterValue, for FunctionalClassesToUse in option \"--MolecularComplexity\" is not valid.\n"; 1043 } 1044 } 1045 $ParameterValue = JoinWords(\@ParameterValues, ",", 0); 1046 } 1047 else { 1048 if ($ParameterValue =~ / /) { 1049 $ParameterValue =~ s/ //g; 1050 } 1051 if ($ParameterValue =~ /^(Yes|No)$/i) { 1052 $ParameterValue = ($ParameterValue =~ /^Yes$/i) ? 1 : 0; 1053 } 1054 } 1055 1056 if ($ParameterName =~ /^MolecularComplexityType$/i) { 1057 if ($ParameterValue !~ /^(AtomTypesFingerprints|ExtendedConnectivityFingerprints|MACCSKeys|PathLengthFingerprints|TopologicalAtomPairsFingerprints|TopologicalAtomTripletsFingerprints|TopologicalAtomTorsionsFingerprints|TopologicalPharmacophoreAtomPairsFingerprints|TopologicalPharmacophoreAtomTripletsFingerprints)$/i) { 1058 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: AtomTypesFingerprints, ExtendedConnectivityFingerprints, MACCSKeys, PathLengthFingerprints, TopologicalAtomPairsFingerprints, TopologicalAtomTripletsFingerprints, TopologicalAtomTorsionsFingerprints, TopologicalPharmacophoreAtomPairsFingerprints, or TopologicalPharmacophoreAtomTripletsFingerprints..\n"; 1059 } 1060 } 1061 elsif ($ParameterName =~ /^AtomIdentifierType$/i) { 1062 if ($ParameterValue !~ /^(AtomicInvariantsAtomTypes|FunctionalClassAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 1063 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, FunctionalClassAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes and UFFAtomTypes.\n"; 1064 } 1065 } 1066 elsif ($ParameterName =~ /^(MACCSKeysSize|MinPathLength|MaxPathLength|MinDistance|MaxDistance|DistanceBinSize)$/i) { 1067 if (!IsPositiveInteger($ParameterValue)) { 1068 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: positive integer. \n"; 1069 } 1070 } 1071 elsif ($ParameterName =~ /^NeighborhoodRadius$/i) { 1072 if (!(IsInteger($ParameterValue) && $ParameterValue >=0)) { 1073 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: 0 or positive integer. \n"; 1074 } 1075 } 1076 elsif ($ParameterName =~ /^NormalizationMethodology$/i) { 1077 if ($ParameterValue !~ /^(None|ByHeavyAtomsCount|ByPossibleKeysCount)$/i) { 1078 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: None, ByHeavyAtomsCount, or ByPossibleKeysCount\n"; 1079 } 1080 } 1081 $ComplexityParametersMap{$ParameterName} = $ParameterValue; 1082 } 1083 1084 if ($ComplexityParametersMap{MACCSKeysSize} !~ /^(166|322)$/i) { 1085 die "Error: The parameter value specified, $ComplexityParametersMap{MACCSKeysSize}, for parameter name, MACCSKeysSize in option \"--MolecularComplexity\" is not valid. Allowed values: 166 or 322\n"; 1086 } 1087 if ($ComplexityParametersMap{MinPathLength} > $ComplexityParametersMap{MaxPathLength}) { 1088 die "Error: The parameter value specified for MinPathLength, $ComplexityParametersMap{MinPathLength}, must be <= MaxPathLength, $ComplexityParametersMap{MaxPathLength} ...\n"; 1089 } 1090 if ($ComplexityParametersMap{MinDistance} > $ComplexityParametersMap{MaxDistance}) { 1091 die "Error: The parameter value specified for MinDistance, $ComplexityParametersMap{MinDistance}, must be <= MaxDistance, $ComplexityParametersMap{MaxDistance} ...\n"; 1092 } 1093 } 1094 1095 # Set default parameter values... 1096 1097 if (IsEmpty($ComplexityParametersMap{MolecularComplexityType})) { 1098 $ComplexityParametersMap{MolecularComplexityType} = 'MACCSKeys'; 1099 } 1100 $MolecularComplexityType = $ComplexityParametersMap{MolecularComplexityType}; 1101 1102 1103 if (IsEmpty($ComplexityParametersMap{AtomIdentifierType})) { 1104 $ComplexityParametersMap{AtomIdentifierType} = ($MolecularComplexityType =~ /^(TopologicalPharmacophoreAtomPairsFingerprints|TopologicalPharmacophoreAtomTripletsFingerprints)$/i) ? "FunctionalClassAtomTypes" : "AtomicInvariantsAtomTypes"; 1105 } 1106 1107 if (IsEmpty($ComplexityParametersMap{AtomicInvariantsToUse})) { 1108 my($AtomicInvariantsToUse); 1109 1110 if ($MolecularComplexityType =~ /^(AtomTypesFingerprints|TopologicalAtomPairsFingerprints|TopologicalAtomTripletsFingerprints|TopologicalAtomTorsionsFingerprints)$/i) { 1111 $AtomicInvariantsToUse = "AS,X,BO,H,FC"; 1112 } 1113 elsif ($MolecularComplexityType =~ /^ExtendedConnectivityFingerprints$/i) { 1114 $AtomicInvariantsToUse = "AS,X,BO,H,FC,MN"; 1115 } 1116 else { 1117 $AtomicInvariantsToUse = "AS"; 1118 } 1119 $ComplexityParametersMap{AtomicInvariantsToUse} = $AtomicInvariantsToUse; 1120 } 1121 1122 if (IsEmpty($ComplexityParametersMap{FunctionalClassesToUse})) { 1123 my($FunctionalClassesToUse); 1124 1125 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomPairsFingerprints$/i) { 1126 $FunctionalClassesToUse = "HBD,HBA,PI,NI,H"; 1127 } 1128 elsif ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomTripletsFingerprints$/i) { 1129 $FunctionalClassesToUse = "HBD,HBA,PI,NI,H,Ar"; 1130 } 1131 else { 1132 $FunctionalClassesToUse = "HBD,HBA,PI,NI,H,Ar,Hal"; 1133 } 1134 $ComplexityParametersMap{FunctionalClassesToUse} = $FunctionalClassesToUse; 1135 } 1136 1137 my(@AtomicInvariantsToUse); 1138 @AtomicInvariantsToUse = split ',', $ComplexityParametersMap{AtomicInvariantsToUse}; 1139 $ComplexityParametersMap{AtomicInvariantsToUse} = \@AtomicInvariantsToUse; 1140 1141 my(@FunctionalClassesToUse); 1142 @FunctionalClassesToUse = split ',', $ComplexityParametersMap{FunctionalClassesToUse}; 1143 $ComplexityParametersMap{FunctionalClassesToUse} = \@FunctionalClassesToUse; 1144 1145 if (IsEmpty($ComplexityParametersMap{UseTriangleInequality})) { 1146 $ComplexityParametersMap{UseTriangleInequality} = 0; 1147 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomTripletsFingerprints$/i) { 1148 $ComplexityParametersMap{UseTriangleInequality} = 1; 1149 } 1150 } 1151 1152 if ($MolecularComplexityType =~ /^(TopologicalPharmacophoreAtomPairsFingerprints|TopologicalPharmacophoreAtomTripletsFingerprints)$/i) { 1153 if ($ComplexityParametersMap{AtomIdentifierType} !~ /^FunctionalClassAtomTypes$/i) { 1154 die "Error: The parameter value specified for AtomIdentifierType, $ComplexityParametersMap{AtomIdentifierType}, in option \"--MolecularComplexity\" is not valid for MolecularComplexityType, $MolecularComplexityType: Allowed value: FunctionalClassAtomTypes...\n"; 1155 } 1156 } 1157 1158 # Set up approprate paremeter names for specified molecular complexity... 1159 1160 @ParameterNames = (); 1161 push @ParameterNames, 'MolecularComplexityType'; 1162 1163 @AtomIdentifierTypeParameters = (); 1164 push @AtomIdentifierTypeParameters, 'AtomIdentifierType'; 1165 if ($ComplexityParametersMap{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) { 1166 push @AtomIdentifierTypeParameters, 'AtomicInvariantsToUse'; 1167 } 1168 elsif ($ComplexityParametersMap{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) { 1169 push @AtomIdentifierTypeParameters, 'FunctionalClassesToUse'; 1170 } 1171 1172 COMPLEXITYTYPE: { 1173 if ($MolecularComplexityType =~ /^AtomTypesFingerprints$/i) { 1174 push @ParameterNames, @AtomIdentifierTypeParameters; 1175 last COMPLEXITYTYPE; 1176 } 1177 if ($MolecularComplexityType =~ /^ExtendedConnectivityFingerprints$/i) { 1178 push @ParameterNames, @AtomIdentifierTypeParameters; 1179 push @ParameterNames, ('NeighborhoodRadius', 'NormalizationMethodology'); 1180 last COMPLEXITYTYPE; 1181 } 1182 if ($MolecularComplexityType =~ /^MACCSKeys$/i) { 1183 push @ParameterNames, 'MACCSKeysSize'; 1184 last COMPLEXITYTYPE; 1185 } 1186 if ($MolecularComplexityType =~ /^PathLengthFingerprints$/i) { 1187 push @ParameterNames, @AtomIdentifierTypeParameters; 1188 push @ParameterNames, ('MinPathLength', 'MaxPathLength', 'UseBondSymbols'); 1189 last COMPLEXITYTYPE; 1190 } 1191 if ($MolecularComplexityType =~ /^TopologicalAtomPairsFingerprints$/i) { 1192 push @ParameterNames, @AtomIdentifierTypeParameters; 1193 push @ParameterNames, ('MinDistance', 'MaxDistance'); 1194 last COMPLEXITYTYPE; 1195 } 1196 if ($MolecularComplexityType =~ /^TopologicalAtomTripletsFingerprints$/i) { 1197 push @ParameterNames, @AtomIdentifierTypeParameters; 1198 push @ParameterNames, ('MinDistance', 'MaxDistance', 'UseTriangleInequality'); 1199 last COMPLEXITYTYPE; 1200 } 1201 if ($MolecularComplexityType =~ /^TopologicalAtomTorsionsFingerprints$/i) { 1202 push @ParameterNames, @AtomIdentifierTypeParameters; 1203 last COMPLEXITYTYPE; 1204 } 1205 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomPairsFingerprints$/i) { 1206 push @ParameterNames, ('AtomIdentifierType', 'FunctionalClassesToUse', 'MinDistance', 'MaxDistance', 'NormalizationMethodology'); 1207 last COMPLEXITYTYPE; 1208 } 1209 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomTripletsFingerprints$/i) { 1210 push @ParameterNames, ('AtomIdentifierType', 'FunctionalClassesToUse', 'MinDistance', 'MaxDistance', 'UseTriangleInequality', 'NormalizationMethodology', 'DistanceBinSize'); 1211 last COMPLEXITYTYPE; 1212 } 1213 die "Error: The parameter value specified, $ParameterValue, for parameter name MolecularComplexityType using \"--MolecularComplexity\" is not valid.\n"; 1214 } 1215 1216 $OptionsInfo{MolecularComplexity} = $Options{molecularcomplexity}; 1217 1218 %{$OptionsInfo{MolecularComplexityParametersMap}} = (); 1219 for $ParameterName (@ParameterNames) { 1220 $ParameterValue = $ComplexityParametersMap{$ParameterName}; 1221 $OptionsInfo{MolecularComplexityParametersMap}{$ParameterName} = $ParameterValue; 1222 } 1223 } 1224 1225 # Setup script usage and retrieve command line arguments specified using various options... 1226 sub SetupScriptUsage { 1227 1228 # Retrieve all the options... 1229 %Options = (); 1230 1231 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel'; 1232 1233 $Options{compoundidmode} = 'LabelPrefix'; 1234 $Options{compoundidlabel} = 'CompoundID'; 1235 $Options{datafieldsmode} = 'CompoundID'; 1236 1237 $Options{filter} = 'Yes'; 1238 1239 $Options{hydrogenbonds} = 'HBondsType2'; 1240 1241 $Options{keeplargestcomponent} = 'Yes'; 1242 1243 # Default mode values are set later... 1244 $Options{mode} = ''; 1245 1246 # Default moelcular complexity values are set later... 1247 $Options{molecularcomplexity} = ''; 1248 1249 # Default precision values are set later... 1250 $Options{precision} = ''; 1251 1252 $Options{output} = 'text'; 1253 $Options{outdelim} = 'comma'; 1254 $Options{quote} = 'yes'; 1255 1256 # Default rotatable bond parameter values are set later... 1257 $Options{rotatablebonds} = ''; 1258 1259 $Options{ruleof3violations} = 'No'; 1260 $Options{ruleof5violations} = 'No'; 1261 1262 # Default TPSA paramater values are set later... 1263 $Options{tpsa} = ''; 1264 1265 if (!GetOptions(\%Options, "aromaticitymodel=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "help|h", "hydrogenbonds=s", "keeplargestcomponent|k=s", "mode|m=s", "molecularcomplexity=s", "outdelim=s", "output=s", "overwrite|o", "precision=s", "rotatablebonds=s", "ruleof3violations=s", "ruleof5violations=s", "quote|q=s", "root|r=s", "tpsa=s", "workingdir|w=s")) { 1266 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1267 } 1268 if ($Options{workingdir}) { 1269 if (! -d $Options{workingdir}) { 1270 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1271 } 1272 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1273 } 1274 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) { 1275 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels(); 1276 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n"; 1277 } 1278 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 1279 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 1280 } 1281 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 1282 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; 1283 } 1284 if ($Options{filter} !~ /^(Yes|No)$/i) { 1285 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; 1286 } 1287 if ($Options{hydrogenbonds} !~ /^(HBondsType1|HydrogenBondsType1|HBondsType2|HydrogenBondsType2)$/i) { 1288 die "Error: The value specified, $Options{hydrogenbonds}, for option \"--HydrogenBonds\" is not valid. Allowed values: HBondsType1, HydrogenBondsType1, HBondsType2, HydrogenBondsType2\n"; 1289 } 1290 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { 1291 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; 1292 } 1293 if ($Options{output} !~ /^(SD|text|both)$/i) { 1294 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; 1295 } 1296 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 1297 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1298 } 1299 if ($Options{quote} !~ /^(Yes|No)$/i) { 1300 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 1301 } 1302 if ($Options{ruleof3violations} !~ /^(Yes|No)$/i) { 1303 die "Error: The value specified, $Options{ruleof3violations}, for option \"--RuleOf3Violations\" is not valid. Allowed values: Yes or No\n"; 1304 } 1305 if ($Options{ruleof5violations} !~ /^(Yes|No)$/i) { 1306 die "Error: The value specified, $Options{ruleof5violations}, for option \"--RuleOf5Violations\" is not valid. Allowed values: Yes or No\n"; 1307 } 1308 } 1309