MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: TopologicalPharmacophoreAtomPairsFingerprints.pl,v $
   4 # $Date: 2015/02/28 20:46:23 $
   5 # $Revision: 1.36 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use SDFileUtil;
  38 use MoleculeFileIO;
  39 use FileIO::FingerprintsSDFileIO;
  40 use FileIO::FingerprintsTextFileIO;
  41 use FileIO::FingerprintsFPFileIO;
  42 use AtomTypes::FunctionalClassAtomTypes;
  43 use Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints;
  44 
  45 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  46 
  47 # Autoflush STDOUT
  48 $| = 1;
  49 
  50 # Starting message...
  51 $ScriptName = basename($0);
  52 print "\n$ScriptName: Starting...\n\n";
  53 $StartTime = new Benchmark;
  54 
  55 # Get the options and setup script...
  56 SetupScriptUsage();
  57 if ($Options{help} || @ARGV < 1) {
  58   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  59 }
  60 
  61 my(@SDFilesList);
  62 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  63 
  64 # Process options...
  65 print "Processing options...\n";
  66 my(%OptionsInfo);
  67 ProcessOptions();
  68 
  69 # Setup information about input files...
  70 print "Checking input SD file(s)...\n";
  71 my(%SDFilesInfo);
  72 RetrieveSDFilesInfo();
  73 
  74 # Process input files..
  75 my($FileIndex);
  76 if (@SDFilesList > 1) {
  77   print "\nProcessing SD files...\n";
  78 }
  79 for $FileIndex (0 .. $#SDFilesList) {
  80   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  81     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  82     GenerateTopologicalPharmacophoreAtomPairsFingerprints($FileIndex);
  83   }
  84 }
  85 print "\n$ScriptName:Done...\n\n";
  86 
  87 $EndTime = new Benchmark;
  88 $TotalTime = timediff ($EndTime, $StartTime);
  89 print "Total time: ", timestr($TotalTime), "\n";
  90 
  91 ###############################################################################
  92 
  93 # Generate fingerprints for a SD file...
  94 #
  95 sub GenerateTopologicalPharmacophoreAtomPairsFingerprints {
  96   my($FileIndex) = @_;
  97   my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, $SetupOutputFiles);
  98 
  99   $SDFile = $SDFilesList[$FileIndex];
 100 
 101   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
 102   $SetupOutputFiles = 1;
 103 
 104   $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
 105   $MoleculeFileIO->Open();
 106 
 107   $CmpdCount = 0;
 108   $IgnoredCmpdCount = 0;
 109 
 110   COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
 111     $CmpdCount++;
 112 
 113     # Filter compound data before calculating fingerprints...
 114     if ($OptionsInfo{Filter}) {
 115       if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
 116         $IgnoredCmpdCount++;
 117         next COMPOUND;
 118       }
 119     }
 120 
 121     $TopologicalPharmacophoreAtomPairsFingerprints = GenerateMoleculeFingerprints($Molecule);
 122     if (!$TopologicalPharmacophoreAtomPairsFingerprints) {
 123       $IgnoredCmpdCount++;
 124       ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
 125       next COMPOUND;
 126     }
 127 
 128     if ($SetupOutputFiles) {
 129       $SetupOutputFiles = 0;
 130       SetupFingerprintsLabelValueIDs($TopologicalPharmacophoreAtomPairsFingerprints);
 131       ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
 132     }
 133 
 134     WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 135   }
 136   $MoleculeFileIO->Close();
 137 
 138   if ($NewFPSDFileIO) {
 139     $NewFPSDFileIO->Close();
 140   }
 141   if ($NewFPTextFileIO) {
 142     $NewFPTextFileIO->Close();
 143   }
 144   if ($NewFPFileIO) {
 145     $NewFPFileIO->Close();
 146   }
 147 
 148   WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
 149 }
 150 
 151 # Process compound being ignored due to problems in fingerprints geneation...
 152 #
 153 sub ProcessIgnoredCompound {
 154   my($Mode, $CmpdCount, $Molecule) = @_;
 155   my($CmpdID, $DataFieldLabelAndValuesRef);
 156 
 157   $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 158   $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 159 
 160   MODE: {
 161     if ($Mode =~ /^ContainsNonElementalData$/i) {
 162       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
 163       next MODE;
 164     }
 165 
 166     if ($Mode =~ /^ContainsNoElementalData$/i) {
 167       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
 168       next MODE;
 169     }
 170 
 171     if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
 172       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 173       next MODE;
 174     }
 175     warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 176   }
 177 }
 178 
 179 # Check and filter compounds....
 180 #
 181 sub CheckAndFilterCompound {
 182   my($CmpdCount, $Molecule) = @_;
 183   my($ElementCount, $NonElementCount);
 184 
 185   ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
 186 
 187   if ($NonElementCount) {
 188     ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
 189     return 1;
 190   }
 191 
 192   if (!$ElementCount) {
 193     ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
 194     return 1;
 195   }
 196 
 197   return 0;
 198 }
 199 
 200 # Write out compounds fingerprints generation summary statistics...
 201 #
 202 sub WriteFingerprintsGenerationSummaryStatistics {
 203   my($CmpdCount, $IgnoredCmpdCount) = @_;
 204   my($ProcessedCmpdCount);
 205 
 206   $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
 207 
 208   print "\nNumber of compounds: $CmpdCount\n";
 209   print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
 210   print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
 211 }
 212 
 213 # Append atom pair value IDs to fingerprint label...
 214 #
 215 sub SetupFingerprintsLabelValueIDs {
 216   my($TopologicalPharmacophoreAtomPairsFingerprints) = @_;
 217 
 218   if ($OptionsInfo{AtomPairsSetSizeToUse} =~ /^ArbitrarySize$/i ||
 219       $OptionsInfo{FingerprintsLabelMode} !~ /^FingerprintsLabelWithIDs$/i) {
 220     return;
 221   }
 222 
 223   $OptionsInfo{FingerprintsLabel} .= "; Value IDs: " . $TopologicalPharmacophoreAtomPairsFingerprints->GetFingerprintsVector->GetValueIDsString();
 224 }
 225 
 226 # Open output files...
 227 #
 228 sub SetupAndOpenOutputFiles {
 229   my($FileIndex) = @_;
 230   my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
 231 
 232   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
 233 
 234   # Setup common parameters for fingerprints file IO objects...
 235   #
 236   %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
 237 
 238   if ($OptionsInfo{SDOutput}) {
 239     $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
 240     print "Generating SD file $NewFPSDFile...\n";
 241     $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
 242     $NewFPSDFileIO->Open();
 243   }
 244 
 245   if ($OptionsInfo{FPOutput}) {
 246     $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
 247     print "Generating FP file $NewFPFile...\n";
 248     $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
 249     $NewFPFileIO->Open();
 250   }
 251 
 252   if ($OptionsInfo{TextOutput}) {
 253     my($ColLabelsRef);
 254 
 255     $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
 256     $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
 257 
 258     print "Generating text file $NewFPTextFile...\n";
 259     $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
 260     $NewFPTextFileIO->Open();
 261   }
 262 
 263   return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 264 }
 265 
 266 # Write fingerpritns and other data to appropriate output files...
 267 #
 268 sub WriteDataToOutputFiles {
 269   my($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
 270   my($DataFieldLabelAndValuesRef);
 271 
 272   $DataFieldLabelAndValuesRef = undef;
 273   if ($NewFPTextFileIO || $NewFPFileIO) {
 274     $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 275   }
 276 
 277   if ($NewFPSDFileIO) {
 278     my($CmpdString);
 279 
 280     $CmpdString = $Molecule->GetInputMoleculeString();
 281     $NewFPSDFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $CmpdString);
 282   }
 283 
 284   if ($NewFPTextFileIO) {
 285     my($ColValuesRef);
 286 
 287     $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 288     $NewFPTextFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $ColValuesRef);
 289   }
 290 
 291   if ($NewFPFileIO) {
 292     my($CompoundID);
 293 
 294     $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 295     $NewFPFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $CompoundID);
 296   }
 297 }
 298 
 299 # Generate approriate column labels for FPText output file...
 300 #
 301 sub SetupFPTextFileCoulmnLabels {
 302   my($FileIndex) = @_;
 303   my($Line, @ColLabels);
 304 
 305   @ColLabels = ();
 306   if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 307     push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 308   }
 309   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 310     push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 311   }
 312   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 313     push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
 314   }
 315   elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 316     push @ColLabels, $OptionsInfo{CompoundIDLabel};
 317   }
 318   # Add fingerprints label...
 319   push @ColLabels, $OptionsInfo{FingerprintsLabel};
 320 
 321   return \@ColLabels;
 322 }
 323 
 324 # Generate column values FPText output file..
 325 #
 326 sub SetupFPTextFileCoulmnValues {
 327   my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 328   my(@ColValues);
 329 
 330   @ColValues = ();
 331   if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 332     push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 333   }
 334   elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 335     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 336   }
 337   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 338     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 339   }
 340   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 341     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
 342   }
 343 
 344   return \@ColValues;
 345 }
 346 
 347 # Generate compound ID for FP and FPText output files..
 348 #
 349 sub SetupCmpdIDForOutputFiles {
 350   my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 351   my($CmpdID);
 352 
 353   $CmpdID = '';
 354   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 355     my($MolName);
 356     $MolName = $Molecule->GetName();
 357     $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 358   }
 359   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 360     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 361   }
 362   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 363     my($SpecifiedDataField);
 364     $SpecifiedDataField = $OptionsInfo{CompoundID};
 365     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 366   }
 367   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 368     $CmpdID = $Molecule->GetName();
 369   }
 370   return $CmpdID;
 371 }
 372 
 373 # Generate fingerprints for molecule...
 374 #
 375 sub GenerateMoleculeFingerprints {
 376   my($Molecule) = @_;
 377   my($TopologicalPharmacophoreAtomPairsFingerprints);
 378 
 379   if ($OptionsInfo{KeepLargestComponent}) {
 380     $Molecule->KeepLargestComponent();
 381   }
 382   if (!$Molecule->DetectRings()) {
 383     return undef;
 384   }
 385   $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
 386   $Molecule->DetectAromaticity();
 387 
 388   if ($OptionsInfo{FuzzifyAtomPairsCount}) {
 389     $TopologicalPharmacophoreAtomPairsFingerprints = new Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints('Molecule' => $Molecule, 'AtomPairsSetSizeToUse' => $OptionsInfo{AtomPairsSetSizeToUse}, 'MinDistance' => $OptionsInfo{MinDistance},  'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomTypesToUse' => \@{$OptionsInfo{AtomTypesToUse}}, , 'NormalizationMethodology' => $OptionsInfo{NormalizationMethodology}, , 'ValuesPrecision' => $OptionsInfo{ValuesPrecision}, 'FuzzifyAtomPairsCount' => $OptionsInfo{FuzzifyAtomPairsCount}, 'FuzzificationMode' =>  $OptionsInfo{FuzzificationMode}, 'FuzzificationMethodology' => $OptionsInfo{FuzzificationMethodology}, 'FuzzFactor' => $OptionsInfo{FuzzFactor});
 390   }
 391   else {
 392     $TopologicalPharmacophoreAtomPairsFingerprints = new Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints('Molecule' => $Molecule, 'AtomPairsSetSizeToUse' => $OptionsInfo{AtomPairsSetSizeToUse}, 'MinDistance' => $OptionsInfo{MinDistance},  'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomTypesToUse' => \@{$OptionsInfo{AtomTypesToUse}}, 'NormalizationMethodology' => $OptionsInfo{NormalizationMethodology}, 'ValuesPrecision' => $OptionsInfo{ValuesPrecision});
 393   }
 394 
 395   # Set atom types weights...
 396   if ($OptionsInfo{UseAtomTypesWeight}) {
 397     $TopologicalPharmacophoreAtomPairsFingerprints->SetAtomTypesWeight(%{$OptionsInfo{AtomTypesWeight}});
 398   }
 399 
 400   # Generate fingerprints...
 401   $TopologicalPharmacophoreAtomPairsFingerprints->GenerateFingerprints();
 402 
 403   # Make sure fingerprints generation is successful...
 404   if (!$TopologicalPharmacophoreAtomPairsFingerprints->IsFingerprintsGenerationSuccessful()) {
 405     return undef;
 406   }
 407 
 408   return $TopologicalPharmacophoreAtomPairsFingerprints;
 409 }
 410 
 411 # Retrieve information about SD files...
 412 #
 413 sub RetrieveSDFilesInfo {
 414   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
 415 
 416   %SDFilesInfo = ();
 417   @{$SDFilesInfo{FileOkay}} = ();
 418   @{$SDFilesInfo{OutFileRoot}} = ();
 419   @{$SDFilesInfo{SDOutFileNames}} = ();
 420   @{$SDFilesInfo{FPOutFileNames}} = ();
 421   @{$SDFilesInfo{TextOutFileNames}} = ();
 422   @{$SDFilesInfo{AllDataFieldsRef}} = ();
 423   @{$SDFilesInfo{CommonDataFieldsRef}} = ();
 424 
 425   $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
 426   $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
 427 
 428   FILELIST: for $Index (0 .. $#SDFilesList) {
 429     $SDFile = $SDFilesList[$Index];
 430 
 431     $SDFilesInfo{FileOkay}[$Index] = 0;
 432     $SDFilesInfo{OutFileRoot}[$Index] = '';
 433     $SDFilesInfo{SDOutFileNames}[$Index] = '';
 434     $SDFilesInfo{FPOutFileNames}[$Index] = '';
 435     $SDFilesInfo{TextOutFileNames}[$Index] = '';
 436 
 437     $SDFile = $SDFilesList[$Index];
 438     if (!(-e $SDFile)) {
 439       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 440       next FILELIST;
 441     }
 442     if (!CheckFileType($SDFile, "sd sdf")) {
 443       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 444       next FILELIST;
 445     }
 446 
 447     if ($CheckDataField) {
 448       # Make sure data field exists in SD file..
 449       my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 450 
 451       @CmpdLines = ();
 452       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 453       $CmpdString = ReadCmpdString(\*SDFILE);
 454       close SDFILE;
 455       @CmpdLines = split "\n", $CmpdString;
 456       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 457       $SpecifiedDataField = $OptionsInfo{CompoundID};
 458       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 459         warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 460         next FILELIST;
 461       }
 462     }
 463 
 464     $AllDataFieldsRef = '';
 465     $CommonDataFieldsRef = '';
 466     if ($CollectDataFields) {
 467       my($CmpdCount);
 468       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 469       ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 470       close SDFILE;
 471     }
 472 
 473     # Setup output file names...
 474     $FileDir = ""; $FileName = ""; $FileExt = "";
 475     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 476 
 477     $TextOutFileExt = "csv";
 478     if ($Options{outdelim} =~ /^tab$/i) {
 479       $TextOutFileExt = "tsv";
 480     }
 481     $SDOutFileExt = $FileExt;
 482     $FPOutFileExt = "fpf";
 483 
 484     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 485       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 486       if ($RootFileName && $RootFileExt) {
 487         $FileName = $RootFileName;
 488       }
 489       else {
 490         $FileName = $OptionsInfo{OutFileRoot};
 491       }
 492       $OutFileRoot = $FileName;
 493     }
 494     else {
 495       $OutFileRoot = "${FileName}TopologicalPharmacophoreAtomPairsFP";
 496     }
 497 
 498     $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
 499     $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
 500     $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
 501 
 502     if ($OptionsInfo{SDOutput}) {
 503       if ($SDFile =~ /$NewSDFileName/i) {
 504         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 505         print "Specify a different name using \"-r --root\" option or use default name.\n";
 506         next FILELIST;
 507       }
 508     }
 509 
 510     if (!$OptionsInfo{OverwriteFiles}) {
 511       # Check SD and text outout files...
 512       if ($OptionsInfo{SDOutput}) {
 513         if (-e $NewSDFileName) {
 514           warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
 515           next FILELIST;
 516         }
 517       }
 518       if ($OptionsInfo{FPOutput}) {
 519         if (-e $NewFPFileName) {
 520           warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
 521           next FILELIST;
 522         }
 523       }
 524       if ($OptionsInfo{TextOutput}) {
 525         if (-e $NewTextFileName) {
 526           warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
 527           next FILELIST;
 528         }
 529       }
 530     }
 531 
 532     $SDFilesInfo{FileOkay}[$Index] = 1;
 533 
 534     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 535     $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
 536     $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
 537     $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
 538 
 539     $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
 540     $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
 541   }
 542 }
 543 
 544 # Process option values...
 545 sub ProcessOptions {
 546   %OptionsInfo = ();
 547 
 548   ProcessAtomTypesToUseOption();
 549   ProcessAtomTypesWeightOption();
 550 
 551   $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
 552 
 553   $OptionsInfo{AtomPairsSetSizeToUse} = $Options{atompairssetsizetouse};
 554 
 555   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 556   $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
 557   $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
 558 
 559   my(@SpecifiedDataFields);
 560   @SpecifiedDataFields = ();
 561 
 562   @{$OptionsInfo{SpecifiedDataFields}} = ();
 563   $OptionsInfo{CompoundID} = '';
 564 
 565   if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
 566     if ($Options{compoundidmode} =~ /^DataField$/i) {
 567       if (!$Options{compoundid}) {
 568         die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 569       }
 570       $OptionsInfo{CompoundID} = $Options{compoundid};
 571     }
 572     elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 573       $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 574     }
 575   }
 576   elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
 577     if (!$Options{datafields}) {
 578       die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
 579     }
 580     @SpecifiedDataFields = split /\,/, $Options{datafields};
 581     push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
 582   }
 583 
 584   $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
 585 
 586   $OptionsInfo{FingerprintsLabelMode} = $Options{fingerprintslabelmode};
 587   $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalPharmacophoreAtomPairsFingerprints';
 588 
 589   $OptionsInfo{FuzzifyAtomPairsCount} = ($Options{fuzzifyatompairscount} =~ /^Yes$/i) ? 1 : 0;
 590   $OptionsInfo{FuzzificationMode} = $Options{fuzzificationmode};
 591   $OptionsInfo{FuzzificationMethodology} = $Options{fuzzificationmethodology};
 592   $OptionsInfo{FuzzFactor} = $Options{fuzzfactor};
 593 
 594   $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
 595 
 596   $OptionsInfo{MinDistance} = $Options{mindistance};
 597   $OptionsInfo{MaxDistance} = $Options{maxdistance};
 598 
 599   $OptionsInfo{NormalizationMethodology} = $Options{normalizationmethodology};
 600 
 601   $OptionsInfo{Output} = $Options{output};
 602   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
 603   $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
 604   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
 605 
 606   $OptionsInfo{OutDelim} = $Options{outdelim};
 607   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 608 
 609   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 610   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 611 
 612   $OptionsInfo{ValuesPrecision} = $Options{valuesprecision};
 613 
 614   # Setup default vector string format...
 615   my($VectorStringFormat);
 616   $VectorStringFormat = '';
 617 
 618   if ($Options{vectorstringformat}) {
 619     $VectorStringFormat = $Options{vectorstringformat};
 620 
 621     if ($Options{atompairssetsizetouse} =~ /^ArbitrarySize$/i && $VectorStringFormat =~ /^ValuesString$/i) {
 622       die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid for $Options{atompairssetsizetouse} value of \"--AtomPairsSetSizeToUse\" option. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 623     }
 624   }
 625   else {
 626     $VectorStringFormat = ($Options{atompairssetsizetouse} =~ /^FixedSize$/) ? "ValuesString" : "IDsAndValuesString";
 627   }
 628   $OptionsInfo{VectorStringFormat} = $VectorStringFormat;
 629 }
 630 
 631 # Process atom type to use option...
 632 #
 633 sub ProcessAtomTypesToUseOption {
 634   my($AtomType, $SpecifiedAtomTypesToUse, @AtomTypesWords);
 635 
 636   @{$OptionsInfo{AtomTypesToUse}} = ();
 637   if (IsEmpty($Options{atomtypestouse})) {
 638     die "Error: Atom types value specified using \"-a, --AtomTypesToUse\" option is empty\n";
 639   }
 640 
 641   $SpecifiedAtomTypesToUse = $Options{atomtypestouse};
 642   $SpecifiedAtomTypesToUse =~ s/ //g;
 643   @AtomTypesWords = split /\,/, $SpecifiedAtomTypesToUse;
 644 
 645   for $AtomType (@AtomTypesWords) {
 646     if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($AtomType)) {
 647       die "Error: Atomic type specified, $AtomType, using \"-a, --AtomTypesToUse\" option is not valid...\n ";
 648     }
 649     push @{$OptionsInfo{AtomTypesToUse}}, $AtomType;
 650   }
 651 }
 652 
 653 # Process atom types weight option...
 654 #
 655 sub ProcessAtomTypesWeightOption {
 656   my($Index, $AtomType, $AtomTypeWeight, $SpecifiedAtomTypesWeight, @AtomTypesWeightsPairs);
 657 
 658   %{$OptionsInfo{AtomTypesWeight}} = ();
 659 
 660   if (IsEmpty($Options{atomtypesweight})) {
 661     die "Error: Atom types weight value specified using \"--AtomTypesWeight\" option is empty\n";
 662   }
 663   $OptionsInfo{UseAtomTypesWeight} = ($Options{atomtypesweight} =~ /^None$/i) ? 0 : 1;
 664   if (!$OptionsInfo{UseAtomTypesWeight}) {
 665     return;
 666   }
 667 
 668   # Process specified atom type/weight pairs...
 669   $SpecifiedAtomTypesWeight = $Options{atomtypesweight};
 670   $SpecifiedAtomTypesWeight =~ s/ //g;
 671   @AtomTypesWeightsPairs = split /\,/, $SpecifiedAtomTypesWeight;
 672 
 673   if (@AtomTypesWeightsPairs % 2) {
 674     die "Error: Invalid number of values specified using \"--AtomTypesWeight\" option: It must contain even number of values.\n";
 675   }
 676 
 677   for ($Index = 0; $Index < @AtomTypesWeightsPairs; $Index += 2) {
 678     $AtomType = $AtomTypesWeightsPairs[$Index]; $AtomTypeWeight = $AtomTypesWeightsPairs[$Index + 1];
 679     if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($AtomType)) {
 680       die "Error: Atom type specified, $AtomType, using \"--AtomTypesWeight\" option is not valid\n ";
 681     }
 682     if (!(IsFloat($AtomTypeWeight) && $AtomTypeWeight >= 0)) {
 683       die "Error: Atom type weight specified, $AtomTypeWeight, using option \"--AtomTypesWeight\" is not valid. Allowed values: real numbers >= 0 \n";
 684     }
 685     $OptionsInfo{AtomTypesWeight}{$AtomType} = $AtomTypeWeight;
 686   }
 687 }
 688 
 689 # Setup script usage  and retrieve command line arguments specified using various options...
 690 sub SetupScriptUsage {
 691 
 692   # Retrieve all the options...
 693   %Options = ();
 694 
 695   $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
 696 
 697   $Options{atompairssetsizetouse} = 'ArbitrarySize';
 698 
 699   $Options{atomtypestouse} = 'HBD,HBA,PI,NI,H';
 700   $Options{atomtypesweight} = 'None';
 701 
 702   $Options{compoundidmode} = 'LabelPrefix';
 703   $Options{compoundidlabel} = 'CompoundID';
 704   $Options{datafieldsmode} = 'CompoundID';
 705 
 706   $Options{filter} = 'Yes';
 707 
 708   $Options{fingerprintslabelmode} = 'FingerprintsLabelOnly';
 709 
 710   $Options{fuzzifyatompairscount} = 'No';
 711   $Options{fuzzificationmode} = 'AfterNormalization';
 712   $Options{fuzzificationmethodology} = 'FuzzyBinning';
 713   $Options{fuzzfactor} = 0.15;
 714 
 715   $Options{keeplargestcomponent} = 'Yes';
 716 
 717   $Options{mindistance} = 1;
 718   $Options{maxdistance} = 10;
 719 
 720   $Options{normalizationmethodology} = 'None';
 721 
 722   $Options{output} = 'text';
 723   $Options{outdelim} = 'comma';
 724   $Options{quote} = 'yes';
 725 
 726   $Options{valuesprecision} = 2;
 727 
 728   $Options{vectorstringformat} = '';
 729 
 730   if (!GetOptions(\%Options, "aromaticitymodel=s", "atompairssetsizetouse=s", "atomtypestouse|a=s", "atomtypesweight=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabelmode=s", "fingerprintslabel=s", "fuzzifyatompairscount=s", "fuzzificationmode=s", "fuzzificationmethodology=s", "fuzzfactor=s", "help|h", "keeplargestcomponent|k=s",  "mindistance=s", "maxdistance=s", "normalizationmethodology|n=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "valuesprecision=s", "vectorstringformat|v=s", "workingdir|w=s")) {
 731     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 732   }
 733   if ($Options{workingdir}) {
 734     if (! -d $Options{workingdir}) {
 735       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 736     }
 737     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 738   }
 739   if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
 740     my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
 741     die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
 742   }
 743   if ($Options{atompairssetsizetouse} !~ /^(ArbitrarySize|FixedSize)$/i) {
 744     die "Error: The value specified, $Options{atompairssetsizetouse}, for option \"--AtomPairsSetSizeToUse\" is not valid. Allowed values: ArbitrarySize or FixedSize\n";
 745   }
 746   if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
 747     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 748   }
 749   if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
 750     die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
 751   }
 752   if ($Options{filter} !~ /^(Yes|No)$/i) {
 753     die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
 754   }
 755   if ($Options{fingerprintslabelmode} !~ /^(FingerprintsLabelOnly|FingerprintsLabelWithIDs)$/i) {
 756     die "Error: The value specified, $Options{fingerprintslabelmode}, for option \"--FingerprintsLabelMode\" is not valid. Allowed values: FingerprintsLabelOnly or FingerprintsLabelWithIDs\n";
 757   }
 758   if ($Options{fuzzifyatompairscount} !~ /^(Yes|No)$/i) {
 759     die "Error: The value specified, $Options{fuzzifyatompairscount}, for option \"--FuzzifyAtomPairsCount\" is not valid. Allowed values: Yes or No\n";
 760   }
 761   if ($Options{fuzzificationmode} !~ /^(BeforeNormalization|AfterNormalization)$/i) {
 762     die "Error: The value specified, $Options{fuzzificationmode}, for option \"--FuzzificationMode\" is not valid. Allowed values: BeforeNormalization or AfterNormalization\n";
 763   }
 764   if ($Options{fuzzificationmethodology} !~ /^(FuzzyBinning|FuzzyBinSmoothing)$/i) {
 765     die "Error: The value specified, $Options{fuzzificationmethodology}, for option \"--FuzzificationMethodology\" is not valid. Allowed values: FuzzyBinning or FuzzyBinSmoothing\n";
 766   }
 767   if (!IsFloat($Options{fuzzfactor})) {
 768     die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" is not valid. Allowed values: real numbers >= 0 \n";
 769   }
 770   if ($Options{fuzzificationmethodology} !~ /^FuzzyBinning$/i) {
 771     if (!($Options{fuzzfactor} >=0 && $Options{fuzzfactor} <= 1.0)) {
 772       die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" during FuzzyBinning \"--FuzzificationMethodology\" is not valid. Allowed values: >= 0 and <= 1 \n";
 773     }
 774   }
 775   elsif ($Options{fuzzificationmethodology} !~ /^FuzzyBinSmoothing$/i) {
 776     if (!($Options{fuzzfactor} >=0 && $Options{fuzzfactor} <= 0.5)) {
 777       die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" during FuzzyBinSmoothing \"--FuzzificationMethodology\" is not valid. Allowed values: >= 0 and <= 0.5 \n";
 778     }
 779   }
 780   if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
 781     die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
 782   }
 783   if (!IsInteger($Options{mindistance})) {
 784     die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: >= 0 \n";
 785   }
 786   if (!IsPositiveInteger($Options{maxdistance})) {
 787     die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n";
 788   }
 789   if ($Options{mindistance} > $Options{maxdistance}) {
 790     die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n";
 791   }
 792   if ($Options{normalizationmethodology} !~ /^(None|ByHeavyAtomsCount|ByAtomTypesCount)$/i) {
 793     die "Error: The value specified, $Options{normalizationmethodology}, for option \"--NormalizationMethodology\" is not valid. Allowed values: None, ByHeavyAtomsCount, or ByAtomTypesCount\n";
 794   }
 795   if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
 796     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
 797   }
 798   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 799     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 800   }
 801   if ($Options{quote} !~ /^(Yes|No)$/i) {
 802     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 803   }
 804   if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
 805     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
 806   }
 807   if (!IsPositiveInteger($Options{valuesprecision})) {
 808     die "Error: The value specified, $Options{valuesprecision}, for option \"--ValuesPrecision\" is not valid. Allowed values: > 0 \n";
 809   }
 810   if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
 811     die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 812   }
 813 }
 814