MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: EStateIndiciesFingerprints.pl,v $
   4 # $Date: 2015/02/28 20:46:19 $
   5 # $Revision: 1.23 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use SDFileUtil;
  38 use MoleculeFileIO;
  39 use FileIO::FingerprintsSDFileIO;
  40 use FileIO::FingerprintsTextFileIO;
  41 use FileIO::FingerprintsFPFileIO;
  42 use AtomTypes::EStateAtomTypes;
  43 use Fingerprints::EStateIndiciesFingerprints;
  44 
  45 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  46 
  47 # Autoflush STDOUT
  48 $| = 1;
  49 
  50 # Starting message...
  51 $ScriptName = basename($0);
  52 print "\n$ScriptName: Starting...\n\n";
  53 $StartTime = new Benchmark;
  54 
  55 # Get the options and setup script...
  56 SetupScriptUsage();
  57 if ($Options{help} || @ARGV < 1) {
  58   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  59 }
  60 
  61 my(@SDFilesList);
  62 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  63 
  64 # Process options...
  65 print "Processing options...\n";
  66 my(%OptionsInfo);
  67 ProcessOptions();
  68 
  69 # Setup information about input files...
  70 print "Checking input SD file(s)...\n";
  71 my(%SDFilesInfo);
  72 RetrieveSDFilesInfo();
  73 
  74 # Process input files..
  75 my($FileIndex);
  76 if (@SDFilesList > 1) {
  77   print "\nProcessing SD files...\n";
  78 }
  79 for $FileIndex (0 .. $#SDFilesList) {
  80   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  81     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  82     GenerateEStateIndiciesFingerprints($FileIndex);
  83   }
  84 }
  85 print "\n$ScriptName:Done...\n\n";
  86 
  87 $EndTime = new Benchmark;
  88 $TotalTime = timediff ($EndTime, $StartTime);
  89 print "Total time: ", timestr($TotalTime), "\n";
  90 
  91 ###############################################################################
  92 
  93 # Generate fingerprints for a SD file...
  94 #
  95 sub GenerateEStateIndiciesFingerprints {
  96   my($FileIndex) = @_;
  97   my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $EStateIndiciesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
  98 
  99   $SDFile = $SDFilesList[$FileIndex];
 100 
 101   # Setup output files...
 102   #
 103   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
 104 
 105   $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
 106   $MoleculeFileIO->Open();
 107 
 108   $CmpdCount = 0;
 109   $IgnoredCmpdCount = 0;
 110 
 111   COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
 112     $CmpdCount++;
 113 
 114     # Filter compound data before calculating fingerprints...
 115     if ($OptionsInfo{Filter}) {
 116       if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
 117         $IgnoredCmpdCount++;
 118         next COMPOUND;
 119       }
 120     }
 121 
 122     $EStateIndiciesFingerprints = GenerateMoleculeFingerprints($Molecule);
 123     if (!$EStateIndiciesFingerprints) {
 124       $IgnoredCmpdCount++;
 125       ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
 126       next COMPOUND;
 127     }
 128 
 129     WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $EStateIndiciesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 130   }
 131   $MoleculeFileIO->Close();
 132 
 133   if ($NewFPSDFileIO) {
 134     $NewFPSDFileIO->Close();
 135   }
 136   if ($NewFPTextFileIO) {
 137     $NewFPTextFileIO->Close();
 138   }
 139   if ($NewFPFileIO) {
 140     $NewFPFileIO->Close();
 141   }
 142 
 143   WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
 144 }
 145 
 146 # Process compound being ignored due to problems in fingerprints geneation...
 147 #
 148 sub ProcessIgnoredCompound {
 149   my($Mode, $CmpdCount, $Molecule) = @_;
 150   my($CmpdID, $DataFieldLabelAndValuesRef);
 151 
 152   $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 153   $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 154 
 155   MODE: {
 156     if ($Mode =~ /^ContainsNonElementalData$/i) {
 157       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
 158       next MODE;
 159     }
 160 
 161     if ($Mode =~ /^ContainsNoElementalData$/i) {
 162       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
 163       next MODE;
 164     }
 165 
 166     if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
 167       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 168       next MODE;
 169     }
 170     warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 171   }
 172 }
 173 
 174 # Check and filter compounds....
 175 #
 176 sub CheckAndFilterCompound {
 177   my($CmpdCount, $Molecule) = @_;
 178   my($ElementCount, $NonElementCount);
 179 
 180   ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
 181 
 182   if ($NonElementCount) {
 183     ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
 184     return 1;
 185   }
 186 
 187   if (!$ElementCount) {
 188     ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
 189     return 1;
 190   }
 191 
 192   return 0;
 193 }
 194 
 195 # Write out compounds fingerprints generation summary statistics...
 196 #
 197 sub WriteFingerprintsGenerationSummaryStatistics {
 198   my($CmpdCount, $IgnoredCmpdCount) = @_;
 199   my($ProcessedCmpdCount);
 200 
 201   $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
 202 
 203   print "\nNumber of compounds: $CmpdCount\n";
 204   print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
 205   print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
 206 }
 207 
 208 # Open output files...
 209 #
 210 sub SetupAndOpenOutputFiles {
 211   my($FileIndex) = @_;
 212   my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
 213 
 214   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
 215 
 216   # Setup common parameters for fingerprints file IO objects...
 217   #
 218   %FingerprintsFileIOParams = ();
 219   %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
 220 
 221   if ($OptionsInfo{SDOutput}) {
 222     $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
 223     print "Generating SD file $NewFPSDFile...\n";
 224     $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
 225     $NewFPSDFileIO->Open();
 226   }
 227 
 228   if ($OptionsInfo{FPOutput}) {
 229     $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
 230     print "Generating FP file $NewFPFile...\n";
 231     $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
 232     $NewFPFileIO->Open();
 233   }
 234 
 235   if ($OptionsInfo{TextOutput}) {
 236     my($ColLabelsRef);
 237 
 238     $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
 239     $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
 240 
 241     print "Generating text file $NewFPTextFile...\n";
 242     $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
 243     $NewFPTextFileIO->Open();
 244   }
 245 
 246   return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 247 }
 248 
 249 # Write fingerpritns and other data to appropriate output files...
 250 #
 251 sub WriteDataToOutputFiles {
 252   my($FileIndex, $CmpdCount, $Molecule, $EStateIndiciesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
 253   my($DataFieldLabelAndValuesRef);
 254 
 255   $DataFieldLabelAndValuesRef = undef;
 256   if ($NewFPTextFileIO || $NewFPFileIO) {
 257     $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 258   }
 259 
 260   if ($NewFPSDFileIO) {
 261     my($CmpdString);
 262 
 263     $CmpdString = $Molecule->GetInputMoleculeString();
 264     $NewFPSDFileIO->WriteFingerprints($EStateIndiciesFingerprints, $CmpdString);
 265   }
 266 
 267   if ($NewFPTextFileIO) {
 268     my($ColValuesRef);
 269 
 270     $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 271     $NewFPTextFileIO->WriteFingerprints($EStateIndiciesFingerprints, $ColValuesRef);
 272   }
 273 
 274   if ($NewFPFileIO) {
 275     my($CompoundID);
 276 
 277     $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 278     $NewFPFileIO->WriteFingerprints($EStateIndiciesFingerprints, $CompoundID);
 279   }
 280 
 281 }
 282 
 283 # Generate approriate column labels for FPText output file...
 284 #
 285 sub SetupFPTextFileCoulmnLabels {
 286   my($FileIndex) = @_;
 287   my($Line, @ColLabels);
 288 
 289   @ColLabels = ();
 290   if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 291     push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 292   }
 293   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 294     push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 295   }
 296   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 297     push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
 298   }
 299   elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 300     push @ColLabels, $OptionsInfo{CompoundIDLabel};
 301   }
 302   # Add fingerprints label...
 303   push @ColLabels, $OptionsInfo{FingerprintsLabel};
 304 
 305   return \@ColLabels;
 306 }
 307 
 308 # Generate column values FPText output file..
 309 #
 310 sub SetupFPTextFileCoulmnValues {
 311   my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 312   my(@ColValues);
 313 
 314   @ColValues = ();
 315   if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 316     push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 317   }
 318   elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 319     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 320   }
 321   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 322     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 323   }
 324   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 325     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
 326   }
 327 
 328   return \@ColValues;
 329 }
 330 
 331 # Generate compound ID for FP and FPText output files..
 332 #
 333 sub SetupCmpdIDForOutputFiles {
 334   my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 335   my($CmpdID);
 336 
 337   $CmpdID = '';
 338   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 339     my($MolName);
 340     $MolName = $Molecule->GetName();
 341     $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 342   }
 343   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 344     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 345   }
 346   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 347     my($SpecifiedDataField);
 348     $SpecifiedDataField = $OptionsInfo{CompoundID};
 349     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 350   }
 351   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 352     $CmpdID = $Molecule->GetName();
 353   }
 354   return $CmpdID;
 355 }
 356 
 357 # Generate fingerprints for molecule...
 358 #
 359 sub GenerateMoleculeFingerprints {
 360   my($Molecule) = @_;
 361   my($EStateIndiciesFingerprints);
 362 
 363   if ($OptionsInfo{KeepLargestComponent}) {
 364     $Molecule->KeepLargestComponent();
 365   }
 366   if (!$Molecule->DetectRings()) {
 367     return undef;
 368   }
 369   $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
 370   $Molecule->DetectAromaticity();
 371 
 372   $EStateIndiciesFingerprints = new Fingerprints::EStateIndiciesFingerprints('Molecule' => $Molecule, 'EStateAtomTypesSetToUse' => $OptionsInfo{EStateAtomTypesSetToUse}, 'ValuesPrecision' => $OptionsInfo{ValuesPrecision});
 373 
 374   # Generate E-state indicies fingerprints...
 375   $EStateIndiciesFingerprints->GenerateFingerprints();
 376 
 377   # Make sure E-state indicies fingerprints generation is successful...
 378   if (!$EStateIndiciesFingerprints->IsFingerprintsGenerationSuccessful()) {
 379     return undef;
 380   }
 381 
 382   return $EStateIndiciesFingerprints;
 383 }
 384 
 385 # Retrieve information about SD files...
 386 #
 387 sub RetrieveSDFilesInfo {
 388   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
 389 
 390   %SDFilesInfo = ();
 391   @{$SDFilesInfo{FileOkay}} = ();
 392   @{$SDFilesInfo{OutFileRoot}} = ();
 393   @{$SDFilesInfo{SDOutFileNames}} = ();
 394   @{$SDFilesInfo{FPOutFileNames}} = ();
 395   @{$SDFilesInfo{TextOutFileNames}} = ();
 396   @{$SDFilesInfo{AllDataFieldsRef}} = ();
 397   @{$SDFilesInfo{CommonDataFieldsRef}} = ();
 398 
 399   $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
 400   $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
 401 
 402   FILELIST: for $Index (0 .. $#SDFilesList) {
 403     $SDFile = $SDFilesList[$Index];
 404 
 405     $SDFilesInfo{FileOkay}[$Index] = 0;
 406     $SDFilesInfo{OutFileRoot}[$Index] = '';
 407     $SDFilesInfo{SDOutFileNames}[$Index] = '';
 408     $SDFilesInfo{FPOutFileNames}[$Index] = '';
 409     $SDFilesInfo{TextOutFileNames}[$Index] = '';
 410 
 411     $SDFile = $SDFilesList[$Index];
 412     if (!(-e $SDFile)) {
 413       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 414       next FILELIST;
 415     }
 416     if (!CheckFileType($SDFile, "sd sdf")) {
 417       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 418       next FILELIST;
 419     }
 420 
 421     if ($CheckDataField) {
 422       # Make sure data field exists in SD file..
 423       my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 424 
 425       @CmpdLines = ();
 426       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 427       $CmpdString = ReadCmpdString(\*SDFILE);
 428       close SDFILE;
 429       @CmpdLines = split "\n", $CmpdString;
 430       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 431       $SpecifiedDataField = $OptionsInfo{CompoundID};
 432       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 433         warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 434         next FILELIST;
 435       }
 436     }
 437 
 438     $AllDataFieldsRef = '';
 439     $CommonDataFieldsRef = '';
 440     if ($CollectDataFields) {
 441       my($CmpdCount);
 442       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 443       ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 444       close SDFILE;
 445     }
 446 
 447     # Setup output file names...
 448     $FileDir = ""; $FileName = ""; $FileExt = "";
 449     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 450 
 451     $TextOutFileExt = "csv";
 452     if ($Options{outdelim} =~ /^tab$/i) {
 453       $TextOutFileExt = "tsv";
 454     }
 455     $SDOutFileExt = $FileExt;
 456     $FPOutFileExt = "fpf";
 457 
 458     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 459       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 460       if ($RootFileName && $RootFileExt) {
 461         $FileName = $RootFileName;
 462       }
 463       else {
 464         $FileName = $OptionsInfo{OutFileRoot};
 465       }
 466       $OutFileRoot = $FileName;
 467     }
 468     else {
 469       $OutFileRoot = "${FileName}EStateIndiciesFP";
 470     }
 471 
 472     $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
 473     $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
 474     $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
 475 
 476     if ($OptionsInfo{SDOutput}) {
 477       if ($SDFile =~ /$NewSDFileName/i) {
 478         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 479         print "Specify a different name using \"-r --root\" option or use default name.\n";
 480         next FILELIST;
 481       }
 482     }
 483 
 484     if (!$OptionsInfo{OverwriteFiles}) {
 485       # Check SD and text outout files...
 486       if ($OptionsInfo{SDOutput}) {
 487         if (-e $NewSDFileName) {
 488           warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
 489           next FILELIST;
 490         }
 491       }
 492       if ($OptionsInfo{FPOutput}) {
 493         if (-e $NewFPFileName) {
 494           warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
 495           next FILELIST;
 496         }
 497       }
 498       if ($OptionsInfo{TextOutput}) {
 499         if (-e $NewTextFileName) {
 500           warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
 501           next FILELIST;
 502         }
 503       }
 504     }
 505 
 506     $SDFilesInfo{FileOkay}[$Index] = 1;
 507 
 508     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 509     $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
 510     $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
 511     $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
 512 
 513     $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
 514     $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
 515   }
 516 }
 517 
 518 # Process option values...
 519 sub ProcessOptions {
 520   %OptionsInfo = ();
 521 
 522   $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
 523 
 524   $OptionsInfo{EStateAtomTypesSetToUse} = $Options{estateatomtypessettouse} ? $Options{estateatomtypessettouse} : 'ArbitrarySize';
 525 
 526   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 527   $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
 528   $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
 529 
 530   my(@SpecifiedDataFields);
 531   @SpecifiedDataFields = ();
 532 
 533   @{$OptionsInfo{SpecifiedDataFields}} = ();
 534   $OptionsInfo{CompoundID} = '';
 535 
 536   if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
 537     if ($Options{compoundidmode} =~ /^DataField$/i) {
 538       if (!$Options{compoundid}) {
 539         die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 540       }
 541       $OptionsInfo{CompoundID} = $Options{compoundid};
 542     }
 543     elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 544       $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 545     }
 546   }
 547   elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
 548     if (!$Options{datafields}) {
 549       die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
 550     }
 551     @SpecifiedDataFields = split /\,/, $Options{datafields};
 552     push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
 553   }
 554 
 555   $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'EStateIndiciesFingerprints';
 556 
 557   $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
 558 
 559   if ($Options{fingerprintslabelmode} =~ /^FingerprintsLabelWithIDs$/) {
 560     if ($Options{estateatomtypessettouse} =~ /^FixedSize$/i) {
 561       # Append E-state atom types for non-hydrogen atoms to the fingerprints label...
 562       my($AtomType, @IDs);
 563       @IDs = ();
 564       for $AtomType (@{AtomTypes::EStateAtomTypes::GetAllPossibleEStateNonHydrogenAtomTypes()}) {
 565         push @IDs, "S${AtomType}";
 566       }
 567       $OptionsInfo{FingerprintsLabel} .= "; EStateAtomTypes: " . TextUtil::JoinWords(\@IDs, " ", 0);
 568     }
 569   }
 570   $OptionsInfo{FingerprintsLabelMode} = $Options{fingerprintslabelmode};
 571 
 572   $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
 573 
 574   $OptionsInfo{Output} = $Options{output};
 575   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
 576   $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
 577   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
 578 
 579   $OptionsInfo{OutDelim} = $Options{outdelim};
 580   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 581 
 582   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 583   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 584 
 585   # Precision for E-state indicies...
 586   $OptionsInfo{ValuesPrecision} = $Options{valuesprecision};
 587 
 588   # Setup default vector string format...
 589   my($VectorStringFormat);
 590   $VectorStringFormat = '';
 591   if ($Options{vectorstringformat}) {
 592     $VectorStringFormat = $Options{vectorstringformat};
 593   }
 594   else {
 595     $VectorStringFormat = ($Options{estateatomtypessettouse} =~ /^FixedSize$/) ? "ValuesString" : "IDsAndValuesString";
 596   }
 597   $OptionsInfo{VectorStringFormat} = $VectorStringFormat;
 598 }
 599 
 600 # Setup script usage  and retrieve command line arguments specified using various options...
 601 sub SetupScriptUsage {
 602 
 603   # Retrieve all the options...
 604   %Options = ();
 605 
 606   $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
 607 
 608   $Options{compoundidmode} = 'LabelPrefix';
 609   $Options{compoundidlabel} = 'CompoundID';
 610   $Options{datafieldsmode} = 'CompoundID';
 611 
 612   $Options{filter} = 'Yes';
 613 
 614   $Options{estateatomtypessettouse} = 'ArbitrarySize';
 615 
 616   $Options{fingerprintslabelmode} = 'FingerprintsLabelOnly';
 617   $Options{keeplargestcomponent} = 'Yes';
 618 
 619   $Options{output} = 'text';
 620   $Options{outdelim} = 'comma';
 621   $Options{quote} = 'yes';
 622 
 623   $Options{valuesprecision} = 3;
 624 
 625   $Options{vectorstringformat} = '';
 626 
 627   if (!GetOptions(\%Options, "aromaticitymodel=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "estateatomtypessettouse|e=s", "filter|f=s", "fingerprintslabelmode=s", "fingerprintslabel=s",  "help|h", "keeplargestcomponent|k=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "valuesprecision=s", "vectorstringformat|v=s", "workingdir|w=s")) {
 628     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 629   }
 630   if ($Options{workingdir}) {
 631     if (! -d $Options{workingdir}) {
 632       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 633     }
 634     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 635   }
 636   if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
 637     my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
 638     die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
 639   }
 640   if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
 641     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 642   }
 643   if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
 644     die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
 645   }
 646   if ($Options{estateatomtypessettouse} && $Options{estateatomtypessettouse} !~ /^(ArbitrarySize|FixedSize)$/) {
 647     die "Error: The value specified, $Options{estateatomtypessettouse}, for option \"-e, --EStateAtomTypesSetToUse\" is not valid. Allowed values: ArbitrarySize or FixedSize\n";
 648   }
 649   if ($Options{filter} !~ /^(Yes|No)$/i) {
 650     die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
 651   }
 652   if ($Options{fingerprintslabelmode} !~ /^(FingerprintsLabelOnly|FingerprintsLabelWithIDs)$/i) {
 653     die "Error: The value specified, $Options{fingerprintslabelmode}, for option \"--FingerprintsLabelMode\" is not valid. Allowed values: FingerprintsLabelOnly or FingerprintsLabelWithIDs\n";
 654   }
 655   if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
 656     die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
 657   }
 658   if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
 659     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
 660   }
 661   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 662     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 663   }
 664   if ($Options{quote} !~ /^(Yes|No)$/i) {
 665     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 666   }
 667   if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
 668     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
 669   }
 670   if (!IsPositiveInteger($Options{valuesprecision})) {
 671     die "Error: The value specified, $Options{valuesprecision}, for option \"--ValuesPrecision\" is not valid. Allowed values: > 0 \n";
 672   }
 673   if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
 674     die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 675   }
 676 }
 677