MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: MACCSKeysFingerprints.pl,v $
   4 # $Date: 2015/02/28 20:46:20 $
   5 # $Revision: 1.31 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use SDFileUtil;
  38 use MoleculeFileIO;
  39 use FileIO::FingerprintsSDFileIO;
  40 use FileIO::FingerprintsTextFileIO;
  41 use FileIO::FingerprintsFPFileIO;
  42 use Fingerprints::MACCSKeys;
  43 
  44 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  45 
  46 # Autoflush STDOUT
  47 $| = 1;
  48 
  49 # Starting message...
  50 $ScriptName = basename($0);
  51 print "\n$ScriptName: Starting...\n\n";
  52 $StartTime = new Benchmark;
  53 
  54 # Get the options and setup script...
  55 SetupScriptUsage();
  56 if ($Options{help} || @ARGV < 1) {
  57   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  58 }
  59 
  60 my(@SDFilesList);
  61 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  62 
  63 # Process options...
  64 print "Processing options...\n";
  65 my(%OptionsInfo);
  66 ProcessOptions();
  67 
  68 # Setup information about input files...
  69 print "Checking input SD file(s)...\n";
  70 my(%SDFilesInfo);
  71 RetrieveSDFilesInfo();
  72 
  73 # Process input files..
  74 my($FileIndex);
  75 if (@SDFilesList > 1) {
  76   print "\nProcessing SD files...\n";
  77 }
  78 for $FileIndex (0 .. $#SDFilesList) {
  79   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  80     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  81     GenerateMACCSKeysFingerprints($FileIndex);
  82   }
  83 }
  84 print "\n$ScriptName:Done...\n\n";
  85 
  86 $EndTime = new Benchmark;
  87 $TotalTime = timediff ($EndTime, $StartTime);
  88 print "Total time: ", timestr($TotalTime), "\n";
  89 
  90 ###############################################################################
  91 
  92 # Generate fingerprints for a SD file...
  93 #
  94 sub GenerateMACCSKeysFingerprints {
  95   my($FileIndex) = @_;
  96   my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
  97 
  98   $SDFile = $SDFilesList[$FileIndex];
  99 
 100   # Setup output files...
 101   #
 102   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
 103 
 104   $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
 105   $MoleculeFileIO->Open();
 106 
 107   $CmpdCount = 0;
 108   $IgnoredCmpdCount = 0;
 109 
 110   COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
 111     $CmpdCount++;
 112 
 113     # Filter compound data before calculating fingerprints...
 114     if ($OptionsInfo{Filter}) {
 115       if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
 116         $IgnoredCmpdCount++;
 117         next COMPOUND;
 118       }
 119     }
 120 
 121     $MACCSKeysFingerprints = GenerateMoleculeFingerprints($Molecule);
 122     if (!$MACCSKeysFingerprints) {
 123       $IgnoredCmpdCount++;
 124       ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
 125       next COMPOUND;
 126     }
 127 
 128     WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 129   }
 130   $MoleculeFileIO->Close();
 131 
 132   if ($NewFPSDFileIO) {
 133     $NewFPSDFileIO->Close();
 134   }
 135   if ($NewFPTextFileIO) {
 136     $NewFPTextFileIO->Close();
 137   }
 138   if ($NewFPFileIO) {
 139     $NewFPFileIO->Close();
 140   }
 141 
 142   WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
 143 }
 144 
 145 # Process compound being ignored due to problems in fingerprints geneation...
 146 #
 147 sub ProcessIgnoredCompound {
 148   my($Mode, $CmpdCount, $Molecule) = @_;
 149   my($CmpdID, $DataFieldLabelAndValuesRef);
 150 
 151   $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 152   $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 153 
 154   MODE: {
 155     if ($Mode =~ /^ContainsNonElementalData$/i) {
 156       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
 157       next MODE;
 158     }
 159 
 160     if ($Mode =~ /^ContainsNoElementalData$/i) {
 161       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
 162       next MODE;
 163     }
 164 
 165     if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
 166       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 167       next MODE;
 168     }
 169     warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 170   }
 171 }
 172 
 173 # Check and filter compounds....
 174 #
 175 sub CheckAndFilterCompound {
 176   my($CmpdCount, $Molecule) = @_;
 177   my($ElementCount, $NonElementCount);
 178 
 179   ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
 180 
 181   if ($NonElementCount) {
 182     ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
 183     return 1;
 184   }
 185 
 186   if (!$ElementCount) {
 187     ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
 188     return 1;
 189   }
 190 
 191   return 0;
 192 }
 193 
 194 # Write out compounds fingerprints generation summary statistics...
 195 #
 196 sub WriteFingerprintsGenerationSummaryStatistics {
 197   my($CmpdCount, $IgnoredCmpdCount) = @_;
 198   my($ProcessedCmpdCount);
 199 
 200   $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
 201 
 202   print "\nNumber of compounds: $CmpdCount\n";
 203   print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
 204   print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
 205 }
 206 
 207 # Open output files...
 208 #
 209 sub SetupAndOpenOutputFiles {
 210   my($FileIndex) = @_;
 211   my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
 212 
 213   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
 214 
 215   # Setup common parameters for fingerprints file IO objects...
 216   #
 217   %FingerprintsFileIOParams = ();
 218   if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) {
 219     %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsBitVectorString', 'BitStringFormat' => $OptionsInfo{BitStringFormat}, 'BitsOrder' => $OptionsInfo{BitsOrder});
 220   }
 221   elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) {
 222     %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
 223   }
 224 
 225   if ($OptionsInfo{SDOutput}) {
 226     $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
 227     print "Generating SD file $NewFPSDFile...\n";
 228     $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
 229     $NewFPSDFileIO->Open();
 230   }
 231 
 232   if ($OptionsInfo{FPOutput}) {
 233     $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
 234     print "Generating FP file $NewFPFile...\n";
 235     $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
 236     $NewFPFileIO->Open();
 237   }
 238 
 239   if ($OptionsInfo{TextOutput}) {
 240     my($ColLabelsRef);
 241 
 242     $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
 243     $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
 244 
 245     print "Generating text file $NewFPTextFile...\n";
 246     $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
 247     $NewFPTextFileIO->Open();
 248   }
 249 
 250   return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 251 }
 252 
 253 # Write fingerpritns and other data to appropriate output files...
 254 #
 255 sub WriteDataToOutputFiles {
 256   my($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
 257   my($DataFieldLabelAndValuesRef);
 258 
 259   $DataFieldLabelAndValuesRef = undef;
 260   if ($NewFPTextFileIO || $NewFPFileIO) {
 261     $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 262   }
 263 
 264   if ($NewFPSDFileIO) {
 265     my($CmpdString);
 266 
 267     $CmpdString = $Molecule->GetInputMoleculeString();
 268     $NewFPSDFileIO->WriteFingerprints($MACCSKeysFingerprints, $CmpdString);
 269   }
 270 
 271   if ($NewFPTextFileIO) {
 272     my($ColValuesRef);
 273 
 274     $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 275     $NewFPTextFileIO->WriteFingerprints($MACCSKeysFingerprints, $ColValuesRef);
 276   }
 277 
 278   if ($NewFPFileIO) {
 279     my($CompoundID);
 280 
 281     $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 282     $NewFPFileIO->WriteFingerprints($MACCSKeysFingerprints, $CompoundID);
 283   }
 284 }
 285 
 286 # Generate approriate column labels for FPText output file...
 287 #
 288 sub SetupFPTextFileCoulmnLabels {
 289   my($FileIndex) = @_;
 290   my($Line, @ColLabels);
 291 
 292   @ColLabels = ();
 293   if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 294     push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 295   }
 296   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 297     push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 298   }
 299   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 300     push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
 301   }
 302   elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 303     push @ColLabels, $OptionsInfo{CompoundIDLabel};
 304   }
 305   # Add fingerprints label...
 306   push @ColLabels, $OptionsInfo{FingerprintsLabel};
 307 
 308   return \@ColLabels;
 309 }
 310 
 311 # Generate column values FPText output file..
 312 #
 313 sub SetupFPTextFileCoulmnValues {
 314   my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 315   my(@ColValues);
 316 
 317   @ColValues = ();
 318   if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 319     push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 320   }
 321   elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 322     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 323   }
 324   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 325     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 326   }
 327   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 328     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
 329   }
 330 
 331   return \@ColValues;
 332 }
 333 
 334 # Generate compound ID for FP and FPText output files..
 335 #
 336 sub SetupCmpdIDForOutputFiles {
 337   my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 338   my($CmpdID);
 339 
 340   $CmpdID = '';
 341   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 342     my($MolName);
 343     $MolName = $Molecule->GetName();
 344     $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 345   }
 346   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 347     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 348   }
 349   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 350     my($SpecifiedDataField);
 351     $SpecifiedDataField = $OptionsInfo{CompoundID};
 352     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 353   }
 354   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 355     $CmpdID = $Molecule->GetName();
 356   }
 357   return $CmpdID;
 358 }
 359 
 360 # Generate fingerprints for molecule...
 361 #
 362 sub GenerateMoleculeFingerprints {
 363   my($Molecule) = @_;
 364   my($MACCSKeysFingerprints);
 365 
 366   if ($OptionsInfo{KeepLargestComponent}) {
 367     $Molecule->KeepLargestComponent();
 368   }
 369   if (!$Molecule->DetectRings()) {
 370     return undef;
 371   }
 372   $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
 373   $Molecule->DetectAromaticity();
 374 
 375   $MACCSKeysFingerprints = undef;
 376   if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) {
 377     $MACCSKeysFingerprints = new Fingerprints::MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyBits', 'Size' => $OptionsInfo{Size});
 378   }
 379   elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) {
 380     $MACCSKeysFingerprints = new Fingerprints::MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyCount', 'Size' => $OptionsInfo{Size});
 381   }
 382   else {
 383     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n";
 384   }
 385   $MACCSKeysFingerprints->GenerateMACCSKeys();
 386 
 387   return $MACCSKeysFingerprints;
 388 }
 389 
 390 # Retrieve information about SD files...
 391 #
 392 sub RetrieveSDFilesInfo {
 393   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
 394 
 395   %SDFilesInfo = ();
 396   @{$SDFilesInfo{FileOkay}} = ();
 397   @{$SDFilesInfo{OutFileRoot}} = ();
 398   @{$SDFilesInfo{SDOutFileNames}} = ();
 399   @{$SDFilesInfo{FPOutFileNames}} = ();
 400   @{$SDFilesInfo{TextOutFileNames}} = ();
 401   @{$SDFilesInfo{AllDataFieldsRef}} = ();
 402   @{$SDFilesInfo{CommonDataFieldsRef}} = ();
 403 
 404   $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
 405   $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
 406 
 407   FILELIST: for $Index (0 .. $#SDFilesList) {
 408     $SDFile = $SDFilesList[$Index];
 409 
 410     $SDFilesInfo{FileOkay}[$Index] = 0;
 411     $SDFilesInfo{OutFileRoot}[$Index] = '';
 412     $SDFilesInfo{SDOutFileNames}[$Index] = '';
 413     $SDFilesInfo{FPOutFileNames}[$Index] = '';
 414     $SDFilesInfo{TextOutFileNames}[$Index] = '';
 415 
 416     $SDFile = $SDFilesList[$Index];
 417     if (!(-e $SDFile)) {
 418       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 419       next FILELIST;
 420     }
 421     if (!CheckFileType($SDFile, "sd sdf")) {
 422       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 423       next FILELIST;
 424     }
 425 
 426     if ($CheckDataField) {
 427       # Make sure data field exists in SD file..
 428       my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 429 
 430       @CmpdLines = ();
 431       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 432       $CmpdString = ReadCmpdString(\*SDFILE);
 433       close SDFILE;
 434       @CmpdLines = split "\n", $CmpdString;
 435       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 436       $SpecifiedDataField = $OptionsInfo{CompoundID};
 437       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 438         warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 439         next FILELIST;
 440       }
 441     }
 442 
 443     $AllDataFieldsRef = '';
 444     $CommonDataFieldsRef = '';
 445     if ($CollectDataFields) {
 446       my($CmpdCount);
 447       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 448       ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 449       close SDFILE;
 450     }
 451 
 452     # Setup output file names...
 453     $FileDir = ""; $FileName = ""; $FileExt = "";
 454     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 455 
 456     $TextOutFileExt = "csv";
 457     if ($Options{outdelim} =~ /^tab$/i) {
 458       $TextOutFileExt = "tsv";
 459     }
 460     $SDOutFileExt = $FileExt;
 461     $FPOutFileExt = "fpf";
 462 
 463     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 464       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 465       if ($RootFileName && $RootFileExt) {
 466         $FileName = $RootFileName;
 467       }
 468       else {
 469         $FileName = $OptionsInfo{OutFileRoot};
 470       }
 471       $OutFileRoot = $FileName;
 472     }
 473     else {
 474       $OutFileRoot = "${FileName}MACCSKeysFP";
 475     }
 476 
 477     $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
 478     $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
 479     $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
 480 
 481     if ($OptionsInfo{SDOutput}) {
 482       if ($SDFile =~ /$NewSDFileName/i) {
 483         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 484         print "Specify a different name using \"-r --root\" option or use default name.\n";
 485         next FILELIST;
 486       }
 487     }
 488 
 489     if (!$OptionsInfo{OverwriteFiles}) {
 490       # Check SD and text outout files...
 491       if ($OptionsInfo{SDOutput}) {
 492         if (-e $NewSDFileName) {
 493           warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
 494           next FILELIST;
 495         }
 496       }
 497       if ($OptionsInfo{FPOutput}) {
 498         if (-e $NewFPFileName) {
 499           warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
 500           next FILELIST;
 501         }
 502       }
 503       if ($OptionsInfo{TextOutput}) {
 504         if (-e $NewTextFileName) {
 505           warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
 506           next FILELIST;
 507         }
 508       }
 509     }
 510 
 511     $SDFilesInfo{FileOkay}[$Index] = 1;
 512 
 513     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 514     $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
 515     $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
 516     $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
 517 
 518     $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
 519     $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
 520   }
 521 }
 522 
 523 # Process option values...
 524 sub ProcessOptions {
 525   %OptionsInfo = ();
 526 
 527   $OptionsInfo{Mode} = $Options{mode};
 528   $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
 529 
 530   $OptionsInfo{BitsOrder} = $Options{bitsorder};
 531   $OptionsInfo{BitStringFormat} = $Options{bitstringformat};
 532 
 533   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 534   $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
 535   $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
 536 
 537   $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
 538 
 539   my(@SpecifiedDataFields);
 540   @SpecifiedDataFields = ();
 541 
 542   @{$OptionsInfo{SpecifiedDataFields}} = ();
 543   $OptionsInfo{CompoundID} = '';
 544 
 545   if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
 546     if ($Options{compoundidmode} =~ /^DataField$/i) {
 547       if (!$Options{compoundid}) {
 548         die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 549       }
 550       $OptionsInfo{CompoundID} = $Options{compoundid};
 551     }
 552     elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 553       $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 554     }
 555   }
 556   elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
 557     if (!$Options{datafields}) {
 558       die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
 559     }
 560     @SpecifiedDataFields = split /\,/, $Options{datafields};
 561     push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
 562   }
 563 
 564   $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'MACCSKeysFingerprints';
 565 
 566   $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
 567 
 568   $OptionsInfo{Output} = $Options{output};
 569   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
 570   $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
 571   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
 572 
 573   $OptionsInfo{OutDelim} = $Options{outdelim};
 574   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 575 
 576   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 577   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 578 
 579   $OptionsInfo{Size} = $Options{size};
 580 
 581   $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat};
 582 }
 583 
 584 # Setup script usage  and retrieve command line arguments specified using various options...
 585 sub SetupScriptUsage {
 586 
 587   # Retrieve all the options...
 588   %Options = ();
 589 
 590   $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
 591 
 592   $Options{bitsorder} = 'Ascending';
 593   $Options{bitstringformat} = 'BinaryString';
 594 
 595   $Options{compoundidmode} = 'LabelPrefix';
 596   $Options{compoundidlabel} = 'CompoundID';
 597   $Options{datafieldsmode} = 'CompoundID';
 598 
 599   $Options{filter} = 'Yes';
 600 
 601   $Options{keeplargestcomponent} = 'Yes';
 602 
 603   $Options{mode} = 'MACCSKeyBits';
 604 
 605   $Options{output} = 'text';
 606   $Options{outdelim} = 'comma';
 607   $Options{quote} = 'yes';
 608 
 609   $Options{size} = 166;
 610 
 611   $Options{vectorstringformat} = 'ValuesString';
 612 
 613   if (!GetOptions(\%Options, "aromaticitymodel=s", "bitsorder=s", "bitstringformat|b=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s",  "help|h", "keeplargestcomponent|k=s", "mode|m=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "size|s=i", "vectorstringformat|v=s", "workingdir|w=s")) {
 614     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 615   }
 616   if ($Options{workingdir}) {
 617     if (! -d $Options{workingdir}) {
 618       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 619     }
 620     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 621   }
 622   if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
 623     my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
 624     die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
 625   }
 626   if ($Options{bitsorder} !~ /^(Ascending|Descending)$/i) {
 627     die "Error: The value specified, $Options{bitsorder}, for option \"--BitsOrder\" is not valid. Allowed values: Ascending or Descending\n";
 628   }
 629   if ($Options{bitstringformat} !~ /^(BinaryString|HexadecimalString)$/i) {
 630     die "Error: The value specified, $Options{bitstringformat}, for option \"-b, --bitstringformat\" is not valid. Allowed values: BinaryString or HexadecimalString\n";
 631   }
 632   if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
 633     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 634   }
 635   if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
 636     die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
 637   }
 638   if ($Options{filter} !~ /^(Yes|No)$/i) {
 639     die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
 640   }
 641   if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
 642     die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
 643   }
 644   if ($Options{mode} !~ /^(MACCSKeyBits|MACCSKeyCount)$/i) {
 645     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n";
 646   }
 647   if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
 648     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
 649   }
 650   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 651     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 652   }
 653   if ($Options{quote} !~ /^(Yes|No)$/i) {
 654     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 655   }
 656   if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
 657     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
 658   }
 659   if (!(IsPositiveInteger($Options{size}) && ($Options{size} == 166 || $Options{size} == 322))) {
 660     die "Error: The value specified, $Options{size}, for option \"-s, --size\" is not valid. Allowed values: 166 or 322 \n";
 661   }
 662   if ($Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
 663     die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 664   }
 665 }
 666