MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: ExtractFromSDFiles.pl,v $
   4 # $Date: 2015/03/22 19:11:27 $
   5 # $Revision: 1.48 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use SDFileUtil;
  36 use FileUtil;
  37 use TextUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName:Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 my(@SDFilesList);
  56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  57 
  58 # Process options...
  59 print "Processing options...\n";
  60 my(%OptionsInfo);
  61 ProcessOptions();
  62 
  63 # Collect information about SD files...
  64 print "Checking input SD file(s)...\n";
  65 my(%SDFilesInfo);
  66 RetrieveSDFilesInfo();
  67 
  68 # Generate output files...
  69 my($FileIndex);
  70 if (@SDFilesList > 1) {
  71   print "\nProcessing SD files...\n";
  72 }
  73 for $FileIndex (0 .. $#SDFilesList) {
  74   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  75     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  76     ExtractFromSDFile($FileIndex);
  77   }
  78 }
  79 print "\n$ScriptName:Done...\n\n";
  80 
  81 $EndTime = new Benchmark;
  82 $TotalTime = timediff ($EndTime, $StartTime);
  83 print "Total time: ", timestr($TotalTime), "\n";
  84 
  85 ###############################################################################
  86 
  87 # Extract data from a SD file...
  88 sub ExtractFromSDFile {
  89   my($FileIndex) = @_;
  90 
  91   OpenInputAndOutputFiles($FileIndex);
  92 
  93   MODE: {
  94     if ($OptionsInfo{Mode} =~ /^AllDataFields$/i) {
  95       ExtractAllDataFields($FileIndex);
  96       last MODE;
  97     }
  98     if ($OptionsInfo{Mode} =~ /^CommonDataFields$/i) {
  99       ExtractCommonDataFields($FileIndex);
 100       last MODE;
 101     }
 102     if ($OptionsInfo{Mode} =~ /^DataFields$/i) {
 103       ExtractDataFields($FileIndex);
 104       last MODE;
 105     }
 106     if ($OptionsInfo{Mode} =~ /^(DataFieldByList|DatafieldUniqueByList)$/i) {
 107       ExtractDataFieldByList($FileIndex);
 108       last MODE;
 109     }
 110     if ($OptionsInfo{Mode} =~ /^DataFieldNotByList$/i) {
 111       ExtractDataFieldNotByList($FileIndex);
 112       last MODE;
 113     }
 114     if ($OptionsInfo{Mode} =~ /^DataFieldsByValue$/i) {
 115       ExtractDataFieldsByValue($FileIndex);
 116       last MODE;
 117     }
 118     if ($OptionsInfo{Mode} =~ /^DataFieldsByRegex$/i) {
 119       ExtractDataFieldsByRegex($FileIndex);
 120       last MODE;
 121     }
 122     if ($OptionsInfo{Mode} =~ /^RandomCmpds$/i) {
 123       ExtractRandomCompounds($FileIndex);
 124       last MODE;
 125     }
 126     if ($OptionsInfo{Mode} =~ /^MolNames$/i) {
 127       ExtractMolNames($FileIndex);
 128       last MODE;
 129     }
 130     if ($OptionsInfo{Mode} =~ /^RecordNum$/i) {
 131       ExtractRecordNum($FileIndex);
 132       last MODE;
 133     }
 134     if ($OptionsInfo{Mode} =~ /^RecordNums$/i) {
 135       ExtractRecordNums($FileIndex);
 136       last MODE;
 137     }
 138     if ($OptionsInfo{Mode} =~ /^RecordRange$/i) {
 139       ExtractRecordRange($FileIndex);
 140       last MODE;
 141     }
 142     if ($OptionsInfo{Mode} =~ /^2DCmpdRecords$/i) {
 143       Extract2DCmpdRecords($FileIndex);
 144       last MODE;
 145     }
 146     if ($OptionsInfo{Mode} =~ /^3DCmpdRecords$/i) {
 147       Extract3DCmpdRecords($FileIndex);
 148       last MODE;
 149     }
 150     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
 151   }
 152 
 153   CloseInputAndOutputFiles();
 154 }
 155 
 156 # Extract all data fields...
 157 sub ExtractAllDataFields {
 158   my($FileIndex) = @_;
 159   my(@CmpdLines);
 160 
 161   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 162   WriteTextFileColLabels();
 163 
 164   while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 165     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 166     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 167 
 168     SetupDataValues();
 169     WriteTextFileCmpdData();
 170     WriteSDFileCmpdData();
 171   }
 172 }
 173 
 174 # Extract common data fields...
 175 sub ExtractCommonDataFields {
 176   my($FileIndex) = @_;
 177   my(@CmpdLines);
 178 
 179   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{CommonDataFieldLabels}[$FileIndex]};
 180   WriteTextFileColLabels();
 181 
 182   while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 183     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 184     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 185 
 186     SetupDataValues();
 187     WriteTextFileCmpdData();
 188     WriteSDFileCmpdData();
 189   }
 190 }
 191 
 192 # Extract specified data fields...
 193 sub ExtractDataFields {
 194   my($FileIndex) = @_;
 195   my(@CmpdLines);
 196 
 197   @{$SDFilesInfo{DataLabels}} = @{$OptionsInfo{SpecifiedDataFieldLabels}};
 198   WriteTextFileColLabels();
 199 
 200   while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 201     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 202     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 203 
 204     SetupDataValues();
 205     WriteTextFileCmpdData();
 206     WriteSDFileCmpdData();
 207   }
 208 }
 209 
 210 # Extract data fields using a list...
 211 sub ExtractDataFieldByList {
 212   my($FileIndex) = @_;
 213   my($CmpdNum, $Value, $SpecifiedDataFieldValuesFoundCount, $CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines);
 214 
 215   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 216   WriteTextFileColLabels();
 217 
 218   for $Value (keys %{$OptionsInfo{SpecifiedDataFieldValues}}) {
 219     $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound";
 220   }
 221   $SpecifiedDataFieldValuesFoundCount = 0;
 222   $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
 223 
 224   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 225     $CmpdNum++;
 226 
 227     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 228     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 229 
 230     if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) {
 231       next CMPDSTRING;
 232     }
 233 
 234     SetupDataValues();
 235 
 236     $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
 237     $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel};
 238 
 239     if (exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) {
 240       if ($SpecifiedDataFieldValuesFoundCount < $OptionsInfo{SpecifiedDataFieldValuesCount}) {
 241         if ($OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} eq "NotFound") {
 242           $SpecifiedDataFieldValuesFoundCount++;
 243           $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} = "Found";
 244           if ($OptionsInfo{Mode} =~ /^DataFieldUniqueByList$/i) {
 245             WriteSDFileCmpdString();
 246             WriteTextFileCmpdData();
 247           }
 248         }
 249         if ($OptionsInfo{Mode} =~ /^DataFieldByList$/i) {
 250           WriteSDFileCmpdString();
 251           WriteTextFileCmpdData();
 252         }
 253       }
 254       if ($SpecifiedDataFieldValuesFoundCount >= $OptionsInfo{SpecifiedDataFieldValuesCount}) {
 255         last CMPDSTRING;
 256       }
 257     }
 258   }
 259 }
 260 
 261 # Extract data field whose values are not on the specified list...
 262 sub ExtractDataFieldNotByList {
 263   my($FileIndex) = @_;
 264   my($CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines);
 265 
 266   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 267   WriteTextFileColLabels();
 268 
 269   $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
 270 
 271   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 272     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 273     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 274 
 275     if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) {
 276       next CMPDSTRING;
 277     }
 278 
 279     SetupDataValues();
 280 
 281     $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel};
 282 
 283     # Make sure the current value is not empty and is not only specified list of values...
 284     if (IsEmpty($CurrentValue) || exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) {
 285       next CMPDSTRING;
 286     }
 287 
 288     WriteSDFileCmpdString();
 289     WriteTextFileCmpdData();
 290   }
 291 }
 292 
 293 # Extract data fields by value...
 294 sub ExtractDataFieldsByValue {
 295   my($FileIndex) = @_;
 296   my($Label, $CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing, @CmpdLines);
 297 
 298   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 299   WriteTextFileColLabels();
 300 
 301   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 302     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 303     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 304 
 305     SetupDataValues();
 306     $ViolationCount = 0;
 307 
 308     for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) {
 309       if (exists $SDFilesInfo{DataFieldValues}{$Label}) {
 310         $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label};
 311         $SpecifiedCriterion = $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label};
 312         $SpecifiedValue = $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label};
 313 
 314         if ($OptionsInfo{NumericalComparison}) {
 315           CRITERION: {
 316               if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue != $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 317               if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue > $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 318               if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue < $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 319               $Nothing = 1;
 320             }
 321         }
 322         else {
 323           CRITERION: {
 324               if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 325               if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 326               if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 327               $Nothing = 1;
 328             }
 329         }
 330       }
 331     }
 332     if ($ViolationCount <= $OptionsInfo{Violations}) {
 333       WriteSDFileCmpdString();
 334       WriteTextFileCmpdData();
 335     }
 336   }
 337 }
 338 
 339 # Extract data fields by value using regular expression match...
 340 sub ExtractDataFieldsByRegex {
 341   my($FileIndex) = @_;
 342   my($Label, $CurrentValue, $SpecifiedRegexCriterion, $SpecifiedRegex, $ViolationCount, $Nothing, @CmpdLines);
 343 
 344   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 345   WriteTextFileColLabels();
 346 
 347   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 348     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 349     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 350 
 351     SetupDataValues();
 352     $ViolationCount = 0;
 353 
 354     for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) {
 355       if (exists $SDFilesInfo{DataFieldValues}{$Label}) {
 356         $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label};
 357            $SpecifiedRegexCriterion = $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label};
 358            $SpecifiedRegex = $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label};
 359 
 360         if ($OptionsInfo{RegexIgnoreCase}) {
 361           CRITERION: {
 362                  if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } }
 363                  if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/i) {  $ViolationCount++; last CRITERION; } }
 364               $Nothing = 1;
 365             }
 366         }
 367         else {
 368           CRITERION: {
 369                  if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } }
 370                  if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/) {  $ViolationCount++; last CRITERION; } }
 371               $Nothing = 1;
 372             }
 373         }
 374       }
 375     }
 376     if ($ViolationCount <= $OptionsInfo{Violations}) {
 377       WriteSDFileCmpdString();
 378       WriteTextFileCmpdData();
 379     }
 380   }
 381 }
 382 
 383 # Extract random compounds...
 384 sub ExtractRandomCompounds {
 385   my($FileIndex) = @_;
 386   my($CmpdNum, $CmpdCount, $RandomCycleCount, $RandomIndex, @CmpdLines, %RandomCmpdIndexMap);
 387 
 388   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 389   WriteTextFileColLabels();
 390 
 391   $CmpdCount = $SDFilesInfo{CmpdCount}[$FileIndex];
 392   srand($OptionsInfo{Seed});
 393   $RandomCycleCount = 0;
 394 
 395   %RandomCmpdIndexMap = ();
 396   while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $OptionsInfo{NumOfCmpds}) {
 397     $RandomCycleCount++;
 398     $RandomIndex = int (rand $CmpdCount) + 1;
 399     $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex;
 400   }
 401 
 402   $CmpdNum = 0;
 403   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 404     $CmpdNum++;
 405     if (!exists $RandomCmpdIndexMap{$CmpdNum}) {
 406       next CMPDSTRING;
 407     }
 408 
 409     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 410 
 411     WriteSDFileCmpdString();
 412 
 413     if ($OptionsInfo{OutputTextFile}) {
 414       %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 415       SetupDataValues();
 416       WriteTextFileCmpdData();
 417     }
 418   }
 419 }
 420 
 421 # Extract mol names...
 422 sub ExtractMolNames {
 423   my($FileIndex) = @_;
 424   my($MolName, $NewTextFileRef, @CmpdLines);
 425 
 426   push @{$SDFilesInfo{DataLabels}}, "MolName";
 427   WriteTextFileColLabels();
 428 
 429   $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
 430   while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 431     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 432     $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OptionsInfo{OutQuote});
 433     print $NewTextFileRef "$MolName\n";
 434   }
 435 }
 436 
 437 # Extract a specific compound record...
 438 sub ExtractRecordNum {
 439   my($FileIndex) = @_;
 440   my($CmpdNum, @CmpdLines);
 441 
 442   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 443   WriteTextFileColLabels();
 444 
 445   $CmpdNum = 0;
 446 
 447   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 448     $CmpdNum++;
 449     if ($CmpdNum != $OptionsInfo{RecordNum}) {
 450       next CMPDSTRING;
 451     }
 452 
 453     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 454     WriteSDFileCmpdString();
 455 
 456     if ($OptionsInfo{OutputTextFile}) {
 457       %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 458       SetupDataValues();
 459       WriteTextFileCmpdData();
 460     }
 461     last CMPDSTRING;
 462   }
 463 }
 464 
 465 # Extract a specific compound records...
 466 sub ExtractRecordNums {
 467   my($FileIndex) = @_;
 468   my($CmpdNum, $CmpdCount, @CmpdLines);
 469 
 470   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 471   WriteTextFileColLabels();
 472 
 473   $CmpdNum = 0;
 474   $CmpdCount = 0;
 475 
 476   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 477     $CmpdNum++;
 478 
 479     if (exists $OptionsInfo{RecordNums}{$CmpdNum}) {
 480       $CmpdCount++;
 481       @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 482 
 483       WriteSDFileCmpdString();
 484 
 485       if ($OptionsInfo{OutputTextFile}) {
 486         %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 487         SetupDataValues();
 488         WriteTextFileCmpdData();
 489       }
 490     }
 491     elsif ($CmpdNum > $OptionsInfo{RecordNumsMax} || $CmpdCount >= $OptionsInfo{RecordNumsCount}) {
 492       last CMPDSTRING;
 493     }
 494   }
 495 }
 496 
 497 
 498 # Extract compounds in a specific record range...
 499 sub ExtractRecordRange {
 500   my($FileIndex) = @_;
 501   my($CmpdNum, @CmpdLines);
 502 
 503   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 504   WriteTextFileColLabels();
 505 
 506   $CmpdNum = 0;
 507   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 508     $CmpdNum++;
 509 
 510     if ($CmpdNum >= $OptionsInfo{StartRecordNum} && $CmpdNum <= $OptionsInfo{EndRecordNum}) {
 511       @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 512 
 513       WriteSDFileCmpdString();
 514 
 515       if ($OptionsInfo{OutputTextFile}) {
 516         %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 517         SetupDataValues();
 518         WriteTextFileCmpdData();
 519       }
 520     }
 521     elsif ($CmpdNum > $OptionsInfo{EndRecordNum}) {
 522       last CMPDSTRING;
 523     }
 524   }
 525 }
 526 
 527 # Extract 2D compound records...
 528 sub Extract2DCmpdRecords {
 529   my($FileIndex) = @_;
 530   my(@CmpdLines);
 531 
 532   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 533   WriteTextFileColLabels();
 534 
 535 
 536   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 537     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 538     if (!IsCmpd2D(\@CmpdLines)) {
 539       next CMPDSTRING;
 540     }
 541 
 542     WriteSDFileCmpdString();
 543 
 544     if ($OptionsInfo{OutputTextFile}) {
 545       %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 546       SetupDataValues();
 547       WriteTextFileCmpdData();
 548     }
 549   }
 550 }
 551 
 552 # Extract 3D compound records...
 553 sub Extract3DCmpdRecords {
 554   my($FileIndex) = @_;
 555   my(@CmpdLines);
 556 
 557   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 558   WriteTextFileColLabels();
 559 
 560 
 561   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 562     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 563     if (!IsCmpd3D(\@CmpdLines)) {
 564       next CMPDSTRING;
 565     }
 566 
 567     WriteSDFileCmpdString();
 568 
 569     if ($OptionsInfo{OutputTextFile}) {
 570       %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 571       SetupDataValues();
 572       WriteTextFileCmpdData();
 573     }
 574   }
 575 }
 576 
 577 
 578 # Open input and output files...
 579 sub OpenInputAndOutputFiles {
 580   my($FileIndex) = @_;
 581 
 582   $SDFilesInfo{NewTextFileRef} = undef;
 583   $SDFilesInfo{NewSDFileRef} = undef;
 584 
 585   if ($OptionsInfo{OutputTextFile} && $OptionsInfo{OutputSDFile}) {
 586     print "Generating files $SDFilesInfo{NewSDFileName}[$FileIndex] and $SDFilesInfo{NewTextFileName}[$FileIndex]...\n";
 587   }
 588   elsif ($OptionsInfo{OutputSDFile}) {
 589     print "Generating file $SDFilesInfo{NewSDFileName}[$FileIndex]...\n";
 590   }
 591   else {
 592     print "Generating file $SDFilesInfo{NewTextFileName}[$FileIndex]...\n";
 593   }
 594 
 595   if ($OptionsInfo{OutputSDFile}) {
 596     open NEWSDFILE, ">$SDFilesInfo{NewSDFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewSDFileName}[$FileIndex]: $! \n";
 597     $SDFilesInfo{NewSDFileRef} = \*NEWSDFILE;
 598   }
 599   if ($OptionsInfo{OutputTextFile}) {
 600     open NEWTEXTFILE, ">$SDFilesInfo{NewTextFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewTextFileName}[$FileIndex]: $! \n";
 601     $SDFilesInfo{NewTextFileRef} = \*NEWTEXTFILE;
 602   }
 603 
 604   open SDFILE, "$SDFilesList[$FileIndex]" or die "Error: Couldn't open $SDFilesList[$FileIndex]: $! \n";
 605   $SDFilesInfo{InputSDFileRef} = \*SDFILE;
 606 
 607 }
 608 
 609 # Close open input and output files...
 610 sub CloseInputAndOutputFiles {
 611   if ($SDFilesInfo{NewSDFileRef}) {
 612     close $SDFilesInfo{NewSDFileRef};
 613   }
 614   if ($SDFilesInfo{NewTextFileRef}) {
 615     close $SDFilesInfo{NewTextFileRef};
 616   }
 617 
 618   if ($SDFilesInfo{InputSDFileRef}) {
 619     close $SDFilesInfo{InputSDFileRef};
 620   }
 621 
 622   $SDFilesInfo{NewTextFileRef} = undef;
 623   $SDFilesInfo{NewSDFileRef} = undef;
 624   $SDFilesInfo{InputSDFileRef} = undef;
 625 }
 626 
 627 # Write out column labels for text file...
 628 sub WriteTextFileColLabels {
 629   my($ColLabelsLine, $NewTextFileRef);
 630 
 631   if (!$OptionsInfo{OutputTextFile}) {
 632     return;
 633   }
 634   $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
 635 
 636   if ($OptionsInfo{OutputStrDataString}) {
 637     # Append structure data string label...
 638     my(@DataLabels);
 639 
 640     @DataLabels = ();
 641     push @DataLabels, @{$SDFilesInfo{DataLabels}};
 642     push @DataLabels, "StructureDataString";
 643 
 644     $ColLabelsLine = JoinWords(\@DataLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 645   }
 646   else {
 647     $ColLabelsLine = JoinWords(\@{$SDFilesInfo{DataLabels}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 648   }
 649   print $NewTextFileRef "$ColLabelsLine\n";
 650 }
 651 
 652 # Setup values for data fields...
 653 sub SetupDataValues {
 654   @{$SDFilesInfo{DataValues}} = map { exists $SDFilesInfo{DataFieldValues}{$_} ? $SDFilesInfo{DataFieldValues}{$_} : "" } @{$SDFilesInfo{DataLabels}};
 655 }
 656 
 657 # Write out structure data and specific data fields to SD file...
 658 sub WriteSDFileCmpdData {
 659   my($MolString, $Count, $NewSDFileRef);
 660 
 661   if (!$OptionsInfo{OutputSDFile}) {
 662     return;
 663   }
 664 
 665   $NewSDFileRef = $SDFilesInfo{NewSDFileRef};
 666 
 667   ($MolString) = split "M  END", $SDFilesInfo{CmpdString};
 668   $MolString .= "M  END";
 669   print $NewSDFileRef "$MolString\n";
 670 
 671   for $Count (0 .. $#{$SDFilesInfo{DataLabels}}) {
 672     print $NewSDFileRef ">  <$SDFilesInfo{DataLabels}[$Count]>\n$SDFilesInfo{DataValues}[$Count]\n\n";
 673   }
 674   print $NewSDFileRef "\$\$\$\$\n";
 675 }
 676 
 677 # Write out compound string...
 678 sub WriteSDFileCmpdString {
 679   my($NewSDFileRef);
 680 
 681   if (!$OptionsInfo{OutputSDFile}) {
 682     return;
 683   }
 684 
 685   $NewSDFileRef = $SDFilesInfo{NewSDFileRef};
 686   print $NewSDFileRef "$SDFilesInfo{CmpdString}\n";
 687 }
 688 
 689 # Write out data for text file...
 690 sub WriteTextFileCmpdData {
 691   my($DataValuesLine, $NewTextFileRef);
 692 
 693   if (!$OptionsInfo{OutputTextFile}) {
 694     return;
 695   }
 696 
 697   $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
 698   $DataValuesLine = JoinWords(\@{$SDFilesInfo{DataValues}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 699 
 700   # Handle multiple lines data values for data fields by joining 'em using semicolons...
 701   if ($DataValuesLine =~ /\n/) {
 702     $DataValuesLine =~ s/\n/;/g;
 703   }
 704 
 705   if ($OptionsInfo{OutputStrDataString}) {
 706     # Append structure data string...
 707     my($StrDataString, $OutQuoteValue, $OutDelim, $StrDataStringDelimiter);
 708 
 709     if ($OptionsInfo{StrDataStringWithFields}) {
 710       $StrDataString = $SDFilesInfo{CmpdString};
 711     }
 712     else {
 713       ($StrDataString) = split "M  END", $SDFilesInfo{CmpdString};
 714       $StrDataString .= "M  END";
 715     }
 716     $StrDataStringDelimiter = $OptionsInfo{StrDataStringDelimiter};
 717     $StrDataString =~ s/\n/$StrDataStringDelimiter/g;
 718 
 719     $OutDelim = $OptionsInfo{OutDelim};
 720     $OutQuoteValue = $OptionsInfo{OutQuote} ? "\"" : "";
 721 
 722     print $NewTextFileRef "$DataValuesLine${OutDelim}${OutQuoteValue}${StrDataString}${OutQuoteValue}\n";
 723   }
 724   else {
 725     print $NewTextFileRef "$DataValuesLine\n";
 726   }
 727 }
 728 
 729 # Retrieve information about input SD files...
 730 sub RetrieveSDFilesInfo {
 731   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount);
 732 
 733   %SDFilesInfo = ();
 734 
 735   @{$SDFilesInfo{FileOkay}} = ();
 736   @{$SDFilesInfo{CmpdCount}} = ();
 737   @{$SDFilesInfo{NewTextFileName}} = ();
 738   @{$SDFilesInfo{NewSDFileName}} = ();
 739 
 740   @{$SDFilesInfo{AllDataFieldLabels}} = ();
 741   @{$SDFilesInfo{CommonDataFieldLabels}} = ();
 742 
 743   FILELIST: for $Index (0 .. $#SDFilesList) {
 744     $SDFile = $SDFilesList[$Index];
 745 
 746     $SDFilesInfo{FileOkay}[$Index] = 0;
 747 
 748     $SDFilesInfo{CmpdCount}[$Index] = 0;
 749     $SDFilesInfo{NewTextFileName}[$Index] = "";
 750     $SDFilesInfo{NewSDFileName}[$Index] = "";
 751 
 752     @{$SDFilesInfo{AllDataFieldLabels}[$Index]} = ();
 753     @{$SDFilesInfo{CommonDataFieldLabels}[$Index]} = ();
 754 
 755     if (!(-e $SDFile)) {
 756       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 757       next FILELIST;
 758     }
 759 
 760     if (!CheckFileType($SDFile, "sd sdf")) {
 761       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 762       next FILELIST;
 763     }
 764 
 765     # Generate appropriate name for the new output file.
 766     $FileDir = ""; $FileName = ""; $FileExt = "";
 767     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 768     $NewFileName = $FileName;
 769     $NewFileName = $FileName  . $OptionsInfo{FileNameMode};
 770     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 771       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 772       if ($RootFileName && $RootFileExt) {
 773         $NewFileName = $RootFileName;
 774       }
 775       else {
 776         $NewFileName = $OptionsInfo{OutFileRoot};
 777       }
 778     }
 779     $NewSDFileName = $NewFileName . ".$OptionsInfo{SDFileExt}";
 780     $NewTextFileName = $NewFileName . ".$OptionsInfo{TextFileExt}";
 781 
 782     if ($OptionsInfo{OutputSDFile}) {
 783       if (lc($NewSDFileName) eq lc($SDFile)) {
 784         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 785         print "Specify a different name using \"-r --root\" option or use default name.\n";
 786         next FILELIST;
 787       }
 788     }
 789 
 790     if (!$OptionsInfo{Overwrite}) {
 791       if ($OptionsInfo{OutputSDFile}) {
 792         if (-e $NewSDFileName) {
 793           warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n";
 794           next FILELIST;
 795         }
 796       }
 797       if ($OptionsInfo{OutputTextFile}) {
 798         if (-e $NewTextFileName) {
 799           warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n";
 800           next FILELIST;
 801         }
 802       }
 803     }
 804 
 805     if (!open SDFILE, "$SDFile") {
 806       warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 807       next FILELIST;
 808     }
 809 
 810     my($CountCmpds, $CollectDataFields);
 811     my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels);
 812 
 813     $CountCmpds = ($OptionsInfo{Mode} =~ /^randomcmpds$/i) ? 1 : 0;
 814 
 815     $CollectDataFields = (($OptionsInfo{Mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^(datafieldsbyvalue|datafieldsbyregex)$/i  && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldbylist$/i  && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafielduniquebylist$/i  && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldnotbylist$/i  && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnum$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnums$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordrange$/i && $OptionsInfo{OutputTextFile})) ? 1 : 0;
 816 
 817     $CmpdCount = 0;
 818     if ($CountCmpds || $CollectDataFields) {
 819       @DataFieldLabels = ();
 820       @CommonDataFieldLabels = ();
 821       %DataFieldLabelsMap = ();
 822       CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 823         $CmpdCount++;
 824         if ($OptionsInfo{Mode} =~ /^recordnum$/i) {
 825           if ($CmpdCount == $OptionsInfo{RecordNum}) {
 826             @CmpdLines = split "\n", $CmpdString;
 827             @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 828             last CMPDSTRING;
 829           }
 830         }
 831         if ($CollectDataFields) {
 832           my($Label);
 833           @CmpdLines = split "\n", $CmpdString;
 834           # Process compound data header labels and figure out which ones are present for
 835           # all the compounds...
 836           if (@DataFieldLabels) {
 837             my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
 838             my(%CmpdDataFieldLabelsMap) = ();
 839             # Setup a map for the current labels...
 840             for $Label (@CmpdDataFieldLabels) {
 841               $CmpdDataFieldLabelsMap{$Label} = "PresentInSome";
 842             }
 843             # Check the presence old labels for this compound; otherwise, mark 'em new...
 844             for $Label (@DataFieldLabels) {
 845               if (!$CmpdDataFieldLabelsMap{$Label}) {
 846                 $DataFieldLabelsMap{$Label} = "PresentInSome";
 847               }
 848             }
 849             # Check the presence this compound in the old labels; otherwise, add 'em...
 850             for $Label (@CmpdDataFieldLabels ) {
 851               if (!$DataFieldLabelsMap{$Label}) {
 852                 # It's a new label...
 853                 push @DataFieldLabels, $Label;
 854                 $DataFieldLabelsMap{$Label} = "PresentInSome";
 855               }
 856             }
 857           }
 858           else {
 859             # Get the initial label set and set up a map...
 860             @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 861             for $Label (@DataFieldLabels) {
 862               $DataFieldLabelsMap{$Label} = "PresentInAll";
 863             }
 864           }
 865           # Identify the common data field labels...
 866           if ($Options{mode} =~ /^commondatafields$/i) {
 867             @CommonDataFieldLabels = ();
 868             for $Label (@DataFieldLabels) {
 869               if ($DataFieldLabelsMap{$Label} eq "PresentInAll") {
 870                 push @CommonDataFieldLabels, $Label;
 871               }
 872             }
 873           }
 874         }
 875       }
 876     }
 877 
 878     $SDFilesInfo{FileOkay}[$Index] = 1;
 879 
 880     $SDFilesInfo{NewTextFileName}[$Index] = $NewTextFileName;
 881     $SDFilesInfo{NewSDFileName}[$Index] = $NewSDFileName;
 882 
 883     $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount;
 884 
 885     push @{$SDFilesInfo{AllDataFieldLabels}[$Index]}, @DataFieldLabels;
 886     push @{$SDFilesInfo{CommonDataFieldLabels}[$Index]}, @CommonDataFieldLabels;
 887 
 888     close SDFILE;
 889   }
 890 }
 891 
 892 # Process options...
 893 sub ProcessOptions {
 894   %OptionsInfo = ();
 895 
 896   $OptionsInfo{Mode} = $Options{mode};
 897 
 898   $OptionsInfo{InDelim} = "\,";
 899   if ($Options{indelim} =~ /^semicolon$/i) {
 900     $OptionsInfo{InDelim} = "\;";
 901   }
 902   elsif ($Options{indelim} =~ /^tab$/i) {
 903     $OptionsInfo{InDelim} = "\t";
 904   }
 905 
 906   $OptionsInfo{OutDelim} = "\,";
 907   if ($Options{outdelim} =~ /^semicolon$/i) {
 908     $OptionsInfo{OutDelim} = "\;";
 909   }
 910   elsif ($Options{outdelim} =~ /^tab$/i) {
 911     $OptionsInfo{OutDelim} = "\t";
 912   }
 913 
 914   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 915 
 916   $OptionsInfo{RegexIgnoreCase} = ($Options{regexignorecase} =~ /^yes$/i) ? 1 : 0;
 917 
 918   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef;
 919   $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef;
 920 
 921   $OptionsInfo{NumOfCmpds} = $Options{numofcmpds};
 922 
 923   $OptionsInfo{ValueComparisonMode} = $Options{valuecomparisonmode};
 924   $OptionsInfo{NumericalComparison} = ($Options{valuecomparisonmode} =~ /^Numeric$/i) ? 1 : 0;
 925 
 926   $OptionsInfo{Violations} = $Options{violations};
 927   $OptionsInfo{Seed} = $Options{seed};
 928 
 929 
 930   if ($Options{mode} =~ /^(datafields|datafieldsbyregex|datafieldsbyvalue|datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) {
 931     if ($Options{datafields} || $Options{datafieldsfile}) {
 932       if ($Options{datafields} && $Options{datafieldsfile}) {
 933         die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
 934       }
 935     }
 936     else {
 937       die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
 938     }
 939   }
 940   $OptionsInfo{DataFields} = $Options{datafields} ? $Options{datafields} : undef;
 941   $OptionsInfo{DataFieldsFile} = $Options{datafieldsfile} ? $Options{datafieldsfile} : undef;
 942 
 943   $OptionsInfo{RecordNum} = 0; $OptionsInfo{StartRecordNum} = 0; $OptionsInfo{EndRecordNum} = 0;
 944 
 945   %{$OptionsInfo{RecordNums}} = ();
 946   $OptionsInfo{RecordNumsMin} = 0; $OptionsInfo{RecordNumsMax} = 0; $OptionsInfo{RecordNumsCount} = 0;
 947 
 948   $OptionsInfo{Record} = $Options{record} ? $Options{record} : undef;
 949 
 950   if ($Options{mode} =~ /^(recordnum|recordnums|recordrange)$/i) {
 951     if ($Options{record}) {
 952       my($Record, @RecordSplit);
 953 
 954       $Record = $Options{record};
 955       $Record =~ s/ //g;
 956 
 957       @RecordSplit = split ",", $Record;
 958 
 959       if ($Options{mode} =~ /^recordnum$/i ) {
 960         if (@RecordSplit == 1) {
 961           $OptionsInfo{RecordNum} = $RecordSplit[0];
 962           if ($OptionsInfo{RecordNum} <= 0) {
 963             die "Error: The value specified, $OptionsInfo{RecordNum},  for option \"--records\" is not valid. Allowed values: > 0 \n";
 964           }
 965         }
 966         else {
 967           die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n";
 968         }
 969       }
 970       elsif ($Options{mode} =~ /^recordnums$/i ) {
 971         my($RecordNum, $RecordCount, @SortedRecordSplit);
 972 
 973         @SortedRecordSplit = sort { $a <=> $b } @RecordSplit;
 974 
 975         $RecordCount = 0;
 976         RECORDNUM: for $RecordNum (@SortedRecordSplit) {
 977           if (exists $OptionsInfo{RecordNums}{$RecordNum}) {
 978             next RECORDNUM;
 979           }
 980           $RecordCount++;
 981           $OptionsInfo{RecordNums}{$RecordNum} = $RecordNum;
 982         }
 983         $OptionsInfo{RecordNumsCount} = $RecordCount;
 984         $OptionsInfo{RecordNumsMin} = $SortedRecordSplit[0];
 985         $OptionsInfo{RecordNumsMax} = $SortedRecordSplit[$#SortedRecordSplit];
 986       }
 987       else {
 988         if (@RecordSplit == 2) {
 989           $OptionsInfo{StartRecordNum} = $RecordSplit[0];
 990           $OptionsInfo{EndRecordNum} = $RecordSplit[1];
 991           if ($OptionsInfo{StartRecordNum} <= 0 || $OptionsInfo{EndRecordNum} <= 0) {
 992             die "Error: The value pair specified, $Options{record},  for option \"--records\" is not valid. Allowed values: > 0 \n";
 993           }
 994         }
 995         else {
 996           die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n";
 997         }
 998         if ($OptionsInfo{StartRecordNum} > $OptionsInfo{EndRecordNum}) {
 999           die "Error: Start record number, $OptionsInfo{StartRecordNum}, must be smaller than end record number, $OptionsInfo{EndRecordNum}.\nSpecify different values using \"--record\" option.\n";
1000         }
1001       }
1002     }
1003     else {
1004       die "Error: For \"-m --mode\" option values recordnum, recordnums or recordrange, specify \"--record\" option value.\n";
1005     }
1006   }
1007 
1008   @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
1009 
1010   my(@Words, $Line, $Value);
1011   if ($Options{mode} =~ /^datafields$/i) {
1012     @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
1013     if ($Options{datafields}) {
1014       @{$OptionsInfo{SpecifiedDataFieldLabels}} = split $OptionsInfo{InDelim}, $Options{datafields};
1015     }
1016     elsif ($Options{datafieldsfile}) {
1017       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
1018       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
1019         @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
1020         if (@Words) {
1021           push @{$OptionsInfo{SpecifiedDataFieldLabels}}, @Words;
1022         }
1023       }
1024       close DATAFIELDSFILE;
1025     }
1026   }
1027   elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) {
1028     my(@DataFieldsByValueTriplets);
1029     @DataFieldsByValueTriplets = ();
1030     if ($Options{datafields}) {
1031       @DataFieldsByValueTriplets = split $OptionsInfo{InDelim}, $Options{datafields};
1032     }
1033     elsif ($Options{datafieldsfile}) {
1034       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
1035       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
1036         @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
1037         if (@Words) {
1038           push @DataFieldsByValueTriplets, @Words;
1039         }
1040       }
1041       close DATAFIELDSFILE;
1042     }
1043     if ((@DataFieldsByValueTriplets % 3)) {
1044       if ($Options{datafields}) {
1045         die "Error: Triplets not found in values specified by \"-d --datafields\" option\n";
1046       }
1047       elsif ($Options{datafieldsfile}) {
1048         die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n";
1049       }
1050     }
1051     my($Index, $Label, $Value, $Criterion);
1052 
1053     @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
1054     %{$OptionsInfo{SpecifiedDataFieldValuesMap}} = ();
1055     %{$OptionsInfo{SpecifiedDataFieldCriteriaMap}} = ();
1056 
1057     for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) {
1058       $Label = $DataFieldsByValueTriplets[$Index];
1059       $Value = $DataFieldsByValueTriplets[$Index + 1];
1060       $Criterion = $DataFieldsByValueTriplets[$Index + 2];
1061 
1062       if ($Criterion =~ /^(eq|le|ge)$/i) {
1063         push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label;
1064         $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label} = $Value;
1065         $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label} = $Criterion;
1066       }
1067       else {
1068         warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n";
1069       }
1070     }
1071   }
1072   elsif ($Options{mode} =~ /^datafieldsbyregex$/i) {
1073     my(@DataFieldsByRegexTriplets);
1074 
1075     @DataFieldsByRegexTriplets = ();
1076     if ($Options{datafields}) {
1077       @DataFieldsByRegexTriplets = quotewords($OptionsInfo{InDelim}, 0, $Options{datafields});
1078     }
1079     elsif ($Options{datafieldsfile}) {
1080       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
1081       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
1082           @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
1083           if (@Words) {
1084             push @DataFieldsByRegexTriplets, @Words;
1085           }
1086       }
1087       close DATAFIELDSFILE;
1088     }
1089     if ((@DataFieldsByRegexTriplets % 3)) {
1090       if ($Options{datafields}) {
1091           die "Error: Triplet not found in values specified by \"-d --datafields\" option\n";
1092       }
1093       elsif ($Options{datafieldsfile}) {
1094           die "Error: Triplet not found in values specified by \"--datafieldsfile\" option\n";
1095       }
1096     }
1097 
1098     my($Index, $Label, $Value, $Criterion);
1099 
1100     @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
1101     %{$OptionsInfo{SpecifiedDataFieldRegexMap}} = ();
1102     %{$OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}} = ();
1103 
1104     for ($Index = 0; $Index < @DataFieldsByRegexTriplets; $Index = $Index + 3) {
1105       $Label = $DataFieldsByRegexTriplets[$Index];
1106       $Value = $DataFieldsByRegexTriplets[$Index + 1];
1107       $Criterion = $DataFieldsByRegexTriplets[$Index + 2];
1108 
1109       if ($Criterion =~ /^(eq|ne)$/i) {
1110           push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label;
1111           $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label} = $Value;
1112           $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label} = $Criterion;
1113       }
1114       else {
1115           warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion; Supported values: eq or ne\n";
1116       }
1117     }
1118   }
1119   elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) {
1120     my($Index, @DataFieldAndValuesList);
1121     if ($Options{datafields}) {
1122       @DataFieldAndValuesList = split $OptionsInfo{InDelim}, $Options{datafields};
1123     }
1124     elsif ($Options{datafieldsfile}) {
1125       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
1126       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
1127         @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
1128         if (@Words) {
1129           push @DataFieldAndValuesList, @Words;
1130         }
1131       }
1132       close DATAFIELDSFILE;
1133     }
1134     if (@DataFieldAndValuesList < 2) {
1135       if ($Options{datafields}) {
1136         die "Error: Invalid number of values specified by \"-d --datafields\" option\n";
1137       }
1138       elsif ($Options{datafieldsfile}) {
1139         die "Error: Invalid number values specified by \"--datafieldsfile\" option\n";
1140       }
1141     }
1142 
1143     $OptionsInfo{SpecifiedDataFieldLabel} = $DataFieldAndValuesList[0];
1144     $OptionsInfo{SpecifiedDataFieldValuesCount} = @DataFieldAndValuesList - 1;
1145     %{$OptionsInfo{SpecifiedDataFieldValues}} = ();
1146 
1147     for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) {
1148       $Value = $DataFieldAndValuesList[$Index];
1149       $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound";
1150     }
1151   }
1152 
1153   $OptionsInfo{SDFileExt} = "sdf";
1154   $OptionsInfo{TextFileExt} = "csv";
1155 
1156   if ($Options{outdelim} =~ /^tab$/i) {
1157     $OptionsInfo{TextFileExt} = "tsv";
1158   }
1159 
1160   if ($Options{mode} =~ /^(alldatafields|molnames)$/i) {
1161     $OptionsInfo{OutputSDFile} = 0;
1162     $OptionsInfo{OutputTextFile} = 1;
1163   }
1164   else {
1165     $OptionsInfo{OutputSDFile} = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0;
1166     $OptionsInfo{OutputTextFile} = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0;
1167   }
1168 
1169   $OptionsInfo{StrDataString} = $Options{strdatastring};
1170   $OptionsInfo{OutputStrDataString} = ($Options{strdatastring} =~ /^Yes$/i) ? 1 : 0;
1171 
1172   $OptionsInfo{StrDataStringDelimiter} = $Options{strdatastringdelimiter};
1173 
1174   if (IsEmpty($Options{strdatastringdelimiter})) {
1175     die "Error: No value specified for \"--StrDataStringDelimiter\" option.\n";
1176   }
1177   $OptionsInfo{StrDataStringMode} = $Options{strdatastringmode};
1178   $OptionsInfo{StrDataStringWithFields} = $Options{strdatastringmode} =~ /^StrAndDataFields$/i ? 1 : 0;
1179 
1180   MODE: {
1181     if ($Options{mode} =~ /^alldatafields$/i) { $OptionsInfo{FileNameMode} = "AllDataDields"; last MODE; }
1182     if ($Options{mode} =~ /^commondatafields$/i) { $OptionsInfo{FileNameMode} = "CommonDataDields"; last MODE; }
1183     if ($Options{mode} =~ /^datafields$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFields"; last MODE; }
1184     if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByValue"; last MODE; }
1185     if ($Options{mode} =~ /^datafieldsbyregex$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByRegex"; last MODE; }
1186     if ($Options{mode} =~ /^datafieldbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataField"; last MODE; }
1187     if ($Options{mode} =~ /^datafielduniquebylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedUniqueDataField"; last MODE; }
1188     if ($Options{mode} =~ /^datafieldnotbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldNotByList"; last MODE; }
1189     if ($Options{mode} =~ /^molnames$/i) { $OptionsInfo{FileNameMode} = "MolName"; last MODE; }
1190     if ($Options{mode} =~ /^randomcmpds$/i) { $OptionsInfo{FileNameMode} = "RandomCmpds"; last MODE; }
1191     if ($Options{mode} =~ /^recordnum$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{RecordNum}"; last MODE; }
1192     if ($Options{mode} =~ /^recordnums$/i) { $OptionsInfo{FileNameMode} = "RecordNums"; last MODE; }
1193     if ($Options{mode} =~ /^recordrange$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{StartRecordNum}" . "To" . "$OptionsInfo{EndRecordNum}"; last MODE; }
1194     if ($Options{mode} =~ /^2dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "2DCmpdRecords"; last MODE; }
1195     if ($Options{mode} =~ /^3dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "3DCmpdRecords"; last MODE; }
1196     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, , datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
1197   }
1198 
1199 }
1200 
1201 # Setup script usage  and retrieve command line arguments specified using various options...
1202 sub SetupScriptUsage {
1203 
1204   # Retrieve all the options...
1205   %Options = ();
1206   $Options{numofcmpds} = 1;
1207   $Options{mode} = "alldatafields";
1208   $Options{indelim} = "comma";
1209   $Options{outdelim} = "comma";
1210   $Options{output} = "SD";
1211   $Options{quote} = "yes";
1212   $Options{regexignorecase} = "yes";
1213   $Options{valuecomparisonmode} = "numeric";
1214   $Options{violations} = 0;
1215   $Options{seed} = 123456789;
1216 
1217   $Options{strdatastring} = "no";
1218   $Options{strdatastringdelimiter} = "|";
1219   $Options{strdatastringmode} = "StrOnly";
1220 
1221   if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "regexignorecase=s", "record=s", "root|r=s", "seed|s=i", "strdatastring=s", "strdatastringdelimiter=s", "strdatastringmode=s", "valuecomparisonmode=s", "violations|v=i", "workingdir|w=s")) {
1222     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1223   }
1224   if ($Options{workingdir}) {
1225     if (! -d $Options{workingdir}) {
1226       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1227     }
1228     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1229   }
1230   if ($Options{numofcmpds} < 1) {
1231     die "Error: The value specified, $Options{numofcmpds},  for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n";
1232   }
1233   if ($Options{valuecomparisonmode} !~ /^(Numeric|Alphanumeric)$/i) {
1234     die "Error: The value specified, $Options{valuecomparisonmode}, for option \"--ValueComparisonMode\" is not valid. Allowed values: Numeric or Alphanumeric\n";
1235   }
1236   if ($Options{violations} < 0) {
1237     die "Error: The value specified, $Options{violations},  for option \"-v --violations\" is not valid. Allowed values: >= 0 \n";
1238   }
1239   if ($Options{mode} !~ /^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldsbyregex|datafieldbylist|datafielduniquebylist|datafieldnotbylist|molnames|randomcmpds|recordnum|recordnums|recordrange|2dcmpdrecords|3dcmpdrecords)$/i) {
1240     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
1241   }
1242   if ($Options{output} !~ /^(SD|text|both)$/i) {
1243     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n";
1244   }
1245   if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) {
1246     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1247   }
1248   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
1249     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1250   }
1251   if ($Options{quote} !~ /^(yes|no)$/i) {
1252     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
1253   }
1254   if ($Options{regexignorecase} !~ /^(yes|no)$/i) {
1255     die "Error: The value specified, $Options{regexignorecase}, for option \"--regexignorecase\" is not valid. Allowed values: yes or no\n";
1256   }
1257   if ($Options{strdatastring} !~ /^(yes|no)$/i) {
1258     die "Error: The value specified, $Options{strdatastring}, for option \"--StrDataString\" is not valid. Allowed values: yes or no\n";
1259   }
1260   if ($Options{strdatastringmode} !~ /^(StrOnly|StrAndDataFields)$/i) {
1261     die "Error: The value specified, $Options{strdatastringmode}, for option \"--StrDataStringMode\" is not valid. Allowed values: StrOnly or StrAndDataFields\n";
1262   }
1263 }
1264