MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: SortSDFiles.pl,v $
   4 # $Date: 2015/02/28 20:46:21 $
   5 # $Revision: 1.26 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use SDFileUtil;
  37 use TextUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName: Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 my(@SDFilesList);
  56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  57 
  58 print "Processing options...\n";
  59 my(%OptionsInfo);
  60 ProcessOptions();
  61 
  62 print "Checking input SD file(s)...\n";
  63 my(%SDFilesInfo);
  64 RetrieveSDFilesInfo();
  65 
  66 # Generate output files...
  67 my($FileIndex);
  68 if (@SDFilesList > 1) {
  69   print "\nProcessing SD files...\n";
  70 }
  71 for $FileIndex (0 .. $#SDFilesList) {
  72   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  73     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  74     SortSDFile($FileIndex);
  75   }
  76 }
  77 print "\n$ScriptName:Done...\n\n";
  78 
  79 $EndTime = new Benchmark;
  80 $TotalTime = timediff ($EndTime, $StartTime);
  81 print "Total time: ", timestr($TotalTime), "\n";
  82 
  83 ###############################################################################
  84 
  85 # Sort it out...
  86 sub SortSDFile {
  87   my($Index) = @_;
  88   my($SDFile, $NewSDFile, $KeyDataFieldName);
  89 
  90   $SDFile = $SDFilesList[$Index];
  91   $NewSDFile = $SDFilesInfo{OutFile}[$Index];
  92   $KeyDataFieldName = $SDFilesInfo{KeyDataFieldName}[$Index];
  93 
  94   print "Generating new SD file $NewSDFile...\n";
  95   open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
  96   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
  97 
  98   # Go over all compound records and store 'em using key value as hash...
  99   my(%KeyToCompundRecordsMap, @InvalidCompoundRecords, $CmpdCount, $CmpdString, @CmpdLines, %DataFieldValues, $KeyDataFieldValue);
 100   %KeyToCompundRecordsMap = ();
 101   @InvalidCompoundRecords = ();
 102   $CmpdCount = 0;
 103 
 104   COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 105       $CmpdCount++;
 106       @CmpdLines = split "\n", $CmpdString;
 107       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 108       #Make sure data field value is okay...
 109       if (!(IsNotEmpty($DataFieldValues{$KeyDataFieldName}) && ($DataFieldValues{$KeyDataFieldName} !~ /\n/))) {
 110         push @InvalidCompoundRecords, $CmpdString;
 111         if ($OptionsInfo{DetailLevel} >= 3 ) {
 112           print "Ignoring compound record $CmpdCount: Contains empty value for key data field $KeyDataFieldName :\n $CmpdString\n\n";
 113         }
 114         elsif ($OptionsInfo{DetailLevel} >= 2) {
 115           print "Ignoring compound record $CmpdCount: Contains empty value for key data field $KeyDataFieldName...\n";
 116         }
 117         next COMPOUND;
 118       }
 119       $KeyDataFieldValue = $DataFieldValues{$KeyDataFieldName};
 120       if ($OptionsInfo{KeyData} =~ /^numeric$/i) {
 121         if (!IsFloat($KeyDataFieldValue)) {
 122           push @InvalidCompoundRecords, $CmpdString;
 123           if ($OptionsInfo{DetailLevel} >= 3 ) {
 124             print "Ignoring compound record $CmpdCount: Contains non-numerical value for key data field $KeyDataFieldName :\n $CmpdString\n\n";
 125           }
 126           elsif ($OptionsInfo{DetailLevel} >= 2) {
 127             print "Ignoring compound record $CmpdCount: Contains non-numerical value for key data field $KeyDataFieldName...\n";
 128           }
 129           next COMPOUND;
 130         }
 131       }
 132       if (exists($KeyToCompundRecordsMap{$KeyDataFieldValue})) {
 133         # Append to existing coompund data...
 134         $KeyToCompundRecordsMap{$KeyDataFieldValue} .= "\n" . $CmpdString;
 135       }
 136       else {
 137         $KeyToCompundRecordsMap{$KeyDataFieldValue} = $CmpdString;
 138       }
 139   }
 140 
 141   if ($OptionsInfo{Sort} =~ /^ascending$/i) {
 142     if ($OptionsInfo{KeyData} =~ /^alphanumeric$/i) {
 143       for $KeyDataFieldValue (sort { lc($a) cmp lc($b) } keys %KeyToCompundRecordsMap ) {
 144         print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n";
 145       }
 146     }
 147     else {
 148       for $KeyDataFieldValue (sort { $a <=> $b } keys %KeyToCompundRecordsMap ) {
 149         print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n";
 150       }
 151     }
 152   }
 153   else {
 154     if ($OptionsInfo{KeyData} =~ /^alphanumeric$/i) {
 155       for $KeyDataFieldValue (sort { lc($b) cmp lc($a) } keys %KeyToCompundRecordsMap ) {
 156         print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n";
 157       }
 158     }
 159     else {
 160       for $KeyDataFieldValue (sort { $b <=> $a } keys %KeyToCompundRecordsMap ) {
 161         print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n";
 162       }
 163     }
 164   }
 165   # Append the records containing data not appropriate for sorting...
 166   if (@InvalidCompoundRecords) {
 167     print "Placing ", scalar(@InvalidCompoundRecords)," compound record(s) with invalid data field key data the end...\n";
 168     for $CmpdString (@InvalidCompoundRecords) {
 169       print NEWSDFILE "$CmpdString\n";
 170     }
 171   }
 172   close NEWSDFILE;
 173   close SDFILE;
 174 }
 175 
 176 # Retrieve information about input SD files...
 177 sub RetrieveSDFilesInfo {
 178   my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot,  $OutFile, $DataFieldName);
 179 
 180   %SDFilesInfo = ();
 181 
 182   @{$SDFilesInfo{FileOkay}} = ();
 183   @{$SDFilesInfo{OutFile}} = ();
 184   @{$SDFilesInfo{KeyDataFieldName}} = ();
 185 
 186   FILELIST: for $Index (0 .. $#SDFilesList) {
 187     $SDFile = $SDFilesList[$Index];
 188     $SDFilesInfo{FileOkay}[$Index] = 0;
 189     $SDFilesInfo{OutFile}[$Index] = "";
 190     $SDFilesInfo{KeyDataFieldName}[$Index] = "";
 191 
 192     if (!(-e $SDFile)) {
 193       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 194       next FILELIST;
 195     }
 196     if (!CheckFileType($SDFile, "sd sdf")) {
 197       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 198       next FILELIST;
 199     }
 200     $FileDir = ""; $FileName = ""; $FileExt = "";
 201     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 202     if ($Options{root} && (@SDFilesList == 1)) {
 203       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 204       if ($RootFileName && $RootFileExt) {
 205         $FileName = $RootFileName;
 206       }
 207       else {
 208         $FileName = $Options{root};
 209       }
 210       $OutFileRoot = $FileName;
 211     }
 212     else {
 213       $OutFileRoot = $FileName . "SortedByDataField";
 214     }
 215 
 216     $OutFile = $OutFileRoot . ".$FileExt";
 217     if (lc($OutFile) eq lc($SDFile)) {
 218       warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n";
 219       next FILELIST;
 220     }
 221     if (!$Options{overwrite}) {
 222       if (-e $OutFile) {
 223         warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
 224         next FILELIST;
 225       }
 226     }
 227     # Setup data field name...
 228     if ($OptionsInfo{SpecifiedDataFieldName}) {
 229       $DataFieldName = $OptionsInfo{SpecifiedDataFieldName};
 230     }
 231     else {
 232       my($CmpdString, @CmpdLines, @DataFieldNames);
 233       @DataFieldNames = ();
 234       if (!open(SDFILE, "$SDFile")) {
 235         warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 236         next FILELIST;
 237       }
 238       $CmpdString = ReadCmpdString(\*SDFILE);
 239       close SDFILE;
 240 
 241       @CmpdLines = split "\n", $CmpdString;
 242       @DataFieldNames = GetCmpdDataHeaderLabels(\@CmpdLines);
 243       $DataFieldName = $DataFieldNames[0];
 244     }
 245 
 246     $SDFilesInfo{FileOkay}[$Index] = 1;
 247     $SDFilesInfo{OutFile}[$Index] = "$OutFile";
 248     $SDFilesInfo{KeyDataFieldName}[$Index] = $DataFieldName;
 249   }
 250 }
 251 
 252 # Process option values...
 253 sub ProcessOptions {
 254   $OptionsInfo{DetailLevel} = $Options{detail};
 255 
 256   $OptionsInfo{Key} = defined $Options{key} ? $Options{key} : undef;
 257   $OptionsInfo{SpecifiedDataFieldName} = "";
 258   if (defined $Options{key}) {
 259     $OptionsInfo{SpecifiedDataFieldName} = $Options{key};
 260   }
 261 
 262   $OptionsInfo{KeyData} = $Options{keydata};
 263   $OptionsInfo{Sort} = $Options{sort};
 264 
 265   $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
 266   $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef;
 267 }
 268 
 269 # Setup script usage  and retrieve command line arguments specified using various options...
 270 sub SetupScriptUsage {
 271 
 272   # Retrieve all the options...
 273   %Options = ();
 274   $Options{detail} = 1;
 275   $Options{sort} = "ascending";
 276   $Options{keydata} = "numeric";
 277   if (!GetOptions(\%Options, "detail|d=i", "help|h",  "key|k=s", "keydata=s", "overwrite|o", "root|r=s", "sort|s=s", "workingdir|w=s")) {
 278     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 279   }
 280   if ($Options{workingdir}) {
 281     if (! -d $Options{workingdir}) {
 282       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 283     }
 284     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 285   }
 286   if ($Options{keydata} !~ /^(numeric|alphanumeric)$/i) {
 287     die "Error: The value specified, $Options{keydata}, for option \"--keydata\" is not valid. Allowed values: numeric or alphanumeric\n";
 288   }
 289   if ($Options{sort} !~ /^(ascending|descending)$/i) {
 290     die "Error: The value specified, $Options{sort}, for option \"-s --sort\" is not valid. Allowed values: ascending or descending\n";
 291   }
 292   if (!IsPositiveInteger($Options{detail})) {
 293     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 294   }
 295 }
 296