MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: ModifySDFilesDataFields.pl,v $
   4 # $Date: 2015/02/28 20:46:20 $
   5 # $Revision: 1.27 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use SDFileUtil;
  37 use TextUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName: Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 my(@SDFilesList);
  56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  57 
  58 # Process options...
  59 print "Processing options...\n";
  60 my(%OptionsInfo);
  61 ProcessOptions();
  62 
  63 print "Checking input SD file(s)...\n";
  64 my(%SDFilesInfo);
  65 RetrieveSDFilesInfo();
  66 
  67 # Generate output files...
  68 my($FileIndex);
  69 if (@SDFilesList > 1) {
  70   print "\nProcessing SD files...\n";
  71 }
  72 for $FileIndex (0 .. $#SDFilesList) {
  73   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  74     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  75     ModifySDFile($FileIndex);
  76   }
  77 }
  78 print "\n$ScriptName:Done...\n\n";
  79 
  80 $EndTime = new Benchmark;
  81 $TotalTime = timediff ($EndTime, $StartTime);
  82 print "Total time: ", timestr($TotalTime), "\n";
  83 
  84 ###############################################################################
  85 
  86 # Modify SD file data fields....
  87 sub ModifySDFile {
  88   my($Index) = @_;
  89   my($SDFile, $NewSDFile);
  90 
  91   $SDFile = $SDFilesList[$Index];
  92   $NewSDFile = $SDFilesInfo{OutFile}[$Index];
  93 
  94   print "Generating new SD file $NewSDFile...\n";
  95   open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
  96   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
  97 
  98   my($CmpdCount, $CmpdString, $CmpdData, $MolName, $OldSDField, $NewSDField, $CommonSDField, $Label, $Value, $FieldValues, $MolNameDataField, $URLCmpdIdFieldName, @CmpdLines, %DataFieldAndValues, @DataFieldLabels);
  99   $CmpdCount = 0;
 100 
 101   COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 102       $CmpdCount++;
 103       @CmpdLines = split "\n", $CmpdString;
 104       if ($OptionsInfo{UseDataFieldForMolName} || $OptionsInfo{ModifyDataFields}) {
 105         %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 106       }
 107       if ($OptionsInfo{ModifyMolName}) {
 108         if ($OptionsInfo{AlwaysReplaceMolName} || !IsNotEmpty($CmpdLines[0])) {
 109           $MolNameDataField = $OptionsInfo{MolNameDataField};
 110           if ($OptionsInfo{UseDataFieldForMolName} && exists($DataFieldAndValues{$MolNameDataField})) {
 111             $MolName = $DataFieldAndValues{$MolNameDataField};
 112             if (length($MolName) > 80) {
 113               $MolName = substr($MolName, 0, 80);
 114             }
 115           }
 116           else {
 117             $MolName = "$OptionsInfo{MolNamePrefix}${CmpdCount}";
 118           }
 119           $CmpdLines[0] = $MolName;
 120           $CmpdString = join "\n", @CmpdLines;
 121         }
 122       }
 123       if (!$OptionsInfo{ModifyDataFields}) {
 124         # Just write the data and get the next compound...
 125         print NEWSDFILE "$CmpdString\n";
 126         next COMPOUND;
 127       }
 128       # Write out the structure data now and handle the old data fields later...
 129       ($CmpdData) = split /\n>/, $CmpdString;
 130       print NEWSDFILE "$CmpdData\n";
 131 
 132       # Modify specified data fields...
 133       for $NewSDField (sort keys %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}}) {
 134         $FieldValues = "";
 135         for $OldSDField (@{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}) {
 136           if (exists($DataFieldAndValues{$OldSDField}) && length($DataFieldAndValues{$OldSDField})) {
 137             $Value = $DataFieldAndValues{$OldSDField};
 138             $FieldValues .= ($FieldValues) ? "\n$Value" : $Value;
 139           }
 140         }
 141         print NEWSDFILE "> <$NewSDField>\n$FieldValues\n\n";
 142       }
 143       # Add specified common fields...
 144       for $CommonSDField (sort keys %{$OptionsInfo{SpecifiedCommonFieldMap}}) {
 145         $Value = $OptionsInfo{SpecifiedCommonFieldMap}{$CommonSDField};
 146         print NEWSDFILE "> <$CommonSDField>\n$Value\n\n";
 147       }
 148       if ($OptionsInfo{CreateDataFieldURL}) {
 149         $Value = "";
 150         $URLCmpdIdFieldName = $OptionsInfo{URLCmpdIdFieldName};
 151         if (exists($DataFieldAndValues{$URLCmpdIdFieldName}) && length($DataFieldAndValues{$URLCmpdIdFieldName})) {
 152           $Value = $DataFieldAndValues{$URLCmpdIdFieldName};
 153           $Value = "$OptionsInfo{URLCGIScriptName}?$OptionsInfo{URLParamName}=${Value}";
 154         }
 155         print NEWSDFILE "> <$OptionsInfo{URLDataFieldLabel}>\n$Value\n\n";
 156       }
 157 
 158       # Handle old data fields and write 'em in the same order as they appear in the input
 159       # files...
 160       if ($OptionsInfo{KeepAllOldDataFields} || $OptionsInfo{KeepUnMappedOldDataFields}) {
 161         my($KeepLabel);
 162         @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 163         LABEL: for $Label (@DataFieldLabels) {
 164           $KeepLabel = $OptionsInfo{KeepAllOldDataFields} ? 1 : ( exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$Label}) ? 0 : 1  );
 165           if (!$KeepLabel) {
 166             next LABEL;
 167           }
 168           $Value = $DataFieldAndValues{$Label};
 169           print NEWSDFILE "> <$Label>\n$Value\n\n";
 170         }
 171       }
 172 
 173       print NEWSDFILE "\$\$\$\$\n";
 174   }
 175   close NEWSDFILE;
 176   close SDFILE;
 177 }
 178 
 179 # Process option values...
 180 sub ProcessOptions {
 181   %OptionsInfo = ();
 182 
 183   $OptionsInfo{Mode} = $Options{mode};
 184 
 185   $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 0;
 186   if ($Options{mode} =~ /^both$/i) {
 187     $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 1;
 188   }
 189   elsif ($Options{mode} =~ /^datafields$/i) {
 190     $OptionsInfo{ModifyMolName} = 0; $OptionsInfo{ModifyDataFields} = 1;
 191   }
 192 
 193   $OptionsInfo{KeepOldDataFields} = $Options{keepolddatafields};
 194   $OptionsInfo{KeepAllOldDataFields} = ($Options{keepolddatafields} =~ /^all$/i) ? 1 : 0;
 195   $OptionsInfo{KeepUnMappedOldDataFields} = ($Options{keepolddatafields} =~ /^unmappedonly$/i) ? 1 : 0;
 196 
 197   $OptionsInfo{MolNameMode} = $Options{molnamemode};
 198   $OptionsInfo{UseDataFieldForMolName} = ($Options{molnamemode} =~ /^datafield$/i) ? 1 : 0;
 199 
 200   $OptionsInfo{MolName} = $Options{molname};
 201   $OptionsInfo{MolNameDataField} = ""; $OptionsInfo{MolNamePrefix} = "Cmpd";
 202   if ($Options{molname}) {
 203     if ($OptionsInfo{UseDataFieldForMolName}) {
 204       $OptionsInfo{MolNameDataField} = $Options{molname};
 205     }
 206     else {
 207       $OptionsInfo{MolNamePrefix} = $Options{molname};
 208     }
 209   }
 210 
 211   $OptionsInfo{MolNameReplace} = $Options{molnamereplace};
 212   $OptionsInfo{AlwaysReplaceMolName} = ($Options{molnamereplace} =~ /^always$/i) ? 1 : 0;
 213 
 214   if ($Options{datafieldsmap} && $Options{datafieldsmapfile}) {
 215     die "Error: Both \"--datafieldsmap\" and  \"--datafieldsmapfile\" options specified: only one is allowed at a time\n";
 216   }
 217 
 218   $OptionsInfo{DataFieldsMap} = $Options{datafieldsmap} ? $Options{datafieldsmap} : '';
 219   $OptionsInfo{DataFieldsMapFile} = $Options{datafieldsmapfile} ? $Options{datafieldsmapfile} : '';
 220 
 221   my($SpecifiedDataFieldMap);
 222 
 223   %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}} = ();
 224   %{$OptionsInfo{SpecifiedOldToNewSDFieldMap}} = ();
 225 
 226   $SpecifiedDataFieldMap = "";
 227   if ($Options{datafieldsmap}) {
 228     $SpecifiedDataFieldMap = $Options{datafieldsmap};
 229   }
 230   elsif ($Options{datafieldsmapfile}) {
 231     my($Line, @LineWords);
 232     open DATAFIELDSFILE, "$Options{datafieldsmapfile}" or die "Couldn't  open $Options{datafieldsmapfile}: $! \n";
 233     while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
 234       @LineWords = quotewords(";", 0, $Line);
 235       $SpecifiedDataFieldMap .= JoinWords(\@LineWords, ";", 0);
 236     }
 237     close DATAFIELDSFILE;
 238   }
 239 
 240   if ($SpecifiedDataFieldMap) {
 241     my($DataFieldMap, $DataField, $NewSDField, @OldSDFields, @DataFieldMapSplit, @DataFieldsSplit, $FirstField);
 242     @DataFieldMapSplit = split ";", $SpecifiedDataFieldMap;
 243     for $DataFieldMap (@DataFieldMapSplit) {
 244       @DataFieldsSplit = split ",", $DataFieldMap;
 245       if (@DataFieldsSplit == 1) {
 246         die "Error: Invalid number of comma delimited values, ", scalar(@DataFieldsSplit), ", specified,  @DataFieldsSplit, using \"--datafieldsmap or --datafieldsmapfile\" option: it must contain more than one value.\n";
 247       }
 248       $FirstField = 1;
 249       @OldSDFields = ();
 250       for $DataField (@DataFieldsSplit) {
 251         if (!(defined($DataField) && length($DataField))) {
 252           die "Error: One of the comma delimited values, \"", join(",", @DataFieldsSplit), "\", specified using \"--datafieldsmap or --datafieldsmapfile\" option is empty.\n";
 253         }
 254         if ($FirstField) {
 255           $FirstField = 0;
 256           $NewSDField = $DataField;
 257         }
 258         else {
 259           push @OldSDFields, $DataField;
 260         }
 261       }
 262       # Make sure a datafield is only specified once...
 263       if (exists $OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}) {
 264         die "Error: New data field, $NewSDField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n";
 265       }
 266       @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}} = ();
 267       push @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}, @OldSDFields;
 268       for $DataField (@OldSDFields) {
 269         if (exists $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} ) {
 270           die "Error: SD field, $DataField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n";
 271         }
 272         else {
 273           $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} = $NewSDField;
 274         }
 275       }
 276 
 277     }
 278   }
 279 
 280   $OptionsInfo{DataFieldsCommon} = $Options{datafieldscommon} ? $Options{datafieldscommon} : '';
 281   %{$OptionsInfo{SpecifiedCommonFieldMap}} = ();
 282 
 283   if ($Options{datafieldscommon}) {
 284     my($DataFieldName, $DataFieldValue, $Index, @CommonDataFieldsSplit);
 285     @CommonDataFieldsSplit = split ",", $Options{datafieldscommon};
 286     if (@CommonDataFieldsSplit % 2) {
 287         die "Error: Invalid number of comma delimited values, ", scalar(@CommonDataFieldsSplit), ", specified \"",  join(",", @CommonDataFieldsSplit), "\" using \"--datafieldscommon\" option: it must contain even number of values.\n";
 288     }
 289     for ($Index = 0; $Index < @CommonDataFieldsSplit; $Index += 2) {
 290       $DataFieldName = $CommonDataFieldsSplit[$Index];
 291       $DataFieldValue = $CommonDataFieldsSplit[$Index + 1];
 292       if (exists $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName}) {
 293         die "Error: Common data field, $DataFieldName, specified more than once using \"--datafieldscommon\" option.\n";
 294       }
 295       if (exists($OptionsInfo{SpecifiedNewToOldSDFieldMap}{$DataFieldName}) || exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataFieldName})) {
 296         die "Error: Common data field, $DataFieldName, specified using \"--datafieldscommon\" option cannot be specified in \"--datafieldsmap or --datafieldsmapfile\" option.\n";
 297       }
 298       $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName} = $DataFieldValue;
 299     }
 300   }
 301 
 302   $OptionsInfo{DataFieldURL} = $Options{datafieldurl} ? $Options{datafieldurl} : '';
 303   $OptionsInfo{CreateDataFieldURL} = (exists($Options{datafieldurl}) && length($Options{datafieldurl}) ) ? 1 : 0;
 304 
 305   $OptionsInfo{URLDataFieldLabel} = ""; $OptionsInfo{URLCGIScriptName} = "";
 306   $OptionsInfo{URLParamName} = ""; $OptionsInfo{URLCmpdIdFieldName} = "";
 307 
 308   if ($OptionsInfo{CreateDataFieldURL}) {
 309     my(@DataFieldURLSplit, $Value);
 310     @DataFieldURLSplit = split ",", $Options{datafieldurl};
 311     if (@DataFieldURLSplit != 4) {
 312       die "Error: Invalid number of values, ", scalar(@DataFieldURLSplit), ", specified using \"--datafieldURL\" option: it must contain 4 values.\n";
 313     }
 314     for $Value (@DataFieldURLSplit) {
 315       if (!IsNotEmpty($Value)) {
 316         die "Error: One of the values, $Options{datafieldurl}, specified using \"--datafieldURL\" option is empty.\n";
 317       }
 318     }
 319     $OptionsInfo{URLDataFieldLabel} = $DataFieldURLSplit[0];
 320     $OptionsInfo{URLCGIScriptName} = $DataFieldURLSplit[1];
 321     $OptionsInfo{URLParamName}  = $DataFieldURLSplit[2];
 322     $OptionsInfo{URLCmpdIdFieldName} = $DataFieldURLSplit[3];
 323   }
 324 
 325 }
 326 
 327 # Retrieve information about input SD files...
 328 sub RetrieveSDFilesInfo {
 329   my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot,  $OutFile, $DataFieldName);
 330 
 331   %SDFilesInfo = ();
 332   @{$SDFilesInfo{FileOkay}} = ();
 333   @{$SDFilesInfo{OutFile}} = ();
 334 
 335    FILELIST: for $Index (0 .. $#SDFilesList) {
 336     $SDFile = $SDFilesList[$Index];
 337 
 338     $SDFilesInfo{FileOkay}[$Index] = 0;
 339     $SDFilesInfo{OutFile}[$Index] = '';
 340 
 341     if (!(-e $SDFile)) {
 342       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 343       next FILELIST;
 344     }
 345     if (!CheckFileType($SDFile, "sd sdf")) {
 346       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 347       next FILELIST;
 348     }
 349     $FileDir = ""; $FileName = ""; $FileExt = "";
 350     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 351     if ($Options{root} && (@SDFilesList == 1)) {
 352       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 353       if ($RootFileName && $RootFileExt) {
 354         $FileName = $RootFileName;
 355       }
 356       else {
 357         $FileName = $Options{root};
 358       }
 359       $OutFileRoot = $FileName;
 360     }
 361     else {
 362       $OutFileRoot = $FileName . "ModifiedDataFields";
 363     }
 364 
 365     $OutFile = $OutFileRoot . ".$FileExt";
 366     if (lc($OutFile) eq lc($SDFile)) {
 367       warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n";
 368       next FILELIST;
 369     }
 370     if (!$Options{overwrite}) {
 371       if (-e $OutFile) {
 372         warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
 373         next FILELIST;
 374       }
 375     }
 376 
 377     $SDFilesInfo{FileOkay}[$Index] = 1;
 378     $SDFilesInfo{OutFile}[$Index] = $OutFile;
 379   }
 380 }
 381 
 382 # Setup script usage  and retrieve command line arguments specified using various options...
 383 sub SetupScriptUsage {
 384 
 385   # Retrieve all the options...
 386   %Options = ();
 387   $Options{detail} = 1;
 388   $Options{keepolddatafields} = "none";
 389   $Options{mode} = "molname";
 390   $Options{molnamemode} = "labelprefix";
 391   $Options{molnamereplace} = "empty";
 392 
 393   if (!GetOptions(\%Options, "detail|d=i", "datafieldscommon=s", "datafieldsmap=s", "datafieldsmapfile=s", "datafieldurl=s", "help|h", "keepolddatafields|k=s", "mode|m=s", "molname=s", "molnamemode=s", "molnamereplace=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
 394     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 395   }
 396   if ($Options{workingdir}) {
 397     if (! -d $Options{workingdir}) {
 398       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 399     }
 400     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 401   }
 402   if ($Options{keepolddatafields} !~ /^(all|unmappedonly|none)$/i) {
 403     die "Error: The value specified, $Options{keepolddatafields}, for option \"-k --keepolddatafields\" is not valid. Allowed values: all, unmappedonly, or none\n";
 404   }
 405   if ($Options{mode} !~ /^(molname|datafields|both)$/i) {
 406     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molname, datafields, or both\n";
 407   }
 408   if ($Options{molnamemode} !~ /^(datafield|labelprefix)$/i) {
 409     die "Error: The value specified, $Options{molnamemode}, for option \"--molnamemode\" is not valid. Allowed values: datafield or labelprefix\n";
 410   }
 411   if ($Options{molnamereplace} !~ /^(always|empty)$/i) {
 412     die "Error: The value specified, $Options{molnamereplace}, for option \"--molnamereplace\" is not valid. Allowed values: always or empty\n";
 413   }
 414   if (!IsPositiveInteger($Options{detail})) {
 415     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 416   }
 417 }
 418