MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: SDToMolFiles.pl,v $
   4 # $Date: 2015/02/28 20:46:20 $
   5 # $Revision: 1.35 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use SDFileUtil;
  36 use FileUtil;
  37 
  38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  39 
  40 # Autoflush STDOUT
  41 $| = 1;
  42 
  43 # Starting message...
  44 $ScriptName = basename $0;
  45 print "\n$ScriptName:Starting...\n\n";
  46 $StartTime = new Benchmark;
  47 
  48 # Get the options and setup script...
  49 SetupScriptUsage();
  50 if ($Options{help} || @ARGV < 1) {
  51   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  52 }
  53 
  54 my(@SDFilesList);
  55 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  56 
  57 # Process options...
  58 print "Processing options...\n";
  59 my(%OptionsInfo);
  60 ProcessOptions();
  61 
  62 # Setup information about input files...
  63 print "Checking input SD file(s)...\n";
  64 my(%SDFilesInfo);
  65 RetrieveSDFilesInfo();
  66 
  67 # Process input files..
  68 my($FileIndex);
  69 if (@SDFilesList > 1) {
  70   print "\nProcessing SD files...\n";
  71 }
  72 for $FileIndex (0 .. $#SDFilesList) {
  73   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  74     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  75     GenerateMolFiles($FileIndex);
  76   }
  77 }
  78 print "\n$ScriptName:Done...\n\n";
  79 
  80 $EndTime = new Benchmark;
  81 $TotalTime = timediff ($EndTime, $StartTime);
  82 print "Total time: ", timestr($TotalTime), "\n";
  83 
  84 ###############################################################################
  85 
  86 # Generate MOL files for a SD file...
  87 #
  88 sub GenerateMolFiles {
  89   my($FileIndex) = @_;
  90   my($SDFile, $MOLFile, $MOLFileRoot, $OutFileRoot, $OverwriteFiles, $UseDataField, $DataFieldName, $UseMolName, $CmpdCount, $MolEndDelimiter, $CmpdString, @CmpdLines, %DataFieldValues);
  91 
  92   $SDFile = $SDFilesList[$FileIndex];
  93 
  94   if (!open SDFILE, "$SDFile") {
  95     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
  96     return;
  97   }
  98 
  99   $CmpdCount = 0;
 100   $MolEndDelimiter = "M  END";
 101 
 102   $OutFileRoot = $SDFilesInfo{OutFileRoot}[$FileIndex];
 103   $OverwriteFiles = $OptionsInfo{OverwriteFiles};
 104 
 105   $UseDataField = ($OptionsInfo{Mode} =~ /^DataField$/i) ? 1 : 0;
 106   $DataFieldName = $OptionsInfo{DataField};
 107 
 108   $UseMolName = ($OptionsInfo{Mode} =~ /^MolName$/i) ? 1 : 0;
 109 
 110   CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 111     $CmpdCount++;
 112 
 113     # Setup MOL file name...
 114     $MOLFileRoot = '';
 115     if ($UseDataField) {
 116       @CmpdLines = split "\n", $CmpdString;
 117       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 118       if (exists $DataFieldValues{$DataFieldName}) {
 119         $MOLFileRoot = $DataFieldValues{$DataFieldName};
 120       }
 121     }
 122     elsif ($UseMolName) {
 123       @CmpdLines = split "\n", $CmpdString;
 124       $MOLFileRoot = $CmpdLines[0];
 125     }
 126 
 127     # Check for any invalid file name characters in data field or molname values...
 128     if ($MOLFileRoot && $MOLFileRoot =~ /[^a-zA-Z0-9_]/) {
 129       $MOLFileRoot =~ s/[^a-zA-Z0-9_]//g;
 130     }
 131     # Fall back plan for MOL file name...
 132     if (!$MOLFileRoot) {
 133       $MOLFileRoot = "${OutFileRoot}Cmpd${CmpdCount}";
 134     }
 135 
 136     $MOLFile = "${MOLFileRoot}.mol";
 137 
 138     if (!$OverwriteFiles) {
 139       if (-e $MOLFile) {
 140         warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: New MOL file, $MOLFile, already exists\n";
 141         next CMPDSTRING;
 142       }
 143     }
 144 
 145     if (!($CmpdString =~ /$MolEndDelimiter/)) {
 146       warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: Invalid compound data\n";
 147       next CMPDSTRING;
 148     }
 149 
 150     # Write out MOL file...
 151 
 152     print "Generating $MOLFile file...\n";
 153     open MOLFILE, ">$MOLFile" or die "Error: Can't open $MOLFile: $! \n";
 154     ($CmpdString) = split "$MolEndDelimiter", $CmpdString;
 155     print MOLFILE "$CmpdString";
 156     print MOLFILE "$MolEndDelimiter\n";
 157     close MOLFILE;
 158 
 159   }
 160 
 161   close SDFILE;
 162 }
 163 
 164 # Retrieve information about SD files...
 165 #
 166 sub RetrieveSDFilesInfo {
 167   my($SDFile, $Index, $FileDir, $FileName, $FileExt, $OutFileRoot);
 168 
 169   %SDFilesInfo = ();
 170   @{$SDFilesInfo{FileOkay}} = ();
 171   @{$SDFilesInfo{OutFileRoot}} = ();
 172 
 173   FILELIST: for $Index (0 .. $#SDFilesList) {
 174     $SDFile = $SDFilesList[$Index];
 175 
 176     $SDFilesInfo{FileOkay}[$Index] = 0;
 177     $SDFilesInfo{OutFileRoot}[$Index] = '';
 178 
 179     $SDFile = $SDFilesList[$Index];
 180     if (!(-e $SDFile)) {
 181       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 182       next FILELIST;
 183     }
 184     if (!CheckFileType($SDFile, "sd sdf")) {
 185       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 186       next FILELIST;
 187     }
 188 
 189     # Setup output file root...
 190     $FileDir = ""; $FileName = ""; $FileExt = "";
 191     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 192 
 193     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 194       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 195       if ($RootFileName && $RootFileExt) {
 196         $FileName = $RootFileName;
 197       }
 198       else {
 199         $FileName = $OptionsInfo{OutFileRoot};
 200       }
 201       $OutFileRoot = $FileName;
 202     }
 203     else {
 204       $OutFileRoot = "$FileName";
 205     }
 206 
 207     $SDFilesInfo{FileOkay}[$Index] = 1;
 208     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 209   }
 210 }
 211 
 212 # Process option values...
 213 sub ProcessOptions {
 214   %OptionsInfo = ();
 215 
 216   $OptionsInfo{Mode} = $Options{mode};
 217 
 218   $OptionsInfo{DataField} = '';
 219   if ($Options{mode} =~ /^DataField$/i) {
 220     if (!$Options{datafield}) {
 221       die "Error: You must specify a value for \"-d, --DataField\" option in \"DataField\" \"-m, --mode\". \n";
 222     }
 223     $OptionsInfo{DataField} = $Options{datafield};
 224   }
 225 
 226   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 227 
 228   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 229 }
 230 
 231 # Setup script usage  and retrieve command line arguments specified using various options...
 232 sub SetupScriptUsage {
 233 
 234   # Retrieve all the options...
 235   %Options = ();
 236 
 237   $Options{mode} = 'RootPrefix';
 238 
 239   if (!GetOptions(\%Options, "datafield|d=s", "help|h", "mode|m=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
 240     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 241   }
 242   if ($Options{workingdir}) {
 243     if (! -d $Options{workingdir}) {
 244       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 245     }
 246     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 247   }
 248 
 249   if ($Options{mode} !~ /^(DataField|MolName|RootPrefix)$/i) {
 250     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: DataField, MolName, RootPrefix\n";
 251   }
 252 }
 253