MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: MolFilesToSD.pl,v $
   4 # $Date: 2015/02/28 20:46:20 $
   5 # $Revision: 1.38 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use SDFileUtil;
  36 use FileUtil;
  37 use TextUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename $0;
  46 print "\n$ScriptName:Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 my(@MOLFilesList);
  56 @MOLFilesList = ExpandFileNames(\@ARGV, "mol");
  57 
  58 # Process options...
  59 print "Processing options...\n";
  60 my(%OptionsInfo);
  61 ProcessOptions();
  62 
  63 print "Generating SD file $OptionsInfo{SDFile}...\n";
  64 GenerateSDFile();
  65 
  66 print "\n$ScriptName:Done...\n\n";
  67 
  68 $EndTime = new Benchmark;
  69 $TotalTime = timediff ($EndTime, $StartTime);
  70 print "Total time: ", timestr($TotalTime), "\n";
  71 
  72 ###############################################################################
  73 
  74 # Generate a SD file using all valid MDL MOL files...
  75 sub GenerateSDFile {
  76   my($MOLFile, $Index, $FileCount, $FileOkayCount, $MolNameLine, $CmpdID, $FileDir, $FileName, $FileExt);
  77 
  78   open SDFILE, ">$OptionsInfo{SDFile}" or die "Error: Can't open $OptionsInfo{SDFile}: $! \n";
  79   $FileCount = 0;
  80   $FileOkayCount = 0;
  81 
  82   FILELIST: for $Index (0 .. $#MOLFilesList) {
  83     $MOLFile = $MOLFilesList[$Index];
  84     $FileCount++;
  85 
  86     print "Processing file $MOLFile...\n";
  87 
  88     if (!(-e $MOLFile)) {
  89       warn "Warning: Ignoring file $MOLFile: It doesn't exist\n";
  90       next FILELIST;
  91     }
  92 
  93     if (!CheckFileType($MOLFile, "mol")) {
  94       warn "Warning: Ignoring file $MOLFile: It's not a MDLMOL file\n";
  95       next FILELIST;
  96     }
  97 
  98     if (!open MOLFILE, "$MOLFile") {
  99       warn "Warning: Ignoring file $MOLFile: Couldn't open it: $! \n";
 100       next FILELIST;
 101     }
 102 
 103     $FileOkayCount++;
 104 
 105     if ($OptionsInfo{ModifyData}) {
 106       $MolNameLine = <MOLFILE>;
 107       if ($OptionsInfo{UseFilePrefix}) {
 108         ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFile);
 109         $CmpdID = $FileName;
 110       }
 111       else {
 112         $CmpdID = $OptionsInfo{CompoundID} . "$FileCount";
 113       }
 114 
 115       if ($OptionsInfo{AddMolNameLine}) {
 116         print SDFILE "$CmpdID\n";
 117       }
 118       else {
 119         $MolNameLine =~ s/(\r\n)|(\r)/\n/g;
 120         print SDFILE $MolNameLine;
 121       }
 122 
 123       while (<MOLFILE>) {
 124         s/(\r\n)|(\r)/\n/g;
 125         print SDFILE;
 126       }
 127 
 128       if ($OptionsInfo{AddDataField}) {
 129         print SDFILE ">  <$OptionsInfo{DataFieldLabel}>\n${CmpdID}\n";
 130       }
 131     }
 132     else {
 133       while (<MOLFILE>) {
 134         s/(\r\n)|(\r)/\n/g;
 135         print SDFILE;
 136       }
 137     }
 138     print SDFILE "\n\$\$\$\$\n";
 139     close MOLFILE;
 140   }
 141   close SDFILE;
 142 
 143   print "\nNumber of files: $FileCount\n";
 144   print "Number of files processed successfully: $FileOkayCount\n";
 145   print "Number of files ignored: " . ($FileCount - $FileOkayCount) . "\n";
 146 }
 147 
 148 # Process option values...
 149 sub ProcessOptions {
 150   %OptionsInfo = ();
 151 
 152   $OptionsInfo{Mode} = $Options{mode};
 153 
 154   $OptionsInfo{CompoundID} = $Options{compoundid};
 155   $OptionsInfo{DataFieldLabel} = $Options{datafieldlabel};
 156 
 157   $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
 158   $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef;
 159 
 160   $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0;
 161   $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0;
 162 
 163   $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0;
 164   $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0;
 165 
 166   $OptionsInfo{ModifyData} = ($OptionsInfo{AddMolNameLine} || $OptionsInfo{AddDataField}) ? 1 : 0;
 167 
 168   $OptionsInfo{UseFilePrefix} = ($Options{compoundid} =~ /^usefileprefix$/i) ? 1 : 0;
 169 
 170   # Setup SD file name...
 171   my($FileDir, $FileName, $FileExt, $SDFile);
 172   if ($Options{root}) {
 173     $FileDir = ""; $FileName = ""; $FileExt = "";
 174     ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root});
 175     if ($FileName && $FileExt) {
 176       $SDFile = $FileName;
 177     }
 178     else {
 179       $SDFile =  $Options{root};
 180     }
 181     $SDFile .=  ".sdf";
 182   }
 183   else {
 184     $FileDir = ""; $FileName = ""; $FileExt = "";
 185     ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFilesList[0]);
 186     $SDFile = $FileName . "1To" . @MOLFilesList . ".sdf";
 187   }
 188 
 189   if (!$Options{overwrite}) {
 190     if (-e $SDFile) {
 191       die "Error: The file $SDFile already exists.\n";
 192     }
 193   }
 194   $OptionsInfo{SDFile} = $SDFile;
 195 
 196 }
 197 
 198 # Setup script usage  and retrieve command line arguments specified using various options...
 199 sub SetupScriptUsage {
 200 
 201   # Retrieve all the options...
 202   %Options = ();
 203   $Options{compoundid} = "Cmpd";
 204   $Options{datafieldlabel} = "Cmpd_ID";
 205   $Options{mode} = "none";
 206 
 207   if (!GetOptions(\%Options, "compoundid|c=s", "datafieldlabel|d=s", "help|h", "mode|m=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
 208     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 209   }
 210   if ($Options{workingdir}) {
 211     if (! -d $Options{workingdir}) {
 212       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 213     }
 214     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 215   }
 216   if ($Options{mode} !~ /^(molnameline|datafield|both|none)$/i ) {
 217     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molnameline, datafield, both, or none\n";
 218   }
 219 }
 220