MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: FilterSDFiles.pl,v $
   4 # $Date: 2015/02/28 20:46:20 $
   5 # $Revision: 1.32 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Benchmark;
  34 use SDFileUtil;
  35 use FileUtil;
  36 
  37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  38 
  39 # Autoflush STDOUT
  40 $| = 1;
  41 
  42 # Starting message...
  43 $ScriptName = basename $0;
  44 print "\n$ScriptName:Starting...\n\n";
  45 $StartTime = new Benchmark;
  46 
  47 # Get the options and setup script...
  48 SetupScriptUsage();
  49 if ($Options{help} || @ARGV < 1) {
  50   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  51 }
  52 
  53 my(@SDFilesList);
  54 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  55 
  56 # Process options...
  57 print "Processing options...\n";
  58 my(%OptionsInfo);
  59 ProcessOptions();
  60 
  61 print "Checking input SD file(s)...\n";
  62 my(%SDFilesInfo);
  63 RetrieveSDFilesInfo();
  64 
  65 # Generate output files...
  66 my($FileIndex, %FilteredSDFileInfo);
  67 if (@SDFilesList > 1) {
  68   print "\nProcessing SD files...\n";
  69 }
  70 for $FileIndex (0 .. $#SDFilesList) {
  71   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  72     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  73     FilterSDFile($FileIndex);
  74   }
  75 }
  76 print "\n$ScriptName:Done...\n\n";
  77 
  78 $EndTime = new Benchmark;
  79 $TotalTime = timediff ($EndTime, $StartTime);
  80 print "Total time: ", timestr($TotalTime), "\n";
  81 
  82 ###############################################################################
  83 
  84 # Filter SD file...
  85 sub FilterSDFile {
  86   my($Index) = @_;
  87   my($SDFile, $NewSDFile, $NewKeepSDFile, $CtabLinesCount, $CmpdString, $PrintCmpdCounterHeader, @CmpdLines);
  88 
  89   $SDFile = $SDFilesList[$Index];
  90   $NewSDFile = $SDFilesInfo{OutFile}[$Index];
  91   $NewKeepSDFile = $SDFilesInfo{OutFileKeep}[$Index];
  92 
  93   open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
  94   if ($OptionsInfo{Keep}) {
  95     open NEWKEEPSDFILE, ">$NewKeepSDFile" or die "Error: Couldn't open $NewKeepSDFile: $! \n";
  96   }
  97   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
  98 
  99   print "\nGenerating SD file $NewSDFile...\n";
 100   if ($OptionsInfo{Keep}) {
 101     print "Generating file $NewKeepSDFile...\n";
 102   }
 103 
 104   %FilteredSDFileInfo = ();
 105 
 106   $FilteredSDFileInfo{CmpdCount} = 0; $FilteredSDFileInfo{FilterCmpd} = 0;
 107   $FilteredSDFileInfo{FilteredCmpdCount} = 0; $FilteredSDFileInfo{KeepCmpdCount} = 0;
 108 
 109   $PrintCmpdCounterHeader = 1;
 110 
 111   CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 112     $FilteredSDFileInfo{CmpdCount} += 1;
 113     $FilteredSDFileInfo{FilterCmpd} = 0;
 114     if (($FilteredSDFileInfo{CmpdCount} % 5000) == 0) {
 115       if ($PrintCmpdCounterHeader) {
 116         $PrintCmpdCounterHeader = 0;
 117         print "\nProcessing compounds:";
 118       }
 119       print "$FilteredSDFileInfo{CmpdCount}...";
 120     }
 121     @CmpdLines = split "\n", $CmpdString;
 122     $CtabLinesCount = GetCtabLinesCount(\@CmpdLines);
 123     if ($CtabLinesCount <= 0) {
 124       $FilteredSDFileInfo{FilterCmpd} = 1;
 125       WriteOutCmpdString($CmpdString);
 126       next CMPDSTRING;
 127     }
 128     my ($AtomCount, $BondCount) = ParseCmpdCountsLine($CmpdLines[3]);
 129     if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) {
 130       if ($CtabLinesCount != ($AtomCount + $BondCount)) {
 131         $FilteredSDFileInfo{FilterCmpd} = 1;
 132         WriteOutCmpdString($CmpdString);
 133         next CMPDSTRING;
 134       }
 135     }
 136     if ($CtabLinesCount == ($AtomCount + $BondCount)) {
 137       if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) {
 138         my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines);
 139         if ($UnknownAtomCount) {
 140           $FilteredSDFileInfo{FilterCmpd} = 1;
 141           WriteOutCmpdString($CmpdString);
 142           next CMPDSTRING;
 143         }
 144       }
 145       if ($OptionsInfo{All} || $OptionsInfo{CleanSalts} || $OptionsInfo{Salts}) {
 146         my ($FragmentsCount, $Fragments, $WashedCmpdString) = WashCmpd(\@CmpdLines);
 147         if ($FragmentsCount > 1) {
 148           if ($OptionsInfo{all} || $OptionsInfo{CleanSalts}) {
 149             $CmpdString = $WashedCmpdString;
 150           }
 151           else {
 152             $FilteredSDFileInfo{FilterCmpd} = 1;
 153           }
 154           WriteOutCmpdString($CmpdString);
 155           next CMPDSTRING;
 156         }
 157       }
 158     }
 159     WriteOutCmpdString($CmpdString);
 160   }
 161   if (!$PrintCmpdCounterHeader) {
 162     print "\n";
 163   }
 164 
 165   close NEWSDFILE;
 166   if ($OptionsInfo{Keep}) {
 167     close NEWKEEPSDFILE;
 168   }
 169   close SDFILE;
 170 
 171   print "\nTotal Number of compounds: $FilteredSDFileInfo{CmpdCount}\n";
 172   print "Number of compounds left after filtering: $FilteredSDFileInfo{FilteredCmpdCount}\n";
 173   print "Number of compounds ignored: $FilteredSDFileInfo{KeepCmpdCount}\n";
 174 }
 175 
 176 # Write out the compound data...
 177 sub WriteOutCmpdString {
 178   my($CmpdString) = @_;
 179 
 180   if ($FilteredSDFileInfo{FilterCmpd}) {
 181     $FilteredSDFileInfo{KeepCmpdCount} += 1;
 182     if ($OptionsInfo{Keep}) {
 183       print NEWKEEPSDFILE "$CmpdString\n";
 184     }
 185   }
 186   else {
 187     $FilteredSDFileInfo{FilteredCmpdCount} += 1;
 188     print NEWSDFILE "$CmpdString\n";
 189   }
 190 }
 191 
 192 # Retrieve information about input SD files...
 193 sub RetrieveSDFilesInfo {
 194   my($Index, $SDFile, $FileDir, $FileName, $FileExt, $NewSDFile, $NewKeepSDFile);
 195 
 196   %SDFilesInfo = ();
 197   @{$SDFilesInfo{FileOkay}} = ();
 198   @{$SDFilesInfo{OutFile}} = ();
 199   @{$SDFilesInfo{OutFileKeep}} = ();
 200 
 201    FILELIST: for $Index (0 .. $#SDFilesList) {
 202     $SDFile = $SDFilesList[$Index];
 203 
 204     $SDFilesInfo{FileOkay}[$Index] = 0;
 205     $SDFilesInfo{OutFile}[$Index] = '';
 206     $SDFilesInfo{OutFileKeep}[$Index] = '';
 207 
 208     if (!(-e $SDFile)) {
 209       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 210       next FILELIST;
 211     }
 212     if (!CheckFileType($SDFile, "sd sdf")) {
 213       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 214       next FILELIST;
 215     }
 216 
 217     # Setup new file names...
 218     $FileDir = ""; $FileName = ""; $FileExt = "";
 219     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 220     if ($Options{root} && (@SDFilesList == 1)) {
 221       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 222       if ($RootFileName && $RootFileExt) {
 223         $NewSDFile = $RootFileName;
 224       }
 225       else {
 226         $NewSDFile = $Options{root};
 227       }
 228       $NewKeepSDFile = $NewSDFile;
 229     }
 230     else {
 231       $NewSDFile = $FileName . "Filtered";
 232       $NewKeepSDFile = $FileName;
 233     }
 234     $NewSDFile .= ".$FileExt";
 235     $NewKeepSDFile .= "Ignored" . ".$FileExt";
 236     if (!$Options{overwrite}) {
 237       if (-e $NewSDFile) {
 238         warn "Warning: Ignoring file $SDFile: New SD file, $NewSDFile, already exists\n";
 239         next FILELIST;
 240       }
 241       if ($Options{keep}) {
 242         if (-e $NewKeepSDFile) {
 243           warn "Warning: Ignoring file $SDFile: New SD file, $NewKeepSDFile, already exists\n";
 244           next FILELIST;
 245         }
 246       }
 247     }
 248     if (lc($NewSDFile) eq lc($SDFile)) {
 249       warn "Warning: Ignoring file $SDFile: Same output, $NewSDFile, and input file name\n";
 250       print "Specify a different name using \"-r --root\" option or use default name.\n";
 251       next FILELIST;
 252     }
 253 
 254     $SDFilesInfo{FileOkay}[$Index] = 1;
 255     $SDFilesInfo{OutFile}[$Index] = $NewSDFile;
 256     $SDFilesInfo{OutFileKeep}[$Index] = $NewKeepSDFile;
 257   }
 258 }
 259 
 260 # Process option values...
 261 sub ProcessOptions {
 262   %OptionsInfo = ();
 263 
 264   $OptionsInfo{All} = $Options{all} ? $Options{all} : undef;
 265   $OptionsInfo{CleanSalts} = $Options{cleansalts} ? $Options{cleansalts} : undef;
 266   $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : undef;
 267   $OptionsInfo{Keep} = $Options{keep} ? $Options{keep} : undef;
 268   $OptionsInfo{Mismatch} = $Options{mismatch} ? $Options{mismatch} : undef;
 269   $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef;
 270   $OptionsInfo{Salts} = $Options{salts} ? $Options{salts} : undef;
 271   $OptionsInfo{UnknownAtoms} = $Options{unknownatoms} ? $Options{unknownatoms} : undef;
 272 
 273 }
 274 
 275 # Setup script usage  and retrieve command line arguments specified using various options...
 276 sub SetupScriptUsage {
 277 
 278   # Retrieve all the options...
 279   %Options = ();
 280   if (!GetOptions(\%Options, "all|a", "cleansalts|c", "empty|e", "help|h", "keep|k", "mismatch|m", "overwrite|o", "root|r=s", "salts|s", "unknownatoms|u", "workingdir|w=s")) {
 281     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 282   }
 283   if ($Options{workingdir}) {
 284     if (! -d $Options{workingdir}) {
 285       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 286     }
 287     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 288   }
 289 }
 290