1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: FilterSDFiles.pl,v $ 4 # $Date: 2015/02/28 20:46:20 $ 5 # $Revision: 1.32 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Benchmark; 34 use SDFileUtil; 35 use FileUtil; 36 37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 38 39 # Autoflush STDOUT 40 $| = 1; 41 42 # Starting message... 43 $ScriptName = basename $0; 44 print "\n$ScriptName:Starting...\n\n"; 45 $StartTime = new Benchmark; 46 47 # Get the options and setup script... 48 SetupScriptUsage(); 49 if ($Options{help} || @ARGV < 1) { 50 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 51 } 52 53 my(@SDFilesList); 54 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 55 56 # Process options... 57 print "Processing options...\n"; 58 my(%OptionsInfo); 59 ProcessOptions(); 60 61 print "Checking input SD file(s)...\n"; 62 my(%SDFilesInfo); 63 RetrieveSDFilesInfo(); 64 65 # Generate output files... 66 my($FileIndex, %FilteredSDFileInfo); 67 if (@SDFilesList > 1) { 68 print "\nProcessing SD files...\n"; 69 } 70 for $FileIndex (0 .. $#SDFilesList) { 71 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 72 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 73 FilterSDFile($FileIndex); 74 } 75 } 76 print "\n$ScriptName:Done...\n\n"; 77 78 $EndTime = new Benchmark; 79 $TotalTime = timediff ($EndTime, $StartTime); 80 print "Total time: ", timestr($TotalTime), "\n"; 81 82 ############################################################################### 83 84 # Filter SD file... 85 sub FilterSDFile { 86 my($Index) = @_; 87 my($SDFile, $NewSDFile, $NewKeepSDFile, $CtabLinesCount, $CmpdString, $PrintCmpdCounterHeader, @CmpdLines); 88 89 $SDFile = $SDFilesList[$Index]; 90 $NewSDFile = $SDFilesInfo{OutFile}[$Index]; 91 $NewKeepSDFile = $SDFilesInfo{OutFileKeep}[$Index]; 92 93 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 94 if ($OptionsInfo{Keep}) { 95 open NEWKEEPSDFILE, ">$NewKeepSDFile" or die "Error: Couldn't open $NewKeepSDFile: $! \n"; 96 } 97 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 98 99 print "\nGenerating SD file $NewSDFile...\n"; 100 if ($OptionsInfo{Keep}) { 101 print "Generating file $NewKeepSDFile...\n"; 102 } 103 104 %FilteredSDFileInfo = (); 105 106 $FilteredSDFileInfo{CmpdCount} = 0; $FilteredSDFileInfo{FilterCmpd} = 0; 107 $FilteredSDFileInfo{FilteredCmpdCount} = 0; $FilteredSDFileInfo{KeepCmpdCount} = 0; 108 109 $PrintCmpdCounterHeader = 1; 110 111 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 112 $FilteredSDFileInfo{CmpdCount} += 1; 113 $FilteredSDFileInfo{FilterCmpd} = 0; 114 if (($FilteredSDFileInfo{CmpdCount} % 5000) == 0) { 115 if ($PrintCmpdCounterHeader) { 116 $PrintCmpdCounterHeader = 0; 117 print "\nProcessing compounds:"; 118 } 119 print "$FilteredSDFileInfo{CmpdCount}..."; 120 } 121 @CmpdLines = split "\n", $CmpdString; 122 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines); 123 if ($CtabLinesCount <= 0) { 124 $FilteredSDFileInfo{FilterCmpd} = 1; 125 WriteOutCmpdString($CmpdString); 126 next CMPDSTRING; 127 } 128 my ($AtomCount, $BondCount) = ParseCmpdCountsLine($CmpdLines[3]); 129 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) { 130 if ($CtabLinesCount != ($AtomCount + $BondCount)) { 131 $FilteredSDFileInfo{FilterCmpd} = 1; 132 WriteOutCmpdString($CmpdString); 133 next CMPDSTRING; 134 } 135 } 136 if ($CtabLinesCount == ($AtomCount + $BondCount)) { 137 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) { 138 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines); 139 if ($UnknownAtomCount) { 140 $FilteredSDFileInfo{FilterCmpd} = 1; 141 WriteOutCmpdString($CmpdString); 142 next CMPDSTRING; 143 } 144 } 145 if ($OptionsInfo{All} || $OptionsInfo{CleanSalts} || $OptionsInfo{Salts}) { 146 my ($FragmentsCount, $Fragments, $WashedCmpdString) = WashCmpd(\@CmpdLines); 147 if ($FragmentsCount > 1) { 148 if ($OptionsInfo{all} || $OptionsInfo{CleanSalts}) { 149 $CmpdString = $WashedCmpdString; 150 } 151 else { 152 $FilteredSDFileInfo{FilterCmpd} = 1; 153 } 154 WriteOutCmpdString($CmpdString); 155 next CMPDSTRING; 156 } 157 } 158 } 159 WriteOutCmpdString($CmpdString); 160 } 161 if (!$PrintCmpdCounterHeader) { 162 print "\n"; 163 } 164 165 close NEWSDFILE; 166 if ($OptionsInfo{Keep}) { 167 close NEWKEEPSDFILE; 168 } 169 close SDFILE; 170 171 print "\nTotal Number of compounds: $FilteredSDFileInfo{CmpdCount}\n"; 172 print "Number of compounds left after filtering: $FilteredSDFileInfo{FilteredCmpdCount}\n"; 173 print "Number of compounds ignored: $FilteredSDFileInfo{KeepCmpdCount}\n"; 174 } 175 176 # Write out the compound data... 177 sub WriteOutCmpdString { 178 my($CmpdString) = @_; 179 180 if ($FilteredSDFileInfo{FilterCmpd}) { 181 $FilteredSDFileInfo{KeepCmpdCount} += 1; 182 if ($OptionsInfo{Keep}) { 183 print NEWKEEPSDFILE "$CmpdString\n"; 184 } 185 } 186 else { 187 $FilteredSDFileInfo{FilteredCmpdCount} += 1; 188 print NEWSDFILE "$CmpdString\n"; 189 } 190 } 191 192 # Retrieve information about input SD files... 193 sub RetrieveSDFilesInfo { 194 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $NewSDFile, $NewKeepSDFile); 195 196 %SDFilesInfo = (); 197 @{$SDFilesInfo{FileOkay}} = (); 198 @{$SDFilesInfo{OutFile}} = (); 199 @{$SDFilesInfo{OutFileKeep}} = (); 200 201 FILELIST: for $Index (0 .. $#SDFilesList) { 202 $SDFile = $SDFilesList[$Index]; 203 204 $SDFilesInfo{FileOkay}[$Index] = 0; 205 $SDFilesInfo{OutFile}[$Index] = ''; 206 $SDFilesInfo{OutFileKeep}[$Index] = ''; 207 208 if (!(-e $SDFile)) { 209 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 210 next FILELIST; 211 } 212 if (!CheckFileType($SDFile, "sd sdf")) { 213 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 214 next FILELIST; 215 } 216 217 # Setup new file names... 218 $FileDir = ""; $FileName = ""; $FileExt = ""; 219 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 220 if ($Options{root} && (@SDFilesList == 1)) { 221 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 222 if ($RootFileName && $RootFileExt) { 223 $NewSDFile = $RootFileName; 224 } 225 else { 226 $NewSDFile = $Options{root}; 227 } 228 $NewKeepSDFile = $NewSDFile; 229 } 230 else { 231 $NewSDFile = $FileName . "Filtered"; 232 $NewKeepSDFile = $FileName; 233 } 234 $NewSDFile .= ".$FileExt"; 235 $NewKeepSDFile .= "Ignored" . ".$FileExt"; 236 if (!$Options{overwrite}) { 237 if (-e $NewSDFile) { 238 warn "Warning: Ignoring file $SDFile: New SD file, $NewSDFile, already exists\n"; 239 next FILELIST; 240 } 241 if ($Options{keep}) { 242 if (-e $NewKeepSDFile) { 243 warn "Warning: Ignoring file $SDFile: New SD file, $NewKeepSDFile, already exists\n"; 244 next FILELIST; 245 } 246 } 247 } 248 if (lc($NewSDFile) eq lc($SDFile)) { 249 warn "Warning: Ignoring file $SDFile: Same output, $NewSDFile, and input file name\n"; 250 print "Specify a different name using \"-r --root\" option or use default name.\n"; 251 next FILELIST; 252 } 253 254 $SDFilesInfo{FileOkay}[$Index] = 1; 255 $SDFilesInfo{OutFile}[$Index] = $NewSDFile; 256 $SDFilesInfo{OutFileKeep}[$Index] = $NewKeepSDFile; 257 } 258 } 259 260 # Process option values... 261 sub ProcessOptions { 262 %OptionsInfo = (); 263 264 $OptionsInfo{All} = $Options{all} ? $Options{all} : undef; 265 $OptionsInfo{CleanSalts} = $Options{cleansalts} ? $Options{cleansalts} : undef; 266 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : undef; 267 $OptionsInfo{Keep} = $Options{keep} ? $Options{keep} : undef; 268 $OptionsInfo{Mismatch} = $Options{mismatch} ? $Options{mismatch} : undef; 269 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef; 270 $OptionsInfo{Salts} = $Options{salts} ? $Options{salts} : undef; 271 $OptionsInfo{UnknownAtoms} = $Options{unknownatoms} ? $Options{unknownatoms} : undef; 272 273 } 274 275 # Setup script usage and retrieve command line arguments specified using various options... 276 sub SetupScriptUsage { 277 278 # Retrieve all the options... 279 %Options = (); 280 if (!GetOptions(\%Options, "all|a", "cleansalts|c", "empty|e", "help|h", "keep|k", "mismatch|m", "overwrite|o", "root|r=s", "salts|s", "unknownatoms|u", "workingdir|w=s")) { 281 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 282 } 283 if ($Options{workingdir}) { 284 if (! -d $Options{workingdir}) { 285 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 286 } 287 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 288 } 289 } 290