Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/FilterSDFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
| author | deepakjadmin |
|---|---|
| date | Wed, 20 Jan 2016 09:23:18 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4816e4a8ae95 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 # | |
| 3 # $RCSfile: FilterSDFiles.pl,v $ | |
| 4 # $Date: 2015/02/28 20:46:20 $ | |
| 5 # $Revision: 1.32 $ | |
| 6 # | |
| 7 # Author: Manish Sud <msud@san.rr.com> | |
| 8 # | |
| 9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 10 # | |
| 11 # This file is part of MayaChemTools. | |
| 12 # | |
| 13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 14 # the terms of the GNU Lesser General Public License as published by the Free | |
| 15 # Software Foundation; either version 3 of the License, or (at your option) any | |
| 16 # later version. | |
| 17 # | |
| 18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
| 19 # any warranty; without even the implied warranty of merchantability of fitness | |
| 20 # for a particular purpose. See the GNU Lesser General Public License for more | |
| 21 # details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public License | |
| 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
| 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
| 26 # Boston, MA, 02111-1307, USA. | |
| 27 # | |
| 28 | |
| 29 use strict; | |
| 30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
| 31 use Getopt::Long; | |
| 32 use File::Basename; | |
| 33 use Benchmark; | |
| 34 use SDFileUtil; | |
| 35 use FileUtil; | |
| 36 | |
| 37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
| 38 | |
| 39 # Autoflush STDOUT | |
| 40 $| = 1; | |
| 41 | |
| 42 # Starting message... | |
| 43 $ScriptName = basename $0; | |
| 44 print "\n$ScriptName:Starting...\n\n"; | |
| 45 $StartTime = new Benchmark; | |
| 46 | |
| 47 # Get the options and setup script... | |
| 48 SetupScriptUsage(); | |
| 49 if ($Options{help} || @ARGV < 1) { | |
| 50 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
| 51 } | |
| 52 | |
| 53 my(@SDFilesList); | |
| 54 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); | |
| 55 | |
| 56 # Process options... | |
| 57 print "Processing options...\n"; | |
| 58 my(%OptionsInfo); | |
| 59 ProcessOptions(); | |
| 60 | |
| 61 print "Checking input SD file(s)...\n"; | |
| 62 my(%SDFilesInfo); | |
| 63 RetrieveSDFilesInfo(); | |
| 64 | |
| 65 # Generate output files... | |
| 66 my($FileIndex, %FilteredSDFileInfo); | |
| 67 if (@SDFilesList > 1) { | |
| 68 print "\nProcessing SD files...\n"; | |
| 69 } | |
| 70 for $FileIndex (0 .. $#SDFilesList) { | |
| 71 if ($SDFilesInfo{FileOkay}[$FileIndex]) { | |
| 72 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; | |
| 73 FilterSDFile($FileIndex); | |
| 74 } | |
| 75 } | |
| 76 print "\n$ScriptName:Done...\n\n"; | |
| 77 | |
| 78 $EndTime = new Benchmark; | |
| 79 $TotalTime = timediff ($EndTime, $StartTime); | |
| 80 print "Total time: ", timestr($TotalTime), "\n"; | |
| 81 | |
| 82 ############################################################################### | |
| 83 | |
| 84 # Filter SD file... | |
| 85 sub FilterSDFile { | |
| 86 my($Index) = @_; | |
| 87 my($SDFile, $NewSDFile, $NewKeepSDFile, $CtabLinesCount, $CmpdString, $PrintCmpdCounterHeader, @CmpdLines); | |
| 88 | |
| 89 $SDFile = $SDFilesList[$Index]; | |
| 90 $NewSDFile = $SDFilesInfo{OutFile}[$Index]; | |
| 91 $NewKeepSDFile = $SDFilesInfo{OutFileKeep}[$Index]; | |
| 92 | |
| 93 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; | |
| 94 if ($OptionsInfo{Keep}) { | |
| 95 open NEWKEEPSDFILE, ">$NewKeepSDFile" or die "Error: Couldn't open $NewKeepSDFile: $! \n"; | |
| 96 } | |
| 97 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; | |
| 98 | |
| 99 print "\nGenerating SD file $NewSDFile...\n"; | |
| 100 if ($OptionsInfo{Keep}) { | |
| 101 print "Generating file $NewKeepSDFile...\n"; | |
| 102 } | |
| 103 | |
| 104 %FilteredSDFileInfo = (); | |
| 105 | |
| 106 $FilteredSDFileInfo{CmpdCount} = 0; $FilteredSDFileInfo{FilterCmpd} = 0; | |
| 107 $FilteredSDFileInfo{FilteredCmpdCount} = 0; $FilteredSDFileInfo{KeepCmpdCount} = 0; | |
| 108 | |
| 109 $PrintCmpdCounterHeader = 1; | |
| 110 | |
| 111 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { | |
| 112 $FilteredSDFileInfo{CmpdCount} += 1; | |
| 113 $FilteredSDFileInfo{FilterCmpd} = 0; | |
| 114 if (($FilteredSDFileInfo{CmpdCount} % 5000) == 0) { | |
| 115 if ($PrintCmpdCounterHeader) { | |
| 116 $PrintCmpdCounterHeader = 0; | |
| 117 print "\nProcessing compounds:"; | |
| 118 } | |
| 119 print "$FilteredSDFileInfo{CmpdCount}..."; | |
| 120 } | |
| 121 @CmpdLines = split "\n", $CmpdString; | |
| 122 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines); | |
| 123 if ($CtabLinesCount <= 0) { | |
| 124 $FilteredSDFileInfo{FilterCmpd} = 1; | |
| 125 WriteOutCmpdString($CmpdString); | |
| 126 next CMPDSTRING; | |
| 127 } | |
| 128 my ($AtomCount, $BondCount) = ParseCmpdCountsLine($CmpdLines[3]); | |
| 129 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) { | |
| 130 if ($CtabLinesCount != ($AtomCount + $BondCount)) { | |
| 131 $FilteredSDFileInfo{FilterCmpd} = 1; | |
| 132 WriteOutCmpdString($CmpdString); | |
| 133 next CMPDSTRING; | |
| 134 } | |
| 135 } | |
| 136 if ($CtabLinesCount == ($AtomCount + $BondCount)) { | |
| 137 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) { | |
| 138 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines); | |
| 139 if ($UnknownAtomCount) { | |
| 140 $FilteredSDFileInfo{FilterCmpd} = 1; | |
| 141 WriteOutCmpdString($CmpdString); | |
| 142 next CMPDSTRING; | |
| 143 } | |
| 144 } | |
| 145 if ($OptionsInfo{All} || $OptionsInfo{CleanSalts} || $OptionsInfo{Salts}) { | |
| 146 my ($FragmentsCount, $Fragments, $WashedCmpdString) = WashCmpd(\@CmpdLines); | |
| 147 if ($FragmentsCount > 1) { | |
| 148 if ($OptionsInfo{all} || $OptionsInfo{CleanSalts}) { | |
| 149 $CmpdString = $WashedCmpdString; | |
| 150 } | |
| 151 else { | |
| 152 $FilteredSDFileInfo{FilterCmpd} = 1; | |
| 153 } | |
| 154 WriteOutCmpdString($CmpdString); | |
| 155 next CMPDSTRING; | |
| 156 } | |
| 157 } | |
| 158 } | |
| 159 WriteOutCmpdString($CmpdString); | |
| 160 } | |
| 161 if (!$PrintCmpdCounterHeader) { | |
| 162 print "\n"; | |
| 163 } | |
| 164 | |
| 165 close NEWSDFILE; | |
| 166 if ($OptionsInfo{Keep}) { | |
| 167 close NEWKEEPSDFILE; | |
| 168 } | |
| 169 close SDFILE; | |
| 170 | |
| 171 print "\nTotal Number of compounds: $FilteredSDFileInfo{CmpdCount}\n"; | |
| 172 print "Number of compounds left after filtering: $FilteredSDFileInfo{FilteredCmpdCount}\n"; | |
| 173 print "Number of compounds ignored: $FilteredSDFileInfo{KeepCmpdCount}\n"; | |
| 174 } | |
| 175 | |
| 176 # Write out the compound data... | |
| 177 sub WriteOutCmpdString { | |
| 178 my($CmpdString) = @_; | |
| 179 | |
| 180 if ($FilteredSDFileInfo{FilterCmpd}) { | |
| 181 $FilteredSDFileInfo{KeepCmpdCount} += 1; | |
| 182 if ($OptionsInfo{Keep}) { | |
| 183 print NEWKEEPSDFILE "$CmpdString\n"; | |
| 184 } | |
| 185 } | |
| 186 else { | |
| 187 $FilteredSDFileInfo{FilteredCmpdCount} += 1; | |
| 188 print NEWSDFILE "$CmpdString\n"; | |
| 189 } | |
| 190 } | |
| 191 | |
| 192 # Retrieve information about input SD files... | |
| 193 sub RetrieveSDFilesInfo { | |
| 194 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $NewSDFile, $NewKeepSDFile); | |
| 195 | |
| 196 %SDFilesInfo = (); | |
| 197 @{$SDFilesInfo{FileOkay}} = (); | |
| 198 @{$SDFilesInfo{OutFile}} = (); | |
| 199 @{$SDFilesInfo{OutFileKeep}} = (); | |
| 200 | |
| 201 FILELIST: for $Index (0 .. $#SDFilesList) { | |
| 202 $SDFile = $SDFilesList[$Index]; | |
| 203 | |
| 204 $SDFilesInfo{FileOkay}[$Index] = 0; | |
| 205 $SDFilesInfo{OutFile}[$Index] = ''; | |
| 206 $SDFilesInfo{OutFileKeep}[$Index] = ''; | |
| 207 | |
| 208 if (!(-e $SDFile)) { | |
| 209 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; | |
| 210 next FILELIST; | |
| 211 } | |
| 212 if (!CheckFileType($SDFile, "sd sdf")) { | |
| 213 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; | |
| 214 next FILELIST; | |
| 215 } | |
| 216 | |
| 217 # Setup new file names... | |
| 218 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
| 219 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); | |
| 220 if ($Options{root} && (@SDFilesList == 1)) { | |
| 221 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); | |
| 222 if ($RootFileName && $RootFileExt) { | |
| 223 $NewSDFile = $RootFileName; | |
| 224 } | |
| 225 else { | |
| 226 $NewSDFile = $Options{root}; | |
| 227 } | |
| 228 $NewKeepSDFile = $NewSDFile; | |
| 229 } | |
| 230 else { | |
| 231 $NewSDFile = $FileName . "Filtered"; | |
| 232 $NewKeepSDFile = $FileName; | |
| 233 } | |
| 234 $NewSDFile .= ".$FileExt"; | |
| 235 $NewKeepSDFile .= "Ignored" . ".$FileExt"; | |
| 236 if (!$Options{overwrite}) { | |
| 237 if (-e $NewSDFile) { | |
| 238 warn "Warning: Ignoring file $SDFile: New SD file, $NewSDFile, already exists\n"; | |
| 239 next FILELIST; | |
| 240 } | |
| 241 if ($Options{keep}) { | |
| 242 if (-e $NewKeepSDFile) { | |
| 243 warn "Warning: Ignoring file $SDFile: New SD file, $NewKeepSDFile, already exists\n"; | |
| 244 next FILELIST; | |
| 245 } | |
| 246 } | |
| 247 } | |
| 248 if (lc($NewSDFile) eq lc($SDFile)) { | |
| 249 warn "Warning: Ignoring file $SDFile: Same output, $NewSDFile, and input file name\n"; | |
| 250 print "Specify a different name using \"-r --root\" option or use default name.\n"; | |
| 251 next FILELIST; | |
| 252 } | |
| 253 | |
| 254 $SDFilesInfo{FileOkay}[$Index] = 1; | |
| 255 $SDFilesInfo{OutFile}[$Index] = $NewSDFile; | |
| 256 $SDFilesInfo{OutFileKeep}[$Index] = $NewKeepSDFile; | |
| 257 } | |
| 258 } | |
| 259 | |
| 260 # Process option values... | |
| 261 sub ProcessOptions { | |
| 262 %OptionsInfo = (); | |
| 263 | |
| 264 $OptionsInfo{All} = $Options{all} ? $Options{all} : undef; | |
| 265 $OptionsInfo{CleanSalts} = $Options{cleansalts} ? $Options{cleansalts} : undef; | |
| 266 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : undef; | |
| 267 $OptionsInfo{Keep} = $Options{keep} ? $Options{keep} : undef; | |
| 268 $OptionsInfo{Mismatch} = $Options{mismatch} ? $Options{mismatch} : undef; | |
| 269 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef; | |
| 270 $OptionsInfo{Salts} = $Options{salts} ? $Options{salts} : undef; | |
| 271 $OptionsInfo{UnknownAtoms} = $Options{unknownatoms} ? $Options{unknownatoms} : undef; | |
| 272 | |
| 273 } | |
| 274 | |
| 275 # Setup script usage and retrieve command line arguments specified using various options... | |
| 276 sub SetupScriptUsage { | |
| 277 | |
| 278 # Retrieve all the options... | |
| 279 %Options = (); | |
| 280 if (!GetOptions(\%Options, "all|a", "cleansalts|c", "empty|e", "help|h", "keep|k", "mismatch|m", "overwrite|o", "root|r=s", "salts|s", "unknownatoms|u", "workingdir|w=s")) { | |
| 281 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
| 282 } | |
| 283 if ($Options{workingdir}) { | |
| 284 if (! -d $Options{workingdir}) { | |
| 285 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
| 286 } | |
| 287 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
| 288 } | |
| 289 } | |
| 290 | |
| 291 __END__ | |
| 292 | |
| 293 =head1 NAME | |
| 294 | |
| 295 FilterSDFiles.pl - Filter compounds from SDFile(s) | |
| 296 | |
| 297 =head1 SYNOPSIS | |
| 298 | |
| 299 FilterSDFiles.pl SDFile(s)... | |
| 300 | |
| 301 FilterSDFiles.pl [B<-a, --all>] [B<-e, --empty>] [B<-c, --cleansalts>] [B<-h, --help>] | |
| 302 [B<-k, --keep>] [B<-m, --mismatch>] [B<-o, --overwrite>] [B<-r, --root> I<rootname>] | |
| 303 [B<-s, --salts>] [B<-u, --unknownatoms>] [B<-w, --workingdir> I<dirname>] SDFile(s)... | |
| 304 | |
| 305 =head1 DESCRIPTION | |
| 306 | |
| 307 Filter specific compounds from I<SDFile(s)>. Available choices are: wash or | |
| 308 remove compounds with salts; take out compounds with no | |
| 309 structural data; remove compounds with mismatched atom/bond blocks data; | |
| 310 remove compounds which contain uknown atoms and so on. Multiple SDFile | |
| 311 names are separated by spaces. The valid file extensions are I<.sdf> and I<.sd>. | |
| 312 All other file names are ignored. All the SD files in a current directory can be | |
| 313 specified either by I<*.sdf> or the current directory name. | |
| 314 | |
| 315 =head1 OPTIONS | |
| 316 | |
| 317 =over 4 | |
| 318 | |
| 319 =item B<-a, --all> | |
| 320 | |
| 321 Use all options to filter compounds. | |
| 322 | |
| 323 =item B<-e, --empty> | |
| 324 | |
| 325 Filter compounds with empty atom/bond blocks. This is B<default behavior>. | |
| 326 | |
| 327 =item B<-c, --cleansalts> | |
| 328 | |
| 329 Wash compounds which contain salts identified as disconnected structural | |
| 330 units. The largest fragment is kept. | |
| 331 | |
| 332 =item B<-h, --help> | |
| 333 | |
| 334 Print this help message. | |
| 335 | |
| 336 =item B<-k, --keep> | |
| 337 | |
| 338 Keep the compounds which were filtered in a separate file. Default: Just | |
| 339 ignore these compounds. Option B<-r --root> is used to generate the new file | |
| 340 name: <Root>Ignored.sdf. Default file name: <SDFileName>Ignored.sdf. | |
| 341 | |
| 342 =item B<-m, --mismatch> | |
| 343 | |
| 344 Remove compounds with mismatched atom/bond blocks and counts line | |
| 345 information specified by header block. | |
| 346 | |
| 347 =item B<-o, --overwrite> | |
| 348 | |
| 349 Overwrite existing files. | |
| 350 | |
| 351 =item B<-r, --root> I<rootname> | |
| 352 | |
| 353 New SD file name is generated using the root: <Root>.sdf. Default file | |
| 354 name:<SDFileName>Filtered.sdf. This option is ignored for multiple input files. | |
| 355 | |
| 356 =item B<-s, --salts> | |
| 357 | |
| 358 Remove compounds which contain salts identified as disconnected structural | |
| 359 units. | |
| 360 | |
| 361 =item B<-u, --unknownatoms> | |
| 362 | |
| 363 Remove compounds with atom blocks containing special atom symbols such | |
| 364 as L, Q, * ,LP, X, R#, or any other non periodic table symbols. | |
| 365 | |
| 366 =item B<-w, --workingdir> I<dirname> | |
| 367 | |
| 368 Location of working directory. Default: current directory. | |
| 369 | |
| 370 =back | |
| 371 | |
| 372 =head1 EXAMPLES | |
| 373 | |
| 374 To remove compounds from SD files which contain salts, unknown atoms, or | |
| 375 mismatched atom/bonds block data or no structural data, type: | |
| 376 | |
| 377 % FilterSDFiles.pl -a -o Sample.sdf | |
| 378 % FilterSDFiles.pl -a -o *.sdf | |
| 379 | |
| 380 And to generate a new NewSampleIgnored.sdf file for filtered compounds, type: | |
| 381 | |
| 382 % FilterSDFiles.pl -a -k -r NewSample -o Sample.sdf | |
| 383 | |
| 384 To wash compounds in order to get rid of all disconnected fragments except for | |
| 385 the largest one, type: | |
| 386 | |
| 387 % FilterSDFiles.pl -c -o Sample.sdf | |
| 388 | |
| 389 =head1 AUTHOR | |
| 390 | |
| 391 Manish Sud <msud@san.rr.com> | |
| 392 | |
| 393 =head1 SEE ALSO | |
| 394 | |
| 395 ExtractFromSDFiles.pl, InfoSDFiles.pl, MergeTextFilesWithSD.pl | |
| 396 | |
| 397 =head1 COPYRIGHT | |
| 398 | |
| 399 Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 400 | |
| 401 This file is part of MayaChemTools. | |
| 402 | |
| 403 MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 404 the terms of the GNU Lesser General Public License as published by the Free | |
| 405 Software Foundation; either version 3 of the License, or (at your option) | |
| 406 any later version. | |
| 407 | |
| 408 =cut |
