0
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # $RCSfile: FilterSDFiles.pl,v $
|
|
4 # $Date: 2015/02/28 20:46:20 $
|
|
5 # $Revision: 1.32 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use FindBin; use lib "$FindBin::Bin/../lib";
|
|
31 use Getopt::Long;
|
|
32 use File::Basename;
|
|
33 use Benchmark;
|
|
34 use SDFileUtil;
|
|
35 use FileUtil;
|
|
36
|
|
37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
|
|
38
|
|
39 # Autoflush STDOUT
|
|
40 $| = 1;
|
|
41
|
|
42 # Starting message...
|
|
43 $ScriptName = basename $0;
|
|
44 print "\n$ScriptName:Starting...\n\n";
|
|
45 $StartTime = new Benchmark;
|
|
46
|
|
47 # Get the options and setup script...
|
|
48 SetupScriptUsage();
|
|
49 if ($Options{help} || @ARGV < 1) {
|
|
50 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
|
|
51 }
|
|
52
|
|
53 my(@SDFilesList);
|
|
54 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
|
|
55
|
|
56 # Process options...
|
|
57 print "Processing options...\n";
|
|
58 my(%OptionsInfo);
|
|
59 ProcessOptions();
|
|
60
|
|
61 print "Checking input SD file(s)...\n";
|
|
62 my(%SDFilesInfo);
|
|
63 RetrieveSDFilesInfo();
|
|
64
|
|
65 # Generate output files...
|
|
66 my($FileIndex, %FilteredSDFileInfo);
|
|
67 if (@SDFilesList > 1) {
|
|
68 print "\nProcessing SD files...\n";
|
|
69 }
|
|
70 for $FileIndex (0 .. $#SDFilesList) {
|
|
71 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
|
|
72 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
|
|
73 FilterSDFile($FileIndex);
|
|
74 }
|
|
75 }
|
|
76 print "\n$ScriptName:Done...\n\n";
|
|
77
|
|
78 $EndTime = new Benchmark;
|
|
79 $TotalTime = timediff ($EndTime, $StartTime);
|
|
80 print "Total time: ", timestr($TotalTime), "\n";
|
|
81
|
|
82 ###############################################################################
|
|
83
|
|
84 # Filter SD file...
|
|
85 sub FilterSDFile {
|
|
86 my($Index) = @_;
|
|
87 my($SDFile, $NewSDFile, $NewKeepSDFile, $CtabLinesCount, $CmpdString, $PrintCmpdCounterHeader, @CmpdLines);
|
|
88
|
|
89 $SDFile = $SDFilesList[$Index];
|
|
90 $NewSDFile = $SDFilesInfo{OutFile}[$Index];
|
|
91 $NewKeepSDFile = $SDFilesInfo{OutFileKeep}[$Index];
|
|
92
|
|
93 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
|
|
94 if ($OptionsInfo{Keep}) {
|
|
95 open NEWKEEPSDFILE, ">$NewKeepSDFile" or die "Error: Couldn't open $NewKeepSDFile: $! \n";
|
|
96 }
|
|
97 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
|
|
98
|
|
99 print "\nGenerating SD file $NewSDFile...\n";
|
|
100 if ($OptionsInfo{Keep}) {
|
|
101 print "Generating file $NewKeepSDFile...\n";
|
|
102 }
|
|
103
|
|
104 %FilteredSDFileInfo = ();
|
|
105
|
|
106 $FilteredSDFileInfo{CmpdCount} = 0; $FilteredSDFileInfo{FilterCmpd} = 0;
|
|
107 $FilteredSDFileInfo{FilteredCmpdCount} = 0; $FilteredSDFileInfo{KeepCmpdCount} = 0;
|
|
108
|
|
109 $PrintCmpdCounterHeader = 1;
|
|
110
|
|
111 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
|
|
112 $FilteredSDFileInfo{CmpdCount} += 1;
|
|
113 $FilteredSDFileInfo{FilterCmpd} = 0;
|
|
114 if (($FilteredSDFileInfo{CmpdCount} % 5000) == 0) {
|
|
115 if ($PrintCmpdCounterHeader) {
|
|
116 $PrintCmpdCounterHeader = 0;
|
|
117 print "\nProcessing compounds:";
|
|
118 }
|
|
119 print "$FilteredSDFileInfo{CmpdCount}...";
|
|
120 }
|
|
121 @CmpdLines = split "\n", $CmpdString;
|
|
122 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines);
|
|
123 if ($CtabLinesCount <= 0) {
|
|
124 $FilteredSDFileInfo{FilterCmpd} = 1;
|
|
125 WriteOutCmpdString($CmpdString);
|
|
126 next CMPDSTRING;
|
|
127 }
|
|
128 my ($AtomCount, $BondCount) = ParseCmpdCountsLine($CmpdLines[3]);
|
|
129 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) {
|
|
130 if ($CtabLinesCount != ($AtomCount + $BondCount)) {
|
|
131 $FilteredSDFileInfo{FilterCmpd} = 1;
|
|
132 WriteOutCmpdString($CmpdString);
|
|
133 next CMPDSTRING;
|
|
134 }
|
|
135 }
|
|
136 if ($CtabLinesCount == ($AtomCount + $BondCount)) {
|
|
137 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) {
|
|
138 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines);
|
|
139 if ($UnknownAtomCount) {
|
|
140 $FilteredSDFileInfo{FilterCmpd} = 1;
|
|
141 WriteOutCmpdString($CmpdString);
|
|
142 next CMPDSTRING;
|
|
143 }
|
|
144 }
|
|
145 if ($OptionsInfo{All} || $OptionsInfo{CleanSalts} || $OptionsInfo{Salts}) {
|
|
146 my ($FragmentsCount, $Fragments, $WashedCmpdString) = WashCmpd(\@CmpdLines);
|
|
147 if ($FragmentsCount > 1) {
|
|
148 if ($OptionsInfo{all} || $OptionsInfo{CleanSalts}) {
|
|
149 $CmpdString = $WashedCmpdString;
|
|
150 }
|
|
151 else {
|
|
152 $FilteredSDFileInfo{FilterCmpd} = 1;
|
|
153 }
|
|
154 WriteOutCmpdString($CmpdString);
|
|
155 next CMPDSTRING;
|
|
156 }
|
|
157 }
|
|
158 }
|
|
159 WriteOutCmpdString($CmpdString);
|
|
160 }
|
|
161 if (!$PrintCmpdCounterHeader) {
|
|
162 print "\n";
|
|
163 }
|
|
164
|
|
165 close NEWSDFILE;
|
|
166 if ($OptionsInfo{Keep}) {
|
|
167 close NEWKEEPSDFILE;
|
|
168 }
|
|
169 close SDFILE;
|
|
170
|
|
171 print "\nTotal Number of compounds: $FilteredSDFileInfo{CmpdCount}\n";
|
|
172 print "Number of compounds left after filtering: $FilteredSDFileInfo{FilteredCmpdCount}\n";
|
|
173 print "Number of compounds ignored: $FilteredSDFileInfo{KeepCmpdCount}\n";
|
|
174 }
|
|
175
|
|
176 # Write out the compound data...
|
|
177 sub WriteOutCmpdString {
|
|
178 my($CmpdString) = @_;
|
|
179
|
|
180 if ($FilteredSDFileInfo{FilterCmpd}) {
|
|
181 $FilteredSDFileInfo{KeepCmpdCount} += 1;
|
|
182 if ($OptionsInfo{Keep}) {
|
|
183 print NEWKEEPSDFILE "$CmpdString\n";
|
|
184 }
|
|
185 }
|
|
186 else {
|
|
187 $FilteredSDFileInfo{FilteredCmpdCount} += 1;
|
|
188 print NEWSDFILE "$CmpdString\n";
|
|
189 }
|
|
190 }
|
|
191
|
|
192 # Retrieve information about input SD files...
|
|
193 sub RetrieveSDFilesInfo {
|
|
194 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $NewSDFile, $NewKeepSDFile);
|
|
195
|
|
196 %SDFilesInfo = ();
|
|
197 @{$SDFilesInfo{FileOkay}} = ();
|
|
198 @{$SDFilesInfo{OutFile}} = ();
|
|
199 @{$SDFilesInfo{OutFileKeep}} = ();
|
|
200
|
|
201 FILELIST: for $Index (0 .. $#SDFilesList) {
|
|
202 $SDFile = $SDFilesList[$Index];
|
|
203
|
|
204 $SDFilesInfo{FileOkay}[$Index] = 0;
|
|
205 $SDFilesInfo{OutFile}[$Index] = '';
|
|
206 $SDFilesInfo{OutFileKeep}[$Index] = '';
|
|
207
|
|
208 if (!(-e $SDFile)) {
|
|
209 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
|
|
210 next FILELIST;
|
|
211 }
|
|
212 if (!CheckFileType($SDFile, "sd sdf")) {
|
|
213 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
|
|
214 next FILELIST;
|
|
215 }
|
|
216
|
|
217 # Setup new file names...
|
|
218 $FileDir = ""; $FileName = ""; $FileExt = "";
|
|
219 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
|
|
220 if ($Options{root} && (@SDFilesList == 1)) {
|
|
221 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
|
|
222 if ($RootFileName && $RootFileExt) {
|
|
223 $NewSDFile = $RootFileName;
|
|
224 }
|
|
225 else {
|
|
226 $NewSDFile = $Options{root};
|
|
227 }
|
|
228 $NewKeepSDFile = $NewSDFile;
|
|
229 }
|
|
230 else {
|
|
231 $NewSDFile = $FileName . "Filtered";
|
|
232 $NewKeepSDFile = $FileName;
|
|
233 }
|
|
234 $NewSDFile .= ".$FileExt";
|
|
235 $NewKeepSDFile .= "Ignored" . ".$FileExt";
|
|
236 if (!$Options{overwrite}) {
|
|
237 if (-e $NewSDFile) {
|
|
238 warn "Warning: Ignoring file $SDFile: New SD file, $NewSDFile, already exists\n";
|
|
239 next FILELIST;
|
|
240 }
|
|
241 if ($Options{keep}) {
|
|
242 if (-e $NewKeepSDFile) {
|
|
243 warn "Warning: Ignoring file $SDFile: New SD file, $NewKeepSDFile, already exists\n";
|
|
244 next FILELIST;
|
|
245 }
|
|
246 }
|
|
247 }
|
|
248 if (lc($NewSDFile) eq lc($SDFile)) {
|
|
249 warn "Warning: Ignoring file $SDFile: Same output, $NewSDFile, and input file name\n";
|
|
250 print "Specify a different name using \"-r --root\" option or use default name.\n";
|
|
251 next FILELIST;
|
|
252 }
|
|
253
|
|
254 $SDFilesInfo{FileOkay}[$Index] = 1;
|
|
255 $SDFilesInfo{OutFile}[$Index] = $NewSDFile;
|
|
256 $SDFilesInfo{OutFileKeep}[$Index] = $NewKeepSDFile;
|
|
257 }
|
|
258 }
|
|
259
|
|
260 # Process option values...
|
|
261 sub ProcessOptions {
|
|
262 %OptionsInfo = ();
|
|
263
|
|
264 $OptionsInfo{All} = $Options{all} ? $Options{all} : undef;
|
|
265 $OptionsInfo{CleanSalts} = $Options{cleansalts} ? $Options{cleansalts} : undef;
|
|
266 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : undef;
|
|
267 $OptionsInfo{Keep} = $Options{keep} ? $Options{keep} : undef;
|
|
268 $OptionsInfo{Mismatch} = $Options{mismatch} ? $Options{mismatch} : undef;
|
|
269 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef;
|
|
270 $OptionsInfo{Salts} = $Options{salts} ? $Options{salts} : undef;
|
|
271 $OptionsInfo{UnknownAtoms} = $Options{unknownatoms} ? $Options{unknownatoms} : undef;
|
|
272
|
|
273 }
|
|
274
|
|
275 # Setup script usage and retrieve command line arguments specified using various options...
|
|
276 sub SetupScriptUsage {
|
|
277
|
|
278 # Retrieve all the options...
|
|
279 %Options = ();
|
|
280 if (!GetOptions(\%Options, "all|a", "cleansalts|c", "empty|e", "help|h", "keep|k", "mismatch|m", "overwrite|o", "root|r=s", "salts|s", "unknownatoms|u", "workingdir|w=s")) {
|
|
281 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
|
|
282 }
|
|
283 if ($Options{workingdir}) {
|
|
284 if (! -d $Options{workingdir}) {
|
|
285 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
|
|
286 }
|
|
287 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
|
|
288 }
|
|
289 }
|
|
290
|
|
291 __END__
|
|
292
|
|
293 =head1 NAME
|
|
294
|
|
295 FilterSDFiles.pl - Filter compounds from SDFile(s)
|
|
296
|
|
297 =head1 SYNOPSIS
|
|
298
|
|
299 FilterSDFiles.pl SDFile(s)...
|
|
300
|
|
301 FilterSDFiles.pl [B<-a, --all>] [B<-e, --empty>] [B<-c, --cleansalts>] [B<-h, --help>]
|
|
302 [B<-k, --keep>] [B<-m, --mismatch>] [B<-o, --overwrite>] [B<-r, --root> I<rootname>]
|
|
303 [B<-s, --salts>] [B<-u, --unknownatoms>] [B<-w, --workingdir> I<dirname>] SDFile(s)...
|
|
304
|
|
305 =head1 DESCRIPTION
|
|
306
|
|
307 Filter specific compounds from I<SDFile(s)>. Available choices are: wash or
|
|
308 remove compounds with salts; take out compounds with no
|
|
309 structural data; remove compounds with mismatched atom/bond blocks data;
|
|
310 remove compounds which contain uknown atoms and so on. Multiple SDFile
|
|
311 names are separated by spaces. The valid file extensions are I<.sdf> and I<.sd>.
|
|
312 All other file names are ignored. All the SD files in a current directory can be
|
|
313 specified either by I<*.sdf> or the current directory name.
|
|
314
|
|
315 =head1 OPTIONS
|
|
316
|
|
317 =over 4
|
|
318
|
|
319 =item B<-a, --all>
|
|
320
|
|
321 Use all options to filter compounds.
|
|
322
|
|
323 =item B<-e, --empty>
|
|
324
|
|
325 Filter compounds with empty atom/bond blocks. This is B<default behavior>.
|
|
326
|
|
327 =item B<-c, --cleansalts>
|
|
328
|
|
329 Wash compounds which contain salts identified as disconnected structural
|
|
330 units. The largest fragment is kept.
|
|
331
|
|
332 =item B<-h, --help>
|
|
333
|
|
334 Print this help message.
|
|
335
|
|
336 =item B<-k, --keep>
|
|
337
|
|
338 Keep the compounds which were filtered in a separate file. Default: Just
|
|
339 ignore these compounds. Option B<-r --root> is used to generate the new file
|
|
340 name: <Root>Ignored.sdf. Default file name: <SDFileName>Ignored.sdf.
|
|
341
|
|
342 =item B<-m, --mismatch>
|
|
343
|
|
344 Remove compounds with mismatched atom/bond blocks and counts line
|
|
345 information specified by header block.
|
|
346
|
|
347 =item B<-o, --overwrite>
|
|
348
|
|
349 Overwrite existing files.
|
|
350
|
|
351 =item B<-r, --root> I<rootname>
|
|
352
|
|
353 New SD file name is generated using the root: <Root>.sdf. Default file
|
|
354 name:<SDFileName>Filtered.sdf. This option is ignored for multiple input files.
|
|
355
|
|
356 =item B<-s, --salts>
|
|
357
|
|
358 Remove compounds which contain salts identified as disconnected structural
|
|
359 units.
|
|
360
|
|
361 =item B<-u, --unknownatoms>
|
|
362
|
|
363 Remove compounds with atom blocks containing special atom symbols such
|
|
364 as L, Q, * ,LP, X, R#, or any other non periodic table symbols.
|
|
365
|
|
366 =item B<-w, --workingdir> I<dirname>
|
|
367
|
|
368 Location of working directory. Default: current directory.
|
|
369
|
|
370 =back
|
|
371
|
|
372 =head1 EXAMPLES
|
|
373
|
|
374 To remove compounds from SD files which contain salts, unknown atoms, or
|
|
375 mismatched atom/bonds block data or no structural data, type:
|
|
376
|
|
377 % FilterSDFiles.pl -a -o Sample.sdf
|
|
378 % FilterSDFiles.pl -a -o *.sdf
|
|
379
|
|
380 And to generate a new NewSampleIgnored.sdf file for filtered compounds, type:
|
|
381
|
|
382 % FilterSDFiles.pl -a -k -r NewSample -o Sample.sdf
|
|
383
|
|
384 To wash compounds in order to get rid of all disconnected fragments except for
|
|
385 the largest one, type:
|
|
386
|
|
387 % FilterSDFiles.pl -c -o Sample.sdf
|
|
388
|
|
389 =head1 AUTHOR
|
|
390
|
|
391 Manish Sud <msud@san.rr.com>
|
|
392
|
|
393 =head1 SEE ALSO
|
|
394
|
|
395 ExtractFromSDFiles.pl, InfoSDFiles.pl, MergeTextFilesWithSD.pl
|
|
396
|
|
397 =head1 COPYRIGHT
|
|
398
|
|
399 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
400
|
|
401 This file is part of MayaChemTools.
|
|
402
|
|
403 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
404 the terms of the GNU Lesser General Public License as published by the Free
|
|
405 Software Foundation; either version 3 of the License, or (at your option)
|
|
406 any later version.
|
|
407
|
|
408 =cut
|