comparison bin/FilterSDFiles.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: FilterSDFiles.pl,v $
4 # $Date: 2015/02/28 20:46:20 $
5 # $Revision: 1.32 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Benchmark;
34 use SDFileUtil;
35 use FileUtil;
36
37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
38
39 # Autoflush STDOUT
40 $| = 1;
41
42 # Starting message...
43 $ScriptName = basename $0;
44 print "\n$ScriptName:Starting...\n\n";
45 $StartTime = new Benchmark;
46
47 # Get the options and setup script...
48 SetupScriptUsage();
49 if ($Options{help} || @ARGV < 1) {
50 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
51 }
52
53 my(@SDFilesList);
54 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
55
56 # Process options...
57 print "Processing options...\n";
58 my(%OptionsInfo);
59 ProcessOptions();
60
61 print "Checking input SD file(s)...\n";
62 my(%SDFilesInfo);
63 RetrieveSDFilesInfo();
64
65 # Generate output files...
66 my($FileIndex, %FilteredSDFileInfo);
67 if (@SDFilesList > 1) {
68 print "\nProcessing SD files...\n";
69 }
70 for $FileIndex (0 .. $#SDFilesList) {
71 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
72 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
73 FilterSDFile($FileIndex);
74 }
75 }
76 print "\n$ScriptName:Done...\n\n";
77
78 $EndTime = new Benchmark;
79 $TotalTime = timediff ($EndTime, $StartTime);
80 print "Total time: ", timestr($TotalTime), "\n";
81
82 ###############################################################################
83
84 # Filter SD file...
85 sub FilterSDFile {
86 my($Index) = @_;
87 my($SDFile, $NewSDFile, $NewKeepSDFile, $CtabLinesCount, $CmpdString, $PrintCmpdCounterHeader, @CmpdLines);
88
89 $SDFile = $SDFilesList[$Index];
90 $NewSDFile = $SDFilesInfo{OutFile}[$Index];
91 $NewKeepSDFile = $SDFilesInfo{OutFileKeep}[$Index];
92
93 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
94 if ($OptionsInfo{Keep}) {
95 open NEWKEEPSDFILE, ">$NewKeepSDFile" or die "Error: Couldn't open $NewKeepSDFile: $! \n";
96 }
97 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
98
99 print "\nGenerating SD file $NewSDFile...\n";
100 if ($OptionsInfo{Keep}) {
101 print "Generating file $NewKeepSDFile...\n";
102 }
103
104 %FilteredSDFileInfo = ();
105
106 $FilteredSDFileInfo{CmpdCount} = 0; $FilteredSDFileInfo{FilterCmpd} = 0;
107 $FilteredSDFileInfo{FilteredCmpdCount} = 0; $FilteredSDFileInfo{KeepCmpdCount} = 0;
108
109 $PrintCmpdCounterHeader = 1;
110
111 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
112 $FilteredSDFileInfo{CmpdCount} += 1;
113 $FilteredSDFileInfo{FilterCmpd} = 0;
114 if (($FilteredSDFileInfo{CmpdCount} % 5000) == 0) {
115 if ($PrintCmpdCounterHeader) {
116 $PrintCmpdCounterHeader = 0;
117 print "\nProcessing compounds:";
118 }
119 print "$FilteredSDFileInfo{CmpdCount}...";
120 }
121 @CmpdLines = split "\n", $CmpdString;
122 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines);
123 if ($CtabLinesCount <= 0) {
124 $FilteredSDFileInfo{FilterCmpd} = 1;
125 WriteOutCmpdString($CmpdString);
126 next CMPDSTRING;
127 }
128 my ($AtomCount, $BondCount) = ParseCmpdCountsLine($CmpdLines[3]);
129 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) {
130 if ($CtabLinesCount != ($AtomCount + $BondCount)) {
131 $FilteredSDFileInfo{FilterCmpd} = 1;
132 WriteOutCmpdString($CmpdString);
133 next CMPDSTRING;
134 }
135 }
136 if ($CtabLinesCount == ($AtomCount + $BondCount)) {
137 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) {
138 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines);
139 if ($UnknownAtomCount) {
140 $FilteredSDFileInfo{FilterCmpd} = 1;
141 WriteOutCmpdString($CmpdString);
142 next CMPDSTRING;
143 }
144 }
145 if ($OptionsInfo{All} || $OptionsInfo{CleanSalts} || $OptionsInfo{Salts}) {
146 my ($FragmentsCount, $Fragments, $WashedCmpdString) = WashCmpd(\@CmpdLines);
147 if ($FragmentsCount > 1) {
148 if ($OptionsInfo{all} || $OptionsInfo{CleanSalts}) {
149 $CmpdString = $WashedCmpdString;
150 }
151 else {
152 $FilteredSDFileInfo{FilterCmpd} = 1;
153 }
154 WriteOutCmpdString($CmpdString);
155 next CMPDSTRING;
156 }
157 }
158 }
159 WriteOutCmpdString($CmpdString);
160 }
161 if (!$PrintCmpdCounterHeader) {
162 print "\n";
163 }
164
165 close NEWSDFILE;
166 if ($OptionsInfo{Keep}) {
167 close NEWKEEPSDFILE;
168 }
169 close SDFILE;
170
171 print "\nTotal Number of compounds: $FilteredSDFileInfo{CmpdCount}\n";
172 print "Number of compounds left after filtering: $FilteredSDFileInfo{FilteredCmpdCount}\n";
173 print "Number of compounds ignored: $FilteredSDFileInfo{KeepCmpdCount}\n";
174 }
175
176 # Write out the compound data...
177 sub WriteOutCmpdString {
178 my($CmpdString) = @_;
179
180 if ($FilteredSDFileInfo{FilterCmpd}) {
181 $FilteredSDFileInfo{KeepCmpdCount} += 1;
182 if ($OptionsInfo{Keep}) {
183 print NEWKEEPSDFILE "$CmpdString\n";
184 }
185 }
186 else {
187 $FilteredSDFileInfo{FilteredCmpdCount} += 1;
188 print NEWSDFILE "$CmpdString\n";
189 }
190 }
191
192 # Retrieve information about input SD files...
193 sub RetrieveSDFilesInfo {
194 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $NewSDFile, $NewKeepSDFile);
195
196 %SDFilesInfo = ();
197 @{$SDFilesInfo{FileOkay}} = ();
198 @{$SDFilesInfo{OutFile}} = ();
199 @{$SDFilesInfo{OutFileKeep}} = ();
200
201 FILELIST: for $Index (0 .. $#SDFilesList) {
202 $SDFile = $SDFilesList[$Index];
203
204 $SDFilesInfo{FileOkay}[$Index] = 0;
205 $SDFilesInfo{OutFile}[$Index] = '';
206 $SDFilesInfo{OutFileKeep}[$Index] = '';
207
208 if (!(-e $SDFile)) {
209 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
210 next FILELIST;
211 }
212 if (!CheckFileType($SDFile, "sd sdf")) {
213 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
214 next FILELIST;
215 }
216
217 # Setup new file names...
218 $FileDir = ""; $FileName = ""; $FileExt = "";
219 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
220 if ($Options{root} && (@SDFilesList == 1)) {
221 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
222 if ($RootFileName && $RootFileExt) {
223 $NewSDFile = $RootFileName;
224 }
225 else {
226 $NewSDFile = $Options{root};
227 }
228 $NewKeepSDFile = $NewSDFile;
229 }
230 else {
231 $NewSDFile = $FileName . "Filtered";
232 $NewKeepSDFile = $FileName;
233 }
234 $NewSDFile .= ".$FileExt";
235 $NewKeepSDFile .= "Ignored" . ".$FileExt";
236 if (!$Options{overwrite}) {
237 if (-e $NewSDFile) {
238 warn "Warning: Ignoring file $SDFile: New SD file, $NewSDFile, already exists\n";
239 next FILELIST;
240 }
241 if ($Options{keep}) {
242 if (-e $NewKeepSDFile) {
243 warn "Warning: Ignoring file $SDFile: New SD file, $NewKeepSDFile, already exists\n";
244 next FILELIST;
245 }
246 }
247 }
248 if (lc($NewSDFile) eq lc($SDFile)) {
249 warn "Warning: Ignoring file $SDFile: Same output, $NewSDFile, and input file name\n";
250 print "Specify a different name using \"-r --root\" option or use default name.\n";
251 next FILELIST;
252 }
253
254 $SDFilesInfo{FileOkay}[$Index] = 1;
255 $SDFilesInfo{OutFile}[$Index] = $NewSDFile;
256 $SDFilesInfo{OutFileKeep}[$Index] = $NewKeepSDFile;
257 }
258 }
259
260 # Process option values...
261 sub ProcessOptions {
262 %OptionsInfo = ();
263
264 $OptionsInfo{All} = $Options{all} ? $Options{all} : undef;
265 $OptionsInfo{CleanSalts} = $Options{cleansalts} ? $Options{cleansalts} : undef;
266 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : undef;
267 $OptionsInfo{Keep} = $Options{keep} ? $Options{keep} : undef;
268 $OptionsInfo{Mismatch} = $Options{mismatch} ? $Options{mismatch} : undef;
269 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef;
270 $OptionsInfo{Salts} = $Options{salts} ? $Options{salts} : undef;
271 $OptionsInfo{UnknownAtoms} = $Options{unknownatoms} ? $Options{unknownatoms} : undef;
272
273 }
274
275 # Setup script usage and retrieve command line arguments specified using various options...
276 sub SetupScriptUsage {
277
278 # Retrieve all the options...
279 %Options = ();
280 if (!GetOptions(\%Options, "all|a", "cleansalts|c", "empty|e", "help|h", "keep|k", "mismatch|m", "overwrite|o", "root|r=s", "salts|s", "unknownatoms|u", "workingdir|w=s")) {
281 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
282 }
283 if ($Options{workingdir}) {
284 if (! -d $Options{workingdir}) {
285 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
286 }
287 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
288 }
289 }
290
291 __END__
292
293 =head1 NAME
294
295 FilterSDFiles.pl - Filter compounds from SDFile(s)
296
297 =head1 SYNOPSIS
298
299 FilterSDFiles.pl SDFile(s)...
300
301 FilterSDFiles.pl [B<-a, --all>] [B<-e, --empty>] [B<-c, --cleansalts>] [B<-h, --help>]
302 [B<-k, --keep>] [B<-m, --mismatch>] [B<-o, --overwrite>] [B<-r, --root> I<rootname>]
303 [B<-s, --salts>] [B<-u, --unknownatoms>] [B<-w, --workingdir> I<dirname>] SDFile(s)...
304
305 =head1 DESCRIPTION
306
307 Filter specific compounds from I<SDFile(s)>. Available choices are: wash or
308 remove compounds with salts; take out compounds with no
309 structural data; remove compounds with mismatched atom/bond blocks data;
310 remove compounds which contain uknown atoms and so on. Multiple SDFile
311 names are separated by spaces. The valid file extensions are I<.sdf> and I<.sd>.
312 All other file names are ignored. All the SD files in a current directory can be
313 specified either by I<*.sdf> or the current directory name.
314
315 =head1 OPTIONS
316
317 =over 4
318
319 =item B<-a, --all>
320
321 Use all options to filter compounds.
322
323 =item B<-e, --empty>
324
325 Filter compounds with empty atom/bond blocks. This is B<default behavior>.
326
327 =item B<-c, --cleansalts>
328
329 Wash compounds which contain salts identified as disconnected structural
330 units. The largest fragment is kept.
331
332 =item B<-h, --help>
333
334 Print this help message.
335
336 =item B<-k, --keep>
337
338 Keep the compounds which were filtered in a separate file. Default: Just
339 ignore these compounds. Option B<-r --root> is used to generate the new file
340 name: <Root>Ignored.sdf. Default file name: <SDFileName>Ignored.sdf.
341
342 =item B<-m, --mismatch>
343
344 Remove compounds with mismatched atom/bond blocks and counts line
345 information specified by header block.
346
347 =item B<-o, --overwrite>
348
349 Overwrite existing files.
350
351 =item B<-r, --root> I<rootname>
352
353 New SD file name is generated using the root: <Root>.sdf. Default file
354 name:<SDFileName>Filtered.sdf. This option is ignored for multiple input files.
355
356 =item B<-s, --salts>
357
358 Remove compounds which contain salts identified as disconnected structural
359 units.
360
361 =item B<-u, --unknownatoms>
362
363 Remove compounds with atom blocks containing special atom symbols such
364 as L, Q, * ,LP, X, R#, or any other non periodic table symbols.
365
366 =item B<-w, --workingdir> I<dirname>
367
368 Location of working directory. Default: current directory.
369
370 =back
371
372 =head1 EXAMPLES
373
374 To remove compounds from SD files which contain salts, unknown atoms, or
375 mismatched atom/bonds block data or no structural data, type:
376
377 % FilterSDFiles.pl -a -o Sample.sdf
378 % FilterSDFiles.pl -a -o *.sdf
379
380 And to generate a new NewSampleIgnored.sdf file for filtered compounds, type:
381
382 % FilterSDFiles.pl -a -k -r NewSample -o Sample.sdf
383
384 To wash compounds in order to get rid of all disconnected fragments except for
385 the largest one, type:
386
387 % FilterSDFiles.pl -c -o Sample.sdf
388
389 =head1 AUTHOR
390
391 Manish Sud <msud@san.rr.com>
392
393 =head1 SEE ALSO
394
395 ExtractFromSDFiles.pl, InfoSDFiles.pl, MergeTextFilesWithSD.pl
396
397 =head1 COPYRIGHT
398
399 Copyright (C) 2015 Manish Sud. All rights reserved.
400
401 This file is part of MayaChemTools.
402
403 MayaChemTools is free software; you can redistribute it and/or modify it under
404 the terms of the GNU Lesser General Public License as published by the Free
405 Software Foundation; either version 3 of the License, or (at your option)
406 any later version.
407
408 =cut