Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/FilterSDFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4816e4a8ae95 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # $RCSfile: FilterSDFiles.pl,v $ | |
4 # $Date: 2015/02/28 20:46:20 $ | |
5 # $Revision: 1.32 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
31 use Getopt::Long; | |
32 use File::Basename; | |
33 use Benchmark; | |
34 use SDFileUtil; | |
35 use FileUtil; | |
36 | |
37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
38 | |
39 # Autoflush STDOUT | |
40 $| = 1; | |
41 | |
42 # Starting message... | |
43 $ScriptName = basename $0; | |
44 print "\n$ScriptName:Starting...\n\n"; | |
45 $StartTime = new Benchmark; | |
46 | |
47 # Get the options and setup script... | |
48 SetupScriptUsage(); | |
49 if ($Options{help} || @ARGV < 1) { | |
50 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
51 } | |
52 | |
53 my(@SDFilesList); | |
54 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); | |
55 | |
56 # Process options... | |
57 print "Processing options...\n"; | |
58 my(%OptionsInfo); | |
59 ProcessOptions(); | |
60 | |
61 print "Checking input SD file(s)...\n"; | |
62 my(%SDFilesInfo); | |
63 RetrieveSDFilesInfo(); | |
64 | |
65 # Generate output files... | |
66 my($FileIndex, %FilteredSDFileInfo); | |
67 if (@SDFilesList > 1) { | |
68 print "\nProcessing SD files...\n"; | |
69 } | |
70 for $FileIndex (0 .. $#SDFilesList) { | |
71 if ($SDFilesInfo{FileOkay}[$FileIndex]) { | |
72 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; | |
73 FilterSDFile($FileIndex); | |
74 } | |
75 } | |
76 print "\n$ScriptName:Done...\n\n"; | |
77 | |
78 $EndTime = new Benchmark; | |
79 $TotalTime = timediff ($EndTime, $StartTime); | |
80 print "Total time: ", timestr($TotalTime), "\n"; | |
81 | |
82 ############################################################################### | |
83 | |
84 # Filter SD file... | |
85 sub FilterSDFile { | |
86 my($Index) = @_; | |
87 my($SDFile, $NewSDFile, $NewKeepSDFile, $CtabLinesCount, $CmpdString, $PrintCmpdCounterHeader, @CmpdLines); | |
88 | |
89 $SDFile = $SDFilesList[$Index]; | |
90 $NewSDFile = $SDFilesInfo{OutFile}[$Index]; | |
91 $NewKeepSDFile = $SDFilesInfo{OutFileKeep}[$Index]; | |
92 | |
93 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; | |
94 if ($OptionsInfo{Keep}) { | |
95 open NEWKEEPSDFILE, ">$NewKeepSDFile" or die "Error: Couldn't open $NewKeepSDFile: $! \n"; | |
96 } | |
97 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; | |
98 | |
99 print "\nGenerating SD file $NewSDFile...\n"; | |
100 if ($OptionsInfo{Keep}) { | |
101 print "Generating file $NewKeepSDFile...\n"; | |
102 } | |
103 | |
104 %FilteredSDFileInfo = (); | |
105 | |
106 $FilteredSDFileInfo{CmpdCount} = 0; $FilteredSDFileInfo{FilterCmpd} = 0; | |
107 $FilteredSDFileInfo{FilteredCmpdCount} = 0; $FilteredSDFileInfo{KeepCmpdCount} = 0; | |
108 | |
109 $PrintCmpdCounterHeader = 1; | |
110 | |
111 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { | |
112 $FilteredSDFileInfo{CmpdCount} += 1; | |
113 $FilteredSDFileInfo{FilterCmpd} = 0; | |
114 if (($FilteredSDFileInfo{CmpdCount} % 5000) == 0) { | |
115 if ($PrintCmpdCounterHeader) { | |
116 $PrintCmpdCounterHeader = 0; | |
117 print "\nProcessing compounds:"; | |
118 } | |
119 print "$FilteredSDFileInfo{CmpdCount}..."; | |
120 } | |
121 @CmpdLines = split "\n", $CmpdString; | |
122 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines); | |
123 if ($CtabLinesCount <= 0) { | |
124 $FilteredSDFileInfo{FilterCmpd} = 1; | |
125 WriteOutCmpdString($CmpdString); | |
126 next CMPDSTRING; | |
127 } | |
128 my ($AtomCount, $BondCount) = ParseCmpdCountsLine($CmpdLines[3]); | |
129 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) { | |
130 if ($CtabLinesCount != ($AtomCount + $BondCount)) { | |
131 $FilteredSDFileInfo{FilterCmpd} = 1; | |
132 WriteOutCmpdString($CmpdString); | |
133 next CMPDSTRING; | |
134 } | |
135 } | |
136 if ($CtabLinesCount == ($AtomCount + $BondCount)) { | |
137 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) { | |
138 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines); | |
139 if ($UnknownAtomCount) { | |
140 $FilteredSDFileInfo{FilterCmpd} = 1; | |
141 WriteOutCmpdString($CmpdString); | |
142 next CMPDSTRING; | |
143 } | |
144 } | |
145 if ($OptionsInfo{All} || $OptionsInfo{CleanSalts} || $OptionsInfo{Salts}) { | |
146 my ($FragmentsCount, $Fragments, $WashedCmpdString) = WashCmpd(\@CmpdLines); | |
147 if ($FragmentsCount > 1) { | |
148 if ($OptionsInfo{all} || $OptionsInfo{CleanSalts}) { | |
149 $CmpdString = $WashedCmpdString; | |
150 } | |
151 else { | |
152 $FilteredSDFileInfo{FilterCmpd} = 1; | |
153 } | |
154 WriteOutCmpdString($CmpdString); | |
155 next CMPDSTRING; | |
156 } | |
157 } | |
158 } | |
159 WriteOutCmpdString($CmpdString); | |
160 } | |
161 if (!$PrintCmpdCounterHeader) { | |
162 print "\n"; | |
163 } | |
164 | |
165 close NEWSDFILE; | |
166 if ($OptionsInfo{Keep}) { | |
167 close NEWKEEPSDFILE; | |
168 } | |
169 close SDFILE; | |
170 | |
171 print "\nTotal Number of compounds: $FilteredSDFileInfo{CmpdCount}\n"; | |
172 print "Number of compounds left after filtering: $FilteredSDFileInfo{FilteredCmpdCount}\n"; | |
173 print "Number of compounds ignored: $FilteredSDFileInfo{KeepCmpdCount}\n"; | |
174 } | |
175 | |
176 # Write out the compound data... | |
177 sub WriteOutCmpdString { | |
178 my($CmpdString) = @_; | |
179 | |
180 if ($FilteredSDFileInfo{FilterCmpd}) { | |
181 $FilteredSDFileInfo{KeepCmpdCount} += 1; | |
182 if ($OptionsInfo{Keep}) { | |
183 print NEWKEEPSDFILE "$CmpdString\n"; | |
184 } | |
185 } | |
186 else { | |
187 $FilteredSDFileInfo{FilteredCmpdCount} += 1; | |
188 print NEWSDFILE "$CmpdString\n"; | |
189 } | |
190 } | |
191 | |
192 # Retrieve information about input SD files... | |
193 sub RetrieveSDFilesInfo { | |
194 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $NewSDFile, $NewKeepSDFile); | |
195 | |
196 %SDFilesInfo = (); | |
197 @{$SDFilesInfo{FileOkay}} = (); | |
198 @{$SDFilesInfo{OutFile}} = (); | |
199 @{$SDFilesInfo{OutFileKeep}} = (); | |
200 | |
201 FILELIST: for $Index (0 .. $#SDFilesList) { | |
202 $SDFile = $SDFilesList[$Index]; | |
203 | |
204 $SDFilesInfo{FileOkay}[$Index] = 0; | |
205 $SDFilesInfo{OutFile}[$Index] = ''; | |
206 $SDFilesInfo{OutFileKeep}[$Index] = ''; | |
207 | |
208 if (!(-e $SDFile)) { | |
209 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; | |
210 next FILELIST; | |
211 } | |
212 if (!CheckFileType($SDFile, "sd sdf")) { | |
213 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; | |
214 next FILELIST; | |
215 } | |
216 | |
217 # Setup new file names... | |
218 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
219 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); | |
220 if ($Options{root} && (@SDFilesList == 1)) { | |
221 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); | |
222 if ($RootFileName && $RootFileExt) { | |
223 $NewSDFile = $RootFileName; | |
224 } | |
225 else { | |
226 $NewSDFile = $Options{root}; | |
227 } | |
228 $NewKeepSDFile = $NewSDFile; | |
229 } | |
230 else { | |
231 $NewSDFile = $FileName . "Filtered"; | |
232 $NewKeepSDFile = $FileName; | |
233 } | |
234 $NewSDFile .= ".$FileExt"; | |
235 $NewKeepSDFile .= "Ignored" . ".$FileExt"; | |
236 if (!$Options{overwrite}) { | |
237 if (-e $NewSDFile) { | |
238 warn "Warning: Ignoring file $SDFile: New SD file, $NewSDFile, already exists\n"; | |
239 next FILELIST; | |
240 } | |
241 if ($Options{keep}) { | |
242 if (-e $NewKeepSDFile) { | |
243 warn "Warning: Ignoring file $SDFile: New SD file, $NewKeepSDFile, already exists\n"; | |
244 next FILELIST; | |
245 } | |
246 } | |
247 } | |
248 if (lc($NewSDFile) eq lc($SDFile)) { | |
249 warn "Warning: Ignoring file $SDFile: Same output, $NewSDFile, and input file name\n"; | |
250 print "Specify a different name using \"-r --root\" option or use default name.\n"; | |
251 next FILELIST; | |
252 } | |
253 | |
254 $SDFilesInfo{FileOkay}[$Index] = 1; | |
255 $SDFilesInfo{OutFile}[$Index] = $NewSDFile; | |
256 $SDFilesInfo{OutFileKeep}[$Index] = $NewKeepSDFile; | |
257 } | |
258 } | |
259 | |
260 # Process option values... | |
261 sub ProcessOptions { | |
262 %OptionsInfo = (); | |
263 | |
264 $OptionsInfo{All} = $Options{all} ? $Options{all} : undef; | |
265 $OptionsInfo{CleanSalts} = $Options{cleansalts} ? $Options{cleansalts} : undef; | |
266 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : undef; | |
267 $OptionsInfo{Keep} = $Options{keep} ? $Options{keep} : undef; | |
268 $OptionsInfo{Mismatch} = $Options{mismatch} ? $Options{mismatch} : undef; | |
269 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef; | |
270 $OptionsInfo{Salts} = $Options{salts} ? $Options{salts} : undef; | |
271 $OptionsInfo{UnknownAtoms} = $Options{unknownatoms} ? $Options{unknownatoms} : undef; | |
272 | |
273 } | |
274 | |
275 # Setup script usage and retrieve command line arguments specified using various options... | |
276 sub SetupScriptUsage { | |
277 | |
278 # Retrieve all the options... | |
279 %Options = (); | |
280 if (!GetOptions(\%Options, "all|a", "cleansalts|c", "empty|e", "help|h", "keep|k", "mismatch|m", "overwrite|o", "root|r=s", "salts|s", "unknownatoms|u", "workingdir|w=s")) { | |
281 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
282 } | |
283 if ($Options{workingdir}) { | |
284 if (! -d $Options{workingdir}) { | |
285 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
286 } | |
287 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
288 } | |
289 } | |
290 | |
291 __END__ | |
292 | |
293 =head1 NAME | |
294 | |
295 FilterSDFiles.pl - Filter compounds from SDFile(s) | |
296 | |
297 =head1 SYNOPSIS | |
298 | |
299 FilterSDFiles.pl SDFile(s)... | |
300 | |
301 FilterSDFiles.pl [B<-a, --all>] [B<-e, --empty>] [B<-c, --cleansalts>] [B<-h, --help>] | |
302 [B<-k, --keep>] [B<-m, --mismatch>] [B<-o, --overwrite>] [B<-r, --root> I<rootname>] | |
303 [B<-s, --salts>] [B<-u, --unknownatoms>] [B<-w, --workingdir> I<dirname>] SDFile(s)... | |
304 | |
305 =head1 DESCRIPTION | |
306 | |
307 Filter specific compounds from I<SDFile(s)>. Available choices are: wash or | |
308 remove compounds with salts; take out compounds with no | |
309 structural data; remove compounds with mismatched atom/bond blocks data; | |
310 remove compounds which contain uknown atoms and so on. Multiple SDFile | |
311 names are separated by spaces. The valid file extensions are I<.sdf> and I<.sd>. | |
312 All other file names are ignored. All the SD files in a current directory can be | |
313 specified either by I<*.sdf> or the current directory name. | |
314 | |
315 =head1 OPTIONS | |
316 | |
317 =over 4 | |
318 | |
319 =item B<-a, --all> | |
320 | |
321 Use all options to filter compounds. | |
322 | |
323 =item B<-e, --empty> | |
324 | |
325 Filter compounds with empty atom/bond blocks. This is B<default behavior>. | |
326 | |
327 =item B<-c, --cleansalts> | |
328 | |
329 Wash compounds which contain salts identified as disconnected structural | |
330 units. The largest fragment is kept. | |
331 | |
332 =item B<-h, --help> | |
333 | |
334 Print this help message. | |
335 | |
336 =item B<-k, --keep> | |
337 | |
338 Keep the compounds which were filtered in a separate file. Default: Just | |
339 ignore these compounds. Option B<-r --root> is used to generate the new file | |
340 name: <Root>Ignored.sdf. Default file name: <SDFileName>Ignored.sdf. | |
341 | |
342 =item B<-m, --mismatch> | |
343 | |
344 Remove compounds with mismatched atom/bond blocks and counts line | |
345 information specified by header block. | |
346 | |
347 =item B<-o, --overwrite> | |
348 | |
349 Overwrite existing files. | |
350 | |
351 =item B<-r, --root> I<rootname> | |
352 | |
353 New SD file name is generated using the root: <Root>.sdf. Default file | |
354 name:<SDFileName>Filtered.sdf. This option is ignored for multiple input files. | |
355 | |
356 =item B<-s, --salts> | |
357 | |
358 Remove compounds which contain salts identified as disconnected structural | |
359 units. | |
360 | |
361 =item B<-u, --unknownatoms> | |
362 | |
363 Remove compounds with atom blocks containing special atom symbols such | |
364 as L, Q, * ,LP, X, R#, or any other non periodic table symbols. | |
365 | |
366 =item B<-w, --workingdir> I<dirname> | |
367 | |
368 Location of working directory. Default: current directory. | |
369 | |
370 =back | |
371 | |
372 =head1 EXAMPLES | |
373 | |
374 To remove compounds from SD files which contain salts, unknown atoms, or | |
375 mismatched atom/bonds block data or no structural data, type: | |
376 | |
377 % FilterSDFiles.pl -a -o Sample.sdf | |
378 % FilterSDFiles.pl -a -o *.sdf | |
379 | |
380 And to generate a new NewSampleIgnored.sdf file for filtered compounds, type: | |
381 | |
382 % FilterSDFiles.pl -a -k -r NewSample -o Sample.sdf | |
383 | |
384 To wash compounds in order to get rid of all disconnected fragments except for | |
385 the largest one, type: | |
386 | |
387 % FilterSDFiles.pl -c -o Sample.sdf | |
388 | |
389 =head1 AUTHOR | |
390 | |
391 Manish Sud <msud@san.rr.com> | |
392 | |
393 =head1 SEE ALSO | |
394 | |
395 ExtractFromSDFiles.pl, InfoSDFiles.pl, MergeTextFilesWithSD.pl | |
396 | |
397 =head1 COPYRIGHT | |
398 | |
399 Copyright (C) 2015 Manish Sud. All rights reserved. | |
400 | |
401 This file is part of MayaChemTools. | |
402 | |
403 MayaChemTools is free software; you can redistribute it and/or modify it under | |
404 the terms of the GNU Lesser General Public License as published by the Free | |
405 Software Foundation; either version 3 of the License, or (at your option) | |
406 any later version. | |
407 | |
408 =cut |