Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/InfoSDFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4816e4a8ae95 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # $RCSfile: InfoSDFiles.pl,v $ | |
4 # $Date: 2015/02/28 20:46:20 $ | |
5 # $Revision: 1.35 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
31 use Getopt::Long; | |
32 use File::Basename; | |
33 use Benchmark; | |
34 use SDFileUtil; | |
35 use TextUtil; | |
36 use FileUtil; | |
37 | |
38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
39 | |
40 # Autoflush STDOUT | |
41 $| = 1; | |
42 | |
43 # Starting message... | |
44 $ScriptName = basename $0; | |
45 print "\n$ScriptName:Starting...\n\n"; | |
46 $StartTime = new Benchmark; | |
47 | |
48 # Get the options and setup script... | |
49 SetupScriptUsage(); | |
50 if ($Options{help} || @ARGV < 1) { | |
51 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
52 } | |
53 | |
54 my(@SDFilesList); | |
55 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); | |
56 | |
57 # Process options... | |
58 print "Processing options...\n"; | |
59 my(%OptionsInfo); | |
60 ProcessOptions(); | |
61 | |
62 # Setup information about input files... | |
63 print "Checking input SD file(s)...\n"; | |
64 my(%SDFilesInfo, %SDCmpdsInfo); | |
65 RetrieveSDFilesInfo(); | |
66 InitializeSDCmpdsInfo(); | |
67 | |
68 # Process input files.. | |
69 my($FileIndex); | |
70 if (@SDFilesList > 1) { | |
71 print "\nProcessing SD files...\n"; | |
72 } | |
73 for $FileIndex (0 .. $#SDFilesList) { | |
74 if ($SDFilesInfo{FileOkay}[$FileIndex]) { | |
75 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; | |
76 ListSDFileInfo($FileIndex); | |
77 } | |
78 } | |
79 ListTotalSizeOfFiles(); | |
80 | |
81 print "\n$ScriptName:Done...\n\n"; | |
82 | |
83 $EndTime = new Benchmark; | |
84 $TotalTime = timediff ($EndTime, $StartTime); | |
85 print "Total time: ", timestr($TotalTime), "\n"; | |
86 | |
87 ############################################################################### | |
88 | |
89 # List appropriate information... | |
90 sub ListSDFileInfo { | |
91 my($Index) = @_; | |
92 my($SDFile); | |
93 | |
94 $SDFile = $SDFilesList[$Index]; | |
95 | |
96 if ($OptionsInfo{ProcessCmpdInfo}) { | |
97 ListCompoundDetailsInfo($Index); | |
98 } | |
99 else { | |
100 ListCompoundCountInfo($Index); | |
101 } | |
102 | |
103 # File size and modification information... | |
104 print "\nFile size: ", FormatFileSize($SDFilesInfo{FileSize}[$Index]), " \n"; | |
105 print "Last modified: ", $SDFilesInfo{FileLastModified}[$Index], " \n"; | |
106 } | |
107 | |
108 # List number of compounds in SD file... | |
109 sub ListCompoundCountInfo { | |
110 my($Index) = @_; | |
111 my($SDFile, $CmpdCount); | |
112 | |
113 $SDFile = $SDFilesList[$Index]; | |
114 | |
115 $CmpdCount = 0; | |
116 | |
117 open SDFILE, "$SDFile" or die "Couldn't open $SDFile: $! \n"; | |
118 while (<SDFILE>) { | |
119 if (/^\$\$\$\$/) { | |
120 $CmpdCount++; | |
121 } | |
122 } | |
123 close SDFILE; | |
124 | |
125 $SDCmpdsInfo{TotalCmpdCount} += $CmpdCount; | |
126 | |
127 print "\nNumber of compounds: $CmpdCount\n"; | |
128 } | |
129 | |
130 # List detailed compound information... | |
131 sub ListCompoundDetailsInfo { | |
132 my($Index) = @_; | |
133 my($SDFile, $CmpdCount, $EmptyCtabBlocksCount, $MismatchCtabBlockCount, $ChiralCtabBlockCount, $UnknownAtomsCtabBlockCount, $InvalidAtomNumbersCtabBlockCount, $SaltsCtabBlockCount, $CtabLinesCount, $PrintCmpdCounterHeader, $ProblematicCmpdData, $CmpdString, @CmpdLines); | |
134 | |
135 $SDFile = $SDFilesList[$Index]; | |
136 | |
137 ($CmpdCount, $EmptyCtabBlocksCount, $MismatchCtabBlockCount, $ChiralCtabBlockCount, $UnknownAtomsCtabBlockCount, $InvalidAtomNumbersCtabBlockCount, $SaltsCtabBlockCount) = (0) x 7; | |
138 | |
139 InitializeSDCmpdsInfo(); | |
140 | |
141 $PrintCmpdCounterHeader = 1; | |
142 | |
143 open SDFILE, "$SDFile" or die "Couldn't open $SDFile: $! \n"; | |
144 while ($CmpdString = ReadCmpdString(\*SDFILE)) { | |
145 $CmpdCount++; | |
146 $ProblematicCmpdData = 0; | |
147 if ($OptionsInfo{Detail} <= 1) { | |
148 if (($CmpdCount % 5000) == 0) { | |
149 if ($PrintCmpdCounterHeader) { | |
150 $PrintCmpdCounterHeader = 0; | |
151 print "Processing compounds:"; | |
152 } | |
153 print "$CmpdCount..."; | |
154 } | |
155 } | |
156 @CmpdLines = split "\n", $CmpdString; | |
157 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines); | |
158 if ($OptionsInfo{All} || $OptionsInfo{Empty}) { | |
159 if ($CtabLinesCount <= 0) { | |
160 $EmptyCtabBlocksCount++; | |
161 $ProblematicCmpdData = 1; | |
162 } | |
163 } | |
164 if ($CtabLinesCount > 0) { | |
165 my ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine($CmpdLines[3]); | |
166 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) { | |
167 if ($CtabLinesCount != ($AtomCount + $BondCount)) { | |
168 $MismatchCtabBlockCount++; | |
169 $ProblematicCmpdData = 1; | |
170 if ($OptionsInfo{Detail} >= 2) { | |
171 print "\nMismatch found: Ctab lines count: $CtabLinesCount; Atoms count: $AtomCount; Bond count: $BondCount\n"; | |
172 } | |
173 } | |
174 } | |
175 if ($OptionsInfo{All} || $OptionsInfo{Chiral}) { | |
176 if ($ChiralFlag == 1) { | |
177 $ChiralCtabBlockCount++; | |
178 } | |
179 } | |
180 if ($CtabLinesCount == ($AtomCount + $BondCount)) { | |
181 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) { | |
182 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines); | |
183 if ($UnknownAtomCount) { | |
184 $UnknownAtomsCtabBlockCount++; | |
185 $ProblematicCmpdData = 1; | |
186 if ($OptionsInfo{Detail} >= 2) { | |
187 print "\nUnknown atom(s) found: $UnknownAtomCount\nUnknown atom(s) symbols:$UnknownAtoms\nUnknown atom(s) data lines:\n$UnknownAtomLines\n"; | |
188 } | |
189 } | |
190 } | |
191 if ($OptionsInfo{All} || $OptionsInfo{InvalidAtomNumbers}) { | |
192 my($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines) = GetInvalidAtomNumbers(\@CmpdLines); | |
193 if ($InvalidAtomNumbersCount) { | |
194 $InvalidAtomNumbersCtabBlockCount++; | |
195 $ProblematicCmpdData = 1; | |
196 if ($OptionsInfo{Detail} >= 2) { | |
197 print "\nInvalid atom number(s) found: $InvalidAtomNumbersCount\nInvalid atom number(s):$InvalidAtomNumbers\nInvalid atom number(s) data lines:\n$InvalidAtomNumberLines\n"; | |
198 } | |
199 } | |
200 } | |
201 if ($OptionsInfo{All} || $OptionsInfo{Salts}) { | |
202 my($FragmentsCount, $Fragments) = GetCmpdFragments(\@CmpdLines); | |
203 if ($FragmentsCount > 1) { | |
204 $SaltsCtabBlockCount++; | |
205 $ProblematicCmpdData = 1; | |
206 if ($OptionsInfo{Detail} >= 2) { | |
207 print "\nSalts found: $FragmentsCount\nSalts atom numbers:\n$Fragments\n"; | |
208 } | |
209 } | |
210 } | |
211 } | |
212 } | |
213 if ($OptionsInfo{ProcessCmpdData}) { | |
214 ProcessCmpdInfo(\@CmpdLines, $CmpdCount); | |
215 } | |
216 if ($OptionsInfo{Detail} >= 3) { | |
217 if ($ProblematicCmpdData) { | |
218 print "\nCompound data:\n$CmpdString\n\n"; | |
219 } | |
220 } | |
221 } | |
222 if ($OptionsInfo{Detail} <= 1) { | |
223 if (!$PrintCmpdCounterHeader) { | |
224 print "\n"; | |
225 } | |
226 } | |
227 close SDFILE; | |
228 | |
229 $SDCmpdsInfo{TotalCmpdCount} += $CmpdCount; | |
230 | |
231 print "\nNumber of compounds: $CmpdCount\n"; | |
232 | |
233 if ($OptionsInfo{All} || $OptionsInfo{Empty}) { | |
234 print "Number of empty atom/bond blocks: $EmptyCtabBlocksCount\n"; | |
235 } | |
236 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) { | |
237 print "Number of mismatched atom/bond blocks: $MismatchCtabBlockCount\n"; | |
238 } | |
239 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) { | |
240 print "Number of atom blocks with unknown atom labels: $UnknownAtomsCtabBlockCount\n"; | |
241 } | |
242 if ($OptionsInfo{All} || $OptionsInfo{InvalidAtomNumbers}) { | |
243 print "Number of bond blocks and atom property blocks with invalid atom numbers: $InvalidAtomNumbersCtabBlockCount\n"; | |
244 } | |
245 if ($OptionsInfo{All} || $OptionsInfo{Salts}) { | |
246 print "Number of atom blocks containing salts: $SaltsCtabBlockCount\n"; | |
247 } | |
248 if ($OptionsInfo{All} || $OptionsInfo{Chiral}) { | |
249 print "Number of chiral atom/bond blocks: $ChiralCtabBlockCount\n"; | |
250 } | |
251 if ($OptionsInfo{ProcessCmpdData}) { | |
252 PrintCmpdInfoSummary(); | |
253 } | |
254 | |
255 } | |
256 | |
257 # Initialize compound data information for a SD file... | |
258 sub InitializeSDCmpdsInfo { | |
259 | |
260 if (!exists $SDCmpdsInfo{TotalCmpdCount}) { | |
261 $SDCmpdsInfo{TotalCmpdCount} = 0; | |
262 } | |
263 | |
264 @{$SDCmpdsInfo{FieldLabels}} = (); | |
265 %{$SDCmpdsInfo{FieldLabelsMap}} = (); | |
266 %{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}} = (); | |
267 %{$SDCmpdsInfo{EmptyFieldValuesCountMap}} = (); | |
268 %{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}} = (); | |
269 %{$SDCmpdsInfo{NumericalFieldValuesCountMap}} = (); | |
270 } | |
271 | |
272 # Process compound data header labels and figure out which ones are present for | |
273 # all the compounds... | |
274 sub ProcessCmpdInfo { | |
275 my($CmpdLinesRef, $CmpdCount) = @_; | |
276 my($Label); | |
277 | |
278 if (@{$SDCmpdsInfo{FieldLabels}}) { | |
279 my (@CmpdFieldLabels) = GetCmpdDataHeaderLabels($CmpdLinesRef); | |
280 my(%CmpdFieldLabelsMap) = (); | |
281 # Setup a map for the current labels... | |
282 for $Label (@CmpdFieldLabels) { | |
283 $CmpdFieldLabelsMap{$Label} = "PresentInSome"; | |
284 } | |
285 # Check the presence old labels for this compound; otherwise, mark 'em new... | |
286 for $Label (@{$SDCmpdsInfo{FieldLabels}}) { | |
287 if (!$CmpdFieldLabelsMap{$Label}) { | |
288 $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInSome"; | |
289 } | |
290 } | |
291 # Check the presence this compound in the old labels; otherwise, add 'em... | |
292 for $Label (@CmpdFieldLabels ) { | |
293 if (!$SDCmpdsInfo{FieldLabelsMap}{$Label}) { | |
294 # It's a new label... | |
295 push @{$SDCmpdsInfo{FieldLabels}}, $Label; | |
296 $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInSome"; | |
297 } | |
298 } | |
299 } | |
300 else { | |
301 # Get the initial label set and set up a map... | |
302 @{$SDCmpdsInfo{FieldLabels}} = GetCmpdDataHeaderLabels($CmpdLinesRef); | |
303 for $Label (@{$SDCmpdsInfo{FieldLabels}}) { | |
304 $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInAll"; | |
305 } | |
306 } | |
307 if ($OptionsInfo{CountEmptyData} || $OptionsInfo{CheckData}) { | |
308 # Count empty data field values... | |
309 my(%DataFieldAndValues, $Label, $Value); | |
310 | |
311 %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues($CmpdLinesRef); | |
312 for $Label (keys %DataFieldAndValues) { | |
313 $Value = $DataFieldAndValues{$Label}; | |
314 if ($OptionsInfo{CountEmptyData}) { | |
315 if (IsNotEmpty($Value)) { | |
316 if (exists($SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label})) { | |
317 $SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label} += 1; | |
318 } | |
319 else { | |
320 $SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label} = 1; | |
321 } | |
322 } | |
323 else { | |
324 if ($Options{detail} >= 2) { | |
325 print "Compound record $CmpdCount: Empty data field <$Label>\n"; | |
326 } | |
327 if (exists($SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label})) { | |
328 $SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label} += 1; | |
329 } | |
330 else { | |
331 $SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label} = 1; | |
332 } | |
333 } | |
334 } | |
335 if ($OptionsInfo{CheckData}) { | |
336 if (IsNumerical($Value)) { | |
337 if (exists($SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label})) { | |
338 $SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label} += 1; | |
339 } | |
340 else { | |
341 $SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label} = 1; | |
342 } | |
343 } | |
344 else { | |
345 if (exists($SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label})) { | |
346 $SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label} += 1; | |
347 } | |
348 else { | |
349 $SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label} = 1; | |
350 } | |
351 } | |
352 } | |
353 } | |
354 } | |
355 } | |
356 | |
357 # Print compound summary... | |
358 sub PrintCmpdInfoSummary { | |
359 if (@{$SDCmpdsInfo{FieldLabels}}) { | |
360 my($PresentInAllCount, $Label, @FieldLabelsPresentInSome, @FieldLabelsPresentInAll); | |
361 | |
362 @FieldLabelsPresentInSome = (); | |
363 @FieldLabelsPresentInAll = (); | |
364 | |
365 $PresentInAllCount = 0; | |
366 print "\nNumber of data fields: ", scalar(@{$SDCmpdsInfo{FieldLabels}}), "\n"; | |
367 print "All data field labels: "; | |
368 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) { | |
369 print "<$Label> "; | |
370 } | |
371 print "\n"; | |
372 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) { | |
373 if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInAll") { | |
374 $PresentInAllCount++; | |
375 push @FieldLabelsPresentInAll, $Label; | |
376 } | |
377 } | |
378 if ($PresentInAllCount != @{$SDCmpdsInfo{FieldLabels}}) { | |
379 print "Data field labels present in all compounds: "; | |
380 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) { | |
381 if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInAll") { | |
382 print "<$Label> "; | |
383 } | |
384 } | |
385 print "\n"; | |
386 print "Data field labels present in some compounds: "; | |
387 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) { | |
388 if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInSome") { | |
389 print "<$Label> "; | |
390 push @FieldLabelsPresentInSome, $Label; | |
391 } | |
392 } | |
393 print "\n"; | |
394 } | |
395 # List empty data field values count... | |
396 if ($OptionsInfo{CountEmptyData}) { | |
397 print "\n"; | |
398 if ($PresentInAllCount == @{$SDCmpdsInfo{FieldLabels}}) { | |
399 PrintDataInformation("Number of non-empty values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}}); | |
400 PrintDataInformation("Number of empty values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}}); | |
401 } | |
402 else { | |
403 PrintDataInformation("Number of non-empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}}); | |
404 PrintDataInformation("Number of empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}}); | |
405 PrintDataInformation("Number of non-empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}}); | |
406 PrintDataInformation("Number of empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}}); | |
407 } | |
408 print "\n"; | |
409 } | |
410 # List numerical data values count... | |
411 if ($OptionsInfo{CheckData}) { | |
412 print "\n"; | |
413 if ($PresentInAllCount == @{$SDCmpdsInfo{FieldLabels}}) { | |
414 PrintDataInformation("Number of non-numerical values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}}); | |
415 PrintDataInformation("Number of numerical values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}}); | |
416 } | |
417 else { | |
418 PrintDataInformation("Number of non-numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}}); | |
419 PrintDataInformation("Number of numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}}); | |
420 PrintDataInformation("Number of non-numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}}); | |
421 PrintDataInformation("Number of numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}}); | |
422 } | |
423 print "\n"; | |
424 } | |
425 } | |
426 else { | |
427 print "\nNumber of data fields: 0\n"; | |
428 } | |
429 } | |
430 # List data information... | |
431 sub PrintDataInformation { | |
432 my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_; | |
433 my($Line, $Label); | |
434 | |
435 $Line = ""; | |
436 for $Label (@{$DataLabelRef}) { | |
437 $Line .= " <$Label> - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ","; | |
438 } | |
439 $Line =~ s/\,$//g; | |
440 print "$InfoLabel: $Line\n"; | |
441 } | |
442 | |
443 # Total size of all the files... | |
444 sub ListTotalSizeOfFiles { | |
445 my($FileOkayCount, $TotalSize, $Index); | |
446 | |
447 $FileOkayCount = 0; | |
448 $TotalSize = 0; | |
449 | |
450 for $Index (0 .. $#SDFilesList) { | |
451 if ($SDFilesInfo{FileOkay}[$Index]) { | |
452 $FileOkayCount++; | |
453 $TotalSize += $SDFilesInfo{FileSize}[$Index]; | |
454 } | |
455 } | |
456 if ($FileOkayCount > 1) { | |
457 print "\nTotal number of compounds in $FileOkayCount SD files: $SDCmpdsInfo{TotalCmpdCount}\n"; | |
458 print "\nTotal size of $FileOkayCount SD files: ", FormatFileSize($TotalSize), "\n"; | |
459 } | |
460 | |
461 } | |
462 | |
463 # Retrieve information about SD files... | |
464 sub RetrieveSDFilesInfo { | |
465 my($Index, $SDFile, $ModifiedTimeString, $ModifiedDateString); | |
466 | |
467 %SDCmpdsInfo = (); | |
468 | |
469 %SDFilesInfo = (); | |
470 @{$SDFilesInfo{FileOkay}} = (); | |
471 @{$SDFilesInfo{FileSize}} = (); | |
472 @{$SDFilesInfo{FileLastModified}} = (); | |
473 | |
474 FILELIST: for $Index (0 .. $#SDFilesList) { | |
475 $SDFilesInfo{FileOkay}[$Index] = 0; | |
476 $SDFilesInfo{FileSize}[$Index] = 0; | |
477 $SDFilesInfo{FileLastModified}[$Index] = ''; | |
478 | |
479 $SDFile = $SDFilesList[$Index]; | |
480 if (!(-e $SDFile)) { | |
481 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; | |
482 next FILELIST; | |
483 } | |
484 if (!CheckFileType($SDFile, "sdf sd")) { | |
485 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; | |
486 next FILELIST; | |
487 } | |
488 if (! open SDFILE, "$SDFile") { | |
489 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; | |
490 next FILELIST; | |
491 } | |
492 close SDFILE; | |
493 | |
494 $SDFilesInfo{FileOkay}[$Index] = 1; | |
495 $SDFilesInfo{FileSize}[$Index] = FileSize($SDFile); | |
496 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SDFile); | |
497 $SDFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; | |
498 } | |
499 } | |
500 | |
501 # Process option values... | |
502 sub ProcessOptions { | |
503 %OptionsInfo = (); | |
504 | |
505 $OptionsInfo{All} = $Options{all} ? $Options{all} : 0; | |
506 $OptionsInfo{Chiral} = $Options{chiral} ? $Options{chiral} : 0; | |
507 $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0; | |
508 $OptionsInfo{DataCheck} = $Options{datacheck} ? $Options{datacheck} : 0; | |
509 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0; | |
510 $OptionsInfo{Fields} = $Options{fields} ? $Options{fields} : 0; | |
511 $OptionsInfo{InvalidAtomNumbers} = $Options{invalidatomnumbers} ? $Options{invalidatomnumbers} : 0; | |
512 $OptionsInfo{Mismatch} = $Options{mismatch} ? $Options{mismatch} : 0; | |
513 $OptionsInfo{Salts} = $Options{salts} ? $Options{salts} : 0; | |
514 $OptionsInfo{UnknownAtoms} = $Options{unknownatoms} ? $Options{unknownatoms} : 0; | |
515 | |
516 $OptionsInfo{Detail} = $Options{detail}; | |
517 | |
518 $OptionsInfo{ProcessCmpdInfo} = ($Options{all} || $Options{chiral} || $Options{empty} || $Options{fields} || $Options{invalidatomnumbers} || $Options{mismatch} || $Options{salts} || $Options{unknownatoms} || $Options{datacheck}) ? 1 : 0; | |
519 | |
520 $OptionsInfo{ProcessCmpdData} = ($Options{all} || $Options{fields} || $Options{empty} || $Options{datacheck}) ? 1 : 0; | |
521 | |
522 $OptionsInfo{CountEmptyData} = ($Options{all} || $Options{empty}) ? 1 : 0; | |
523 $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0; | |
524 } | |
525 | |
526 # Setup script usage and retrieve command line arguments specified using various options... | |
527 sub SetupScriptUsage { | |
528 | |
529 # Setup default and retrieve all the options... | |
530 %Options = (); | |
531 $Options{detail} = 1; | |
532 if (!GetOptions(\%Options, "all|a", "count|c", "chiral", "datacheck", "detail|d:i", "empty|e", "fields|f", "help|h", "invalidatomnumbers|i", "mismatch|m", "salts|s", "unknownatoms|u", "workingdir|w=s")) { | |
533 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
534 } | |
535 if ($Options{workingdir}) { | |
536 if (! -d $Options{workingdir}) { | |
537 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
538 } | |
539 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
540 } | |
541 if ($Options{detail} <= 0 || $Options{detail} > 3) { | |
542 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Possible values: 1 to 3\n"; | |
543 } | |
544 } | |
545 | |
546 __END__ | |
547 | |
548 =head1 NAME | |
549 | |
550 InfoSDFiles.pl - List information about SDFile(s) | |
551 | |
552 =head1 SYNOPSIS | |
553 | |
554 InfoSDFile.pl SDFile(s)... | |
555 | |
556 InfoSDFile.pl [B<-a --all>] [B<-c --count>] [B<--chiral>] [B<--datacheck>] | |
557 [B<-d --detail> infolevel] [B<-e --empty>] [B<-f, --fields>] [B<-h, --help>] | |
558 [B<-i, --invalidatomnumbers>] [B<-m, --mismatch>] [B<-s, --salts>] [B<-u, --unknownatoms>] | |
559 [B<-w, --workingdir> dirname] SDFile(s)... | |
560 | |
561 =head1 DESCRIPTION | |
562 | |
563 List information about I<SDFile(s)> contents: number of compounds, empty records | |
564 and so on. Multiple SDFile names are separated by spaces. The valid file extensions | |
565 are I<.sdf> and I<.sd>. All other file names are ignored. All the SD files in a current | |
566 directory can be specified either by I<*.sdf> or the current directory name. | |
567 | |
568 =head1 OPTIONS | |
569 | |
570 =over 4 | |
571 | |
572 =item B<-a, --all> | |
573 | |
574 List all the available information. | |
575 | |
576 =item B<-c, --count> | |
577 | |
578 List number of compounds. This is B<default behavior>. | |
579 | |
580 =item B<--chiral> | |
581 | |
582 List number of empty atom/bond blocks for compounds with chiral flag set in | |
583 count line. | |
584 | |
585 =item B<-d, --detail> I<infolevel> | |
586 | |
587 Level of information to print. Default: 1. Possible values: I<1, 2, or 3>. | |
588 | |
589 =item B<--datacheck> | |
590 | |
591 List number of numerical and non-numerical values for each data field. | |
592 | |
593 =item B<-e, --empty> | |
594 | |
595 List number of empty atom/bond blocks and data fields for compounds. | |
596 | |
597 =item B<-f, --fields> | |
598 | |
599 List data field labels present for compounds. | |
600 | |
601 =item B<-h, --help> | |
602 | |
603 Print this help message. | |
604 | |
605 =item B<-i, --invalidatomnumbers> | |
606 | |
607 List number of bond blocks for compounds which contain invalid atom numbers. | |
608 | |
609 =item B<-m, --mismatch> | |
610 | |
611 List number of atom/bond blocks for compounds which don't match with counts | |
612 line information in header block. | |
613 | |
614 =item B<-s, --salts> | |
615 | |
616 List number of atom blocks for compounds which contain salts identified as | |
617 disconnected structural units. | |
618 | |
619 =item B<-u, --unknownatoms> | |
620 | |
621 List number of atom blocks for compounds which contain special atom symbols | |
622 such as L, Q, * ,LP, X, R#, or any other non periodic table symbols. | |
623 | |
624 =item B<-w, --workingdir> I<dirname> | |
625 | |
626 Location of working directory. Default: current directory. | |
627 | |
628 =back | |
629 | |
630 =head1 EXAMPLES | |
631 | |
632 To count compounds in SD file(s), type: | |
633 | |
634 % InfoSDFiles.pl Sample1.sdf | |
635 % InfoSDFiles.pl Sample1.sdf Sample2.sdf | |
636 % InfoSDFiles.pl *.sdf | |
637 | |
638 To list all available information for SD file(s), type: | |
639 | |
640 % InfoSDFiles.pl -a *.sdf | |
641 | |
642 To list all data fields present in sample.sdf, type: | |
643 | |
644 % InfoSDFiles.pl -f Sample.sdf | |
645 | |
646 To count number of compounds which contain salts and list associated structural | |
647 data, type: | |
648 | |
649 % InfoSDFiles.pl -s -d 3 Sample.sdf | |
650 | |
651 =head1 AUTHOR | |
652 | |
653 Manish Sud <msud@san.rr.com> | |
654 | |
655 =head1 SEE ALSO | |
656 | |
657 ExtractFromSDFiles.pl, FilterSDFiles.pl, MergeTextFilesWithSD.pl | |
658 | |
659 =head1 COPYRIGHT | |
660 | |
661 Copyright (C) 2015 Manish Sud. All rights reserved. | |
662 | |
663 This file is part of MayaChemTools. | |
664 | |
665 MayaChemTools is free software; you can redistribute it and/or modify it under | |
666 the terms of the GNU Lesser General Public License as published by the Free | |
667 Software Foundation; either version 3 of the License, or (at your option) | |
668 any later version. | |
669 | |
670 =cut |