comparison bin/InfoSDFiles.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: InfoSDFiles.pl,v $
4 # $Date: 2015/02/28 20:46:20 $
5 # $Revision: 1.35 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Benchmark;
34 use SDFileUtil;
35 use TextUtil;
36 use FileUtil;
37
38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
39
40 # Autoflush STDOUT
41 $| = 1;
42
43 # Starting message...
44 $ScriptName = basename $0;
45 print "\n$ScriptName:Starting...\n\n";
46 $StartTime = new Benchmark;
47
48 # Get the options and setup script...
49 SetupScriptUsage();
50 if ($Options{help} || @ARGV < 1) {
51 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
52 }
53
54 my(@SDFilesList);
55 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
56
57 # Process options...
58 print "Processing options...\n";
59 my(%OptionsInfo);
60 ProcessOptions();
61
62 # Setup information about input files...
63 print "Checking input SD file(s)...\n";
64 my(%SDFilesInfo, %SDCmpdsInfo);
65 RetrieveSDFilesInfo();
66 InitializeSDCmpdsInfo();
67
68 # Process input files..
69 my($FileIndex);
70 if (@SDFilesList > 1) {
71 print "\nProcessing SD files...\n";
72 }
73 for $FileIndex (0 .. $#SDFilesList) {
74 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
75 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
76 ListSDFileInfo($FileIndex);
77 }
78 }
79 ListTotalSizeOfFiles();
80
81 print "\n$ScriptName:Done...\n\n";
82
83 $EndTime = new Benchmark;
84 $TotalTime = timediff ($EndTime, $StartTime);
85 print "Total time: ", timestr($TotalTime), "\n";
86
87 ###############################################################################
88
89 # List appropriate information...
90 sub ListSDFileInfo {
91 my($Index) = @_;
92 my($SDFile);
93
94 $SDFile = $SDFilesList[$Index];
95
96 if ($OptionsInfo{ProcessCmpdInfo}) {
97 ListCompoundDetailsInfo($Index);
98 }
99 else {
100 ListCompoundCountInfo($Index);
101 }
102
103 # File size and modification information...
104 print "\nFile size: ", FormatFileSize($SDFilesInfo{FileSize}[$Index]), " \n";
105 print "Last modified: ", $SDFilesInfo{FileLastModified}[$Index], " \n";
106 }
107
108 # List number of compounds in SD file...
109 sub ListCompoundCountInfo {
110 my($Index) = @_;
111 my($SDFile, $CmpdCount);
112
113 $SDFile = $SDFilesList[$Index];
114
115 $CmpdCount = 0;
116
117 open SDFILE, "$SDFile" or die "Couldn't open $SDFile: $! \n";
118 while (<SDFILE>) {
119 if (/^\$\$\$\$/) {
120 $CmpdCount++;
121 }
122 }
123 close SDFILE;
124
125 $SDCmpdsInfo{TotalCmpdCount} += $CmpdCount;
126
127 print "\nNumber of compounds: $CmpdCount\n";
128 }
129
130 # List detailed compound information...
131 sub ListCompoundDetailsInfo {
132 my($Index) = @_;
133 my($SDFile, $CmpdCount, $EmptyCtabBlocksCount, $MismatchCtabBlockCount, $ChiralCtabBlockCount, $UnknownAtomsCtabBlockCount, $InvalidAtomNumbersCtabBlockCount, $SaltsCtabBlockCount, $CtabLinesCount, $PrintCmpdCounterHeader, $ProblematicCmpdData, $CmpdString, @CmpdLines);
134
135 $SDFile = $SDFilesList[$Index];
136
137 ($CmpdCount, $EmptyCtabBlocksCount, $MismatchCtabBlockCount, $ChiralCtabBlockCount, $UnknownAtomsCtabBlockCount, $InvalidAtomNumbersCtabBlockCount, $SaltsCtabBlockCount) = (0) x 7;
138
139 InitializeSDCmpdsInfo();
140
141 $PrintCmpdCounterHeader = 1;
142
143 open SDFILE, "$SDFile" or die "Couldn't open $SDFile: $! \n";
144 while ($CmpdString = ReadCmpdString(\*SDFILE)) {
145 $CmpdCount++;
146 $ProblematicCmpdData = 0;
147 if ($OptionsInfo{Detail} <= 1) {
148 if (($CmpdCount % 5000) == 0) {
149 if ($PrintCmpdCounterHeader) {
150 $PrintCmpdCounterHeader = 0;
151 print "Processing compounds:";
152 }
153 print "$CmpdCount...";
154 }
155 }
156 @CmpdLines = split "\n", $CmpdString;
157 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines);
158 if ($OptionsInfo{All} || $OptionsInfo{Empty}) {
159 if ($CtabLinesCount <= 0) {
160 $EmptyCtabBlocksCount++;
161 $ProblematicCmpdData = 1;
162 }
163 }
164 if ($CtabLinesCount > 0) {
165 my ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine($CmpdLines[3]);
166 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) {
167 if ($CtabLinesCount != ($AtomCount + $BondCount)) {
168 $MismatchCtabBlockCount++;
169 $ProblematicCmpdData = 1;
170 if ($OptionsInfo{Detail} >= 2) {
171 print "\nMismatch found: Ctab lines count: $CtabLinesCount; Atoms count: $AtomCount; Bond count: $BondCount\n";
172 }
173 }
174 }
175 if ($OptionsInfo{All} || $OptionsInfo{Chiral}) {
176 if ($ChiralFlag == 1) {
177 $ChiralCtabBlockCount++;
178 }
179 }
180 if ($CtabLinesCount == ($AtomCount + $BondCount)) {
181 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) {
182 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines);
183 if ($UnknownAtomCount) {
184 $UnknownAtomsCtabBlockCount++;
185 $ProblematicCmpdData = 1;
186 if ($OptionsInfo{Detail} >= 2) {
187 print "\nUnknown atom(s) found: $UnknownAtomCount\nUnknown atom(s) symbols:$UnknownAtoms\nUnknown atom(s) data lines:\n$UnknownAtomLines\n";
188 }
189 }
190 }
191 if ($OptionsInfo{All} || $OptionsInfo{InvalidAtomNumbers}) {
192 my($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines) = GetInvalidAtomNumbers(\@CmpdLines);
193 if ($InvalidAtomNumbersCount) {
194 $InvalidAtomNumbersCtabBlockCount++;
195 $ProblematicCmpdData = 1;
196 if ($OptionsInfo{Detail} >= 2) {
197 print "\nInvalid atom number(s) found: $InvalidAtomNumbersCount\nInvalid atom number(s):$InvalidAtomNumbers\nInvalid atom number(s) data lines:\n$InvalidAtomNumberLines\n";
198 }
199 }
200 }
201 if ($OptionsInfo{All} || $OptionsInfo{Salts}) {
202 my($FragmentsCount, $Fragments) = GetCmpdFragments(\@CmpdLines);
203 if ($FragmentsCount > 1) {
204 $SaltsCtabBlockCount++;
205 $ProblematicCmpdData = 1;
206 if ($OptionsInfo{Detail} >= 2) {
207 print "\nSalts found: $FragmentsCount\nSalts atom numbers:\n$Fragments\n";
208 }
209 }
210 }
211 }
212 }
213 if ($OptionsInfo{ProcessCmpdData}) {
214 ProcessCmpdInfo(\@CmpdLines, $CmpdCount);
215 }
216 if ($OptionsInfo{Detail} >= 3) {
217 if ($ProblematicCmpdData) {
218 print "\nCompound data:\n$CmpdString\n\n";
219 }
220 }
221 }
222 if ($OptionsInfo{Detail} <= 1) {
223 if (!$PrintCmpdCounterHeader) {
224 print "\n";
225 }
226 }
227 close SDFILE;
228
229 $SDCmpdsInfo{TotalCmpdCount} += $CmpdCount;
230
231 print "\nNumber of compounds: $CmpdCount\n";
232
233 if ($OptionsInfo{All} || $OptionsInfo{Empty}) {
234 print "Number of empty atom/bond blocks: $EmptyCtabBlocksCount\n";
235 }
236 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) {
237 print "Number of mismatched atom/bond blocks: $MismatchCtabBlockCount\n";
238 }
239 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) {
240 print "Number of atom blocks with unknown atom labels: $UnknownAtomsCtabBlockCount\n";
241 }
242 if ($OptionsInfo{All} || $OptionsInfo{InvalidAtomNumbers}) {
243 print "Number of bond blocks and atom property blocks with invalid atom numbers: $InvalidAtomNumbersCtabBlockCount\n";
244 }
245 if ($OptionsInfo{All} || $OptionsInfo{Salts}) {
246 print "Number of atom blocks containing salts: $SaltsCtabBlockCount\n";
247 }
248 if ($OptionsInfo{All} || $OptionsInfo{Chiral}) {
249 print "Number of chiral atom/bond blocks: $ChiralCtabBlockCount\n";
250 }
251 if ($OptionsInfo{ProcessCmpdData}) {
252 PrintCmpdInfoSummary();
253 }
254
255 }
256
257 # Initialize compound data information for a SD file...
258 sub InitializeSDCmpdsInfo {
259
260 if (!exists $SDCmpdsInfo{TotalCmpdCount}) {
261 $SDCmpdsInfo{TotalCmpdCount} = 0;
262 }
263
264 @{$SDCmpdsInfo{FieldLabels}} = ();
265 %{$SDCmpdsInfo{FieldLabelsMap}} = ();
266 %{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}} = ();
267 %{$SDCmpdsInfo{EmptyFieldValuesCountMap}} = ();
268 %{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}} = ();
269 %{$SDCmpdsInfo{NumericalFieldValuesCountMap}} = ();
270 }
271
272 # Process compound data header labels and figure out which ones are present for
273 # all the compounds...
274 sub ProcessCmpdInfo {
275 my($CmpdLinesRef, $CmpdCount) = @_;
276 my($Label);
277
278 if (@{$SDCmpdsInfo{FieldLabels}}) {
279 my (@CmpdFieldLabels) = GetCmpdDataHeaderLabels($CmpdLinesRef);
280 my(%CmpdFieldLabelsMap) = ();
281 # Setup a map for the current labels...
282 for $Label (@CmpdFieldLabels) {
283 $CmpdFieldLabelsMap{$Label} = "PresentInSome";
284 }
285 # Check the presence old labels for this compound; otherwise, mark 'em new...
286 for $Label (@{$SDCmpdsInfo{FieldLabels}}) {
287 if (!$CmpdFieldLabelsMap{$Label}) {
288 $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInSome";
289 }
290 }
291 # Check the presence this compound in the old labels; otherwise, add 'em...
292 for $Label (@CmpdFieldLabels ) {
293 if (!$SDCmpdsInfo{FieldLabelsMap}{$Label}) {
294 # It's a new label...
295 push @{$SDCmpdsInfo{FieldLabels}}, $Label;
296 $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInSome";
297 }
298 }
299 }
300 else {
301 # Get the initial label set and set up a map...
302 @{$SDCmpdsInfo{FieldLabels}} = GetCmpdDataHeaderLabels($CmpdLinesRef);
303 for $Label (@{$SDCmpdsInfo{FieldLabels}}) {
304 $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInAll";
305 }
306 }
307 if ($OptionsInfo{CountEmptyData} || $OptionsInfo{CheckData}) {
308 # Count empty data field values...
309 my(%DataFieldAndValues, $Label, $Value);
310
311 %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues($CmpdLinesRef);
312 for $Label (keys %DataFieldAndValues) {
313 $Value = $DataFieldAndValues{$Label};
314 if ($OptionsInfo{CountEmptyData}) {
315 if (IsNotEmpty($Value)) {
316 if (exists($SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label})) {
317 $SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label} += 1;
318 }
319 else {
320 $SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label} = 1;
321 }
322 }
323 else {
324 if ($Options{detail} >= 2) {
325 print "Compound record $CmpdCount: Empty data field <$Label>\n";
326 }
327 if (exists($SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label})) {
328 $SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label} += 1;
329 }
330 else {
331 $SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label} = 1;
332 }
333 }
334 }
335 if ($OptionsInfo{CheckData}) {
336 if (IsNumerical($Value)) {
337 if (exists($SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label})) {
338 $SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label} += 1;
339 }
340 else {
341 $SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label} = 1;
342 }
343 }
344 else {
345 if (exists($SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label})) {
346 $SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label} += 1;
347 }
348 else {
349 $SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label} = 1;
350 }
351 }
352 }
353 }
354 }
355 }
356
357 # Print compound summary...
358 sub PrintCmpdInfoSummary {
359 if (@{$SDCmpdsInfo{FieldLabels}}) {
360 my($PresentInAllCount, $Label, @FieldLabelsPresentInSome, @FieldLabelsPresentInAll);
361
362 @FieldLabelsPresentInSome = ();
363 @FieldLabelsPresentInAll = ();
364
365 $PresentInAllCount = 0;
366 print "\nNumber of data fields: ", scalar(@{$SDCmpdsInfo{FieldLabels}}), "\n";
367 print "All data field labels: ";
368 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) {
369 print "<$Label> ";
370 }
371 print "\n";
372 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) {
373 if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInAll") {
374 $PresentInAllCount++;
375 push @FieldLabelsPresentInAll, $Label;
376 }
377 }
378 if ($PresentInAllCount != @{$SDCmpdsInfo{FieldLabels}}) {
379 print "Data field labels present in all compounds: ";
380 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) {
381 if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInAll") {
382 print "<$Label> ";
383 }
384 }
385 print "\n";
386 print "Data field labels present in some compounds: ";
387 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) {
388 if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInSome") {
389 print "<$Label> ";
390 push @FieldLabelsPresentInSome, $Label;
391 }
392 }
393 print "\n";
394 }
395 # List empty data field values count...
396 if ($OptionsInfo{CountEmptyData}) {
397 print "\n";
398 if ($PresentInAllCount == @{$SDCmpdsInfo{FieldLabels}}) {
399 PrintDataInformation("Number of non-empty values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}});
400 PrintDataInformation("Number of empty values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}});
401 }
402 else {
403 PrintDataInformation("Number of non-empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}});
404 PrintDataInformation("Number of empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}});
405 PrintDataInformation("Number of non-empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}});
406 PrintDataInformation("Number of empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}});
407 }
408 print "\n";
409 }
410 # List numerical data values count...
411 if ($OptionsInfo{CheckData}) {
412 print "\n";
413 if ($PresentInAllCount == @{$SDCmpdsInfo{FieldLabels}}) {
414 PrintDataInformation("Number of non-numerical values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}});
415 PrintDataInformation("Number of numerical values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}});
416 }
417 else {
418 PrintDataInformation("Number of non-numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}});
419 PrintDataInformation("Number of numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}});
420 PrintDataInformation("Number of non-numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}});
421 PrintDataInformation("Number of numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}});
422 }
423 print "\n";
424 }
425 }
426 else {
427 print "\nNumber of data fields: 0\n";
428 }
429 }
430 # List data information...
431 sub PrintDataInformation {
432 my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_;
433 my($Line, $Label);
434
435 $Line = "";
436 for $Label (@{$DataLabelRef}) {
437 $Line .= " <$Label> - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ",";
438 }
439 $Line =~ s/\,$//g;
440 print "$InfoLabel: $Line\n";
441 }
442
443 # Total size of all the files...
444 sub ListTotalSizeOfFiles {
445 my($FileOkayCount, $TotalSize, $Index);
446
447 $FileOkayCount = 0;
448 $TotalSize = 0;
449
450 for $Index (0 .. $#SDFilesList) {
451 if ($SDFilesInfo{FileOkay}[$Index]) {
452 $FileOkayCount++;
453 $TotalSize += $SDFilesInfo{FileSize}[$Index];
454 }
455 }
456 if ($FileOkayCount > 1) {
457 print "\nTotal number of compounds in $FileOkayCount SD files: $SDCmpdsInfo{TotalCmpdCount}\n";
458 print "\nTotal size of $FileOkayCount SD files: ", FormatFileSize($TotalSize), "\n";
459 }
460
461 }
462
463 # Retrieve information about SD files...
464 sub RetrieveSDFilesInfo {
465 my($Index, $SDFile, $ModifiedTimeString, $ModifiedDateString);
466
467 %SDCmpdsInfo = ();
468
469 %SDFilesInfo = ();
470 @{$SDFilesInfo{FileOkay}} = ();
471 @{$SDFilesInfo{FileSize}} = ();
472 @{$SDFilesInfo{FileLastModified}} = ();
473
474 FILELIST: for $Index (0 .. $#SDFilesList) {
475 $SDFilesInfo{FileOkay}[$Index] = 0;
476 $SDFilesInfo{FileSize}[$Index] = 0;
477 $SDFilesInfo{FileLastModified}[$Index] = '';
478
479 $SDFile = $SDFilesList[$Index];
480 if (!(-e $SDFile)) {
481 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
482 next FILELIST;
483 }
484 if (!CheckFileType($SDFile, "sdf sd")) {
485 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
486 next FILELIST;
487 }
488 if (! open SDFILE, "$SDFile") {
489 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
490 next FILELIST;
491 }
492 close SDFILE;
493
494 $SDFilesInfo{FileOkay}[$Index] = 1;
495 $SDFilesInfo{FileSize}[$Index] = FileSize($SDFile);
496 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SDFile);
497 $SDFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
498 }
499 }
500
501 # Process option values...
502 sub ProcessOptions {
503 %OptionsInfo = ();
504
505 $OptionsInfo{All} = $Options{all} ? $Options{all} : 0;
506 $OptionsInfo{Chiral} = $Options{chiral} ? $Options{chiral} : 0;
507 $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0;
508 $OptionsInfo{DataCheck} = $Options{datacheck} ? $Options{datacheck} : 0;
509 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0;
510 $OptionsInfo{Fields} = $Options{fields} ? $Options{fields} : 0;
511 $OptionsInfo{InvalidAtomNumbers} = $Options{invalidatomnumbers} ? $Options{invalidatomnumbers} : 0;
512 $OptionsInfo{Mismatch} = $Options{mismatch} ? $Options{mismatch} : 0;
513 $OptionsInfo{Salts} = $Options{salts} ? $Options{salts} : 0;
514 $OptionsInfo{UnknownAtoms} = $Options{unknownatoms} ? $Options{unknownatoms} : 0;
515
516 $OptionsInfo{Detail} = $Options{detail};
517
518 $OptionsInfo{ProcessCmpdInfo} = ($Options{all} || $Options{chiral} || $Options{empty} || $Options{fields} || $Options{invalidatomnumbers} || $Options{mismatch} || $Options{salts} || $Options{unknownatoms} || $Options{datacheck}) ? 1 : 0;
519
520 $OptionsInfo{ProcessCmpdData} = ($Options{all} || $Options{fields} || $Options{empty} || $Options{datacheck}) ? 1 : 0;
521
522 $OptionsInfo{CountEmptyData} = ($Options{all} || $Options{empty}) ? 1 : 0;
523 $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0;
524 }
525
526 # Setup script usage and retrieve command line arguments specified using various options...
527 sub SetupScriptUsage {
528
529 # Setup default and retrieve all the options...
530 %Options = ();
531 $Options{detail} = 1;
532 if (!GetOptions(\%Options, "all|a", "count|c", "chiral", "datacheck", "detail|d:i", "empty|e", "fields|f", "help|h", "invalidatomnumbers|i", "mismatch|m", "salts|s", "unknownatoms|u", "workingdir|w=s")) {
533 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
534 }
535 if ($Options{workingdir}) {
536 if (! -d $Options{workingdir}) {
537 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
538 }
539 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
540 }
541 if ($Options{detail} <= 0 || $Options{detail} > 3) {
542 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Possible values: 1 to 3\n";
543 }
544 }
545
546 __END__
547
548 =head1 NAME
549
550 InfoSDFiles.pl - List information about SDFile(s)
551
552 =head1 SYNOPSIS
553
554 InfoSDFile.pl SDFile(s)...
555
556 InfoSDFile.pl [B<-a --all>] [B<-c --count>] [B<--chiral>] [B<--datacheck>]
557 [B<-d --detail> infolevel] [B<-e --empty>] [B<-f, --fields>] [B<-h, --help>]
558 [B<-i, --invalidatomnumbers>] [B<-m, --mismatch>] [B<-s, --salts>] [B<-u, --unknownatoms>]
559 [B<-w, --workingdir> dirname] SDFile(s)...
560
561 =head1 DESCRIPTION
562
563 List information about I<SDFile(s)> contents: number of compounds, empty records
564 and so on. Multiple SDFile names are separated by spaces. The valid file extensions
565 are I<.sdf> and I<.sd>. All other file names are ignored. All the SD files in a current
566 directory can be specified either by I<*.sdf> or the current directory name.
567
568 =head1 OPTIONS
569
570 =over 4
571
572 =item B<-a, --all>
573
574 List all the available information.
575
576 =item B<-c, --count>
577
578 List number of compounds. This is B<default behavior>.
579
580 =item B<--chiral>
581
582 List number of empty atom/bond blocks for compounds with chiral flag set in
583 count line.
584
585 =item B<-d, --detail> I<infolevel>
586
587 Level of information to print. Default: 1. Possible values: I<1, 2, or 3>.
588
589 =item B<--datacheck>
590
591 List number of numerical and non-numerical values for each data field.
592
593 =item B<-e, --empty>
594
595 List number of empty atom/bond blocks and data fields for compounds.
596
597 =item B<-f, --fields>
598
599 List data field labels present for compounds.
600
601 =item B<-h, --help>
602
603 Print this help message.
604
605 =item B<-i, --invalidatomnumbers>
606
607 List number of bond blocks for compounds which contain invalid atom numbers.
608
609 =item B<-m, --mismatch>
610
611 List number of atom/bond blocks for compounds which don't match with counts
612 line information in header block.
613
614 =item B<-s, --salts>
615
616 List number of atom blocks for compounds which contain salts identified as
617 disconnected structural units.
618
619 =item B<-u, --unknownatoms>
620
621 List number of atom blocks for compounds which contain special atom symbols
622 such as L, Q, * ,LP, X, R#, or any other non periodic table symbols.
623
624 =item B<-w, --workingdir> I<dirname>
625
626 Location of working directory. Default: current directory.
627
628 =back
629
630 =head1 EXAMPLES
631
632 To count compounds in SD file(s), type:
633
634 % InfoSDFiles.pl Sample1.sdf
635 % InfoSDFiles.pl Sample1.sdf Sample2.sdf
636 % InfoSDFiles.pl *.sdf
637
638 To list all available information for SD file(s), type:
639
640 % InfoSDFiles.pl -a *.sdf
641
642 To list all data fields present in sample.sdf, type:
643
644 % InfoSDFiles.pl -f Sample.sdf
645
646 To count number of compounds which contain salts and list associated structural
647 data, type:
648
649 % InfoSDFiles.pl -s -d 3 Sample.sdf
650
651 =head1 AUTHOR
652
653 Manish Sud <msud@san.rr.com>
654
655 =head1 SEE ALSO
656
657 ExtractFromSDFiles.pl, FilterSDFiles.pl, MergeTextFilesWithSD.pl
658
659 =head1 COPYRIGHT
660
661 Copyright (C) 2015 Manish Sud. All rights reserved.
662
663 This file is part of MayaChemTools.
664
665 MayaChemTools is free software; you can redistribute it and/or modify it under
666 the terms of the GNU Lesser General Public License as published by the Free
667 Software Foundation; either version 3 of the License, or (at your option)
668 any later version.
669
670 =cut