0
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # $RCSfile: InfoSDFiles.pl,v $
|
|
4 # $Date: 2015/02/28 20:46:20 $
|
|
5 # $Revision: 1.35 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use FindBin; use lib "$FindBin::Bin/../lib";
|
|
31 use Getopt::Long;
|
|
32 use File::Basename;
|
|
33 use Benchmark;
|
|
34 use SDFileUtil;
|
|
35 use TextUtil;
|
|
36 use FileUtil;
|
|
37
|
|
38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
|
|
39
|
|
40 # Autoflush STDOUT
|
|
41 $| = 1;
|
|
42
|
|
43 # Starting message...
|
|
44 $ScriptName = basename $0;
|
|
45 print "\n$ScriptName:Starting...\n\n";
|
|
46 $StartTime = new Benchmark;
|
|
47
|
|
48 # Get the options and setup script...
|
|
49 SetupScriptUsage();
|
|
50 if ($Options{help} || @ARGV < 1) {
|
|
51 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
|
|
52 }
|
|
53
|
|
54 my(@SDFilesList);
|
|
55 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
|
|
56
|
|
57 # Process options...
|
|
58 print "Processing options...\n";
|
|
59 my(%OptionsInfo);
|
|
60 ProcessOptions();
|
|
61
|
|
62 # Setup information about input files...
|
|
63 print "Checking input SD file(s)...\n";
|
|
64 my(%SDFilesInfo, %SDCmpdsInfo);
|
|
65 RetrieveSDFilesInfo();
|
|
66 InitializeSDCmpdsInfo();
|
|
67
|
|
68 # Process input files..
|
|
69 my($FileIndex);
|
|
70 if (@SDFilesList > 1) {
|
|
71 print "\nProcessing SD files...\n";
|
|
72 }
|
|
73 for $FileIndex (0 .. $#SDFilesList) {
|
|
74 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
|
|
75 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
|
|
76 ListSDFileInfo($FileIndex);
|
|
77 }
|
|
78 }
|
|
79 ListTotalSizeOfFiles();
|
|
80
|
|
81 print "\n$ScriptName:Done...\n\n";
|
|
82
|
|
83 $EndTime = new Benchmark;
|
|
84 $TotalTime = timediff ($EndTime, $StartTime);
|
|
85 print "Total time: ", timestr($TotalTime), "\n";
|
|
86
|
|
87 ###############################################################################
|
|
88
|
|
89 # List appropriate information...
|
|
90 sub ListSDFileInfo {
|
|
91 my($Index) = @_;
|
|
92 my($SDFile);
|
|
93
|
|
94 $SDFile = $SDFilesList[$Index];
|
|
95
|
|
96 if ($OptionsInfo{ProcessCmpdInfo}) {
|
|
97 ListCompoundDetailsInfo($Index);
|
|
98 }
|
|
99 else {
|
|
100 ListCompoundCountInfo($Index);
|
|
101 }
|
|
102
|
|
103 # File size and modification information...
|
|
104 print "\nFile size: ", FormatFileSize($SDFilesInfo{FileSize}[$Index]), " \n";
|
|
105 print "Last modified: ", $SDFilesInfo{FileLastModified}[$Index], " \n";
|
|
106 }
|
|
107
|
|
108 # List number of compounds in SD file...
|
|
109 sub ListCompoundCountInfo {
|
|
110 my($Index) = @_;
|
|
111 my($SDFile, $CmpdCount);
|
|
112
|
|
113 $SDFile = $SDFilesList[$Index];
|
|
114
|
|
115 $CmpdCount = 0;
|
|
116
|
|
117 open SDFILE, "$SDFile" or die "Couldn't open $SDFile: $! \n";
|
|
118 while (<SDFILE>) {
|
|
119 if (/^\$\$\$\$/) {
|
|
120 $CmpdCount++;
|
|
121 }
|
|
122 }
|
|
123 close SDFILE;
|
|
124
|
|
125 $SDCmpdsInfo{TotalCmpdCount} += $CmpdCount;
|
|
126
|
|
127 print "\nNumber of compounds: $CmpdCount\n";
|
|
128 }
|
|
129
|
|
130 # List detailed compound information...
|
|
131 sub ListCompoundDetailsInfo {
|
|
132 my($Index) = @_;
|
|
133 my($SDFile, $CmpdCount, $EmptyCtabBlocksCount, $MismatchCtabBlockCount, $ChiralCtabBlockCount, $UnknownAtomsCtabBlockCount, $InvalidAtomNumbersCtabBlockCount, $SaltsCtabBlockCount, $CtabLinesCount, $PrintCmpdCounterHeader, $ProblematicCmpdData, $CmpdString, @CmpdLines);
|
|
134
|
|
135 $SDFile = $SDFilesList[$Index];
|
|
136
|
|
137 ($CmpdCount, $EmptyCtabBlocksCount, $MismatchCtabBlockCount, $ChiralCtabBlockCount, $UnknownAtomsCtabBlockCount, $InvalidAtomNumbersCtabBlockCount, $SaltsCtabBlockCount) = (0) x 7;
|
|
138
|
|
139 InitializeSDCmpdsInfo();
|
|
140
|
|
141 $PrintCmpdCounterHeader = 1;
|
|
142
|
|
143 open SDFILE, "$SDFile" or die "Couldn't open $SDFile: $! \n";
|
|
144 while ($CmpdString = ReadCmpdString(\*SDFILE)) {
|
|
145 $CmpdCount++;
|
|
146 $ProblematicCmpdData = 0;
|
|
147 if ($OptionsInfo{Detail} <= 1) {
|
|
148 if (($CmpdCount % 5000) == 0) {
|
|
149 if ($PrintCmpdCounterHeader) {
|
|
150 $PrintCmpdCounterHeader = 0;
|
|
151 print "Processing compounds:";
|
|
152 }
|
|
153 print "$CmpdCount...";
|
|
154 }
|
|
155 }
|
|
156 @CmpdLines = split "\n", $CmpdString;
|
|
157 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines);
|
|
158 if ($OptionsInfo{All} || $OptionsInfo{Empty}) {
|
|
159 if ($CtabLinesCount <= 0) {
|
|
160 $EmptyCtabBlocksCount++;
|
|
161 $ProblematicCmpdData = 1;
|
|
162 }
|
|
163 }
|
|
164 if ($CtabLinesCount > 0) {
|
|
165 my ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine($CmpdLines[3]);
|
|
166 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) {
|
|
167 if ($CtabLinesCount != ($AtomCount + $BondCount)) {
|
|
168 $MismatchCtabBlockCount++;
|
|
169 $ProblematicCmpdData = 1;
|
|
170 if ($OptionsInfo{Detail} >= 2) {
|
|
171 print "\nMismatch found: Ctab lines count: $CtabLinesCount; Atoms count: $AtomCount; Bond count: $BondCount\n";
|
|
172 }
|
|
173 }
|
|
174 }
|
|
175 if ($OptionsInfo{All} || $OptionsInfo{Chiral}) {
|
|
176 if ($ChiralFlag == 1) {
|
|
177 $ChiralCtabBlockCount++;
|
|
178 }
|
|
179 }
|
|
180 if ($CtabLinesCount == ($AtomCount + $BondCount)) {
|
|
181 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) {
|
|
182 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines);
|
|
183 if ($UnknownAtomCount) {
|
|
184 $UnknownAtomsCtabBlockCount++;
|
|
185 $ProblematicCmpdData = 1;
|
|
186 if ($OptionsInfo{Detail} >= 2) {
|
|
187 print "\nUnknown atom(s) found: $UnknownAtomCount\nUnknown atom(s) symbols:$UnknownAtoms\nUnknown atom(s) data lines:\n$UnknownAtomLines\n";
|
|
188 }
|
|
189 }
|
|
190 }
|
|
191 if ($OptionsInfo{All} || $OptionsInfo{InvalidAtomNumbers}) {
|
|
192 my($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines) = GetInvalidAtomNumbers(\@CmpdLines);
|
|
193 if ($InvalidAtomNumbersCount) {
|
|
194 $InvalidAtomNumbersCtabBlockCount++;
|
|
195 $ProblematicCmpdData = 1;
|
|
196 if ($OptionsInfo{Detail} >= 2) {
|
|
197 print "\nInvalid atom number(s) found: $InvalidAtomNumbersCount\nInvalid atom number(s):$InvalidAtomNumbers\nInvalid atom number(s) data lines:\n$InvalidAtomNumberLines\n";
|
|
198 }
|
|
199 }
|
|
200 }
|
|
201 if ($OptionsInfo{All} || $OptionsInfo{Salts}) {
|
|
202 my($FragmentsCount, $Fragments) = GetCmpdFragments(\@CmpdLines);
|
|
203 if ($FragmentsCount > 1) {
|
|
204 $SaltsCtabBlockCount++;
|
|
205 $ProblematicCmpdData = 1;
|
|
206 if ($OptionsInfo{Detail} >= 2) {
|
|
207 print "\nSalts found: $FragmentsCount\nSalts atom numbers:\n$Fragments\n";
|
|
208 }
|
|
209 }
|
|
210 }
|
|
211 }
|
|
212 }
|
|
213 if ($OptionsInfo{ProcessCmpdData}) {
|
|
214 ProcessCmpdInfo(\@CmpdLines, $CmpdCount);
|
|
215 }
|
|
216 if ($OptionsInfo{Detail} >= 3) {
|
|
217 if ($ProblematicCmpdData) {
|
|
218 print "\nCompound data:\n$CmpdString\n\n";
|
|
219 }
|
|
220 }
|
|
221 }
|
|
222 if ($OptionsInfo{Detail} <= 1) {
|
|
223 if (!$PrintCmpdCounterHeader) {
|
|
224 print "\n";
|
|
225 }
|
|
226 }
|
|
227 close SDFILE;
|
|
228
|
|
229 $SDCmpdsInfo{TotalCmpdCount} += $CmpdCount;
|
|
230
|
|
231 print "\nNumber of compounds: $CmpdCount\n";
|
|
232
|
|
233 if ($OptionsInfo{All} || $OptionsInfo{Empty}) {
|
|
234 print "Number of empty atom/bond blocks: $EmptyCtabBlocksCount\n";
|
|
235 }
|
|
236 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) {
|
|
237 print "Number of mismatched atom/bond blocks: $MismatchCtabBlockCount\n";
|
|
238 }
|
|
239 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) {
|
|
240 print "Number of atom blocks with unknown atom labels: $UnknownAtomsCtabBlockCount\n";
|
|
241 }
|
|
242 if ($OptionsInfo{All} || $OptionsInfo{InvalidAtomNumbers}) {
|
|
243 print "Number of bond blocks and atom property blocks with invalid atom numbers: $InvalidAtomNumbersCtabBlockCount\n";
|
|
244 }
|
|
245 if ($OptionsInfo{All} || $OptionsInfo{Salts}) {
|
|
246 print "Number of atom blocks containing salts: $SaltsCtabBlockCount\n";
|
|
247 }
|
|
248 if ($OptionsInfo{All} || $OptionsInfo{Chiral}) {
|
|
249 print "Number of chiral atom/bond blocks: $ChiralCtabBlockCount\n";
|
|
250 }
|
|
251 if ($OptionsInfo{ProcessCmpdData}) {
|
|
252 PrintCmpdInfoSummary();
|
|
253 }
|
|
254
|
|
255 }
|
|
256
|
|
257 # Initialize compound data information for a SD file...
|
|
258 sub InitializeSDCmpdsInfo {
|
|
259
|
|
260 if (!exists $SDCmpdsInfo{TotalCmpdCount}) {
|
|
261 $SDCmpdsInfo{TotalCmpdCount} = 0;
|
|
262 }
|
|
263
|
|
264 @{$SDCmpdsInfo{FieldLabels}} = ();
|
|
265 %{$SDCmpdsInfo{FieldLabelsMap}} = ();
|
|
266 %{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}} = ();
|
|
267 %{$SDCmpdsInfo{EmptyFieldValuesCountMap}} = ();
|
|
268 %{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}} = ();
|
|
269 %{$SDCmpdsInfo{NumericalFieldValuesCountMap}} = ();
|
|
270 }
|
|
271
|
|
272 # Process compound data header labels and figure out which ones are present for
|
|
273 # all the compounds...
|
|
274 sub ProcessCmpdInfo {
|
|
275 my($CmpdLinesRef, $CmpdCount) = @_;
|
|
276 my($Label);
|
|
277
|
|
278 if (@{$SDCmpdsInfo{FieldLabels}}) {
|
|
279 my (@CmpdFieldLabels) = GetCmpdDataHeaderLabels($CmpdLinesRef);
|
|
280 my(%CmpdFieldLabelsMap) = ();
|
|
281 # Setup a map for the current labels...
|
|
282 for $Label (@CmpdFieldLabels) {
|
|
283 $CmpdFieldLabelsMap{$Label} = "PresentInSome";
|
|
284 }
|
|
285 # Check the presence old labels for this compound; otherwise, mark 'em new...
|
|
286 for $Label (@{$SDCmpdsInfo{FieldLabels}}) {
|
|
287 if (!$CmpdFieldLabelsMap{$Label}) {
|
|
288 $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInSome";
|
|
289 }
|
|
290 }
|
|
291 # Check the presence this compound in the old labels; otherwise, add 'em...
|
|
292 for $Label (@CmpdFieldLabels ) {
|
|
293 if (!$SDCmpdsInfo{FieldLabelsMap}{$Label}) {
|
|
294 # It's a new label...
|
|
295 push @{$SDCmpdsInfo{FieldLabels}}, $Label;
|
|
296 $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInSome";
|
|
297 }
|
|
298 }
|
|
299 }
|
|
300 else {
|
|
301 # Get the initial label set and set up a map...
|
|
302 @{$SDCmpdsInfo{FieldLabels}} = GetCmpdDataHeaderLabels($CmpdLinesRef);
|
|
303 for $Label (@{$SDCmpdsInfo{FieldLabels}}) {
|
|
304 $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInAll";
|
|
305 }
|
|
306 }
|
|
307 if ($OptionsInfo{CountEmptyData} || $OptionsInfo{CheckData}) {
|
|
308 # Count empty data field values...
|
|
309 my(%DataFieldAndValues, $Label, $Value);
|
|
310
|
|
311 %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues($CmpdLinesRef);
|
|
312 for $Label (keys %DataFieldAndValues) {
|
|
313 $Value = $DataFieldAndValues{$Label};
|
|
314 if ($OptionsInfo{CountEmptyData}) {
|
|
315 if (IsNotEmpty($Value)) {
|
|
316 if (exists($SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label})) {
|
|
317 $SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label} += 1;
|
|
318 }
|
|
319 else {
|
|
320 $SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label} = 1;
|
|
321 }
|
|
322 }
|
|
323 else {
|
|
324 if ($Options{detail} >= 2) {
|
|
325 print "Compound record $CmpdCount: Empty data field <$Label>\n";
|
|
326 }
|
|
327 if (exists($SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label})) {
|
|
328 $SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label} += 1;
|
|
329 }
|
|
330 else {
|
|
331 $SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label} = 1;
|
|
332 }
|
|
333 }
|
|
334 }
|
|
335 if ($OptionsInfo{CheckData}) {
|
|
336 if (IsNumerical($Value)) {
|
|
337 if (exists($SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label})) {
|
|
338 $SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label} += 1;
|
|
339 }
|
|
340 else {
|
|
341 $SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label} = 1;
|
|
342 }
|
|
343 }
|
|
344 else {
|
|
345 if (exists($SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label})) {
|
|
346 $SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label} += 1;
|
|
347 }
|
|
348 else {
|
|
349 $SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label} = 1;
|
|
350 }
|
|
351 }
|
|
352 }
|
|
353 }
|
|
354 }
|
|
355 }
|
|
356
|
|
357 # Print compound summary...
|
|
358 sub PrintCmpdInfoSummary {
|
|
359 if (@{$SDCmpdsInfo{FieldLabels}}) {
|
|
360 my($PresentInAllCount, $Label, @FieldLabelsPresentInSome, @FieldLabelsPresentInAll);
|
|
361
|
|
362 @FieldLabelsPresentInSome = ();
|
|
363 @FieldLabelsPresentInAll = ();
|
|
364
|
|
365 $PresentInAllCount = 0;
|
|
366 print "\nNumber of data fields: ", scalar(@{$SDCmpdsInfo{FieldLabels}}), "\n";
|
|
367 print "All data field labels: ";
|
|
368 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) {
|
|
369 print "<$Label> ";
|
|
370 }
|
|
371 print "\n";
|
|
372 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) {
|
|
373 if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInAll") {
|
|
374 $PresentInAllCount++;
|
|
375 push @FieldLabelsPresentInAll, $Label;
|
|
376 }
|
|
377 }
|
|
378 if ($PresentInAllCount != @{$SDCmpdsInfo{FieldLabels}}) {
|
|
379 print "Data field labels present in all compounds: ";
|
|
380 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) {
|
|
381 if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInAll") {
|
|
382 print "<$Label> ";
|
|
383 }
|
|
384 }
|
|
385 print "\n";
|
|
386 print "Data field labels present in some compounds: ";
|
|
387 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) {
|
|
388 if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInSome") {
|
|
389 print "<$Label> ";
|
|
390 push @FieldLabelsPresentInSome, $Label;
|
|
391 }
|
|
392 }
|
|
393 print "\n";
|
|
394 }
|
|
395 # List empty data field values count...
|
|
396 if ($OptionsInfo{CountEmptyData}) {
|
|
397 print "\n";
|
|
398 if ($PresentInAllCount == @{$SDCmpdsInfo{FieldLabels}}) {
|
|
399 PrintDataInformation("Number of non-empty values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}});
|
|
400 PrintDataInformation("Number of empty values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}});
|
|
401 }
|
|
402 else {
|
|
403 PrintDataInformation("Number of non-empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}});
|
|
404 PrintDataInformation("Number of empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}});
|
|
405 PrintDataInformation("Number of non-empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}});
|
|
406 PrintDataInformation("Number of empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}});
|
|
407 }
|
|
408 print "\n";
|
|
409 }
|
|
410 # List numerical data values count...
|
|
411 if ($OptionsInfo{CheckData}) {
|
|
412 print "\n";
|
|
413 if ($PresentInAllCount == @{$SDCmpdsInfo{FieldLabels}}) {
|
|
414 PrintDataInformation("Number of non-numerical values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}});
|
|
415 PrintDataInformation("Number of numerical values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}});
|
|
416 }
|
|
417 else {
|
|
418 PrintDataInformation("Number of non-numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}});
|
|
419 PrintDataInformation("Number of numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}});
|
|
420 PrintDataInformation("Number of non-numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}});
|
|
421 PrintDataInformation("Number of numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}});
|
|
422 }
|
|
423 print "\n";
|
|
424 }
|
|
425 }
|
|
426 else {
|
|
427 print "\nNumber of data fields: 0\n";
|
|
428 }
|
|
429 }
|
|
430 # List data information...
|
|
431 sub PrintDataInformation {
|
|
432 my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_;
|
|
433 my($Line, $Label);
|
|
434
|
|
435 $Line = "";
|
|
436 for $Label (@{$DataLabelRef}) {
|
|
437 $Line .= " <$Label> - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ",";
|
|
438 }
|
|
439 $Line =~ s/\,$//g;
|
|
440 print "$InfoLabel: $Line\n";
|
|
441 }
|
|
442
|
|
443 # Total size of all the files...
|
|
444 sub ListTotalSizeOfFiles {
|
|
445 my($FileOkayCount, $TotalSize, $Index);
|
|
446
|
|
447 $FileOkayCount = 0;
|
|
448 $TotalSize = 0;
|
|
449
|
|
450 for $Index (0 .. $#SDFilesList) {
|
|
451 if ($SDFilesInfo{FileOkay}[$Index]) {
|
|
452 $FileOkayCount++;
|
|
453 $TotalSize += $SDFilesInfo{FileSize}[$Index];
|
|
454 }
|
|
455 }
|
|
456 if ($FileOkayCount > 1) {
|
|
457 print "\nTotal number of compounds in $FileOkayCount SD files: $SDCmpdsInfo{TotalCmpdCount}\n";
|
|
458 print "\nTotal size of $FileOkayCount SD files: ", FormatFileSize($TotalSize), "\n";
|
|
459 }
|
|
460
|
|
461 }
|
|
462
|
|
463 # Retrieve information about SD files...
|
|
464 sub RetrieveSDFilesInfo {
|
|
465 my($Index, $SDFile, $ModifiedTimeString, $ModifiedDateString);
|
|
466
|
|
467 %SDCmpdsInfo = ();
|
|
468
|
|
469 %SDFilesInfo = ();
|
|
470 @{$SDFilesInfo{FileOkay}} = ();
|
|
471 @{$SDFilesInfo{FileSize}} = ();
|
|
472 @{$SDFilesInfo{FileLastModified}} = ();
|
|
473
|
|
474 FILELIST: for $Index (0 .. $#SDFilesList) {
|
|
475 $SDFilesInfo{FileOkay}[$Index] = 0;
|
|
476 $SDFilesInfo{FileSize}[$Index] = 0;
|
|
477 $SDFilesInfo{FileLastModified}[$Index] = '';
|
|
478
|
|
479 $SDFile = $SDFilesList[$Index];
|
|
480 if (!(-e $SDFile)) {
|
|
481 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
|
|
482 next FILELIST;
|
|
483 }
|
|
484 if (!CheckFileType($SDFile, "sdf sd")) {
|
|
485 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
|
|
486 next FILELIST;
|
|
487 }
|
|
488 if (! open SDFILE, "$SDFile") {
|
|
489 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
|
|
490 next FILELIST;
|
|
491 }
|
|
492 close SDFILE;
|
|
493
|
|
494 $SDFilesInfo{FileOkay}[$Index] = 1;
|
|
495 $SDFilesInfo{FileSize}[$Index] = FileSize($SDFile);
|
|
496 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SDFile);
|
|
497 $SDFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
|
|
498 }
|
|
499 }
|
|
500
|
|
501 # Process option values...
|
|
502 sub ProcessOptions {
|
|
503 %OptionsInfo = ();
|
|
504
|
|
505 $OptionsInfo{All} = $Options{all} ? $Options{all} : 0;
|
|
506 $OptionsInfo{Chiral} = $Options{chiral} ? $Options{chiral} : 0;
|
|
507 $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0;
|
|
508 $OptionsInfo{DataCheck} = $Options{datacheck} ? $Options{datacheck} : 0;
|
|
509 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0;
|
|
510 $OptionsInfo{Fields} = $Options{fields} ? $Options{fields} : 0;
|
|
511 $OptionsInfo{InvalidAtomNumbers} = $Options{invalidatomnumbers} ? $Options{invalidatomnumbers} : 0;
|
|
512 $OptionsInfo{Mismatch} = $Options{mismatch} ? $Options{mismatch} : 0;
|
|
513 $OptionsInfo{Salts} = $Options{salts} ? $Options{salts} : 0;
|
|
514 $OptionsInfo{UnknownAtoms} = $Options{unknownatoms} ? $Options{unknownatoms} : 0;
|
|
515
|
|
516 $OptionsInfo{Detail} = $Options{detail};
|
|
517
|
|
518 $OptionsInfo{ProcessCmpdInfo} = ($Options{all} || $Options{chiral} || $Options{empty} || $Options{fields} || $Options{invalidatomnumbers} || $Options{mismatch} || $Options{salts} || $Options{unknownatoms} || $Options{datacheck}) ? 1 : 0;
|
|
519
|
|
520 $OptionsInfo{ProcessCmpdData} = ($Options{all} || $Options{fields} || $Options{empty} || $Options{datacheck}) ? 1 : 0;
|
|
521
|
|
522 $OptionsInfo{CountEmptyData} = ($Options{all} || $Options{empty}) ? 1 : 0;
|
|
523 $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0;
|
|
524 }
|
|
525
|
|
526 # Setup script usage and retrieve command line arguments specified using various options...
|
|
527 sub SetupScriptUsage {
|
|
528
|
|
529 # Setup default and retrieve all the options...
|
|
530 %Options = ();
|
|
531 $Options{detail} = 1;
|
|
532 if (!GetOptions(\%Options, "all|a", "count|c", "chiral", "datacheck", "detail|d:i", "empty|e", "fields|f", "help|h", "invalidatomnumbers|i", "mismatch|m", "salts|s", "unknownatoms|u", "workingdir|w=s")) {
|
|
533 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
|
|
534 }
|
|
535 if ($Options{workingdir}) {
|
|
536 if (! -d $Options{workingdir}) {
|
|
537 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
|
|
538 }
|
|
539 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
|
|
540 }
|
|
541 if ($Options{detail} <= 0 || $Options{detail} > 3) {
|
|
542 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Possible values: 1 to 3\n";
|
|
543 }
|
|
544 }
|
|
545
|
|
546 __END__
|
|
547
|
|
548 =head1 NAME
|
|
549
|
|
550 InfoSDFiles.pl - List information about SDFile(s)
|
|
551
|
|
552 =head1 SYNOPSIS
|
|
553
|
|
554 InfoSDFile.pl SDFile(s)...
|
|
555
|
|
556 InfoSDFile.pl [B<-a --all>] [B<-c --count>] [B<--chiral>] [B<--datacheck>]
|
|
557 [B<-d --detail> infolevel] [B<-e --empty>] [B<-f, --fields>] [B<-h, --help>]
|
|
558 [B<-i, --invalidatomnumbers>] [B<-m, --mismatch>] [B<-s, --salts>] [B<-u, --unknownatoms>]
|
|
559 [B<-w, --workingdir> dirname] SDFile(s)...
|
|
560
|
|
561 =head1 DESCRIPTION
|
|
562
|
|
563 List information about I<SDFile(s)> contents: number of compounds, empty records
|
|
564 and so on. Multiple SDFile names are separated by spaces. The valid file extensions
|
|
565 are I<.sdf> and I<.sd>. All other file names are ignored. All the SD files in a current
|
|
566 directory can be specified either by I<*.sdf> or the current directory name.
|
|
567
|
|
568 =head1 OPTIONS
|
|
569
|
|
570 =over 4
|
|
571
|
|
572 =item B<-a, --all>
|
|
573
|
|
574 List all the available information.
|
|
575
|
|
576 =item B<-c, --count>
|
|
577
|
|
578 List number of compounds. This is B<default behavior>.
|
|
579
|
|
580 =item B<--chiral>
|
|
581
|
|
582 List number of empty atom/bond blocks for compounds with chiral flag set in
|
|
583 count line.
|
|
584
|
|
585 =item B<-d, --detail> I<infolevel>
|
|
586
|
|
587 Level of information to print. Default: 1. Possible values: I<1, 2, or 3>.
|
|
588
|
|
589 =item B<--datacheck>
|
|
590
|
|
591 List number of numerical and non-numerical values for each data field.
|
|
592
|
|
593 =item B<-e, --empty>
|
|
594
|
|
595 List number of empty atom/bond blocks and data fields for compounds.
|
|
596
|
|
597 =item B<-f, --fields>
|
|
598
|
|
599 List data field labels present for compounds.
|
|
600
|
|
601 =item B<-h, --help>
|
|
602
|
|
603 Print this help message.
|
|
604
|
|
605 =item B<-i, --invalidatomnumbers>
|
|
606
|
|
607 List number of bond blocks for compounds which contain invalid atom numbers.
|
|
608
|
|
609 =item B<-m, --mismatch>
|
|
610
|
|
611 List number of atom/bond blocks for compounds which don't match with counts
|
|
612 line information in header block.
|
|
613
|
|
614 =item B<-s, --salts>
|
|
615
|
|
616 List number of atom blocks for compounds which contain salts identified as
|
|
617 disconnected structural units.
|
|
618
|
|
619 =item B<-u, --unknownatoms>
|
|
620
|
|
621 List number of atom blocks for compounds which contain special atom symbols
|
|
622 such as L, Q, * ,LP, X, R#, or any other non periodic table symbols.
|
|
623
|
|
624 =item B<-w, --workingdir> I<dirname>
|
|
625
|
|
626 Location of working directory. Default: current directory.
|
|
627
|
|
628 =back
|
|
629
|
|
630 =head1 EXAMPLES
|
|
631
|
|
632 To count compounds in SD file(s), type:
|
|
633
|
|
634 % InfoSDFiles.pl Sample1.sdf
|
|
635 % InfoSDFiles.pl Sample1.sdf Sample2.sdf
|
|
636 % InfoSDFiles.pl *.sdf
|
|
637
|
|
638 To list all available information for SD file(s), type:
|
|
639
|
|
640 % InfoSDFiles.pl -a *.sdf
|
|
641
|
|
642 To list all data fields present in sample.sdf, type:
|
|
643
|
|
644 % InfoSDFiles.pl -f Sample.sdf
|
|
645
|
|
646 To count number of compounds which contain salts and list associated structural
|
|
647 data, type:
|
|
648
|
|
649 % InfoSDFiles.pl -s -d 3 Sample.sdf
|
|
650
|
|
651 =head1 AUTHOR
|
|
652
|
|
653 Manish Sud <msud@san.rr.com>
|
|
654
|
|
655 =head1 SEE ALSO
|
|
656
|
|
657 ExtractFromSDFiles.pl, FilterSDFiles.pl, MergeTextFilesWithSD.pl
|
|
658
|
|
659 =head1 COPYRIGHT
|
|
660
|
|
661 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
662
|
|
663 This file is part of MayaChemTools.
|
|
664
|
|
665 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
666 the terms of the GNU Lesser General Public License as published by the Free
|
|
667 Software Foundation; either version 3 of the License, or (at your option)
|
|
668 any later version.
|
|
669
|
|
670 =cut
|