Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/InfoSDFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
| author | deepakjadmin |
|---|---|
| date | Wed, 20 Jan 2016 09:23:18 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4816e4a8ae95 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 # | |
| 3 # $RCSfile: InfoSDFiles.pl,v $ | |
| 4 # $Date: 2015/02/28 20:46:20 $ | |
| 5 # $Revision: 1.35 $ | |
| 6 # | |
| 7 # Author: Manish Sud <msud@san.rr.com> | |
| 8 # | |
| 9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 10 # | |
| 11 # This file is part of MayaChemTools. | |
| 12 # | |
| 13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 14 # the terms of the GNU Lesser General Public License as published by the Free | |
| 15 # Software Foundation; either version 3 of the License, or (at your option) any | |
| 16 # later version. | |
| 17 # | |
| 18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
| 19 # any warranty; without even the implied warranty of merchantability of fitness | |
| 20 # for a particular purpose. See the GNU Lesser General Public License for more | |
| 21 # details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public License | |
| 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
| 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
| 26 # Boston, MA, 02111-1307, USA. | |
| 27 # | |
| 28 | |
| 29 use strict; | |
| 30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
| 31 use Getopt::Long; | |
| 32 use File::Basename; | |
| 33 use Benchmark; | |
| 34 use SDFileUtil; | |
| 35 use TextUtil; | |
| 36 use FileUtil; | |
| 37 | |
| 38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
| 39 | |
| 40 # Autoflush STDOUT | |
| 41 $| = 1; | |
| 42 | |
| 43 # Starting message... | |
| 44 $ScriptName = basename $0; | |
| 45 print "\n$ScriptName:Starting...\n\n"; | |
| 46 $StartTime = new Benchmark; | |
| 47 | |
| 48 # Get the options and setup script... | |
| 49 SetupScriptUsage(); | |
| 50 if ($Options{help} || @ARGV < 1) { | |
| 51 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
| 52 } | |
| 53 | |
| 54 my(@SDFilesList); | |
| 55 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); | |
| 56 | |
| 57 # Process options... | |
| 58 print "Processing options...\n"; | |
| 59 my(%OptionsInfo); | |
| 60 ProcessOptions(); | |
| 61 | |
| 62 # Setup information about input files... | |
| 63 print "Checking input SD file(s)...\n"; | |
| 64 my(%SDFilesInfo, %SDCmpdsInfo); | |
| 65 RetrieveSDFilesInfo(); | |
| 66 InitializeSDCmpdsInfo(); | |
| 67 | |
| 68 # Process input files.. | |
| 69 my($FileIndex); | |
| 70 if (@SDFilesList > 1) { | |
| 71 print "\nProcessing SD files...\n"; | |
| 72 } | |
| 73 for $FileIndex (0 .. $#SDFilesList) { | |
| 74 if ($SDFilesInfo{FileOkay}[$FileIndex]) { | |
| 75 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; | |
| 76 ListSDFileInfo($FileIndex); | |
| 77 } | |
| 78 } | |
| 79 ListTotalSizeOfFiles(); | |
| 80 | |
| 81 print "\n$ScriptName:Done...\n\n"; | |
| 82 | |
| 83 $EndTime = new Benchmark; | |
| 84 $TotalTime = timediff ($EndTime, $StartTime); | |
| 85 print "Total time: ", timestr($TotalTime), "\n"; | |
| 86 | |
| 87 ############################################################################### | |
| 88 | |
| 89 # List appropriate information... | |
| 90 sub ListSDFileInfo { | |
| 91 my($Index) = @_; | |
| 92 my($SDFile); | |
| 93 | |
| 94 $SDFile = $SDFilesList[$Index]; | |
| 95 | |
| 96 if ($OptionsInfo{ProcessCmpdInfo}) { | |
| 97 ListCompoundDetailsInfo($Index); | |
| 98 } | |
| 99 else { | |
| 100 ListCompoundCountInfo($Index); | |
| 101 } | |
| 102 | |
| 103 # File size and modification information... | |
| 104 print "\nFile size: ", FormatFileSize($SDFilesInfo{FileSize}[$Index]), " \n"; | |
| 105 print "Last modified: ", $SDFilesInfo{FileLastModified}[$Index], " \n"; | |
| 106 } | |
| 107 | |
| 108 # List number of compounds in SD file... | |
| 109 sub ListCompoundCountInfo { | |
| 110 my($Index) = @_; | |
| 111 my($SDFile, $CmpdCount); | |
| 112 | |
| 113 $SDFile = $SDFilesList[$Index]; | |
| 114 | |
| 115 $CmpdCount = 0; | |
| 116 | |
| 117 open SDFILE, "$SDFile" or die "Couldn't open $SDFile: $! \n"; | |
| 118 while (<SDFILE>) { | |
| 119 if (/^\$\$\$\$/) { | |
| 120 $CmpdCount++; | |
| 121 } | |
| 122 } | |
| 123 close SDFILE; | |
| 124 | |
| 125 $SDCmpdsInfo{TotalCmpdCount} += $CmpdCount; | |
| 126 | |
| 127 print "\nNumber of compounds: $CmpdCount\n"; | |
| 128 } | |
| 129 | |
| 130 # List detailed compound information... | |
| 131 sub ListCompoundDetailsInfo { | |
| 132 my($Index) = @_; | |
| 133 my($SDFile, $CmpdCount, $EmptyCtabBlocksCount, $MismatchCtabBlockCount, $ChiralCtabBlockCount, $UnknownAtomsCtabBlockCount, $InvalidAtomNumbersCtabBlockCount, $SaltsCtabBlockCount, $CtabLinesCount, $PrintCmpdCounterHeader, $ProblematicCmpdData, $CmpdString, @CmpdLines); | |
| 134 | |
| 135 $SDFile = $SDFilesList[$Index]; | |
| 136 | |
| 137 ($CmpdCount, $EmptyCtabBlocksCount, $MismatchCtabBlockCount, $ChiralCtabBlockCount, $UnknownAtomsCtabBlockCount, $InvalidAtomNumbersCtabBlockCount, $SaltsCtabBlockCount) = (0) x 7; | |
| 138 | |
| 139 InitializeSDCmpdsInfo(); | |
| 140 | |
| 141 $PrintCmpdCounterHeader = 1; | |
| 142 | |
| 143 open SDFILE, "$SDFile" or die "Couldn't open $SDFile: $! \n"; | |
| 144 while ($CmpdString = ReadCmpdString(\*SDFILE)) { | |
| 145 $CmpdCount++; | |
| 146 $ProblematicCmpdData = 0; | |
| 147 if ($OptionsInfo{Detail} <= 1) { | |
| 148 if (($CmpdCount % 5000) == 0) { | |
| 149 if ($PrintCmpdCounterHeader) { | |
| 150 $PrintCmpdCounterHeader = 0; | |
| 151 print "Processing compounds:"; | |
| 152 } | |
| 153 print "$CmpdCount..."; | |
| 154 } | |
| 155 } | |
| 156 @CmpdLines = split "\n", $CmpdString; | |
| 157 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines); | |
| 158 if ($OptionsInfo{All} || $OptionsInfo{Empty}) { | |
| 159 if ($CtabLinesCount <= 0) { | |
| 160 $EmptyCtabBlocksCount++; | |
| 161 $ProblematicCmpdData = 1; | |
| 162 } | |
| 163 } | |
| 164 if ($CtabLinesCount > 0) { | |
| 165 my ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine($CmpdLines[3]); | |
| 166 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) { | |
| 167 if ($CtabLinesCount != ($AtomCount + $BondCount)) { | |
| 168 $MismatchCtabBlockCount++; | |
| 169 $ProblematicCmpdData = 1; | |
| 170 if ($OptionsInfo{Detail} >= 2) { | |
| 171 print "\nMismatch found: Ctab lines count: $CtabLinesCount; Atoms count: $AtomCount; Bond count: $BondCount\n"; | |
| 172 } | |
| 173 } | |
| 174 } | |
| 175 if ($OptionsInfo{All} || $OptionsInfo{Chiral}) { | |
| 176 if ($ChiralFlag == 1) { | |
| 177 $ChiralCtabBlockCount++; | |
| 178 } | |
| 179 } | |
| 180 if ($CtabLinesCount == ($AtomCount + $BondCount)) { | |
| 181 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) { | |
| 182 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines); | |
| 183 if ($UnknownAtomCount) { | |
| 184 $UnknownAtomsCtabBlockCount++; | |
| 185 $ProblematicCmpdData = 1; | |
| 186 if ($OptionsInfo{Detail} >= 2) { | |
| 187 print "\nUnknown atom(s) found: $UnknownAtomCount\nUnknown atom(s) symbols:$UnknownAtoms\nUnknown atom(s) data lines:\n$UnknownAtomLines\n"; | |
| 188 } | |
| 189 } | |
| 190 } | |
| 191 if ($OptionsInfo{All} || $OptionsInfo{InvalidAtomNumbers}) { | |
| 192 my($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines) = GetInvalidAtomNumbers(\@CmpdLines); | |
| 193 if ($InvalidAtomNumbersCount) { | |
| 194 $InvalidAtomNumbersCtabBlockCount++; | |
| 195 $ProblematicCmpdData = 1; | |
| 196 if ($OptionsInfo{Detail} >= 2) { | |
| 197 print "\nInvalid atom number(s) found: $InvalidAtomNumbersCount\nInvalid atom number(s):$InvalidAtomNumbers\nInvalid atom number(s) data lines:\n$InvalidAtomNumberLines\n"; | |
| 198 } | |
| 199 } | |
| 200 } | |
| 201 if ($OptionsInfo{All} || $OptionsInfo{Salts}) { | |
| 202 my($FragmentsCount, $Fragments) = GetCmpdFragments(\@CmpdLines); | |
| 203 if ($FragmentsCount > 1) { | |
| 204 $SaltsCtabBlockCount++; | |
| 205 $ProblematicCmpdData = 1; | |
| 206 if ($OptionsInfo{Detail} >= 2) { | |
| 207 print "\nSalts found: $FragmentsCount\nSalts atom numbers:\n$Fragments\n"; | |
| 208 } | |
| 209 } | |
| 210 } | |
| 211 } | |
| 212 } | |
| 213 if ($OptionsInfo{ProcessCmpdData}) { | |
| 214 ProcessCmpdInfo(\@CmpdLines, $CmpdCount); | |
| 215 } | |
| 216 if ($OptionsInfo{Detail} >= 3) { | |
| 217 if ($ProblematicCmpdData) { | |
| 218 print "\nCompound data:\n$CmpdString\n\n"; | |
| 219 } | |
| 220 } | |
| 221 } | |
| 222 if ($OptionsInfo{Detail} <= 1) { | |
| 223 if (!$PrintCmpdCounterHeader) { | |
| 224 print "\n"; | |
| 225 } | |
| 226 } | |
| 227 close SDFILE; | |
| 228 | |
| 229 $SDCmpdsInfo{TotalCmpdCount} += $CmpdCount; | |
| 230 | |
| 231 print "\nNumber of compounds: $CmpdCount\n"; | |
| 232 | |
| 233 if ($OptionsInfo{All} || $OptionsInfo{Empty}) { | |
| 234 print "Number of empty atom/bond blocks: $EmptyCtabBlocksCount\n"; | |
| 235 } | |
| 236 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) { | |
| 237 print "Number of mismatched atom/bond blocks: $MismatchCtabBlockCount\n"; | |
| 238 } | |
| 239 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) { | |
| 240 print "Number of atom blocks with unknown atom labels: $UnknownAtomsCtabBlockCount\n"; | |
| 241 } | |
| 242 if ($OptionsInfo{All} || $OptionsInfo{InvalidAtomNumbers}) { | |
| 243 print "Number of bond blocks and atom property blocks with invalid atom numbers: $InvalidAtomNumbersCtabBlockCount\n"; | |
| 244 } | |
| 245 if ($OptionsInfo{All} || $OptionsInfo{Salts}) { | |
| 246 print "Number of atom blocks containing salts: $SaltsCtabBlockCount\n"; | |
| 247 } | |
| 248 if ($OptionsInfo{All} || $OptionsInfo{Chiral}) { | |
| 249 print "Number of chiral atom/bond blocks: $ChiralCtabBlockCount\n"; | |
| 250 } | |
| 251 if ($OptionsInfo{ProcessCmpdData}) { | |
| 252 PrintCmpdInfoSummary(); | |
| 253 } | |
| 254 | |
| 255 } | |
| 256 | |
| 257 # Initialize compound data information for a SD file... | |
| 258 sub InitializeSDCmpdsInfo { | |
| 259 | |
| 260 if (!exists $SDCmpdsInfo{TotalCmpdCount}) { | |
| 261 $SDCmpdsInfo{TotalCmpdCount} = 0; | |
| 262 } | |
| 263 | |
| 264 @{$SDCmpdsInfo{FieldLabels}} = (); | |
| 265 %{$SDCmpdsInfo{FieldLabelsMap}} = (); | |
| 266 %{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}} = (); | |
| 267 %{$SDCmpdsInfo{EmptyFieldValuesCountMap}} = (); | |
| 268 %{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}} = (); | |
| 269 %{$SDCmpdsInfo{NumericalFieldValuesCountMap}} = (); | |
| 270 } | |
| 271 | |
| 272 # Process compound data header labels and figure out which ones are present for | |
| 273 # all the compounds... | |
| 274 sub ProcessCmpdInfo { | |
| 275 my($CmpdLinesRef, $CmpdCount) = @_; | |
| 276 my($Label); | |
| 277 | |
| 278 if (@{$SDCmpdsInfo{FieldLabels}}) { | |
| 279 my (@CmpdFieldLabels) = GetCmpdDataHeaderLabels($CmpdLinesRef); | |
| 280 my(%CmpdFieldLabelsMap) = (); | |
| 281 # Setup a map for the current labels... | |
| 282 for $Label (@CmpdFieldLabels) { | |
| 283 $CmpdFieldLabelsMap{$Label} = "PresentInSome"; | |
| 284 } | |
| 285 # Check the presence old labels for this compound; otherwise, mark 'em new... | |
| 286 for $Label (@{$SDCmpdsInfo{FieldLabels}}) { | |
| 287 if (!$CmpdFieldLabelsMap{$Label}) { | |
| 288 $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInSome"; | |
| 289 } | |
| 290 } | |
| 291 # Check the presence this compound in the old labels; otherwise, add 'em... | |
| 292 for $Label (@CmpdFieldLabels ) { | |
| 293 if (!$SDCmpdsInfo{FieldLabelsMap}{$Label}) { | |
| 294 # It's a new label... | |
| 295 push @{$SDCmpdsInfo{FieldLabels}}, $Label; | |
| 296 $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInSome"; | |
| 297 } | |
| 298 } | |
| 299 } | |
| 300 else { | |
| 301 # Get the initial label set and set up a map... | |
| 302 @{$SDCmpdsInfo{FieldLabels}} = GetCmpdDataHeaderLabels($CmpdLinesRef); | |
| 303 for $Label (@{$SDCmpdsInfo{FieldLabels}}) { | |
| 304 $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInAll"; | |
| 305 } | |
| 306 } | |
| 307 if ($OptionsInfo{CountEmptyData} || $OptionsInfo{CheckData}) { | |
| 308 # Count empty data field values... | |
| 309 my(%DataFieldAndValues, $Label, $Value); | |
| 310 | |
| 311 %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues($CmpdLinesRef); | |
| 312 for $Label (keys %DataFieldAndValues) { | |
| 313 $Value = $DataFieldAndValues{$Label}; | |
| 314 if ($OptionsInfo{CountEmptyData}) { | |
| 315 if (IsNotEmpty($Value)) { | |
| 316 if (exists($SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label})) { | |
| 317 $SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label} += 1; | |
| 318 } | |
| 319 else { | |
| 320 $SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label} = 1; | |
| 321 } | |
| 322 } | |
| 323 else { | |
| 324 if ($Options{detail} >= 2) { | |
| 325 print "Compound record $CmpdCount: Empty data field <$Label>\n"; | |
| 326 } | |
| 327 if (exists($SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label})) { | |
| 328 $SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label} += 1; | |
| 329 } | |
| 330 else { | |
| 331 $SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label} = 1; | |
| 332 } | |
| 333 } | |
| 334 } | |
| 335 if ($OptionsInfo{CheckData}) { | |
| 336 if (IsNumerical($Value)) { | |
| 337 if (exists($SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label})) { | |
| 338 $SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label} += 1; | |
| 339 } | |
| 340 else { | |
| 341 $SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label} = 1; | |
| 342 } | |
| 343 } | |
| 344 else { | |
| 345 if (exists($SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label})) { | |
| 346 $SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label} += 1; | |
| 347 } | |
| 348 else { | |
| 349 $SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label} = 1; | |
| 350 } | |
| 351 } | |
| 352 } | |
| 353 } | |
| 354 } | |
| 355 } | |
| 356 | |
| 357 # Print compound summary... | |
| 358 sub PrintCmpdInfoSummary { | |
| 359 if (@{$SDCmpdsInfo{FieldLabels}}) { | |
| 360 my($PresentInAllCount, $Label, @FieldLabelsPresentInSome, @FieldLabelsPresentInAll); | |
| 361 | |
| 362 @FieldLabelsPresentInSome = (); | |
| 363 @FieldLabelsPresentInAll = (); | |
| 364 | |
| 365 $PresentInAllCount = 0; | |
| 366 print "\nNumber of data fields: ", scalar(@{$SDCmpdsInfo{FieldLabels}}), "\n"; | |
| 367 print "All data field labels: "; | |
| 368 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) { | |
| 369 print "<$Label> "; | |
| 370 } | |
| 371 print "\n"; | |
| 372 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) { | |
| 373 if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInAll") { | |
| 374 $PresentInAllCount++; | |
| 375 push @FieldLabelsPresentInAll, $Label; | |
| 376 } | |
| 377 } | |
| 378 if ($PresentInAllCount != @{$SDCmpdsInfo{FieldLabels}}) { | |
| 379 print "Data field labels present in all compounds: "; | |
| 380 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) { | |
| 381 if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInAll") { | |
| 382 print "<$Label> "; | |
| 383 } | |
| 384 } | |
| 385 print "\n"; | |
| 386 print "Data field labels present in some compounds: "; | |
| 387 for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) { | |
| 388 if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInSome") { | |
| 389 print "<$Label> "; | |
| 390 push @FieldLabelsPresentInSome, $Label; | |
| 391 } | |
| 392 } | |
| 393 print "\n"; | |
| 394 } | |
| 395 # List empty data field values count... | |
| 396 if ($OptionsInfo{CountEmptyData}) { | |
| 397 print "\n"; | |
| 398 if ($PresentInAllCount == @{$SDCmpdsInfo{FieldLabels}}) { | |
| 399 PrintDataInformation("Number of non-empty values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}}); | |
| 400 PrintDataInformation("Number of empty values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}}); | |
| 401 } | |
| 402 else { | |
| 403 PrintDataInformation("Number of non-empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}}); | |
| 404 PrintDataInformation("Number of empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}}); | |
| 405 PrintDataInformation("Number of non-empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}}); | |
| 406 PrintDataInformation("Number of empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}}); | |
| 407 } | |
| 408 print "\n"; | |
| 409 } | |
| 410 # List numerical data values count... | |
| 411 if ($OptionsInfo{CheckData}) { | |
| 412 print "\n"; | |
| 413 if ($PresentInAllCount == @{$SDCmpdsInfo{FieldLabels}}) { | |
| 414 PrintDataInformation("Number of non-numerical values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}}); | |
| 415 PrintDataInformation("Number of numerical values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}}); | |
| 416 } | |
| 417 else { | |
| 418 PrintDataInformation("Number of non-numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}}); | |
| 419 PrintDataInformation("Number of numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}}); | |
| 420 PrintDataInformation("Number of non-numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}}); | |
| 421 PrintDataInformation("Number of numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}}); | |
| 422 } | |
| 423 print "\n"; | |
| 424 } | |
| 425 } | |
| 426 else { | |
| 427 print "\nNumber of data fields: 0\n"; | |
| 428 } | |
| 429 } | |
| 430 # List data information... | |
| 431 sub PrintDataInformation { | |
| 432 my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_; | |
| 433 my($Line, $Label); | |
| 434 | |
| 435 $Line = ""; | |
| 436 for $Label (@{$DataLabelRef}) { | |
| 437 $Line .= " <$Label> - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ","; | |
| 438 } | |
| 439 $Line =~ s/\,$//g; | |
| 440 print "$InfoLabel: $Line\n"; | |
| 441 } | |
| 442 | |
| 443 # Total size of all the files... | |
| 444 sub ListTotalSizeOfFiles { | |
| 445 my($FileOkayCount, $TotalSize, $Index); | |
| 446 | |
| 447 $FileOkayCount = 0; | |
| 448 $TotalSize = 0; | |
| 449 | |
| 450 for $Index (0 .. $#SDFilesList) { | |
| 451 if ($SDFilesInfo{FileOkay}[$Index]) { | |
| 452 $FileOkayCount++; | |
| 453 $TotalSize += $SDFilesInfo{FileSize}[$Index]; | |
| 454 } | |
| 455 } | |
| 456 if ($FileOkayCount > 1) { | |
| 457 print "\nTotal number of compounds in $FileOkayCount SD files: $SDCmpdsInfo{TotalCmpdCount}\n"; | |
| 458 print "\nTotal size of $FileOkayCount SD files: ", FormatFileSize($TotalSize), "\n"; | |
| 459 } | |
| 460 | |
| 461 } | |
| 462 | |
| 463 # Retrieve information about SD files... | |
| 464 sub RetrieveSDFilesInfo { | |
| 465 my($Index, $SDFile, $ModifiedTimeString, $ModifiedDateString); | |
| 466 | |
| 467 %SDCmpdsInfo = (); | |
| 468 | |
| 469 %SDFilesInfo = (); | |
| 470 @{$SDFilesInfo{FileOkay}} = (); | |
| 471 @{$SDFilesInfo{FileSize}} = (); | |
| 472 @{$SDFilesInfo{FileLastModified}} = (); | |
| 473 | |
| 474 FILELIST: for $Index (0 .. $#SDFilesList) { | |
| 475 $SDFilesInfo{FileOkay}[$Index] = 0; | |
| 476 $SDFilesInfo{FileSize}[$Index] = 0; | |
| 477 $SDFilesInfo{FileLastModified}[$Index] = ''; | |
| 478 | |
| 479 $SDFile = $SDFilesList[$Index]; | |
| 480 if (!(-e $SDFile)) { | |
| 481 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; | |
| 482 next FILELIST; | |
| 483 } | |
| 484 if (!CheckFileType($SDFile, "sdf sd")) { | |
| 485 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; | |
| 486 next FILELIST; | |
| 487 } | |
| 488 if (! open SDFILE, "$SDFile") { | |
| 489 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; | |
| 490 next FILELIST; | |
| 491 } | |
| 492 close SDFILE; | |
| 493 | |
| 494 $SDFilesInfo{FileOkay}[$Index] = 1; | |
| 495 $SDFilesInfo{FileSize}[$Index] = FileSize($SDFile); | |
| 496 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SDFile); | |
| 497 $SDFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; | |
| 498 } | |
| 499 } | |
| 500 | |
| 501 # Process option values... | |
| 502 sub ProcessOptions { | |
| 503 %OptionsInfo = (); | |
| 504 | |
| 505 $OptionsInfo{All} = $Options{all} ? $Options{all} : 0; | |
| 506 $OptionsInfo{Chiral} = $Options{chiral} ? $Options{chiral} : 0; | |
| 507 $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0; | |
| 508 $OptionsInfo{DataCheck} = $Options{datacheck} ? $Options{datacheck} : 0; | |
| 509 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0; | |
| 510 $OptionsInfo{Fields} = $Options{fields} ? $Options{fields} : 0; | |
| 511 $OptionsInfo{InvalidAtomNumbers} = $Options{invalidatomnumbers} ? $Options{invalidatomnumbers} : 0; | |
| 512 $OptionsInfo{Mismatch} = $Options{mismatch} ? $Options{mismatch} : 0; | |
| 513 $OptionsInfo{Salts} = $Options{salts} ? $Options{salts} : 0; | |
| 514 $OptionsInfo{UnknownAtoms} = $Options{unknownatoms} ? $Options{unknownatoms} : 0; | |
| 515 | |
| 516 $OptionsInfo{Detail} = $Options{detail}; | |
| 517 | |
| 518 $OptionsInfo{ProcessCmpdInfo} = ($Options{all} || $Options{chiral} || $Options{empty} || $Options{fields} || $Options{invalidatomnumbers} || $Options{mismatch} || $Options{salts} || $Options{unknownatoms} || $Options{datacheck}) ? 1 : 0; | |
| 519 | |
| 520 $OptionsInfo{ProcessCmpdData} = ($Options{all} || $Options{fields} || $Options{empty} || $Options{datacheck}) ? 1 : 0; | |
| 521 | |
| 522 $OptionsInfo{CountEmptyData} = ($Options{all} || $Options{empty}) ? 1 : 0; | |
| 523 $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0; | |
| 524 } | |
| 525 | |
| 526 # Setup script usage and retrieve command line arguments specified using various options... | |
| 527 sub SetupScriptUsage { | |
| 528 | |
| 529 # Setup default and retrieve all the options... | |
| 530 %Options = (); | |
| 531 $Options{detail} = 1; | |
| 532 if (!GetOptions(\%Options, "all|a", "count|c", "chiral", "datacheck", "detail|d:i", "empty|e", "fields|f", "help|h", "invalidatomnumbers|i", "mismatch|m", "salts|s", "unknownatoms|u", "workingdir|w=s")) { | |
| 533 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
| 534 } | |
| 535 if ($Options{workingdir}) { | |
| 536 if (! -d $Options{workingdir}) { | |
| 537 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
| 538 } | |
| 539 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
| 540 } | |
| 541 if ($Options{detail} <= 0 || $Options{detail} > 3) { | |
| 542 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Possible values: 1 to 3\n"; | |
| 543 } | |
| 544 } | |
| 545 | |
| 546 __END__ | |
| 547 | |
| 548 =head1 NAME | |
| 549 | |
| 550 InfoSDFiles.pl - List information about SDFile(s) | |
| 551 | |
| 552 =head1 SYNOPSIS | |
| 553 | |
| 554 InfoSDFile.pl SDFile(s)... | |
| 555 | |
| 556 InfoSDFile.pl [B<-a --all>] [B<-c --count>] [B<--chiral>] [B<--datacheck>] | |
| 557 [B<-d --detail> infolevel] [B<-e --empty>] [B<-f, --fields>] [B<-h, --help>] | |
| 558 [B<-i, --invalidatomnumbers>] [B<-m, --mismatch>] [B<-s, --salts>] [B<-u, --unknownatoms>] | |
| 559 [B<-w, --workingdir> dirname] SDFile(s)... | |
| 560 | |
| 561 =head1 DESCRIPTION | |
| 562 | |
| 563 List information about I<SDFile(s)> contents: number of compounds, empty records | |
| 564 and so on. Multiple SDFile names are separated by spaces. The valid file extensions | |
| 565 are I<.sdf> and I<.sd>. All other file names are ignored. All the SD files in a current | |
| 566 directory can be specified either by I<*.sdf> or the current directory name. | |
| 567 | |
| 568 =head1 OPTIONS | |
| 569 | |
| 570 =over 4 | |
| 571 | |
| 572 =item B<-a, --all> | |
| 573 | |
| 574 List all the available information. | |
| 575 | |
| 576 =item B<-c, --count> | |
| 577 | |
| 578 List number of compounds. This is B<default behavior>. | |
| 579 | |
| 580 =item B<--chiral> | |
| 581 | |
| 582 List number of empty atom/bond blocks for compounds with chiral flag set in | |
| 583 count line. | |
| 584 | |
| 585 =item B<-d, --detail> I<infolevel> | |
| 586 | |
| 587 Level of information to print. Default: 1. Possible values: I<1, 2, or 3>. | |
| 588 | |
| 589 =item B<--datacheck> | |
| 590 | |
| 591 List number of numerical and non-numerical values for each data field. | |
| 592 | |
| 593 =item B<-e, --empty> | |
| 594 | |
| 595 List number of empty atom/bond blocks and data fields for compounds. | |
| 596 | |
| 597 =item B<-f, --fields> | |
| 598 | |
| 599 List data field labels present for compounds. | |
| 600 | |
| 601 =item B<-h, --help> | |
| 602 | |
| 603 Print this help message. | |
| 604 | |
| 605 =item B<-i, --invalidatomnumbers> | |
| 606 | |
| 607 List number of bond blocks for compounds which contain invalid atom numbers. | |
| 608 | |
| 609 =item B<-m, --mismatch> | |
| 610 | |
| 611 List number of atom/bond blocks for compounds which don't match with counts | |
| 612 line information in header block. | |
| 613 | |
| 614 =item B<-s, --salts> | |
| 615 | |
| 616 List number of atom blocks for compounds which contain salts identified as | |
| 617 disconnected structural units. | |
| 618 | |
| 619 =item B<-u, --unknownatoms> | |
| 620 | |
| 621 List number of atom blocks for compounds which contain special atom symbols | |
| 622 such as L, Q, * ,LP, X, R#, or any other non periodic table symbols. | |
| 623 | |
| 624 =item B<-w, --workingdir> I<dirname> | |
| 625 | |
| 626 Location of working directory. Default: current directory. | |
| 627 | |
| 628 =back | |
| 629 | |
| 630 =head1 EXAMPLES | |
| 631 | |
| 632 To count compounds in SD file(s), type: | |
| 633 | |
| 634 % InfoSDFiles.pl Sample1.sdf | |
| 635 % InfoSDFiles.pl Sample1.sdf Sample2.sdf | |
| 636 % InfoSDFiles.pl *.sdf | |
| 637 | |
| 638 To list all available information for SD file(s), type: | |
| 639 | |
| 640 % InfoSDFiles.pl -a *.sdf | |
| 641 | |
| 642 To list all data fields present in sample.sdf, type: | |
| 643 | |
| 644 % InfoSDFiles.pl -f Sample.sdf | |
| 645 | |
| 646 To count number of compounds which contain salts and list associated structural | |
| 647 data, type: | |
| 648 | |
| 649 % InfoSDFiles.pl -s -d 3 Sample.sdf | |
| 650 | |
| 651 =head1 AUTHOR | |
| 652 | |
| 653 Manish Sud <msud@san.rr.com> | |
| 654 | |
| 655 =head1 SEE ALSO | |
| 656 | |
| 657 ExtractFromSDFiles.pl, FilterSDFiles.pl, MergeTextFilesWithSD.pl | |
| 658 | |
| 659 =head1 COPYRIGHT | |
| 660 | |
| 661 Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 662 | |
| 663 This file is part of MayaChemTools. | |
| 664 | |
| 665 MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 666 the terms of the GNU Lesser General Public License as published by the Free | |
| 667 Software Foundation; either version 3 of the License, or (at your option) | |
| 668 any later version. | |
| 669 | |
| 670 =cut |
