Mercurial > repos > deepakjadmin > r_caret_test
diff mayachemtool/mayachemtools/bin/InfoSDFiles.pl @ 0:68300206e90d draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Thu, 05 Nov 2015 02:41:30 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mayachemtool/mayachemtools/bin/InfoSDFiles.pl Thu Nov 05 02:41:30 2015 -0500 @@ -0,0 +1,670 @@ +#!/usr/bin/perl -w +# +# $RCSfile: InfoSDFiles.pl,v $ +# $Date: 2015/02/28 20:46:20 $ +# $Revision: 1.35 $ +# +# Author: Manish Sud <msud@san.rr.com> +# +# Copyright (C) 2015 Manish Sud. All rights reserved. +# +# This file is part of MayaChemTools. +# +# MayaChemTools is free software; you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# MayaChemTools is distributed in the hope that it will be useful, but without +# any warranty; without even the implied warranty of merchantability of fitness +# for a particular purpose. See the GNU Lesser General Public License for more +# details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or +# write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, +# Boston, MA, 02111-1307, USA. +# + +use strict; +use FindBin; use lib "$FindBin::Bin/../lib"; +use Getopt::Long; +use File::Basename; +use Benchmark; +use SDFileUtil; +use TextUtil; +use FileUtil; + +my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); + +# Autoflush STDOUT +$| = 1; + +# Starting message... +$ScriptName = basename $0; +print "\n$ScriptName:Starting...\n\n"; +$StartTime = new Benchmark; + +# Get the options and setup script... +SetupScriptUsage(); +if ($Options{help} || @ARGV < 1) { + die GetUsageFromPod("$FindBin::Bin/$ScriptName"); +} + +my(@SDFilesList); +@SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); + +# Process options... +print "Processing options...\n"; +my(%OptionsInfo); +ProcessOptions(); + +# Setup information about input files... +print "Checking input SD file(s)...\n"; +my(%SDFilesInfo, %SDCmpdsInfo); +RetrieveSDFilesInfo(); +InitializeSDCmpdsInfo(); + +# Process input files.. +my($FileIndex); +if (@SDFilesList > 1) { + print "\nProcessing SD files...\n"; +} +for $FileIndex (0 .. $#SDFilesList) { + if ($SDFilesInfo{FileOkay}[$FileIndex]) { + print "\nProcessing file $SDFilesList[$FileIndex]...\n"; + ListSDFileInfo($FileIndex); + } +} +ListTotalSizeOfFiles(); + +print "\n$ScriptName:Done...\n\n"; + +$EndTime = new Benchmark; +$TotalTime = timediff ($EndTime, $StartTime); +print "Total time: ", timestr($TotalTime), "\n"; + +############################################################################### + +# List appropriate information... +sub ListSDFileInfo { + my($Index) = @_; + my($SDFile); + + $SDFile = $SDFilesList[$Index]; + + if ($OptionsInfo{ProcessCmpdInfo}) { + ListCompoundDetailsInfo($Index); + } + else { + ListCompoundCountInfo($Index); + } + + # File size and modification information... + print "\nFile size: ", FormatFileSize($SDFilesInfo{FileSize}[$Index]), " \n"; + print "Last modified: ", $SDFilesInfo{FileLastModified}[$Index], " \n"; +} + +# List number of compounds in SD file... +sub ListCompoundCountInfo { + my($Index) = @_; + my($SDFile, $CmpdCount); + + $SDFile = $SDFilesList[$Index]; + + $CmpdCount = 0; + + open SDFILE, "$SDFile" or die "Couldn't open $SDFile: $! \n"; + while (<SDFILE>) { + if (/^\$\$\$\$/) { + $CmpdCount++; + } + } + close SDFILE; + + $SDCmpdsInfo{TotalCmpdCount} += $CmpdCount; + + print "\nNumber of compounds: $CmpdCount\n"; +} + +# List detailed compound information... +sub ListCompoundDetailsInfo { + my($Index) = @_; + my($SDFile, $CmpdCount, $EmptyCtabBlocksCount, $MismatchCtabBlockCount, $ChiralCtabBlockCount, $UnknownAtomsCtabBlockCount, $InvalidAtomNumbersCtabBlockCount, $SaltsCtabBlockCount, $CtabLinesCount, $PrintCmpdCounterHeader, $ProblematicCmpdData, $CmpdString, @CmpdLines); + + $SDFile = $SDFilesList[$Index]; + + ($CmpdCount, $EmptyCtabBlocksCount, $MismatchCtabBlockCount, $ChiralCtabBlockCount, $UnknownAtomsCtabBlockCount, $InvalidAtomNumbersCtabBlockCount, $SaltsCtabBlockCount) = (0) x 7; + + InitializeSDCmpdsInfo(); + + $PrintCmpdCounterHeader = 1; + + open SDFILE, "$SDFile" or die "Couldn't open $SDFile: $! \n"; + while ($CmpdString = ReadCmpdString(\*SDFILE)) { + $CmpdCount++; + $ProblematicCmpdData = 0; + if ($OptionsInfo{Detail} <= 1) { + if (($CmpdCount % 5000) == 0) { + if ($PrintCmpdCounterHeader) { + $PrintCmpdCounterHeader = 0; + print "Processing compounds:"; + } + print "$CmpdCount..."; + } + } + @CmpdLines = split "\n", $CmpdString; + $CtabLinesCount = GetCtabLinesCount(\@CmpdLines); + if ($OptionsInfo{All} || $OptionsInfo{Empty}) { + if ($CtabLinesCount <= 0) { + $EmptyCtabBlocksCount++; + $ProblematicCmpdData = 1; + } + } + if ($CtabLinesCount > 0) { + my ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine($CmpdLines[3]); + if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) { + if ($CtabLinesCount != ($AtomCount + $BondCount)) { + $MismatchCtabBlockCount++; + $ProblematicCmpdData = 1; + if ($OptionsInfo{Detail} >= 2) { + print "\nMismatch found: Ctab lines count: $CtabLinesCount; Atoms count: $AtomCount; Bond count: $BondCount\n"; + } + } + } + if ($OptionsInfo{All} || $OptionsInfo{Chiral}) { + if ($ChiralFlag == 1) { + $ChiralCtabBlockCount++; + } + } + if ($CtabLinesCount == ($AtomCount + $BondCount)) { + if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) { + my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines); + if ($UnknownAtomCount) { + $UnknownAtomsCtabBlockCount++; + $ProblematicCmpdData = 1; + if ($OptionsInfo{Detail} >= 2) { + print "\nUnknown atom(s) found: $UnknownAtomCount\nUnknown atom(s) symbols:$UnknownAtoms\nUnknown atom(s) data lines:\n$UnknownAtomLines\n"; + } + } + } + if ($OptionsInfo{All} || $OptionsInfo{InvalidAtomNumbers}) { + my($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines) = GetInvalidAtomNumbers(\@CmpdLines); + if ($InvalidAtomNumbersCount) { + $InvalidAtomNumbersCtabBlockCount++; + $ProblematicCmpdData = 1; + if ($OptionsInfo{Detail} >= 2) { + print "\nInvalid atom number(s) found: $InvalidAtomNumbersCount\nInvalid atom number(s):$InvalidAtomNumbers\nInvalid atom number(s) data lines:\n$InvalidAtomNumberLines\n"; + } + } + } + if ($OptionsInfo{All} || $OptionsInfo{Salts}) { + my($FragmentsCount, $Fragments) = GetCmpdFragments(\@CmpdLines); + if ($FragmentsCount > 1) { + $SaltsCtabBlockCount++; + $ProblematicCmpdData = 1; + if ($OptionsInfo{Detail} >= 2) { + print "\nSalts found: $FragmentsCount\nSalts atom numbers:\n$Fragments\n"; + } + } + } + } + } + if ($OptionsInfo{ProcessCmpdData}) { + ProcessCmpdInfo(\@CmpdLines, $CmpdCount); + } + if ($OptionsInfo{Detail} >= 3) { + if ($ProblematicCmpdData) { + print "\nCompound data:\n$CmpdString\n\n"; + } + } + } + if ($OptionsInfo{Detail} <= 1) { + if (!$PrintCmpdCounterHeader) { + print "\n"; + } + } + close SDFILE; + + $SDCmpdsInfo{TotalCmpdCount} += $CmpdCount; + + print "\nNumber of compounds: $CmpdCount\n"; + + if ($OptionsInfo{All} || $OptionsInfo{Empty}) { + print "Number of empty atom/bond blocks: $EmptyCtabBlocksCount\n"; + } + if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) { + print "Number of mismatched atom/bond blocks: $MismatchCtabBlockCount\n"; + } + if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) { + print "Number of atom blocks with unknown atom labels: $UnknownAtomsCtabBlockCount\n"; + } + if ($OptionsInfo{All} || $OptionsInfo{InvalidAtomNumbers}) { + print "Number of bond blocks and atom property blocks with invalid atom numbers: $InvalidAtomNumbersCtabBlockCount\n"; + } + if ($OptionsInfo{All} || $OptionsInfo{Salts}) { + print "Number of atom blocks containing salts: $SaltsCtabBlockCount\n"; + } + if ($OptionsInfo{All} || $OptionsInfo{Chiral}) { + print "Number of chiral atom/bond blocks: $ChiralCtabBlockCount\n"; + } + if ($OptionsInfo{ProcessCmpdData}) { + PrintCmpdInfoSummary(); + } + +} + +# Initialize compound data information for a SD file... +sub InitializeSDCmpdsInfo { + + if (!exists $SDCmpdsInfo{TotalCmpdCount}) { + $SDCmpdsInfo{TotalCmpdCount} = 0; + } + + @{$SDCmpdsInfo{FieldLabels}} = (); + %{$SDCmpdsInfo{FieldLabelsMap}} = (); + %{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}} = (); + %{$SDCmpdsInfo{EmptyFieldValuesCountMap}} = (); + %{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}} = (); + %{$SDCmpdsInfo{NumericalFieldValuesCountMap}} = (); +} + +# Process compound data header labels and figure out which ones are present for +# all the compounds... +sub ProcessCmpdInfo { + my($CmpdLinesRef, $CmpdCount) = @_; + my($Label); + + if (@{$SDCmpdsInfo{FieldLabels}}) { + my (@CmpdFieldLabels) = GetCmpdDataHeaderLabels($CmpdLinesRef); + my(%CmpdFieldLabelsMap) = (); + # Setup a map for the current labels... + for $Label (@CmpdFieldLabels) { + $CmpdFieldLabelsMap{$Label} = "PresentInSome"; + } + # Check the presence old labels for this compound; otherwise, mark 'em new... + for $Label (@{$SDCmpdsInfo{FieldLabels}}) { + if (!$CmpdFieldLabelsMap{$Label}) { + $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInSome"; + } + } + # Check the presence this compound in the old labels; otherwise, add 'em... + for $Label (@CmpdFieldLabels ) { + if (!$SDCmpdsInfo{FieldLabelsMap}{$Label}) { + # It's a new label... + push @{$SDCmpdsInfo{FieldLabels}}, $Label; + $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInSome"; + } + } + } + else { + # Get the initial label set and set up a map... + @{$SDCmpdsInfo{FieldLabels}} = GetCmpdDataHeaderLabels($CmpdLinesRef); + for $Label (@{$SDCmpdsInfo{FieldLabels}}) { + $SDCmpdsInfo{FieldLabelsMap}{$Label} = "PresentInAll"; + } + } + if ($OptionsInfo{CountEmptyData} || $OptionsInfo{CheckData}) { + # Count empty data field values... + my(%DataFieldAndValues, $Label, $Value); + + %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues($CmpdLinesRef); + for $Label (keys %DataFieldAndValues) { + $Value = $DataFieldAndValues{$Label}; + if ($OptionsInfo{CountEmptyData}) { + if (IsNotEmpty($Value)) { + if (exists($SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label})) { + $SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label} += 1; + } + else { + $SDCmpdsInfo{NonEmptyFieldValuesCountMap}{$Label} = 1; + } + } + else { + if ($Options{detail} >= 2) { + print "Compound record $CmpdCount: Empty data field <$Label>\n"; + } + if (exists($SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label})) { + $SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label} += 1; + } + else { + $SDCmpdsInfo{EmptyFieldValuesCountMap}{$Label} = 1; + } + } + } + if ($OptionsInfo{CheckData}) { + if (IsNumerical($Value)) { + if (exists($SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label})) { + $SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label} += 1; + } + else { + $SDCmpdsInfo{NumericalFieldValuesCountMap}{$Label} = 1; + } + } + else { + if (exists($SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label})) { + $SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label} += 1; + } + else { + $SDCmpdsInfo{NonNumericalFieldValuesCountMap}{$Label} = 1; + } + } + } + } + } +} + +# Print compound summary... +sub PrintCmpdInfoSummary { + if (@{$SDCmpdsInfo{FieldLabels}}) { + my($PresentInAllCount, $Label, @FieldLabelsPresentInSome, @FieldLabelsPresentInAll); + + @FieldLabelsPresentInSome = (); + @FieldLabelsPresentInAll = (); + + $PresentInAllCount = 0; + print "\nNumber of data fields: ", scalar(@{$SDCmpdsInfo{FieldLabels}}), "\n"; + print "All data field labels: "; + for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) { + print "<$Label> "; + } + print "\n"; + for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) { + if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInAll") { + $PresentInAllCount++; + push @FieldLabelsPresentInAll, $Label; + } + } + if ($PresentInAllCount != @{$SDCmpdsInfo{FieldLabels}}) { + print "Data field labels present in all compounds: "; + for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) { + if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInAll") { + print "<$Label> "; + } + } + print "\n"; + print "Data field labels present in some compounds: "; + for $Label (sort keys %{$SDCmpdsInfo{FieldLabelsMap}}) { + if ($SDCmpdsInfo{FieldLabelsMap}{$Label} eq "PresentInSome") { + print "<$Label> "; + push @FieldLabelsPresentInSome, $Label; + } + } + print "\n"; + } + # List empty data field values count... + if ($OptionsInfo{CountEmptyData}) { + print "\n"; + if ($PresentInAllCount == @{$SDCmpdsInfo{FieldLabels}}) { + PrintDataInformation("Number of non-empty values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}}); + PrintDataInformation("Number of empty values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}}); + } + else { + PrintDataInformation("Number of non-empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}}); + PrintDataInformation("Number of empty values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}}); + PrintDataInformation("Number of non-empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NonEmptyFieldValuesCountMap}}); + PrintDataInformation("Number of empty values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{EmptyFieldValuesCountMap}}); + } + print "\n"; + } + # List numerical data values count... + if ($OptionsInfo{CheckData}) { + print "\n"; + if ($PresentInAllCount == @{$SDCmpdsInfo{FieldLabels}}) { + PrintDataInformation("Number of non-numerical values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}}); + PrintDataInformation("Number of numerical values for data field(s)", \@{$SDCmpdsInfo{FieldLabels}}, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}}); + } + else { + PrintDataInformation("Number of non-numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}}); + PrintDataInformation("Number of numerical values for data field(s) present in all compounds", \@FieldLabelsPresentInAll, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}}); + PrintDataInformation("Number of non-numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NonNumericalFieldValuesCountMap}}); + PrintDataInformation("Number of numerical values for data field(s) present in some compounds", \@FieldLabelsPresentInSome, \%{$SDCmpdsInfo{NumericalFieldValuesCountMap}}); + } + print "\n"; + } + } + else { + print "\nNumber of data fields: 0\n"; + } +} +# List data information... +sub PrintDataInformation { + my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_; + my($Line, $Label); + + $Line = ""; + for $Label (@{$DataLabelRef}) { + $Line .= " <$Label> - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ","; + } + $Line =~ s/\,$//g; + print "$InfoLabel: $Line\n"; +} + +# Total size of all the files... +sub ListTotalSizeOfFiles { + my($FileOkayCount, $TotalSize, $Index); + + $FileOkayCount = 0; + $TotalSize = 0; + + for $Index (0 .. $#SDFilesList) { + if ($SDFilesInfo{FileOkay}[$Index]) { + $FileOkayCount++; + $TotalSize += $SDFilesInfo{FileSize}[$Index]; + } + } + if ($FileOkayCount > 1) { + print "\nTotal number of compounds in $FileOkayCount SD files: $SDCmpdsInfo{TotalCmpdCount}\n"; + print "\nTotal size of $FileOkayCount SD files: ", FormatFileSize($TotalSize), "\n"; + } + +} + +# Retrieve information about SD files... +sub RetrieveSDFilesInfo { + my($Index, $SDFile, $ModifiedTimeString, $ModifiedDateString); + + %SDCmpdsInfo = (); + + %SDFilesInfo = (); + @{$SDFilesInfo{FileOkay}} = (); + @{$SDFilesInfo{FileSize}} = (); + @{$SDFilesInfo{FileLastModified}} = (); + + FILELIST: for $Index (0 .. $#SDFilesList) { + $SDFilesInfo{FileOkay}[$Index] = 0; + $SDFilesInfo{FileSize}[$Index] = 0; + $SDFilesInfo{FileLastModified}[$Index] = ''; + + $SDFile = $SDFilesList[$Index]; + if (!(-e $SDFile)) { + warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; + next FILELIST; + } + if (!CheckFileType($SDFile, "sdf sd")) { + warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; + next FILELIST; + } + if (! open SDFILE, "$SDFile") { + warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; + next FILELIST; + } + close SDFILE; + + $SDFilesInfo{FileOkay}[$Index] = 1; + $SDFilesInfo{FileSize}[$Index] = FileSize($SDFile); + ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SDFile); + $SDFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; + } +} + +# Process option values... +sub ProcessOptions { + %OptionsInfo = (); + + $OptionsInfo{All} = $Options{all} ? $Options{all} : 0; + $OptionsInfo{Chiral} = $Options{chiral} ? $Options{chiral} : 0; + $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0; + $OptionsInfo{DataCheck} = $Options{datacheck} ? $Options{datacheck} : 0; + $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0; + $OptionsInfo{Fields} = $Options{fields} ? $Options{fields} : 0; + $OptionsInfo{InvalidAtomNumbers} = $Options{invalidatomnumbers} ? $Options{invalidatomnumbers} : 0; + $OptionsInfo{Mismatch} = $Options{mismatch} ? $Options{mismatch} : 0; + $OptionsInfo{Salts} = $Options{salts} ? $Options{salts} : 0; + $OptionsInfo{UnknownAtoms} = $Options{unknownatoms} ? $Options{unknownatoms} : 0; + + $OptionsInfo{Detail} = $Options{detail}; + + $OptionsInfo{ProcessCmpdInfo} = ($Options{all} || $Options{chiral} || $Options{empty} || $Options{fields} || $Options{invalidatomnumbers} || $Options{mismatch} || $Options{salts} || $Options{unknownatoms} || $Options{datacheck}) ? 1 : 0; + + $OptionsInfo{ProcessCmpdData} = ($Options{all} || $Options{fields} || $Options{empty} || $Options{datacheck}) ? 1 : 0; + + $OptionsInfo{CountEmptyData} = ($Options{all} || $Options{empty}) ? 1 : 0; + $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0; +} + +# Setup script usage and retrieve command line arguments specified using various options... +sub SetupScriptUsage { + + # Setup default and retrieve all the options... + %Options = (); + $Options{detail} = 1; + if (!GetOptions(\%Options, "all|a", "count|c", "chiral", "datacheck", "detail|d:i", "empty|e", "fields|f", "help|h", "invalidatomnumbers|i", "mismatch|m", "salts|s", "unknownatoms|u", "workingdir|w=s")) { + die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; + } + if ($Options{workingdir}) { + if (! -d $Options{workingdir}) { + die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; + } + chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; + } + if ($Options{detail} <= 0 || $Options{detail} > 3) { + die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Possible values: 1 to 3\n"; + } +} + +__END__ + +=head1 NAME + +InfoSDFiles.pl - List information about SDFile(s) + +=head1 SYNOPSIS + +InfoSDFile.pl SDFile(s)... + +InfoSDFile.pl [B<-a --all>] [B<-c --count>] [B<--chiral>] [B<--datacheck>] +[B<-d --detail> infolevel] [B<-e --empty>] [B<-f, --fields>] [B<-h, --help>] +[B<-i, --invalidatomnumbers>] [B<-m, --mismatch>] [B<-s, --salts>] [B<-u, --unknownatoms>] +[B<-w, --workingdir> dirname] SDFile(s)... + +=head1 DESCRIPTION + +List information about I<SDFile(s)> contents: number of compounds, empty records +and so on. Multiple SDFile names are separated by spaces. The valid file extensions +are I<.sdf> and I<.sd>. All other file names are ignored. All the SD files in a current +directory can be specified either by I<*.sdf> or the current directory name. + +=head1 OPTIONS + +=over 4 + +=item B<-a, --all> + +List all the available information. + +=item B<-c, --count> + +List number of compounds. This is B<default behavior>. + +=item B<--chiral> + +List number of empty atom/bond blocks for compounds with chiral flag set in +count line. + +=item B<-d, --detail> I<infolevel> + +Level of information to print. Default: 1. Possible values: I<1, 2, or 3>. + +=item B<--datacheck> + +List number of numerical and non-numerical values for each data field. + +=item B<-e, --empty> + +List number of empty atom/bond blocks and data fields for compounds. + +=item B<-f, --fields> + +List data field labels present for compounds. + +=item B<-h, --help> + +Print this help message. + +=item B<-i, --invalidatomnumbers> + +List number of bond blocks for compounds which contain invalid atom numbers. + +=item B<-m, --mismatch> + +List number of atom/bond blocks for compounds which don't match with counts +line information in header block. + +=item B<-s, --salts> + +List number of atom blocks for compounds which contain salts identified as +disconnected structural units. + +=item B<-u, --unknownatoms> + +List number of atom blocks for compounds which contain special atom symbols +such as L, Q, * ,LP, X, R#, or any other non periodic table symbols. + +=item B<-w, --workingdir> I<dirname> + +Location of working directory. Default: current directory. + +=back + +=head1 EXAMPLES + +To count compounds in SD file(s), type: + + % InfoSDFiles.pl Sample1.sdf + % InfoSDFiles.pl Sample1.sdf Sample2.sdf + % InfoSDFiles.pl *.sdf + +To list all available information for SD file(s), type: + + % InfoSDFiles.pl -a *.sdf + +To list all data fields present in sample.sdf, type: + + % InfoSDFiles.pl -f Sample.sdf + +To count number of compounds which contain salts and list associated structural +data, type: + + % InfoSDFiles.pl -s -d 3 Sample.sdf + +=head1 AUTHOR + +Manish Sud <msud@san.rr.com> + +=head1 SEE ALSO + +ExtractFromSDFiles.pl, FilterSDFiles.pl, MergeTextFilesWithSD.pl + +=head1 COPYRIGHT + +Copyright (C) 2015 Manish Sud. All rights reserved. + +This file is part of MayaChemTools. + +MayaChemTools is free software; you can redistribute it and/or modify it under +the terms of the GNU Lesser General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + +=cut