Mercurial > repos > deepakjadmin > r_caret_test
diff mayachemtool/mayachemtools/bin/InfoTextFiles.pl @ 0:68300206e90d draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Thu, 05 Nov 2015 02:41:30 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mayachemtool/mayachemtools/bin/InfoTextFiles.pl Thu Nov 05 02:41:30 2015 -0500 @@ -0,0 +1,596 @@ +#!/usr/bin/perl -w +# +# $RCSfile: InfoTextFiles.pl,v $ +# $Date: 2015/02/28 20:46:20 $ +# $Revision: 1.30 $ +# +# Author: Manish Sud <msud@san.rr.com> +# +# Copyright (C) 2015 Manish Sud. All rights reserved. +# +# This file is part of MayaChemTools. +# +# MayaChemTools is free software; you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# MayaChemTools is distributed in the hope that it will be useful, but without +# any warranty; without even the implied warranty of merchantability of fitness +# for a particular purpose. See the GNU Lesser General Public License for more +# details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or +# write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, +# Boston, MA, 02111-1307, USA. +# + +use strict; +use FindBin; use lib "$FindBin::Bin/../lib"; +use Getopt::Long; +use File::Basename; +use Text::ParseWords; +use Benchmark; +use FileUtil; +use TextUtil; + +my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); + +# Autoflush STDOUT +$| = 1; + +# Starting message... +$ScriptName = basename($0); +print "\n$ScriptName: Starting...\n\n"; +$StartTime = new Benchmark; + +# Get the options and setup script... +SetupScriptUsage(); +if ($Options{help} || @ARGV < 1) { + die GetUsageFromPod("$FindBin::Bin/$ScriptName"); +} + +my(@TextFilesList); +@TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); + +# Process options... +print "Processing options...\n"; +my(%OptionsInfo); +ProcessOptions(); + +print "Checking input text file(s)...\n"; +my(%TextFilesInfo); +RetrieveTextFilesInfo(); +ProcessColumnsInfo(); + +# Generate output files... +my($FileIndex); +if (@TextFilesList > 1) { + print "\nProcessing text files...\n"; +} +for $FileIndex (0 .. $#TextFilesList) { + if ($TextFilesInfo{FileOkay}[$FileIndex]) { + print "\nProcessing file $TextFilesList[$FileIndex]...\n"; + ListTextFileInfo($FileIndex); + } +} +ListTotalSizeOfFiles(); + +print "\n$ScriptName:Done...\n\n"; + +$EndTime = new Benchmark; +$TotalTime = timediff ($EndTime, $StartTime); +print "Total time: ", timestr($TotalTime), "\n"; + +############################################################################### + +# List appropriate information... +sub ListTextFileInfo { + my($Index) = @_; + my($TextFile, $Line, $InDelim, $LineCount, $EmptyLinesCount, $EmptyColDataLinesCount, $GreaterThanMaxColLinesCount, $Label, $Value, $ColNum, $EmptyColValueFound, $PrintTextLine, $NonNumericalDataFound, @ColLabels, @LineWords, %EmptyColValuesCountMap, %NonEmptyColValuesCountMap, %SpecifiedNonNumericalColValuesCountMap, %NonNumericalColValuesCountMap, %NumericalColValuesCountMap,); + + $TextFile = $TextFilesList[$Index]; + $InDelim = $TextFilesInfo{InDelim}[$Index]; + @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]}; + + open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; + + $LineCount = 0; + $EmptyLinesCount = 0; + $EmptyColDataLinesCount = 0; + $GreaterThanMaxColLinesCount = 0; + + %EmptyColValuesCountMap = (); + %NonEmptyColValuesCountMap = (); + %SpecifiedNonNumericalColValuesCountMap = (); + %NonNumericalColValuesCountMap = (); + %NumericalColValuesCountMap = (); + + if ($OptionsInfo{ParseLines}) { + # Skip over column labels from old file... + if (<TEXTFILE>) { + $LineCount++; + LINE: while ($Line = <TEXTFILE>) { + $LineCount++; + $PrintTextLine = 0; + $Line =~ s/(\r\n)|(\r)|\n//g; + @LineWords = quotewords($InDelim, 0, $Line); + if ($OptionsInfo{CountEmpty}) { + # Count lines with no data... + if (!@LineWords) { + $EmptyLinesCount++; + if ($OptionsInfo{DetailLevel} >= 2) { + print "Line number $LineCount is empty...\n"; + } + next LINE; + } + # Count lines with empty data for some columns... + $EmptyColValueFound = 0; + VALUE: for $Value (@LineWords) { + if (!IsNotEmpty($Value)) { + $EmptyColValueFound = 1; + next VALUE; + } + } + if ($EmptyColValueFound) { + $EmptyColDataLinesCount++; + if ($OptionsInfo{DetailLevel} >= 2) { + print "Line number $LineCount contains empty column value(s)...\n"; + } + $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; + } + # Count lines with columns greater than the column label line... + if (@LineWords > @ColLabels) { + $GreaterThanMaxColLinesCount++; + if ($OptionsInfo{DetailLevel} >= 2) { + print "Line number $LineCount contains more than ", scalar(@ColLabels), " columns...\n"; + } + $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; + } + # Count empty values for each coulmn... + for $ColNum (0 .. $#LineWords) { + if ($ColNum < @ColLabels) { + $Label = $ColLabels[$ColNum]; + if (IsNotEmpty($LineWords[$ColNum])) { + if (exists($NonEmptyColValuesCountMap{$Label})) { + $NonEmptyColValuesCountMap{$Label} += 1; + } + else { + $NonEmptyColValuesCountMap{$Label} = 1; + } + } + else { + $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; + if (exists($EmptyColValuesCountMap{$Label})) { + $EmptyColValuesCountMap{$Label} += 1; + } + else { + $EmptyColValuesCountMap{$Label} = 1; + } + } + } + } + } + if ($OptionsInfo{CheckData}) { + for $ColNum (0 .. $#LineWords) { + if ($ColNum < @ColLabels) { + if (IsNumerical($LineWords[$ColNum])) { + $Label = $ColLabels[$ColNum]; + if (exists($NumericalColValuesCountMap{$Label})) { + $NumericalColValuesCountMap{$Label} += 1; + } + else { + $NumericalColValuesCountMap{$Label} = 1; + } + } + else { + $Label = $ColLabels[$ColNum]; + if (IsNotEmpty($LineWords[$ColNum])) { + if (exists($NonNumericalColValuesCountMap{$Label})) { + $NonNumericalColValuesCountMap{$Label} += 1; + } + else { + $NonNumericalColValuesCountMap{$Label} = 1; + } + } + } + } + } + } + if ($OptionsInfo{CheckNumericalData}) { + $NonNumericalDataFound = 0; + for $ColNum (@{$TextFilesInfo{NumericalDataColNums}[$Index]}) { + if ($ColNum < @LineWords) { + if (!IsNumerical($LineWords[$ColNum])) { + $NonNumericalDataFound = 1; + $Label = $ColLabels[$ColNum]; + if (exists($SpecifiedNonNumericalColValuesCountMap{$Label})) { + $SpecifiedNonNumericalColValuesCountMap{$Label} += 1; + } + else { + $SpecifiedNonNumericalColValuesCountMap{$Label} = 1; + } + } + } + } + if ($NonNumericalDataFound) { + $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; + if ($OptionsInfo{DetailLevel} >=2 ) { + print "Line number $LineCount contains non-numerical data for some specified column(s)...\n"; + } + } + } + if ($PrintTextLine) { + print "Line $LineCount: $Line\n\n"; + } + } + } + } + else { + while (<TEXTFILE>) { + $LineCount++; + } + } + close TEXTFILE; + + print "\nNumber of lines: $LineCount\n"; + print "Number of columns: $TextFilesInfo{ColCount}[$Index]\n"; + print "Column labels: ", JoinWords(\@ColLabels, ", ", 1), "\n"; + + if ($OptionsInfo{CountEmpty}) { + print "\nNumber of lines with no data: $EmptyLinesCount\n"; + print "Number of lines with some missing column data: $EmptyColDataLinesCount\n"; + print "Number of lines containing greater than ", scalar(@ColLabels), " columns: $GreaterThanMaxColLinesCount\n"; + PrintDataInformation("Number of non-empty values for each column(s)", \@ColLabels, \%NonEmptyColValuesCountMap); + PrintDataInformation("Number of empty values for each column(s)", \@ColLabels, \%EmptyColValuesCountMap); + } + + if ($OptionsInfo{CheckData}) { + print "\n"; + PrintDataInformation("Number of non-numerical data values for each column(s)", \@ColLabels, \%NonNumericalColValuesCountMap); + PrintDataInformation("Number of numerical data values for each column(s)", \@ColLabels, \%NumericalColValuesCountMap); + print "\n"; + } + + if ($OptionsInfo{CheckNumericalData} && @{$TextFilesInfo{NumericalDataColLabels}[$Index]}) { + PrintDataInformation("Number of non-numerical data values for each column(s)", \@{$TextFilesInfo{NumericalDataColLabels}[$Index]}, \%SpecifiedNonNumericalColValuesCountMap); + } + + # File size and modification information... + print "\nFile size: ", FormatFileSize($TextFilesInfo{FileSize}[$Index]), " \n"; + print "Last modified: ", $TextFilesInfo{FileLastModified}[$Index], " \n"; +} + +# Total size of all the fiels... +sub ListTotalSizeOfFiles { + my($FileOkayCount, $TotalSize, $Index); + + $FileOkayCount = 0; + $TotalSize = 0; + + for $Index (0 .. $#TextFilesList) { + if ($TextFilesInfo{FileOkay}[$Index]) { + $FileOkayCount++; + $TotalSize += $TextFilesInfo{FileSize}[$Index]; + } + } + if ($FileOkayCount > 1) { + print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; + } +} + +# List data information... +sub PrintDataInformation { + my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_; + my($Line, $Label); + + $Line = ""; + for $Label (@{$DataLabelRef}) { + $Line .= " \"$Label\" - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ","; + } + $Line =~ s/\,$//g; + print "$InfoLabel: $Line\n"; +} + +# Retrieve information about input text files... +sub RetrieveTextFilesInfo { + my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString); + + %TextFilesInfo = (); + @{$TextFilesInfo{FileOkay}} = (); + @{$TextFilesInfo{ColCount}} = (); + @{$TextFilesInfo{ColLabels}} = (); + @{$TextFilesInfo{ColLabelToNumMap}} = (); + @{$TextFilesInfo{InDelim}} = (); + @{$TextFilesInfo{FileSize}} = (); + @{$TextFilesInfo{FileLastModified}} = (); + + FILELIST: for $Index (0 .. $#TextFilesList) { + $TextFile = $TextFilesList[$Index]; + + $TextFilesInfo{FileOkay}[$Index] = 0; + $TextFilesInfo{ColCount}[$Index] = 0; + $TextFilesInfo{InDelim}[$Index] = ""; + $TextFilesInfo{FileSize}[$Index] = 0; + $TextFilesInfo{FileLastModified}[$Index] = ''; + @{$TextFilesInfo{ColLabels}[$Index]} = (); + %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); + + if (!(-e $TextFile)) { + warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; + next FILELIST; + } + if (!CheckFileType($TextFile, "csv tsv")) { + warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; + next FILELIST; + } + ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); + if ($FileExt =~ /^tsv$/i) { + $InDelim = "\t"; + } + else { + $InDelim = "\,"; + if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) { + warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; + next FILELIST; + } + if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { + $InDelim = "\;"; + } + } + + if (!open TEXTFILE, "$TextFile") { + warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; + next FILELIST; + } + + $Line = GetTextLine(\*TEXTFILE); + @ColLabels = quotewords($InDelim, 0, $Line); + close TEXTFILE; + + $TextFilesInfo{FileOkay}[$Index] = 1; + $TextFilesInfo{InDelim}[$Index] = $InDelim; + + $TextFilesInfo{ColCount}[$Index] = @ColLabels; + push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; + for $ColNum (0 .. $#ColLabels) { + $ColLabel = $ColLabels[$ColNum]; + $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; + } + $TextFilesInfo{FileSize}[$Index] = FileSize($TextFile); + ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile); + $TextFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; + } + +} + +# Make sure specified numerical data columns are okay... +sub ProcessColumnsInfo { + my($Index, $TextFile); + + @{$TextFilesInfo{NumericalDataColNums}} = (); + @{$TextFilesInfo{NumericalDataColLabels}} = (); + + FILELIST: for $Index (0 .. $#TextFilesList) { + $TextFile = $TextFilesList[$Index]; + @{$TextFilesInfo{NumericalDataColNums}[$Index]} = (); + @{$TextFilesInfo{NumericalDataColLabels}[$Index]} = (); + + if ($TextFilesInfo{FileOkay}[$Index]) { + my($SpecifiedColNum, $ColNum, $ColLabel, @SpecifiedColNums, @SpecifiedColLabels); + @SpecifiedColNums = (); + if ($OptionsInfo{Mode} =~ /^colnum$/i) { + for $SpecifiedColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { + if ($SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) { + $ColNum = $SpecifiedColNum - 1; + push @SpecifiedColNums, $ColNum; + push @SpecifiedColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; + } + } + } + else { + for $ColLabel (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { + if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { + $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; + push @SpecifiedColNums, $ColNum; + push @SpecifiedColLabels, $ColLabel; + } + } + } + if (@SpecifiedColNums) { + push @{$TextFilesInfo{NumericalDataColNums}[$Index]}, @SpecifiedColNums; + push @{$TextFilesInfo{NumericalDataColLabels}[$Index]}, @SpecifiedColLabels; + } + } + } +} + +# Process option values... +sub ProcessOptions { + %OptionsInfo = (); + + $OptionsInfo{Mode} = $Options{mode}; + + $OptionsInfo{All} = $Options{all} ? $Options{all} : 0; + $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0; + + $OptionsInfo{DetailLevel} = $Options{detail} ? $Options{detail} : 1; + + $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0; + + $OptionsInfo{InDelim} = $Options{indelim}; + $OptionsInfo{NumericalDataCols} = $Options{numericaldatacols} ? $Options{numericaldatacols} : 0; + + $OptionsInfo{ParseLines} = ($Options{all} || $Options{empty} || $Options{numericaldatacols}) ? 1 : 0; + $OptionsInfo{CountEmpty} = ($Options{all} || $Options{empty}) ? 1 : 0; + $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0; + $OptionsInfo{CheckNumericalData} = ($Options{all} || $Options{numericaldatacols}) ? 1 : 0; + + @{$OptionsInfo{SpecifiedNumericalDataCols}} = (); + if ($Options{numericaldatacols}) { + @{$OptionsInfo{SpecifiedNumericalDataCols}} = split ",", $Options{numericaldatacols}; + if ($Options{mode} =~ /^colnum$/i) { + my($ColNum); + for $ColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { + if (!IsPositiveInteger($ColNum)) { + die "Error: Invalid value $ColNum specified using \"--numericaldatacols\" option: Allowed values: > 0\n"; + } + } + } + } + +} + +# Setup script usage and retrieve command line arguments specified using various options... +sub SetupScriptUsage { + + # Retrieve all the options... + %Options = (); + $Options{detail} = 1; + $Options{mode} = "colnum"; + $Options{indelim} = "comma"; + if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d=i", "empty|e", "help|h", "indelim=s", "mode|m=s", "numericaldatacols|n=s", "workingdir|w=s")) { + die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; + } + if ($Options{workingdir}) { + if (! -d $Options{workingdir}) { + die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; + } + chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; + } + if ($Options{mode} !~ /^(colnum|collabel)$/i) { + die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n"; + } + if ($Options{indelim} !~ /^(comma|semicolon)$/i) { + die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; + } + if (!IsPositiveInteger($Options{detail})) { + die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; + } +} + +__END__ + +=head1 NAME + +InfoTextFiles.pl - List information about TextFile(s) + +=head1 SYNOPSIS + +InfoTextFiles.pl TextFile(s)... + +InfoTextFiles.pl [B<-a, --all>] [B<-c, --count>] [B<--datacheck>] [B<-d, --detail> infolevel] [B<-e, --empty>] +[B<-h, --help>] [B<--indelim> comma | semicolon] [B<-m, --mode> colnum | collabel] +[B<-n, --numericaldatacols> colnum,[colnum,...] | collabel,[collabel,...]] +[B<-w, --workingdir> dirname] TextFile(s)... + +=head1 DESCRIPTION + +List information about I<TextFile(s)> contents: number of lines and columns, empty +column values, and so on. The file names are separated by spaces. +The valid file extensions are I<.csv> and I<.tsv> for comma/semicolon and tab delimited +text files respectively. All other file names are ignored. All the text files in a +current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory +name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file +which doesn't correspond to the format indicated by B<--indelim> option is ignored. + +=head1 OPTIONS + +=over 4 + +=item B<-a, --all> + +List all the available information. + +=item B<-c, --count> + +List number of rows and columns. This is B<default behavior>. + +=item B<--datacheck> + +List number of numerical and non-numerical values for each column. + +=item B<-d, --detail> I<infolevel> + +Level of information to print about lines being ignored. Default: I<1>. Possible values: +I<1, 2 or 3>. + +=item B<-e, --empty> + +List number of empty row and column values. + +=item B<-h, --help> + +Print this help message. + +=item B<--indelim> I<comma | semicolon> + +Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>. +Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a +delimiter. + +=item B<-m, --mode> I<colnum | collabel> + +Specify how to identify numerical data columns: using column number or column label. +Possible values: I<colnum or collabel>. Default value: I<colnum>. + +=item B<-n, --numericaldatacols> I<colnum,[colnum,...] | collabel,[collabel,...]> + +This value is mode specific. It is a list of column number or labels to check for +presence of numerical data only; otherwise, the value is flagged. Default value: I<all;all;...>. + +For I<colnum> mode, input value format is: I<colnum,...;colnum,...;...>. Example: + + 1,3,5 + "2,4,6" + +For I<collabel> mode, input value format is: I<collabel,...;collabel,...;...>. Example: + + "MW,SumNO,SumNHOH" + + +=item B<-w, --workingdir> I<dirname> + +Location of working directory. Default: current directory. + +=back + +=head1 EXAMPLES + +To count number of lines and columns in Text file(s), type: + + % InfoTextFiles.pl Sample1.csv + % InfoTextFiles.pl Sample1.csv Sample1.tsv + % InfoTextFiles.pl *.csv *.tsv + +To count number of lines, columns and empty values in Sample1.csv file and print +detailed information, type: + + % InfoTextFiles.pl -d 3 -e Sample1.csv + +To track all available information and non-numerical values for Mol_ID and MolWeight +columns in Sample1.csv file and print detailed information, type: + + % InfoTextFiles.pl -d 3 -a -m collabel -n Mol_ID,MolWeight Sample1.csv + +=head1 AUTHOR + +Manish Sud <msud@san.rr.com> + +=head1 SEE ALSO + +JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl, TextFilesToHTML.pl + +=head1 COPYRIGHT + +Copyright (C) 2015 Manish Sud. All rights reserved. + +This file is part of MayaChemTools. + +MayaChemTools is free software; you can redistribute it and/or modify it under +the terms of the GNU Lesser General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + +=cut