Mercurial > repos > deepakjadmin > mayatool3_test3
view mayachemtools/bin/InfoTextFiles.pl @ 9:ab29fa5c8c1f draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Thu, 15 Dec 2016 14:18:03 -0500 |
parents | 73ae111cf86f |
children |
line wrap: on
line source
#!/usr/bin/perl -w # # $RCSfile: InfoTextFiles.pl,v $ # $Date: 2015/02/28 20:46:20 $ # $Revision: 1.30 $ # # Author: Manish Sud <msud@san.rr.com> # # Copyright (C) 2015 Manish Sud. All rights reserved. # # This file is part of MayaChemTools. # # MayaChemTools is free software; you can redistribute it and/or modify it under # the terms of the GNU Lesser General Public License as published by the Free # Software Foundation; either version 3 of the License, or (at your option) any # later version. # # MayaChemTools is distributed in the hope that it will be useful, but without # any warranty; without even the implied warranty of merchantability of fitness # for a particular purpose. See the GNU Lesser General Public License for more # details. # # You should have received a copy of the GNU Lesser General Public License # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, # Boston, MA, 02111-1307, USA. # use strict; use FindBin; use lib "$FindBin::Bin/../lib"; use Getopt::Long; use File::Basename; use Text::ParseWords; use Benchmark; use FileUtil; use TextUtil; my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); # Autoflush STDOUT $| = 1; # Starting message... $ScriptName = basename($0); print "\n$ScriptName: Starting...\n\n"; $StartTime = new Benchmark; # Get the options and setup script... SetupScriptUsage(); if ($Options{help} || @ARGV < 1) { die GetUsageFromPod("$FindBin::Bin/$ScriptName"); } my(@TextFilesList); @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); # Process options... print "Processing options...\n"; my(%OptionsInfo); ProcessOptions(); print "Checking input text file(s)...\n"; my(%TextFilesInfo); RetrieveTextFilesInfo(); ProcessColumnsInfo(); # Generate output files... my($FileIndex); if (@TextFilesList > 1) { print "\nProcessing text files...\n"; } for $FileIndex (0 .. $#TextFilesList) { if ($TextFilesInfo{FileOkay}[$FileIndex]) { print "\nProcessing file $TextFilesList[$FileIndex]...\n"; ListTextFileInfo($FileIndex); } } ListTotalSizeOfFiles(); print "\n$ScriptName:Done...\n\n"; $EndTime = new Benchmark; $TotalTime = timediff ($EndTime, $StartTime); print "Total time: ", timestr($TotalTime), "\n"; ############################################################################### # List appropriate information... sub ListTextFileInfo { my($Index) = @_; my($TextFile, $Line, $InDelim, $LineCount, $EmptyLinesCount, $EmptyColDataLinesCount, $GreaterThanMaxColLinesCount, $Label, $Value, $ColNum, $EmptyColValueFound, $PrintTextLine, $NonNumericalDataFound, @ColLabels, @LineWords, %EmptyColValuesCountMap, %NonEmptyColValuesCountMap, %SpecifiedNonNumericalColValuesCountMap, %NonNumericalColValuesCountMap, %NumericalColValuesCountMap,); $TextFile = $TextFilesList[$Index]; $InDelim = $TextFilesInfo{InDelim}[$Index]; @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]}; open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; $LineCount = 0; $EmptyLinesCount = 0; $EmptyColDataLinesCount = 0; $GreaterThanMaxColLinesCount = 0; %EmptyColValuesCountMap = (); %NonEmptyColValuesCountMap = (); %SpecifiedNonNumericalColValuesCountMap = (); %NonNumericalColValuesCountMap = (); %NumericalColValuesCountMap = (); if ($OptionsInfo{ParseLines}) { # Skip over column labels from old file... if (<TEXTFILE>) { $LineCount++; LINE: while ($Line = <TEXTFILE>) { $LineCount++; $PrintTextLine = 0; $Line =~ s/(\r\n)|(\r)|\n//g; @LineWords = quotewords($InDelim, 0, $Line); if ($OptionsInfo{CountEmpty}) { # Count lines with no data... if (!@LineWords) { $EmptyLinesCount++; if ($OptionsInfo{DetailLevel} >= 2) { print "Line number $LineCount is empty...\n"; } next LINE; } # Count lines with empty data for some columns... $EmptyColValueFound = 0; VALUE: for $Value (@LineWords) { if (!IsNotEmpty($Value)) { $EmptyColValueFound = 1; next VALUE; } } if ($EmptyColValueFound) { $EmptyColDataLinesCount++; if ($OptionsInfo{DetailLevel} >= 2) { print "Line number $LineCount contains empty column value(s)...\n"; } $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; } # Count lines with columns greater than the column label line... if (@LineWords > @ColLabels) { $GreaterThanMaxColLinesCount++; if ($OptionsInfo{DetailLevel} >= 2) { print "Line number $LineCount contains more than ", scalar(@ColLabels), " columns...\n"; } $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; } # Count empty values for each coulmn... for $ColNum (0 .. $#LineWords) { if ($ColNum < @ColLabels) { $Label = $ColLabels[$ColNum]; if (IsNotEmpty($LineWords[$ColNum])) { if (exists($NonEmptyColValuesCountMap{$Label})) { $NonEmptyColValuesCountMap{$Label} += 1; } else { $NonEmptyColValuesCountMap{$Label} = 1; } } else { $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; if (exists($EmptyColValuesCountMap{$Label})) { $EmptyColValuesCountMap{$Label} += 1; } else { $EmptyColValuesCountMap{$Label} = 1; } } } } } if ($OptionsInfo{CheckData}) { for $ColNum (0 .. $#LineWords) { if ($ColNum < @ColLabels) { if (IsNumerical($LineWords[$ColNum])) { $Label = $ColLabels[$ColNum]; if (exists($NumericalColValuesCountMap{$Label})) { $NumericalColValuesCountMap{$Label} += 1; } else { $NumericalColValuesCountMap{$Label} = 1; } } else { $Label = $ColLabels[$ColNum]; if (IsNotEmpty($LineWords[$ColNum])) { if (exists($NonNumericalColValuesCountMap{$Label})) { $NonNumericalColValuesCountMap{$Label} += 1; } else { $NonNumericalColValuesCountMap{$Label} = 1; } } } } } } if ($OptionsInfo{CheckNumericalData}) { $NonNumericalDataFound = 0; for $ColNum (@{$TextFilesInfo{NumericalDataColNums}[$Index]}) { if ($ColNum < @LineWords) { if (!IsNumerical($LineWords[$ColNum])) { $NonNumericalDataFound = 1; $Label = $ColLabels[$ColNum]; if (exists($SpecifiedNonNumericalColValuesCountMap{$Label})) { $SpecifiedNonNumericalColValuesCountMap{$Label} += 1; } else { $SpecifiedNonNumericalColValuesCountMap{$Label} = 1; } } } } if ($NonNumericalDataFound) { $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; if ($OptionsInfo{DetailLevel} >=2 ) { print "Line number $LineCount contains non-numerical data for some specified column(s)...\n"; } } } if ($PrintTextLine) { print "Line $LineCount: $Line\n\n"; } } } } else { while (<TEXTFILE>) { $LineCount++; } } close TEXTFILE; print "\nNumber of lines: $LineCount\n"; print "Number of columns: $TextFilesInfo{ColCount}[$Index]\n"; print "Column labels: ", JoinWords(\@ColLabels, ", ", 1), "\n"; if ($OptionsInfo{CountEmpty}) { print "\nNumber of lines with no data: $EmptyLinesCount\n"; print "Number of lines with some missing column data: $EmptyColDataLinesCount\n"; print "Number of lines containing greater than ", scalar(@ColLabels), " columns: $GreaterThanMaxColLinesCount\n"; PrintDataInformation("Number of non-empty values for each column(s)", \@ColLabels, \%NonEmptyColValuesCountMap); PrintDataInformation("Number of empty values for each column(s)", \@ColLabels, \%EmptyColValuesCountMap); } if ($OptionsInfo{CheckData}) { print "\n"; PrintDataInformation("Number of non-numerical data values for each column(s)", \@ColLabels, \%NonNumericalColValuesCountMap); PrintDataInformation("Number of numerical data values for each column(s)", \@ColLabels, \%NumericalColValuesCountMap); print "\n"; } if ($OptionsInfo{CheckNumericalData} && @{$TextFilesInfo{NumericalDataColLabels}[$Index]}) { PrintDataInformation("Number of non-numerical data values for each column(s)", \@{$TextFilesInfo{NumericalDataColLabels}[$Index]}, \%SpecifiedNonNumericalColValuesCountMap); } # File size and modification information... print "\nFile size: ", FormatFileSize($TextFilesInfo{FileSize}[$Index]), " \n"; print "Last modified: ", $TextFilesInfo{FileLastModified}[$Index], " \n"; } # Total size of all the fiels... sub ListTotalSizeOfFiles { my($FileOkayCount, $TotalSize, $Index); $FileOkayCount = 0; $TotalSize = 0; for $Index (0 .. $#TextFilesList) { if ($TextFilesInfo{FileOkay}[$Index]) { $FileOkayCount++; $TotalSize += $TextFilesInfo{FileSize}[$Index]; } } if ($FileOkayCount > 1) { print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; } } # List data information... sub PrintDataInformation { my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_; my($Line, $Label); $Line = ""; for $Label (@{$DataLabelRef}) { $Line .= " \"$Label\" - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ","; } $Line =~ s/\,$//g; print "$InfoLabel: $Line\n"; } # Retrieve information about input text files... sub RetrieveTextFilesInfo { my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString); %TextFilesInfo = (); @{$TextFilesInfo{FileOkay}} = (); @{$TextFilesInfo{ColCount}} = (); @{$TextFilesInfo{ColLabels}} = (); @{$TextFilesInfo{ColLabelToNumMap}} = (); @{$TextFilesInfo{InDelim}} = (); @{$TextFilesInfo{FileSize}} = (); @{$TextFilesInfo{FileLastModified}} = (); FILELIST: for $Index (0 .. $#TextFilesList) { $TextFile = $TextFilesList[$Index]; $TextFilesInfo{FileOkay}[$Index] = 0; $TextFilesInfo{ColCount}[$Index] = 0; $TextFilesInfo{InDelim}[$Index] = ""; $TextFilesInfo{FileSize}[$Index] = 0; $TextFilesInfo{FileLastModified}[$Index] = ''; @{$TextFilesInfo{ColLabels}[$Index]} = (); %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); if (!(-e $TextFile)) { warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; next FILELIST; } if (!CheckFileType($TextFile, "csv tsv")) { warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; next FILELIST; } ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); if ($FileExt =~ /^tsv$/i) { $InDelim = "\t"; } else { $InDelim = "\,"; if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) { warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; next FILELIST; } if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { $InDelim = "\;"; } } if (!open TEXTFILE, "$TextFile") { warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; next FILELIST; } $Line = GetTextLine(\*TEXTFILE); @ColLabels = quotewords($InDelim, 0, $Line); close TEXTFILE; $TextFilesInfo{FileOkay}[$Index] = 1; $TextFilesInfo{InDelim}[$Index] = $InDelim; $TextFilesInfo{ColCount}[$Index] = @ColLabels; push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; for $ColNum (0 .. $#ColLabels) { $ColLabel = $ColLabels[$ColNum]; $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; } $TextFilesInfo{FileSize}[$Index] = FileSize($TextFile); ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile); $TextFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; } } # Make sure specified numerical data columns are okay... sub ProcessColumnsInfo { my($Index, $TextFile); @{$TextFilesInfo{NumericalDataColNums}} = (); @{$TextFilesInfo{NumericalDataColLabels}} = (); FILELIST: for $Index (0 .. $#TextFilesList) { $TextFile = $TextFilesList[$Index]; @{$TextFilesInfo{NumericalDataColNums}[$Index]} = (); @{$TextFilesInfo{NumericalDataColLabels}[$Index]} = (); if ($TextFilesInfo{FileOkay}[$Index]) { my($SpecifiedColNum, $ColNum, $ColLabel, @SpecifiedColNums, @SpecifiedColLabels); @SpecifiedColNums = (); if ($OptionsInfo{Mode} =~ /^colnum$/i) { for $SpecifiedColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { if ($SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) { $ColNum = $SpecifiedColNum - 1; push @SpecifiedColNums, $ColNum; push @SpecifiedColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; } } } else { for $ColLabel (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; push @SpecifiedColNums, $ColNum; push @SpecifiedColLabels, $ColLabel; } } } if (@SpecifiedColNums) { push @{$TextFilesInfo{NumericalDataColNums}[$Index]}, @SpecifiedColNums; push @{$TextFilesInfo{NumericalDataColLabels}[$Index]}, @SpecifiedColLabels; } } } } # Process option values... sub ProcessOptions { %OptionsInfo = (); $OptionsInfo{Mode} = $Options{mode}; $OptionsInfo{All} = $Options{all} ? $Options{all} : 0; $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0; $OptionsInfo{DetailLevel} = $Options{detail} ? $Options{detail} : 1; $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0; $OptionsInfo{InDelim} = $Options{indelim}; $OptionsInfo{NumericalDataCols} = $Options{numericaldatacols} ? $Options{numericaldatacols} : 0; $OptionsInfo{ParseLines} = ($Options{all} || $Options{empty} || $Options{numericaldatacols}) ? 1 : 0; $OptionsInfo{CountEmpty} = ($Options{all} || $Options{empty}) ? 1 : 0; $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0; $OptionsInfo{CheckNumericalData} = ($Options{all} || $Options{numericaldatacols}) ? 1 : 0; @{$OptionsInfo{SpecifiedNumericalDataCols}} = (); if ($Options{numericaldatacols}) { @{$OptionsInfo{SpecifiedNumericalDataCols}} = split ",", $Options{numericaldatacols}; if ($Options{mode} =~ /^colnum$/i) { my($ColNum); for $ColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { if (!IsPositiveInteger($ColNum)) { die "Error: Invalid value $ColNum specified using \"--numericaldatacols\" option: Allowed values: > 0\n"; } } } } } # Setup script usage and retrieve command line arguments specified using various options... sub SetupScriptUsage { # Retrieve all the options... %Options = (); $Options{detail} = 1; $Options{mode} = "colnum"; $Options{indelim} = "comma"; if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d=i", "empty|e", "help|h", "indelim=s", "mode|m=s", "numericaldatacols|n=s", "workingdir|w=s")) { die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; } if ($Options{workingdir}) { if (! -d $Options{workingdir}) { die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; } chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; } if ($Options{mode} !~ /^(colnum|collabel)$/i) { die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n"; } if ($Options{indelim} !~ /^(comma|semicolon)$/i) { die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; } if (!IsPositiveInteger($Options{detail})) { die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; } } __END__ =head1 NAME InfoTextFiles.pl - List information about TextFile(s) =head1 SYNOPSIS InfoTextFiles.pl TextFile(s)... InfoTextFiles.pl [B<-a, --all>] [B<-c, --count>] [B<--datacheck>] [B<-d, --detail> infolevel] [B<-e, --empty>] [B<-h, --help>] [B<--indelim> comma | semicolon] [B<-m, --mode> colnum | collabel] [B<-n, --numericaldatacols> colnum,[colnum,...] | collabel,[collabel,...]] [B<-w, --workingdir> dirname] TextFile(s)... =head1 DESCRIPTION List information about I<TextFile(s)> contents: number of lines and columns, empty column values, and so on. The file names are separated by spaces. The valid file extensions are I<.csv> and I<.tsv> for comma/semicolon and tab delimited text files respectively. All other file names are ignored. All the text files in a current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file which doesn't correspond to the format indicated by B<--indelim> option is ignored. =head1 OPTIONS =over 4 =item B<-a, --all> List all the available information. =item B<-c, --count> List number of rows and columns. This is B<default behavior>. =item B<--datacheck> List number of numerical and non-numerical values for each column. =item B<-d, --detail> I<infolevel> Level of information to print about lines being ignored. Default: I<1>. Possible values: I<1, 2 or 3>. =item B<-e, --empty> List number of empty row and column values. =item B<-h, --help> Print this help message. =item B<--indelim> I<comma | semicolon> Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>. Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a delimiter. =item B<-m, --mode> I<colnum | collabel> Specify how to identify numerical data columns: using column number or column label. Possible values: I<colnum or collabel>. Default value: I<colnum>. =item B<-n, --numericaldatacols> I<colnum,[colnum,...] | collabel,[collabel,...]> This value is mode specific. It is a list of column number or labels to check for presence of numerical data only; otherwise, the value is flagged. Default value: I<all;all;...>. For I<colnum> mode, input value format is: I<colnum,...;colnum,...;...>. Example: 1,3,5 "2,4,6" For I<collabel> mode, input value format is: I<collabel,...;collabel,...;...>. Example: "MW,SumNO,SumNHOH" =item B<-w, --workingdir> I<dirname> Location of working directory. Default: current directory. =back =head1 EXAMPLES To count number of lines and columns in Text file(s), type: % InfoTextFiles.pl Sample1.csv % InfoTextFiles.pl Sample1.csv Sample1.tsv % InfoTextFiles.pl *.csv *.tsv To count number of lines, columns and empty values in Sample1.csv file and print detailed information, type: % InfoTextFiles.pl -d 3 -e Sample1.csv To track all available information and non-numerical values for Mol_ID and MolWeight columns in Sample1.csv file and print detailed information, type: % InfoTextFiles.pl -d 3 -a -m collabel -n Mol_ID,MolWeight Sample1.csv =head1 AUTHOR Manish Sud <msud@san.rr.com> =head1 SEE ALSO JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl, TextFilesToHTML.pl =head1 COPYRIGHT Copyright (C) 2015 Manish Sud. All rights reserved. This file is part of MayaChemTools. MayaChemTools is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. =cut