MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: SplitTextFiles.pl,v $
   4 # $Date: 2015/02/28 20:46:21 $
   5 # $Revision: 1.33 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 
  38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  39 
  40 # Autoflush STDOUT
  41 $| = 1;
  42 
  43 # Starting message...
  44 $ScriptName = basename $0;
  45 print "\n$ScriptName:Starting...\n\n";
  46 $StartTime = new Benchmark;
  47 
  48 # Get the options and setup script...
  49 SetupScriptUsage();
  50 if ($Options{help} || @ARGV < 1) {
  51   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  52 }
  53 
  54 my(@TextFilesList);
  55 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  56 
  57 # Process options...
  58 print "Processing options...\n";
  59 my(%OptionsInfo);
  60 ProcessOptions();
  61 
  62 print "Checking input text file(s)...\n";
  63 my(%TextFilesInfo);
  64 RetrieveTextFilesInfo();
  65 
  66 # Generate output files...
  67 my($FileIndex);
  68 if (@TextFilesList > 1) {
  69   print "\nProcessing text files...\n";
  70 }
  71 for $FileIndex (0 .. $#TextFilesList) {
  72   if ($TextFilesInfo{FileOkay}[$FileIndex]) {
  73     print "\nProcessing file $TextFilesList[$FileIndex]...\n";
  74     SplitTextFile($FileIndex);
  75   }
  76 }
  77 
  78 print "\n$ScriptName:Done...\n\n";
  79 
  80 $EndTime = new Benchmark;
  81 $TotalTime = timediff ($EndTime, $StartTime);
  82 print "Total time: ", timestr($TotalTime), "\n";
  83 
  84 ###############################################################################
  85 
  86 # Split a Text file...
  87 #
  88 sub SplitTextFile {
  89   my($FileIndex) = @_;
  90   my($TextFile, $LineCount, $MaxLinesPerFile, $MaxNumOfFiles);
  91 
  92   $TextFile = $TextFilesList[$FileIndex];
  93 
  94   if (!open TEXTFILE, "$TextFile") {
  95     warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
  96     return;
  97   }
  98 
  99   $MaxNumOfFiles = $OptionsInfo{NumOfFiles};
 100 
 101   # Count number of lines to figure out maximum number of lines per file...
 102   $LineCount = 0;
 103   while (<TEXTFILE>) {
 104       $LineCount++;
 105   }
 106   close TEXTFILE;
 107 
 108   if ($LineCount < $MaxNumOfFiles) {
 109     warn "Warning: Ignoring file $TextFile: Total number of lines, $LineCount, is smaller than\nnumber of new files, $MaxNumOfFiles\n";
 110     return;
 111   }
 112 
 113   $MaxLinesPerFile = int $LineCount / $MaxNumOfFiles;
 114 
 115   GenerateTextFiles($FileIndex, $MaxNumOfFiles, $MaxLinesPerFile);
 116 }
 117 
 118 # Generate new Text files...
 119 #
 120 sub GenerateTextFiles {
 121   my($FileIndex, $NumOfFiles, $NumOfLinesPerFile) = @_;
 122   my($TextFile, $LineCount, $NewFileIndex, $NewFileName, $MaxLinesCount, $InDelim, $OutDelim, $OutQuote, $ColLabelsLine, $Line, @ColLabels, @Words, @NewTextFilesList);
 123 
 124   # Setup new file names list...
 125   @NewTextFilesList = ();
 126   for $NewFileIndex (1 .. $NumOfFiles) {
 127     $NewFileName = $TextFilesInfo{OutFileRoot}[$FileIndex] . "Part${NewFileIndex}." . $TextFilesInfo{OutFileExt}[$FileIndex];
 128     if (!$OptionsInfo{OverwriteFiles}) {
 129       if (-e $NewFileName) {
 130         warn "Warning: Ignoring file $TextFile: New Text file, $NewFileName, already exists\n";
 131         return;
 132       }
 133     }
 134     push @NewTextFilesList, $NewFileName;
 135   }
 136 
 137   $TextFile = $TextFilesList[$FileIndex];
 138 
 139   if (!open TEXTFILE, "$TextFile") {
 140     warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 141     return;
 142   }
 143 
 144   $InDelim = $TextFilesInfo{InDelim}[$FileIndex];
 145 
 146   $OutDelim = $OptionsInfo{OutDelim};
 147   $OutQuote = $OptionsInfo{OutQuote};
 148 
 149   $MaxLinesCount = $NumOfLinesPerFile;
 150   $LineCount = 0;
 151   $NewFileIndex = 1;
 152 
 153   open NEWTEXTFILE, ">$NewTextFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewTextFilesList[$NewFileIndex -1]: $! \n";
 154   print "Generating $NewTextFilesList[$NewFileIndex - 1] file...\n";
 155 
 156   if ($OptionsInfo{Label}) {
 157     if ($OptionsInfo{Fast}) {
 158       $ColLabelsLine = GetTextLine(\*TEXTFILE);
 159     }
 160     else {
 161       $Line = GetTextLine(\*TEXTFILE);
 162       @ColLabels = quotewords($InDelim, 0, $Line);
 163       $ColLabelsLine = JoinWords(\@ColLabels, $OutDelim, $OutQuote);
 164     }
 165     print NEWTEXTFILE "$ColLabelsLine\n";
 166   }
 167 
 168   while ($Line = GetTextLine(\*TEXTFILE)) {
 169     $LineCount++;
 170 
 171     if (!$Options{fast}) {
 172       @Words = quotewords($InDelim, 0, $Line);
 173       $Line = JoinWords(\@Words, $OutDelim, $OutQuote);
 174     }
 175     print NEWTEXTFILE "$Line\n";
 176 
 177     if ($NewFileIndex <= $NumOfFiles) {
 178       if ($LineCount >= $MaxLinesCount) {
 179         if ($NewFileIndex < $NumOfFiles) {
 180           close NEWTEXTFILE;
 181         }
 182         $NewFileIndex++;
 183         $MaxLinesCount = $NumOfLinesPerFile * $NewFileIndex;
 184 
 185         if ($NewFileIndex <= $NumOfFiles) {
 186           open NEWTEXTFILE, ">$NewTextFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewTextFilesList[$NewFileIndex -1]: $! \n";
 187           print "Generating $NewTextFilesList[$NewFileIndex - 1] file...\n";
 188 
 189           if ($OptionsInfo{Label}) {
 190             print NEWTEXTFILE "$ColLabelsLine\n";
 191           }
 192         }
 193       }
 194     }
 195   }
 196   close NEWTEXTFILE;
 197   close TEXTFILE;
 198 }
 199 
 200 # Retrieve information about Text files...
 201 sub RetrieveTextFilesInfo {
 202   my($Index, $TextFile, $InDelim, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt);
 203 
 204   %TextFilesInfo = ();
 205   @{$TextFilesInfo{FileOkay}} = ();
 206   @{$TextFilesInfo{InDelim}} = ();
 207   @{$TextFilesInfo{OutFileRoot}} = ();
 208   @{$TextFilesInfo{OutFileExt}} = ();
 209 
 210   FILELIST: for $Index (0 .. $#TextFilesList) {
 211     $TextFilesInfo{FileOkay}[$Index] = 0;
 212     $TextFilesInfo{InDelim}[$Index] = "";
 213     $TextFilesInfo{OutFileRoot}[$Index] = "";
 214     $TextFilesInfo{OutFileExt}[$Index] = "";
 215 
 216     $TextFile = $TextFilesList[$Index];
 217     if (!(-e $TextFile)) {
 218       warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
 219       next FILELIST;
 220     }
 221     if (!CheckFileType($TextFile, "csv tsv")) {
 222       warn "Warning: Ignoring file $TextFile: It's not a Text file\n";
 223       next FILELIST;
 224     }
 225     if (! open TEXTFILE, "$TextFile") {
 226       warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 227       next FILELIST;
 228     }
 229     close TEXTFILE;
 230 
 231     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 232 
 233     # Setup input delimiter...
 234     $InDelim = '';
 235     if (!$OptionsInfo{Fast}) {
 236       if ($FileExt =~ /^tsv$/i) {
 237         $InDelim = "\t";
 238       }
 239       else {
 240         $InDelim = "\,";
 241         if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) {
 242           warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n";
 243           next FILELIST;
 244         }
 245         if ($OptionsInfo{InDelim} =~ /^semicolon$/i) {
 246           $InDelim = "\;";
 247         }
 248       }
 249     }
 250 
 251     # Setup output file root...
 252     $OutFileExt = $OptionsInfo{Fast} ? $FileExt : (($Options{outdelim} =~ /^tab$/i ) ? "tsv" : "csv");
 253 
 254     if ($OptionsInfo{OutFileRoot} && (@TextFilesList == 1)) {
 255       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 256       if ($RootFileName && $RootFileExt) {
 257         $FileName = $RootFileName;
 258       }
 259       else {
 260         $FileName = $OptionsInfo{OutFileRoot};
 261       }
 262       $OutFileRoot = $FileName;
 263     }
 264     else {
 265       $OutFileRoot = $FileName;
 266     }
 267 
 268     $TextFilesInfo{FileOkay}[$Index] = 1;
 269     $TextFilesInfo{InDelim}[$Index] = $InDelim;
 270     $TextFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 271     $TextFilesInfo{OutFileExt}[$Index] = $OutFileExt;
 272   }
 273 }
 274 
 275 # Process option values...
 276 sub ProcessOptions {
 277 
 278   %OptionsInfo = ();
 279 
 280   $OptionsInfo{Fast} = defined $Options{fast} ? $Options{fast} : undef;
 281 
 282   $OptionsInfo{InDelim} = $Options{indelim};
 283   $OptionsInfo{Label} = ($Options{label} =~ /^yes$/i) ? 1 : 0;
 284 
 285   $OptionsInfo{NumOfFiles} = $Options{numfiles};
 286 
 287   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef;
 288   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? $Options{overwrite} : undef;
 289 
 290   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
 291   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 292 }
 293 
 294 # Setup script usage  and retrieve command line arguments specified using various options...
 295 sub SetupScriptUsage {
 296 
 297   # Retrieve all the options...
 298   %Options = ();
 299   $Options{label} = "yes";
 300   $Options{numfiles} = 2;
 301   $Options{indelim} = "comma";
 302   $Options{outdelim} = "comma";
 303   $Options{quote} = "yes";
 304   if (!GetOptions(\%Options, "fast|f", "help|h", "indelim=s", "label|l=s", "numfiles|n=i", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "workingdir|w=s")) {
 305     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 306   }
 307   if ($Options{workingdir}) {
 308     if (! -d $Options{workingdir}) {
 309       die "Error: The value specified, $Options{workingdir},  for option \"-w --workingdir\" is not a directory name.\n";
 310     }
 311     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 312   }
 313   if ($Options{numfiles} < 2) {
 314     die "Error: The value specified, $Options{numfiles},  for option \"-n --numfiles\" is not valid. Allowed values: >= 2 \n";
 315   }
 316   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 317     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 318   }
 319   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 320     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 321   }
 322   if ($Options{quote} !~ /^(yes|no)$/i) {
 323     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 324   }
 325   if ($Options{label} !~ /^(yes|no)$/i) {
 326     die "Error: The value specified, $Options{label}, for option \"-l --label\" is not valid. Allowed values: yes or no\n";
 327   }
 328 }
 329