1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: SplitTextFiles.pl,v $ 4 # $Date: 2015/02/28 20:46:21 $ 5 # $Revision: 1.33 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 39 40 # Autoflush STDOUT 41 $| = 1; 42 43 # Starting message... 44 $ScriptName = basename $0; 45 print "\n$ScriptName:Starting...\n\n"; 46 $StartTime = new Benchmark; 47 48 # Get the options and setup script... 49 SetupScriptUsage(); 50 if ($Options{help} || @ARGV < 1) { 51 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 52 } 53 54 my(@TextFilesList); 55 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 56 57 # Process options... 58 print "Processing options...\n"; 59 my(%OptionsInfo); 60 ProcessOptions(); 61 62 print "Checking input text file(s)...\n"; 63 my(%TextFilesInfo); 64 RetrieveTextFilesInfo(); 65 66 # Generate output files... 67 my($FileIndex); 68 if (@TextFilesList > 1) { 69 print "\nProcessing text files...\n"; 70 } 71 for $FileIndex (0 .. $#TextFilesList) { 72 if ($TextFilesInfo{FileOkay}[$FileIndex]) { 73 print "\nProcessing file $TextFilesList[$FileIndex]...\n"; 74 SplitTextFile($FileIndex); 75 } 76 } 77 78 print "\n$ScriptName:Done...\n\n"; 79 80 $EndTime = new Benchmark; 81 $TotalTime = timediff ($EndTime, $StartTime); 82 print "Total time: ", timestr($TotalTime), "\n"; 83 84 ############################################################################### 85 86 # Split a Text file... 87 # 88 sub SplitTextFile { 89 my($FileIndex) = @_; 90 my($TextFile, $LineCount, $MaxLinesPerFile, $MaxNumOfFiles); 91 92 $TextFile = $TextFilesList[$FileIndex]; 93 94 if (!open TEXTFILE, "$TextFile") { 95 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 96 return; 97 } 98 99 $MaxNumOfFiles = $OptionsInfo{NumOfFiles}; 100 101 # Count number of lines to figure out maximum number of lines per file... 102 $LineCount = 0; 103 while (<TEXTFILE>) { 104 $LineCount++; 105 } 106 close TEXTFILE; 107 108 if ($LineCount < $MaxNumOfFiles) { 109 warn "Warning: Ignoring file $TextFile: Total number of lines, $LineCount, is smaller than\nnumber of new files, $MaxNumOfFiles\n"; 110 return; 111 } 112 113 $MaxLinesPerFile = int $LineCount / $MaxNumOfFiles; 114 115 GenerateTextFiles($FileIndex, $MaxNumOfFiles, $MaxLinesPerFile); 116 } 117 118 # Generate new Text files... 119 # 120 sub GenerateTextFiles { 121 my($FileIndex, $NumOfFiles, $NumOfLinesPerFile) = @_; 122 my($TextFile, $LineCount, $NewFileIndex, $NewFileName, $MaxLinesCount, $InDelim, $OutDelim, $OutQuote, $ColLabelsLine, $Line, @ColLabels, @Words, @NewTextFilesList); 123 124 # Setup new file names list... 125 @NewTextFilesList = (); 126 for $NewFileIndex (1 .. $NumOfFiles) { 127 $NewFileName = $TextFilesInfo{OutFileRoot}[$FileIndex] . "Part${NewFileIndex}." . $TextFilesInfo{OutFileExt}[$FileIndex]; 128 if (!$OptionsInfo{OverwriteFiles}) { 129 if (-e $NewFileName) { 130 warn "Warning: Ignoring file $TextFile: New Text file, $NewFileName, already exists\n"; 131 return; 132 } 133 } 134 push @NewTextFilesList, $NewFileName; 135 } 136 137 $TextFile = $TextFilesList[$FileIndex]; 138 139 if (!open TEXTFILE, "$TextFile") { 140 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 141 return; 142 } 143 144 $InDelim = $TextFilesInfo{InDelim}[$FileIndex]; 145 146 $OutDelim = $OptionsInfo{OutDelim}; 147 $OutQuote = $OptionsInfo{OutQuote}; 148 149 $MaxLinesCount = $NumOfLinesPerFile; 150 $LineCount = 0; 151 $NewFileIndex = 1; 152 153 open NEWTEXTFILE, ">$NewTextFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewTextFilesList[$NewFileIndex -1]: $! \n"; 154 print "Generating $NewTextFilesList[$NewFileIndex - 1] file...\n"; 155 156 if ($OptionsInfo{Label}) { 157 if ($OptionsInfo{Fast}) { 158 $ColLabelsLine = GetTextLine(\*TEXTFILE); 159 } 160 else { 161 $Line = GetTextLine(\*TEXTFILE); 162 @ColLabels = quotewords($InDelim, 0, $Line); 163 $ColLabelsLine = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 164 } 165 print NEWTEXTFILE "$ColLabelsLine\n"; 166 } 167 168 while ($Line = GetTextLine(\*TEXTFILE)) { 169 $LineCount++; 170 171 if (!$Options{fast}) { 172 @Words = quotewords($InDelim, 0, $Line); 173 $Line = JoinWords(\@Words, $OutDelim, $OutQuote); 174 } 175 print NEWTEXTFILE "$Line\n"; 176 177 if ($NewFileIndex <= $NumOfFiles) { 178 if ($LineCount >= $MaxLinesCount) { 179 if ($NewFileIndex < $NumOfFiles) { 180 close NEWTEXTFILE; 181 } 182 $NewFileIndex++; 183 $MaxLinesCount = $NumOfLinesPerFile * $NewFileIndex; 184 185 if ($NewFileIndex <= $NumOfFiles) { 186 open NEWTEXTFILE, ">$NewTextFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewTextFilesList[$NewFileIndex -1]: $! \n"; 187 print "Generating $NewTextFilesList[$NewFileIndex - 1] file...\n"; 188 189 if ($OptionsInfo{Label}) { 190 print NEWTEXTFILE "$ColLabelsLine\n"; 191 } 192 } 193 } 194 } 195 } 196 close NEWTEXTFILE; 197 close TEXTFILE; 198 } 199 200 # Retrieve information about Text files... 201 sub RetrieveTextFilesInfo { 202 my($Index, $TextFile, $InDelim, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt); 203 204 %TextFilesInfo = (); 205 @{$TextFilesInfo{FileOkay}} = (); 206 @{$TextFilesInfo{InDelim}} = (); 207 @{$TextFilesInfo{OutFileRoot}} = (); 208 @{$TextFilesInfo{OutFileExt}} = (); 209 210 FILELIST: for $Index (0 .. $#TextFilesList) { 211 $TextFilesInfo{FileOkay}[$Index] = 0; 212 $TextFilesInfo{InDelim}[$Index] = ""; 213 $TextFilesInfo{OutFileRoot}[$Index] = ""; 214 $TextFilesInfo{OutFileExt}[$Index] = ""; 215 216 $TextFile = $TextFilesList[$Index]; 217 if (!(-e $TextFile)) { 218 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 219 next FILELIST; 220 } 221 if (!CheckFileType($TextFile, "csv tsv")) { 222 warn "Warning: Ignoring file $TextFile: It's not a Text file\n"; 223 next FILELIST; 224 } 225 if (! open TEXTFILE, "$TextFile") { 226 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 227 next FILELIST; 228 } 229 close TEXTFILE; 230 231 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 232 233 # Setup input delimiter... 234 $InDelim = ''; 235 if (!$OptionsInfo{Fast}) { 236 if ($FileExt =~ /^tsv$/i) { 237 $InDelim = "\t"; 238 } 239 else { 240 $InDelim = "\,"; 241 if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) { 242 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; 243 next FILELIST; 244 } 245 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { 246 $InDelim = "\;"; 247 } 248 } 249 } 250 251 # Setup output file root... 252 $OutFileExt = $OptionsInfo{Fast} ? $FileExt : (($Options{outdelim} =~ /^tab$/i ) ? "tsv" : "csv"); 253 254 if ($OptionsInfo{OutFileRoot} && (@TextFilesList == 1)) { 255 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 256 if ($RootFileName && $RootFileExt) { 257 $FileName = $RootFileName; 258 } 259 else { 260 $FileName = $OptionsInfo{OutFileRoot}; 261 } 262 $OutFileRoot = $FileName; 263 } 264 else { 265 $OutFileRoot = $FileName; 266 } 267 268 $TextFilesInfo{FileOkay}[$Index] = 1; 269 $TextFilesInfo{InDelim}[$Index] = $InDelim; 270 $TextFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 271 $TextFilesInfo{OutFileExt}[$Index] = $OutFileExt; 272 } 273 } 274 275 # Process option values... 276 sub ProcessOptions { 277 278 %OptionsInfo = (); 279 280 $OptionsInfo{Fast} = defined $Options{fast} ? $Options{fast} : undef; 281 282 $OptionsInfo{InDelim} = $Options{indelim}; 283 $OptionsInfo{Label} = ($Options{label} =~ /^yes$/i) ? 1 : 0; 284 285 $OptionsInfo{NumOfFiles} = $Options{numfiles}; 286 287 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef; 288 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? $Options{overwrite} : undef; 289 290 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,"); 291 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; 292 } 293 294 # Setup script usage and retrieve command line arguments specified using various options... 295 sub SetupScriptUsage { 296 297 # Retrieve all the options... 298 %Options = (); 299 $Options{label} = "yes"; 300 $Options{numfiles} = 2; 301 $Options{indelim} = "comma"; 302 $Options{outdelim} = "comma"; 303 $Options{quote} = "yes"; 304 if (!GetOptions(\%Options, "fast|f", "help|h", "indelim=s", "label|l=s", "numfiles|n=i", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "workingdir|w=s")) { 305 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 306 } 307 if ($Options{workingdir}) { 308 if (! -d $Options{workingdir}) { 309 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 310 } 311 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 312 } 313 if ($Options{numfiles} < 2) { 314 die "Error: The value specified, $Options{numfiles}, for option \"-n --numfiles\" is not valid. Allowed values: >= 2 \n"; 315 } 316 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 317 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 318 } 319 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 320 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 321 } 322 if ($Options{quote} !~ /^(yes|no)$/i) { 323 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 324 } 325 if ($Options{label} !~ /^(yes|no)$/i) { 326 die "Error: The value specified, $Options{label}, for option \"-l --label\" is not valid. Allowed values: yes or no\n"; 327 } 328 } 329