1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: JoinTextFiles.pl,v $ 4 # $Date: 2015/02/28 20:46:20 $ 5 # $Revision: 1.32 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 39 40 # Autoflush STDOUT 41 $| = 1; 42 43 # Starting message... 44 $ScriptName = basename $0; 45 print "\n$ScriptName:Starting...\n\n"; 46 $StartTime = new Benchmark; 47 48 # Get the options and setup script... 49 SetupScriptUsage(); 50 if ($Options{help} || @ARGV < 1) { 51 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 52 } 53 54 my(@TextFilesList); 55 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 56 57 if (@TextFilesList == 1) { 58 die "Error: Specify more than one Text file.\n"; 59 } 60 61 # Process options... 62 print "Processing options...\n"; 63 my(%OptionsInfo); 64 ProcessOptions(); 65 66 # Setup information about input files... 67 print "Checking input text files...\n"; 68 my(%TextFilesInfo); 69 RetrieveTextFilesInfo(); 70 71 # Join files... 72 print "\nGenerating new text file $OptionsInfo{NewTextFile}...\n"; 73 JoinTextFiles(); 74 75 print "\n$ScriptName:Done...\n\n"; 76 77 $EndTime = new Benchmark; 78 $TotalTime = timediff ($EndTime, $StartTime); 79 print "Total time: ", timestr($TotalTime), "\n"; 80 81 ############################################################################### 82 83 # Join all valid Text files... 84 sub JoinTextFiles { 85 my($FileIndex, $TextFile, $NewTextFile, $Line, $FirstColLabelsLine, $OutDelim, $OutQuote, $InDelim, @Words, @ColLabels); 86 87 $NewTextFile = $OptionsInfo{NewTextFile}; 88 89 $FirstColLabelsLine = ''; 90 91 $OutDelim = $OptionsInfo{OutDelim}; $OutQuote = $OptionsInfo{OutQuote}; 92 93 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n"; 94 FILELIST: for $FileIndex (0 .. $#TextFilesList) { 95 if (!$TextFilesInfo{FileOkay}[$FileIndex]) { 96 next FILELIST; 97 } 98 99 $TextFile = $TextFilesList[$FileIndex]; 100 $InDelim = $TextFilesInfo{InDelim}[$FileIndex]; 101 102 print "\nProcessing file $TextFile...\n"; 103 104 open TEXTFILE, "$TextFile" or die "Error: Couldn't open $TextFile: $! \n"; 105 106 if ($OptionsInfo{Label}) { 107 if ($OptionsInfo{Fast}) { 108 $Line = GetTextLine(\*TEXTFILE); 109 if (!$FirstColLabelsLine) { 110 $FirstColLabelsLine = $Line; 111 print NEWTEXTFILE "$FirstColLabelsLine\n"; 112 } 113 } 114 else { 115 $Line = GetTextLine(\*TEXTFILE); 116 if (!$FirstColLabelsLine) { 117 @ColLabels = quotewords($InDelim, 0, $Line); 118 $FirstColLabelsLine = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 119 print NEWTEXTFILE "$FirstColLabelsLine\n"; 120 } 121 } 122 } 123 124 while ($Line = GetTextLine(\*TEXTFILE)) { 125 if (!$OptionsInfo{Fast}) { 126 @Words = quotewords($InDelim, 0, $Line); 127 $Line = JoinWords(\@Words, $OutDelim, $OutQuote); 128 } 129 print NEWTEXTFILE "$Line\n"; 130 } 131 close TEXTFILE; 132 } 133 134 close NEWTEXTFILE; 135 } 136 137 # Retrieve information about Text files... 138 sub RetrieveTextFilesInfo { 139 my($Index, $TextFile, $InDelim, $Line, $FirstColLabelsLine, $ColLabelsLine, $FileDir, $FileName, $FileExt, @FirstColLabels, @ColLabels); 140 141 %TextFilesInfo = (); 142 @{$TextFilesInfo{FileOkay}} = (); 143 @{$TextFilesInfo{InDelim}} = (); 144 145 $FirstColLabelsLine = ''; $ColLabelsLine = ''; 146 @FirstColLabels = (); @ColLabels = (); 147 148 FILELIST: for $Index (0 .. $#TextFilesList) { 149 $TextFilesInfo{FileOkay}[$Index] = 0; 150 $TextFilesInfo{InDelim}[$Index] = ""; 151 152 $TextFile = $TextFilesList[$Index]; 153 if (!(-e $TextFile)) { 154 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 155 next FILELIST; 156 } 157 if (!CheckFileType($TextFile, "csv tsv")) { 158 warn "Warning: Ignoring file $TextFile: It's not a Text file\n"; 159 next FILELIST; 160 } 161 162 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 163 if ($FileExt =~ /^tsv$/i) { 164 $InDelim = "\t"; 165 } 166 else { 167 $InDelim = "\,"; 168 if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) { 169 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; 170 next FILELIST; 171 } 172 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { 173 $InDelim = "\;"; 174 } 175 } 176 177 if (! open TEXTFILE, "$TextFile") { 178 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 179 next FILELIST; 180 } 181 $Line = GetTextLine(\*TEXTFILE); 182 close TEXTFILE; 183 184 if ($OptionsInfo{Label}) { 185 if (!$OptionsInfo{Fast}) { 186 @ColLabels = quotewords($InDelim, 0, $Line); 187 if ($FirstColLabelsLine) { 188 if (@ColLabels != @FirstColLabels) { 189 warn "Warning: Ignoring file $TextFile: The number of columns in this file, ", scalar(@ColLabels), ", is different from the number of columns, ", scalar(@FirstColLabels), ", in the first valid text file. \n"; 190 next FILELIST; 191 } 192 $ColLabelsLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 193 if ($ColLabelsLine ne $FirstColLabelsLine) { 194 warn "Warning: Ignoring file $TextFile: The column names in this file are different from those in first valid text file.\nColumnlabels in first valid text file: $FirstColLabelsLine \nColumnlabels in current text file: $ColLabelsLine\n"; 195 next FILELIST; 196 } 197 } 198 else { 199 @FirstColLabels = @ColLabels; 200 $FirstColLabelsLine = JoinWords(\@FirstColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 201 } 202 } 203 } 204 205 $TextFilesInfo{FileOkay}[$Index] = 1; 206 $TextFilesInfo{InDelim}[$Index] = $InDelim; 207 } 208 } 209 210 # Process option values... 211 sub ProcessOptions { 212 my($FileDir, $FileName, $FileExt, $NewTextFile, $Index); 213 214 %OptionsInfo = (); 215 216 $OptionsInfo{Fast} = $Options{fast} ? $Options{fast} : 0; 217 218 $OptionsInfo{InDelim} = $Options{indelim}; 219 $OptionsInfo{Label} = ($Options{label} =~ /^yes$/i) ? 1 : 0; 220 221 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef; 222 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef; 223 224 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,"); 225 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; 226 227 if ($Options{root}) { 228 $FileDir = ""; $FileName = ""; $FileExt = ""; 229 ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root}); 230 if ($FileName && $FileExt) { 231 $NewTextFile = $FileName; 232 } 233 else { 234 $NewTextFile = $Options{root}; 235 } 236 } 237 else { 238 $FileDir = ""; $FileName = ""; $FileExt = ""; 239 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFilesList[0]); 240 $NewTextFile = $FileName . "1To" . @TextFilesList . "Joined"; 241 } 242 243 if ($Options{outdelim} =~ /^tab$/i) { 244 $NewTextFile .= ".tsv"; 245 } 246 else { 247 $NewTextFile .= ".csv"; 248 } 249 250 if (!$Options{overwrite}) { 251 if (-e $NewTextFile) { 252 die "Error: The file $NewTextFile already exists.\n"; 253 } 254 } 255 256 if ($Options{root}) { 257 for $Index (0 .. $#TextFilesList) { 258 if (lc($NewTextFile) eq lc($TextFilesList[$Index])) { 259 die "Error: Output filename, $NewTextFile, is similar to a input file name.\nSpecify a different name using \"-r --root\" option or use default name.\n"; 260 } 261 } 262 } 263 264 $OptionsInfo{NewTextFile} = $NewTextFile; 265 } 266 267 # Setup script usage and retrieve command line arguments specified using various options... 268 sub SetupScriptUsage { 269 270 # Retrieve all the options... 271 %Options = (); 272 $Options{label} = "yes"; 273 $Options{indelim} = "comma"; 274 $Options{outdelim} = "comma"; 275 $Options{quote} = "yes"; 276 if (!GetOptions(\%Options, "fast|f", "help|h", "indelim=s", "label|l=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "workingdir|w=s")) { 277 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 278 } 279 if ($Options{workingdir}) { 280 if (! -d $Options{workingdir}) { 281 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 282 } 283 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 284 } 285 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 286 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 287 } 288 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 289 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 290 } 291 if ($Options{quote} !~ /^(yes|no)$/i) { 292 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 293 } 294 if ($Options{label} !~ /^(yes|no)$/i) { 295 die "Error: The value specified, $Options{label}, for option \"-l --label\" is not valid. Allowed values: yes or no\n"; 296 } 297 } 298