Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/MergeTextFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
| author | deepakjadmin |
|---|---|
| date | Wed, 20 Jan 2016 09:23:18 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4816e4a8ae95 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 # | |
| 3 # $RCSfile: MergeTextFiles.pl,v $ | |
| 4 # $Date: 2015/02/28 20:46:20 $ | |
| 5 # $Revision: 1.40 $ | |
| 6 # | |
| 7 # Author: Manish Sud <msud@san.rr.com> | |
| 8 # | |
| 9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 10 # | |
| 11 # This file is part of MayaChemTools. | |
| 12 # | |
| 13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 14 # the terms of the GNU Lesser General Public License as published by the Free | |
| 15 # Software Foundation; either version 3 of the License, or (at your option) any | |
| 16 # later version. | |
| 17 # | |
| 18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
| 19 # any warranty; without even the implied warranty of merchantability of fitness | |
| 20 # for a particular purpose. See the GNU Lesser General Public License for more | |
| 21 # details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public License | |
| 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
| 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
| 26 # Boston, MA, 02111-1307, USA. | |
| 27 # | |
| 28 | |
| 29 use strict; | |
| 30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
| 31 use Getopt::Long; | |
| 32 use File::Basename; | |
| 33 use Text::ParseWords; | |
| 34 use Benchmark; | |
| 35 use FileHandle; | |
| 36 use FileUtil; | |
| 37 use TextUtil; | |
| 38 | |
| 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
| 40 | |
| 41 # Autoflush STDOUT | |
| 42 $| = 1; | |
| 43 | |
| 44 # Starting message... | |
| 45 $ScriptName = basename $0; | |
| 46 print "\n$ScriptName:Starting...\n\n"; | |
| 47 $StartTime = new Benchmark; | |
| 48 | |
| 49 # Get the options and setup script... | |
| 50 SetupScriptUsage(); | |
| 51 if ($Options{help} || @ARGV < 1) { | |
| 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
| 53 } | |
| 54 | |
| 55 my(@TextFilesList); | |
| 56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); | |
| 57 | |
| 58 if (@TextFilesList == 1) { | |
| 59 die "Error: Specify more than one text file.\n"; | |
| 60 } | |
| 61 | |
| 62 # Process options... | |
| 63 print "Processing options...\n"; | |
| 64 my(%OptionsInfo); | |
| 65 ProcessOptions(); | |
| 66 | |
| 67 # Setup information about input files... | |
| 68 my(%TextFilesInfo); | |
| 69 print "Checking input text files...\n"; | |
| 70 RetrieveTextFilesInfo(); | |
| 71 RetrieveColumnsAndKeysInfo(); | |
| 72 | |
| 73 # Merge files... | |
| 74 print "\nGenerating new text file $OptionsInfo{NewTextFile}...\n"; | |
| 75 MergeTextFiles(); | |
| 76 | |
| 77 print "\n$ScriptName:Done...\n\n"; | |
| 78 | |
| 79 $EndTime = new Benchmark; | |
| 80 $TotalTime = timediff ($EndTime, $StartTime); | |
| 81 print "Total time: ", timestr($TotalTime), "\n"; | |
| 82 | |
| 83 ############################################################################### | |
| 84 | |
| 85 # Merge all valid Text files... | |
| 86 sub MergeTextFiles { | |
| 87 my($Index); | |
| 88 | |
| 89 open NEWTEXTFILE, ">$OptionsInfo{NewTextFile}" or die "Error: Couldn't open $OptionsInfo{NewTextFile}: $! \n"; | |
| 90 | |
| 91 WriteNewTextFileColumnLabels(\*NEWTEXTFILE); | |
| 92 | |
| 93 #Open up all the files and skip coumn label line... | |
| 94 @{$TextFilesInfo{FileHandle}} = (); | |
| 95 for $Index (0 .. $#TextFilesList) { | |
| 96 $TextFilesInfo{FileHandle}[$Index] = new FileHandle; | |
| 97 | |
| 98 open $TextFilesInfo{FileHandle}[$Index], "$TextFilesList[$Index]" or die "Error: Couldn't open $TextFilesList[$Index]: $! \n"; | |
| 99 GetTextLine($TextFilesInfo{FileHandle}[$Index]); | |
| 100 } | |
| 101 | |
| 102 # Merge files... | |
| 103 if ($OptionsInfo{Keys}) { | |
| 104 MergeColumnValuesUsingKeys(\*NEWTEXTFILE); | |
| 105 } | |
| 106 else { | |
| 107 MergeColumnValues(\*NEWTEXTFILE); | |
| 108 } | |
| 109 | |
| 110 # Close all opened files... | |
| 111 close NEWTEXTFILE; | |
| 112 for $Index (0 .. $#TextFilesList) { | |
| 113 close $TextFilesInfo{FileHandle}[$Index]; | |
| 114 } | |
| 115 | |
| 116 } | |
| 117 | |
| 118 # Merge all the column values... | |
| 119 sub MergeColumnValues { | |
| 120 my($NewTextFileRef) = @_; | |
| 121 my($Index, $Line, $InDelim, $Value, $ColNum, @LineWords, @File1LineWords, @ColValues); | |
| 122 | |
| 123 while ($Line = GetTextLine($TextFilesInfo{FileHandle}[0])) { | |
| 124 $InDelim = $TextFilesInfo{InDelim}[0]; | |
| 125 @ColValues = (); | |
| 126 | |
| 127 #Collect column values from first file before the merge point... | |
| 128 @File1LineWords = quotewords($InDelim, 0, $Line); | |
| 129 for $ColNum (@{$TextFilesInfo{File1Part1ColNums}}) { | |
| 130 $Value = ($ColNum < @File1LineWords) ? $File1LineWords[$ColNum] : ""; | |
| 131 push @ColValues, $Value; | |
| 132 } | |
| 133 | |
| 134 #Collect column values from other text files... | |
| 135 for $Index (1 .. $#TextFilesList) { | |
| 136 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
| 137 if ($Line = GetTextLine($TextFilesInfo{FileHandle}[$Index])) { | |
| 138 @LineWords = quotewords($InDelim, 0, $Line); | |
| 139 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) { | |
| 140 $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : ""; | |
| 141 push @ColValues, $Value; | |
| 142 } | |
| 143 } | |
| 144 } | |
| 145 | |
| 146 #Collect column labels from first file after the merge point... | |
| 147 for $ColNum (@{$TextFilesInfo{File1Part2ColNums}}) { | |
| 148 $Value = ($ColNum < @File1LineWords) ? $File1LineWords[$ColNum] : ""; | |
| 149 push @ColValues, $Value; | |
| 150 } | |
| 151 | |
| 152 # Write it out... | |
| 153 $Line = JoinWords(\@ColValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 154 print $NewTextFileRef "$Line\n"; | |
| 155 } | |
| 156 | |
| 157 } | |
| 158 | |
| 159 # Merge column values using keys... | |
| 160 sub MergeColumnValuesUsingKeys { | |
| 161 my($NewTextFileRef) = @_; | |
| 162 my($Index, $InDelim, $Line, $Value, $ColNum, $KeyColNum, $KeyColValue, @LineWords, @ColValues, @File1LineWords, @TextFilesKeysToLinesMap); | |
| 163 | |
| 164 @TextFilesKeysToLinesMap = (); | |
| 165 | |
| 166 # Retrieve text lines from all the files except for the first file... | |
| 167 for $Index (1 .. $#TextFilesList) { | |
| 168 %{$TextFilesKeysToLinesMap[$Index]} = (); | |
| 169 | |
| 170 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
| 171 $KeyColNum = $TextFilesInfo{KeysToUse}[$Index]; | |
| 172 | |
| 173 while ($Line = GetTextLine($TextFilesInfo{FileHandle}[$Index])) { | |
| 174 @LineWords = quotewords($InDelim, 0, $Line); | |
| 175 if ($KeyColNum < @LineWords) { | |
| 176 $KeyColValue = $LineWords[$KeyColNum]; | |
| 177 if (length($KeyColValue)) { | |
| 178 if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) { | |
| 179 warn "Warning: Ignoring line, $Line, in text file $TextFilesList[$Index]: Column key value, $KeyColValue, already exists\n"; | |
| 180 } | |
| 181 else { | |
| 182 @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}} = (); | |
| 183 push @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}}, @LineWords; | |
| 184 } | |
| 185 } | |
| 186 } | |
| 187 } | |
| 188 } | |
| 189 | |
| 190 while ($Line = GetTextLine($TextFilesInfo{FileHandle}[0])) { | |
| 191 $InDelim = $TextFilesInfo{InDelim}[0]; | |
| 192 | |
| 193 @ColValues = (); | |
| 194 @File1LineWords = quotewords($InDelim, 0, $Line); | |
| 195 | |
| 196 $KeyColNum = $TextFilesInfo{KeysToUse}[0]; | |
| 197 $KeyColValue = $File1LineWords[$KeyColNum]; | |
| 198 | |
| 199 #Collect column values from first file before the merge point... | |
| 200 for $ColNum (@{$TextFilesInfo{File1Part1ColNums}}) { | |
| 201 $Value = ($ColNum < @File1LineWords) ? $File1LineWords[$ColNum] : ""; | |
| 202 push @ColValues, $Value; | |
| 203 } | |
| 204 | |
| 205 #Collect column values from other text files... | |
| 206 for $Index (1 .. $#TextFilesList) { | |
| 207 @LineWords = (); | |
| 208 if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) { | |
| 209 push @LineWords, @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}}; | |
| 210 } | |
| 211 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) { | |
| 212 $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : ""; | |
| 213 push @ColValues, $Value; | |
| 214 } | |
| 215 } | |
| 216 | |
| 217 #Collect column labels from first file after the merge point... | |
| 218 for $ColNum (@{$TextFilesInfo{File1Part2ColNums}}) { | |
| 219 $Value = ($ColNum < @File1LineWords) ? $File1LineWords[$ColNum] : ""; | |
| 220 push @ColValues, $Value; | |
| 221 } | |
| 222 | |
| 223 # Write it out... | |
| 224 $Line = JoinWords(\@ColValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 225 print $NewTextFileRef "$Line\n"; | |
| 226 } | |
| 227 | |
| 228 } | |
| 229 | |
| 230 # Write out column labels for new merged text file... | |
| 231 sub WriteNewTextFileColumnLabels { | |
| 232 my($NewTextFileRef) = @_; | |
| 233 my($Index, $Line, $ColNum, @ColLabels); | |
| 234 | |
| 235 #Write out column labels for the merged text file... | |
| 236 @ColLabels = (); | |
| 237 | |
| 238 #Collect column labels from first file before the merge point... | |
| 239 for $ColNum (@{$TextFilesInfo{File1Part1ColNums}}) { | |
| 240 push @ColLabels, $TextFilesInfo{ColToMergeNumToLabelMap}[0]{$ColNum}; | |
| 241 } | |
| 242 | |
| 243 #Collect column labels from other text files... | |
| 244 for $Index (1 .. $#TextFilesList) { | |
| 245 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) { | |
| 246 push @ColLabels, $TextFilesInfo{ColToMergeNumToLabelMap}[$Index]{$ColNum}; | |
| 247 } | |
| 248 } | |
| 249 | |
| 250 #Collect column labels from first file after the merge point... | |
| 251 for $ColNum (@{$TextFilesInfo{File1Part2ColNums}}) { | |
| 252 push @ColLabels, $TextFilesInfo{ColToMergeNumToLabelMap}[0]{$ColNum}; | |
| 253 } | |
| 254 | |
| 255 #Write it out... | |
| 256 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 257 print NEWTEXTFILE "$Line\n"; | |
| 258 } | |
| 259 | |
| 260 # Retrieve text file columns and keys information for specified options... | |
| 261 sub RetrieveColumnsAndKeysInfo { | |
| 262 ProcessColumnsInfo(); | |
| 263 | |
| 264 if ($OptionsInfo{Keys}) { | |
| 265 ProcessKeysInfo(); | |
| 266 } | |
| 267 | |
| 268 ProcessStartColInfo(); | |
| 269 } | |
| 270 | |
| 271 # Process specified columns... | |
| 272 sub ProcessColumnsInfo { | |
| 273 my($Index, $SpecifiedColNum, $Values, $ColIndex, $ColNum, $ColLabel, @Words); | |
| 274 | |
| 275 @{$TextFilesInfo{ColSpecified}} = (); | |
| 276 @{$TextFilesInfo{ColToMerge}} = (); | |
| 277 @{$TextFilesInfo{ColToMergeNumToLabelMap}} = (); | |
| 278 | |
| 279 for $Index (0 .. $#TextFilesList) { | |
| 280 | |
| 281 @{$TextFilesInfo{ColSpecified}[$Index]} = (); | |
| 282 | |
| 283 $Values = "all"; | |
| 284 if ($OptionsInfo{Columns}) { | |
| 285 $Values = $OptionsInfo{ColValues}[$Index]; | |
| 286 } | |
| 287 | |
| 288 if ($Values =~ /all/i) { | |
| 289 if ($OptionsInfo{Mode} =~ /^colnum$/i) { | |
| 290 for $ColNum (1 .. $TextFilesInfo{ColCount}[$Index]) { | |
| 291 push @{$TextFilesInfo{ColSpecified}[$Index]}, $ColNum; | |
| 292 } | |
| 293 } | |
| 294 else { | |
| 295 push @{$TextFilesInfo{ColSpecified}[$Index]}, @{$TextFilesInfo{ColLabels}[$Index]}; | |
| 296 } | |
| 297 } | |
| 298 else { | |
| 299 @Words = split ",", $Values; | |
| 300 push @{$TextFilesInfo{ColSpecified}[$Index]}, @Words; | |
| 301 } | |
| 302 | |
| 303 @{$TextFilesInfo{ColToMerge}[$Index]} = (); | |
| 304 %{$TextFilesInfo{ColToMergeNumToLabelMap}[$Index]} = (); | |
| 305 | |
| 306 if ($OptionsInfo{Mode} =~ /^collabel$/i) { | |
| 307 for $ColIndex (0 .. $#{$TextFilesInfo{ColSpecified}[$Index]}) { | |
| 308 $ColLabel = $TextFilesInfo{ColSpecified}[$Index][$ColIndex]; | |
| 309 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { | |
| 310 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; | |
| 311 push @{$TextFilesInfo{ColToMerge}[$Index]}, $ColNum; | |
| 312 $TextFilesInfo{ColToMergeNumToLabelMap}[$Index]{$ColNum} = $ColLabel; | |
| 313 } | |
| 314 else { | |
| 315 warn "Warning: Ignoring value, $ColLabel, specified using \"-c --column\" option: column name doesn't exist in $TextFilesList[$Index] \n"; | |
| 316 } | |
| 317 } | |
| 318 } | |
| 319 else { | |
| 320 for $ColIndex (0 .. $#{$TextFilesInfo{ColSpecified}[$Index]}) { | |
| 321 $SpecifiedColNum = $TextFilesInfo{ColSpecified}[$Index][$ColIndex]; | |
| 322 if ($SpecifiedColNum > 0 && $SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) { | |
| 323 $ColNum = $SpecifiedColNum - 1; | |
| 324 push @{$TextFilesInfo{ColToMerge}[$Index]}, $ColNum; | |
| 325 $TextFilesInfo{ColToMergeNumToLabelMap}[$Index]{$ColNum} = $TextFilesInfo{ColLabels}[$Index][$ColNum]; | |
| 326 } | |
| 327 else { | |
| 328 warn "Warning: Ignoring value, $SpecifiedColNum, specified using \"-c --column\" option: column number doesn't exist in $TextFilesList[$Index] \n"; | |
| 329 } | |
| 330 } | |
| 331 } | |
| 332 my (@ColToMergeSorted) = sort { $a <=> $b } @{$TextFilesInfo{ColToMerge}[$Index]}; | |
| 333 @{$TextFilesInfo{ColToMerge}[$Index]} = (); | |
| 334 push @{$TextFilesInfo{ColToMerge}[$Index]}, @ColToMergeSorted; | |
| 335 } | |
| 336 } | |
| 337 | |
| 338 # Process specified key column values... | |
| 339 sub ProcessKeysInfo { | |
| 340 my($Index, $Key, $ColLabel, $ColNum); | |
| 341 | |
| 342 @{$TextFilesInfo{KeysSpecified}} = (); | |
| 343 @{$TextFilesInfo{KeysToUse}} = (); | |
| 344 | |
| 345 for $Index (0 .. $#TextFilesList) { | |
| 346 $Key = $OptionsInfo{KeyValues}[$Index]; | |
| 347 | |
| 348 $TextFilesInfo{KeysSpecified}[$Index] = $Key; | |
| 349 $TextFilesInfo{KeysToUse}[$Index] = -1; | |
| 350 | |
| 351 if ($OptionsInfo{Mode} =~ /^collabel$/i) { | |
| 352 $ColLabel = $Key; | |
| 353 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { | |
| 354 $TextFilesInfo{KeysToUse}[$Index] = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; | |
| 355 } | |
| 356 else { | |
| 357 warn "Warning: Ignoring value, $ColLabel, specified using \"-k --keys\" option: column name doesn't exist in $TextFilesList[$Index] \n"; | |
| 358 } | |
| 359 } | |
| 360 else { | |
| 361 $ColNum = $Key; | |
| 362 if ($ColNum > 0 && $ColNum <= $TextFilesInfo{ColCount}[$Index]) { | |
| 363 $TextFilesInfo{KeysToUse}[$Index] = $ColNum - 1; | |
| 364 } | |
| 365 else { | |
| 366 warn "Warning: Ignoring value, $ColNum, specified using \"-k --keys\" option: column number doesn't exist in $TextFilesList[$Index] \n"; | |
| 367 } | |
| 368 } | |
| 369 } | |
| 370 | |
| 371 # Modify columns to merge list to make sure the columns identified by key are taken off the list | |
| 372 # except for the first text file... | |
| 373 my(@ColToMergeFiltered); | |
| 374 | |
| 375 for $Index (1 .. $#TextFilesList) { | |
| 376 @ColToMergeFiltered = (); | |
| 377 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) { | |
| 378 if ($TextFilesInfo{KeysToUse}[$Index] != $ColNum) { | |
| 379 push @ColToMergeFiltered, $ColNum; | |
| 380 } | |
| 381 } | |
| 382 @{$TextFilesInfo{ColToMerge}[$Index]} = (); | |
| 383 push @{$TextFilesInfo{ColToMerge}[$Index]}, @ColToMergeFiltered; | |
| 384 } | |
| 385 } | |
| 386 | |
| 387 # Process specified start column value... | |
| 388 sub ProcessStartColInfo { | |
| 389 my($Index, $ColIndex, $ColNum, $StartColNum, $Part1StartColNum, $Part1EndColNum, $Part2StartColNum, $Part2EndColNum, $BeforeStartColNum, $AfterStartColNum, $FirstColNum, $LastColNum, $FirstIndex, $LastIndex); | |
| 390 | |
| 391 @{$TextFilesInfo{File1Part1ColNums}} = (); | |
| 392 @{$TextFilesInfo{File1Part2ColNums}} = (); | |
| 393 | |
| 394 $StartColNum = "last"; | |
| 395 if ($OptionsInfo{StartCol}) { | |
| 396 if (length($OptionsInfo{StartCol})) { | |
| 397 $StartColNum = $OptionsInfo{StartCol} | |
| 398 } | |
| 399 } | |
| 400 | |
| 401 if ($StartColNum !~ /^last$/i) { | |
| 402 if ($OptionsInfo{Mode} =~ /^collabel$/i) { | |
| 403 if (exists($TextFilesInfo{ColLabelToNumMap}[0]{$StartColNum})) { | |
| 404 $StartColNum = $TextFilesInfo{ColLabelToNumMap}[0]{$StartColNum}; | |
| 405 } | |
| 406 else { | |
| 407 die "Error: Invalid value $StartColNum specified using \"-s --startcol\" option: column name doesn't exist in $TextFilesList[0] \n"; | |
| 408 } | |
| 409 } | |
| 410 else { | |
| 411 if ($StartColNum > 0 && $StartColNum <= $TextFilesInfo{ColCount}[0]) { | |
| 412 $StartColNum -= 1; | |
| 413 } | |
| 414 else { | |
| 415 die "Error: Invalid value $StartColNum specified using \"-s --startcol\" option: column number doesn't exist in $TextFilesList[0] \n"; | |
| 416 } | |
| 417 } | |
| 418 } | |
| 419 else { | |
| 420 $StartColNum = $TextFilesInfo{ColCount}[0] - 1; | |
| 421 } | |
| 422 | |
| 423 # Make sure StartColNum is present on the list of columns to merge for the first text file... | |
| 424 if (!exists($TextFilesInfo{ColToMergeNumToLabelMap}[0]{$StartColNum})) { | |
| 425 die "Error: Invalid value $StartColNum specified using \"-s --startcol\" option: doesn't exist in the specified lists of columns to merge for $TextFilesList[0] \n"; | |
| 426 } | |
| 427 | |
| 428 # Find out the column number before and after StartColNum in first text file... | |
| 429 $BeforeStartColNum = $StartColNum; | |
| 430 $AfterStartColNum = $StartColNum; | |
| 431 | |
| 432 $FirstIndex = 0; $LastIndex = $#{$TextFilesInfo{ColToMerge}[0]}; | |
| 433 | |
| 434 $FirstColNum = $TextFilesInfo{ColToMerge}[0][$FirstIndex]; | |
| 435 $LastColNum = $TextFilesInfo{ColToMerge}[0][$LastIndex]; | |
| 436 | |
| 437 for $Index (0 .. $LastIndex) { | |
| 438 if ($TextFilesInfo{ColToMerge}[0][$Index] == $StartColNum) { | |
| 439 $BeforeStartColNum = (($Index -1) >= $FirstIndex) ? $TextFilesInfo{ColToMerge}[0][$Index - 1] : ($FirstColNum - 1); | |
| 440 $AfterStartColNum = (($Index + 1) <= $LastIndex) ? $TextFilesInfo{ColToMerge}[0][$Index + 1] : ($LastColNum + 1); | |
| 441 } | |
| 442 } | |
| 443 | |
| 444 if ($OptionsInfo{StartColMode} =~ /^after$/i) { | |
| 445 $Part1StartColNum = $FirstColNum; $Part1EndColNum = $StartColNum; | |
| 446 $Part2StartColNum = $AfterStartColNum; $Part2EndColNum = $LastColNum; | |
| 447 } | |
| 448 else { | |
| 449 $Part1StartColNum = $FirstColNum; $Part1EndColNum = $BeforeStartColNum; | |
| 450 $Part2StartColNum = $StartColNum; $Part2EndColNum = $LastColNum; | |
| 451 } | |
| 452 | |
| 453 @{$TextFilesInfo{File1Part1ColNums}} = (); | |
| 454 @{$TextFilesInfo{File1Part2ColNums}} = (); | |
| 455 | |
| 456 for $ColIndex (0 .. $#{$TextFilesInfo{ColToMerge}[0]}) { | |
| 457 $ColNum = $TextFilesInfo{ColToMerge}[0][$ColIndex]; | |
| 458 if ($ColNum >= $Part1StartColNum && $ColNum <= $Part1EndColNum) { | |
| 459 push @{$TextFilesInfo{File1Part1ColNums}}, $ColNum; | |
| 460 } | |
| 461 } | |
| 462 | |
| 463 for $ColIndex (0 .. $#{$TextFilesInfo{ColToMerge}[0]}) { | |
| 464 $ColNum = $TextFilesInfo{ColToMerge}[0][$ColIndex]; | |
| 465 if ($ColNum >= $Part2StartColNum && $ColNum <= $Part2EndColNum) { | |
| 466 push @{$TextFilesInfo{File1Part2ColNums}}, $ColNum; | |
| 467 } | |
| 468 } | |
| 469 | |
| 470 } | |
| 471 | |
| 472 # Retrieve information about input text files... | |
| 473 sub RetrieveTextFilesInfo { | |
| 474 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, $ColNum, $ColLabel, $FileNotOkayCount, @ColLabels,); | |
| 475 | |
| 476 %TextFilesInfo = (); | |
| 477 | |
| 478 @{$TextFilesInfo{FileOkay}} = (); | |
| 479 @{$TextFilesInfo{ColCount}} = (); | |
| 480 @{$TextFilesInfo{ColLabels}} = (); | |
| 481 @{$TextFilesInfo{ColLabelToNumMap}} = (); | |
| 482 @{$TextFilesInfo{InDelim}} = (); | |
| 483 | |
| 484 $FileNotOkayCount = 0; | |
| 485 | |
| 486 FILELIST: for $Index (0 .. $#TextFilesList) { | |
| 487 $TextFile = $TextFilesList[$Index]; | |
| 488 | |
| 489 $TextFilesInfo{FileOkay}[$Index] = 0; | |
| 490 $TextFilesInfo{ColCount}[$Index] = 0; | |
| 491 $TextFilesInfo{InDelim}[$Index] = ""; | |
| 492 | |
| 493 @{$TextFilesInfo{ColLabels}[$Index]} = (); | |
| 494 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); | |
| 495 | |
| 496 if (!(-e $TextFile)) { | |
| 497 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; | |
| 498 $FileNotOkayCount++; | |
| 499 next FILELIST; | |
| 500 } | |
| 501 if (!CheckFileType($TextFile, "csv tsv")) { | |
| 502 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; | |
| 503 $FileNotOkayCount++; | |
| 504 next FILELIST; | |
| 505 } | |
| 506 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); | |
| 507 if ($FileExt =~ /^tsv$/i) { | |
| 508 $InDelim = "\t"; | |
| 509 } | |
| 510 else { | |
| 511 $InDelim = "\,"; | |
| 512 if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) { | |
| 513 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; | |
| 514 $FileNotOkayCount++; | |
| 515 next FILELIST; | |
| 516 } | |
| 517 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { | |
| 518 $InDelim = "\;"; | |
| 519 } | |
| 520 } | |
| 521 | |
| 522 if (!open TEXTFILE, "$TextFile") { | |
| 523 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; | |
| 524 $FileNotOkayCount++; | |
| 525 next FILELIST; | |
| 526 } | |
| 527 | |
| 528 $Line = GetTextLine(\*TEXTFILE); | |
| 529 @ColLabels = quotewords($InDelim, 0, $Line); | |
| 530 close TEXTFILE; | |
| 531 | |
| 532 $TextFilesInfo{FileOkay}[$Index] = 1; | |
| 533 $TextFilesInfo{InDelim}[$Index] = $InDelim; | |
| 534 | |
| 535 $TextFilesInfo{ColCount}[$Index] = @ColLabels; | |
| 536 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; | |
| 537 for $ColNum (0 .. $#ColLabels) { | |
| 538 $ColLabel = $ColLabels[$ColNum]; | |
| 539 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; | |
| 540 } | |
| 541 } | |
| 542 # Make sure all specified files are valid for merging to work properly... | |
| 543 if ($FileNotOkayCount) { | |
| 544 die "Error: Problems with input text file(s)...\n"; | |
| 545 } | |
| 546 } | |
| 547 | |
| 548 # Process option values... | |
| 549 sub ProcessOptions { | |
| 550 my($Index, $FileDir, $FileName, $FileExt, $NewTextFile, @ColValues, @KeyValues); | |
| 551 | |
| 552 %OptionsInfo = (); | |
| 553 | |
| 554 $OptionsInfo{Mode} = $Options{mode}; | |
| 555 | |
| 556 $OptionsInfo{Columns} = $Options{columns}; | |
| 557 @{$OptionsInfo{ColValues}} = (); | |
| 558 | |
| 559 if ($Options{columns}) { | |
| 560 @ColValues = split ";", $Options{columns}; | |
| 561 if (@ColValues != @TextFilesList) { | |
| 562 die "Error: Invalid number of values specified by \"-c --columns\" option: it must be equal to number of input text files.\n"; | |
| 563 } | |
| 564 for $Index (0 .. $#ColValues) { | |
| 565 if (!length($ColValues[$Index])) { | |
| 566 die "Error: Invalid value specified by \"-c --columns\" option: empty values are not allowed.\n"; | |
| 567 } | |
| 568 } | |
| 569 @{$OptionsInfo{ColValues}} = @ColValues; | |
| 570 } | |
| 571 | |
| 572 $OptionsInfo{Keys} = $Options{keys}; | |
| 573 @{$OptionsInfo{KeyValues}} = (); | |
| 574 | |
| 575 if ($Options{keys}) { | |
| 576 @KeyValues = split ";", $Options{keys}; | |
| 577 if (@KeyValues != @TextFilesList) { | |
| 578 die "Error: Invalid number of values specified by \"-k --keys\" option: it must be equal to number of input text files.\n"; | |
| 579 } | |
| 580 for $Index (0 .. $#KeyValues) { | |
| 581 if (!length($KeyValues[$Index])) { | |
| 582 die "Error: Invalid value specified by \"-k --keys\" option: empty values are not allowed.\n"; | |
| 583 } | |
| 584 } | |
| 585 @{$OptionsInfo{KeyValues}} = @KeyValues; | |
| 586 } | |
| 587 | |
| 588 $OptionsInfo{InDelim} = $Options{indelim}; | |
| 589 | |
| 590 $OptionsInfo{StartCol} = $Options{startcol} ? $Options{startcol} : undef; | |
| 591 $OptionsInfo{StartColMode} = $Options{startcolmode}; | |
| 592 | |
| 593 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef; | |
| 594 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef; | |
| 595 | |
| 596 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,"); | |
| 597 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; | |
| 598 | |
| 599 if ($Options{root}) { | |
| 600 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
| 601 ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root}); | |
| 602 if ($FileName && $FileExt) { | |
| 603 $NewTextFile = $FileName; | |
| 604 } else { | |
| 605 $NewTextFile = $Options{root}; | |
| 606 } | |
| 607 } else { | |
| 608 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
| 609 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFilesList[0]); | |
| 610 $NewTextFile = $FileName . "1To" . @TextFilesList . "Merged"; | |
| 611 } | |
| 612 if ($Options{outdelim} =~ /^tab$/i) { | |
| 613 $NewTextFile .= ".tsv"; | |
| 614 } else { | |
| 615 $NewTextFile .= ".csv"; | |
| 616 } | |
| 617 if (!$Options{overwrite}) { | |
| 618 if (-e $NewTextFile) { | |
| 619 die "Error: The file $NewTextFile already exists.\n"; | |
| 620 } | |
| 621 } | |
| 622 if ($Options{root}) { | |
| 623 for $Index (0 .. $#TextFilesList) { | |
| 624 if (lc($NewTextFile) eq lc($TextFilesList[$Index])) { | |
| 625 die "Error: Output filename, $NewTextFile, is similar to a input file name.\nSpecify a different name using \"-r --root\" option or use default name.\n"; | |
| 626 } | |
| 627 } | |
| 628 } | |
| 629 | |
| 630 $OptionsInfo{NewTextFile} = $NewTextFile; | |
| 631 } | |
| 632 | |
| 633 # Setup script usage and retrieve command line arguments specified using various options... | |
| 634 sub SetupScriptUsage { | |
| 635 | |
| 636 # Retrieve all the options... | |
| 637 %Options = (); | |
| 638 | |
| 639 $Options{mode} = "colnum"; | |
| 640 $Options{indelim} = "comma"; | |
| 641 $Options{outdelim} = "comma"; | |
| 642 $Options{quote} = "yes"; | |
| 643 $Options{startcolmode} = "after"; | |
| 644 | |
| 645 if (!GetOptions(\%Options, "help|h", "indelim=s", "columns|c=s", "keys|k=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "startcol|s=s", "startcolmode=s", "workingdir|w=s")) { | |
| 646 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
| 647 } | |
| 648 if ($Options{workingdir}) { | |
| 649 if (! -d $Options{workingdir}) { | |
| 650 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
| 651 } | |
| 652 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
| 653 } | |
| 654 if ($Options{mode} !~ /^(colnum|collabel)$/i) { | |
| 655 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum, or collabel\n"; | |
| 656 } | |
| 657 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { | |
| 658 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; | |
| 659 } | |
| 660 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { | |
| 661 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
| 662 } | |
| 663 if ($Options{quote} !~ /^(yes|no)$/i) { | |
| 664 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; | |
| 665 } | |
| 666 if ($Options{startcolmode} !~ /^(before|after)$/i) { | |
| 667 die "Error: The value specified, $Options{quote}, for option \"--startcolmode\" is not valid. Allowed values: before or after\n"; | |
| 668 } | |
| 669 } | |
| 670 | |
| 671 __END__ | |
| 672 | |
| 673 =head1 NAME | |
| 674 | |
| 675 MergeTextFiles.pl - Merge multiple CSV or TSV text files into a single text file | |
| 676 | |
| 677 =head1 SYNOPSIS | |
| 678 | |
| 679 MergeTextFiles.pl TextFiles... | |
| 680 | |
| 681 MergeTextFiles.pl [B<-h, --help>] [B<--indelim> comma | semicolon] [B<-c, --columns> colnum,...;... | collabel,...;...] | |
| 682 [B<-k, --keys> colnum,...;... | collabel,...;...] [B<-m, --mode> colnum | collabel] | |
| 683 [B<-o, --overwrite>] [B<--outdelim> comma | tab | semicolon] [B<-q, --quote> yes | no] | |
| 684 [B<-r, --root> rootname] [B<-s, --startcol> colnum | collabel] [B<--startcolmode> before | after] | |
| 685 [B<-w, --workingdir> dirname] TextFiles... | |
| 686 | |
| 687 =head1 DESCRIPTION | |
| 688 | |
| 689 Merge multiple CSV or TSV I<TextFiles> into first I<TextFile> to generate a single | |
| 690 text file. Unless B<-k --keys> option is used, data rows from other I<TextFiles> | |
| 691 are added to first I<TextFile> in a sequential order, and the number of rows in first | |
| 692 I<TextFile> is used to determine how many rows of data are added from other | |
| 693 I<TextFiles>. | |
| 694 | |
| 695 Multiple I<TextFiles> names are separated by space. The valid file extensions are I<.csv> and | |
| 696 I<.tsv> for comma/semicolon and tab delimited text files respectively. All other file names | |
| 697 are ignored. All the text files in a current directory can be specified by I<*.csv>, | |
| 698 I<*.tsv>, or the current directory name. The B<--indelim> option determines the | |
| 699 format of I<TextFiles>. Any file which doesn't correspond to the format indicated | |
| 700 by B<--indelim> option is ignored. | |
| 701 | |
| 702 =head1 OPTIONS | |
| 703 | |
| 704 =over 4 | |
| 705 | |
| 706 =item B<-h, --help> | |
| 707 | |
| 708 Print this help message. | |
| 709 | |
| 710 =item B<--indelim> I<comma | semicolon> | |
| 711 | |
| 712 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>. | |
| 713 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a | |
| 714 delimiter. | |
| 715 | |
| 716 =item B<-c, --columns> I<colnum,...;... | collabel,...;...> | |
| 717 | |
| 718 This value is mode specific. It is a list of columns to merge into first | |
| 719 text file specified by column numbers or labels for each text file | |
| 720 delimited by ";". All specified text files are merged into first text file. | |
| 721 | |
| 722 Default value: I<all;all;...>. By default, all columns from specified text files are | |
| 723 merged into first text file. | |
| 724 | |
| 725 For I<colnum> mode, input value format is: I<colnum,...;colnum,...;...>. Example: | |
| 726 | |
| 727 "1,2;1,3,4;7,8,9" | |
| 728 | |
| 729 For I<collabel> mode, input value format is: I<collabel,...;collabel,...;...>. Example: | |
| 730 | |
| 731 "MW,SumNO;SumNHOH,ClogP,PSA;MolName,Mol_Id,Extreg" | |
| 732 | |
| 733 =item B<-k, --keys> I<colnum,...;... | collabel,...;...> | |
| 734 | |
| 735 This value is mode specific. It specifies column keys to use for merging | |
| 736 all specified text files into first text file. The column keys are specified by | |
| 737 column numbers or labels for each text file delimited by ";". | |
| 738 | |
| 739 By default, data rows from text files are merged into first file in the order they appear. | |
| 740 | |
| 741 For I<colnum> mode, input value format is:I<colkeynum, colkeynum;...>. Example: | |
| 742 | |
| 743 "1;3;7" | |
| 744 | |
| 745 For I<collabel> mode, input value format is:I<colkeylabel, colkeylabel;...>. Example: | |
| 746 | |
| 747 "Mol_Id;Mol_Id;Cmpd_Id" | |
| 748 | |
| 749 =item B<-m, --mode> I<colnum | collabel> | |
| 750 | |
| 751 Specify how to merge text files: using column numbers or column labels. | |
| 752 Possible values: I<colnum or collabel>. Default value: I<colnum>. | |
| 753 | |
| 754 =item B<-o, --overwrite> | |
| 755 | |
| 756 Overwrite existing files. | |
| 757 | |
| 758 =item B<--outdelim> I<comma | tab | semicolon> | |
| 759 | |
| 760 Output text file delimiter. Possible values: I<comma, tab, or semicolon> | |
| 761 Default value: I<comma>. | |
| 762 | |
| 763 =item B<-q, --quote> I<yes | no> | |
| 764 | |
| 765 Put quotes around column values in output text file. Possible values: I<yes or | |
| 766 no>. Default value: I<yes>. | |
| 767 | |
| 768 =item B<-r, --root> I<rootname> | |
| 769 | |
| 770 New text file name is generated using the root: <Root>.<Ext>. Default file | |
| 771 name: <FirstTextFileName>1To<Count>Merged.<Ext>. The csv, and tsv | |
| 772 <Ext> values are used for comma/semicolon, and tab delimited text files | |
| 773 respectively. | |
| 774 | |
| 775 =item B<-s, --startcol> I<colnum | collabel> | |
| 776 | |
| 777 This value is mode specific. It specifies the column in first text file which is | |
| 778 used for start merging other text files.For I<colnum> mode, specify column | |
| 779 number and for I<collabel> mode, specify column label. | |
| 780 | |
| 781 Default value: I<last>. Start merge after the last column. | |
| 782 | |
| 783 =item B<--startcolmode> I<before | after> | |
| 784 | |
| 785 Start the merge before or after the B<-s, --startcol> value. Possible values: I<before or after> | |
| 786 Default value: I<after>. | |
| 787 | |
| 788 =item B<-w, --workingdir> I<dirname> | |
| 789 | |
| 790 Location of working directory. Default: current directory. | |
| 791 | |
| 792 =back | |
| 793 | |
| 794 =head1 EXAMPLES | |
| 795 | |
| 796 To merge Sample2.csv and Sample3.csv into Sample1.csv and generate | |
| 797 NewSample.csv, type: | |
| 798 | |
| 799 % MergeTextFiles.pl -r NewSample -o Sample1.csv Sample2.csv | |
| 800 Sample3.csv | |
| 801 | |
| 802 To merge all Sample*.tsv and generate NewSample.tsv file, type: | |
| 803 | |
| 804 % MergeTextFiles.pl -r NewSample --indelim comma --outdelim tab -o | |
| 805 Sample*.csv | |
| 806 | |
| 807 To merge column numbers "1,2" and "3,4,5" from Sample2.csv and Sample3.csv | |
| 808 into Sample1.csv starting before column number 3 in Sample1.csv and to generate | |
| 809 NewSample.csv without quoting column data, type: | |
| 810 | |
| 811 % MergeTextFiles.pl -s 3 --startcolmode before -r NewSample -q no | |
| 812 -m colnum -c "all;1,2;3,4,5" -o Sample1.csv Sample2.csv | |
| 813 Sample3.csv | |
| 814 | |
| 815 To merge column "Mol_ID,Formula,MolWeight" and "Mol_ID,NAME,ChemBankID" | |
| 816 from Sample2.csv and Sample3.csv into Sample1.csv using "Mol_ID" as a column keys | |
| 817 starting after the last column and to generate NewSample.tsv, type: | |
| 818 | |
| 819 % MergeTextFiles.pl -r NewSample --outdelim tab -k "Mol_ID;Mol_ID; | |
| 820 Mol_ID" -m collabel -c "all;Mol_ID,Formula,MolWeight;Mol_ID,NAME, | |
| 821 ChemBankID" -o Sample1.csv Sample2.csv Sample3.csv | |
| 822 | |
| 823 =head1 AUTHOR | |
| 824 | |
| 825 Manish Sud <msud@san.rr.com> | |
| 826 | |
| 827 =head1 SEE ALSO | |
| 828 | |
| 829 JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl | |
| 830 | |
| 831 =head1 COPYRIGHT | |
| 832 | |
| 833 Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 834 | |
| 835 This file is part of MayaChemTools. | |
| 836 | |
| 837 MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 838 the terms of the GNU Lesser General Public License as published by the Free | |
| 839 Software Foundation; either version 3 of the License, or (at your option) | |
| 840 any later version. | |
| 841 | |
| 842 =cut |
