comparison mayachemtools/bin/MergeTextFiles.pl @ 0:73ae111cf86f draft

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 11:55:01 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:73ae111cf86f
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: MergeTextFiles.pl,v $
4 # $Date: 2015/02/28 20:46:20 $
5 # $Revision: 1.40 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileHandle;
36 use FileUtil;
37 use TextUtil;
38
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
40
41 # Autoflush STDOUT
42 $| = 1;
43
44 # Starting message...
45 $ScriptName = basename $0;
46 print "\n$ScriptName:Starting...\n\n";
47 $StartTime = new Benchmark;
48
49 # Get the options and setup script...
50 SetupScriptUsage();
51 if ($Options{help} || @ARGV < 1) {
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
53 }
54
55 my(@TextFilesList);
56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
57
58 if (@TextFilesList == 1) {
59 die "Error: Specify more than one text file.\n";
60 }
61
62 # Process options...
63 print "Processing options...\n";
64 my(%OptionsInfo);
65 ProcessOptions();
66
67 # Setup information about input files...
68 my(%TextFilesInfo);
69 print "Checking input text files...\n";
70 RetrieveTextFilesInfo();
71 RetrieveColumnsAndKeysInfo();
72
73 # Merge files...
74 print "\nGenerating new text file $OptionsInfo{NewTextFile}...\n";
75 MergeTextFiles();
76
77 print "\n$ScriptName:Done...\n\n";
78
79 $EndTime = new Benchmark;
80 $TotalTime = timediff ($EndTime, $StartTime);
81 print "Total time: ", timestr($TotalTime), "\n";
82
83 ###############################################################################
84
85 # Merge all valid Text files...
86 sub MergeTextFiles {
87 my($Index);
88
89 open NEWTEXTFILE, ">$OptionsInfo{NewTextFile}" or die "Error: Couldn't open $OptionsInfo{NewTextFile}: $! \n";
90
91 WriteNewTextFileColumnLabels(\*NEWTEXTFILE);
92
93 #Open up all the files and skip coumn label line...
94 @{$TextFilesInfo{FileHandle}} = ();
95 for $Index (0 .. $#TextFilesList) {
96 $TextFilesInfo{FileHandle}[$Index] = new FileHandle;
97
98 open $TextFilesInfo{FileHandle}[$Index], "$TextFilesList[$Index]" or die "Error: Couldn't open $TextFilesList[$Index]: $! \n";
99 GetTextLine($TextFilesInfo{FileHandle}[$Index]);
100 }
101
102 # Merge files...
103 if ($OptionsInfo{Keys}) {
104 MergeColumnValuesUsingKeys(\*NEWTEXTFILE);
105 }
106 else {
107 MergeColumnValues(\*NEWTEXTFILE);
108 }
109
110 # Close all opened files...
111 close NEWTEXTFILE;
112 for $Index (0 .. $#TextFilesList) {
113 close $TextFilesInfo{FileHandle}[$Index];
114 }
115
116 }
117
118 # Merge all the column values...
119 sub MergeColumnValues {
120 my($NewTextFileRef) = @_;
121 my($Index, $Line, $InDelim, $Value, $ColNum, @LineWords, @File1LineWords, @ColValues);
122
123 while ($Line = GetTextLine($TextFilesInfo{FileHandle}[0])) {
124 $InDelim = $TextFilesInfo{InDelim}[0];
125 @ColValues = ();
126
127 #Collect column values from first file before the merge point...
128 @File1LineWords = quotewords($InDelim, 0, $Line);
129 for $ColNum (@{$TextFilesInfo{File1Part1ColNums}}) {
130 $Value = ($ColNum < @File1LineWords) ? $File1LineWords[$ColNum] : "";
131 push @ColValues, $Value;
132 }
133
134 #Collect column values from other text files...
135 for $Index (1 .. $#TextFilesList) {
136 $InDelim = $TextFilesInfo{InDelim}[$Index];
137 if ($Line = GetTextLine($TextFilesInfo{FileHandle}[$Index])) {
138 @LineWords = quotewords($InDelim, 0, $Line);
139 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) {
140 $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : "";
141 push @ColValues, $Value;
142 }
143 }
144 }
145
146 #Collect column labels from first file after the merge point...
147 for $ColNum (@{$TextFilesInfo{File1Part2ColNums}}) {
148 $Value = ($ColNum < @File1LineWords) ? $File1LineWords[$ColNum] : "";
149 push @ColValues, $Value;
150 }
151
152 # Write it out...
153 $Line = JoinWords(\@ColValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
154 print $NewTextFileRef "$Line\n";
155 }
156
157 }
158
159 # Merge column values using keys...
160 sub MergeColumnValuesUsingKeys {
161 my($NewTextFileRef) = @_;
162 my($Index, $InDelim, $Line, $Value, $ColNum, $KeyColNum, $KeyColValue, @LineWords, @ColValues, @File1LineWords, @TextFilesKeysToLinesMap);
163
164 @TextFilesKeysToLinesMap = ();
165
166 # Retrieve text lines from all the files except for the first file...
167 for $Index (1 .. $#TextFilesList) {
168 %{$TextFilesKeysToLinesMap[$Index]} = ();
169
170 $InDelim = $TextFilesInfo{InDelim}[$Index];
171 $KeyColNum = $TextFilesInfo{KeysToUse}[$Index];
172
173 while ($Line = GetTextLine($TextFilesInfo{FileHandle}[$Index])) {
174 @LineWords = quotewords($InDelim, 0, $Line);
175 if ($KeyColNum < @LineWords) {
176 $KeyColValue = $LineWords[$KeyColNum];
177 if (length($KeyColValue)) {
178 if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) {
179 warn "Warning: Ignoring line, $Line, in text file $TextFilesList[$Index]: Column key value, $KeyColValue, already exists\n";
180 }
181 else {
182 @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}} = ();
183 push @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}}, @LineWords;
184 }
185 }
186 }
187 }
188 }
189
190 while ($Line = GetTextLine($TextFilesInfo{FileHandle}[0])) {
191 $InDelim = $TextFilesInfo{InDelim}[0];
192
193 @ColValues = ();
194 @File1LineWords = quotewords($InDelim, 0, $Line);
195
196 $KeyColNum = $TextFilesInfo{KeysToUse}[0];
197 $KeyColValue = $File1LineWords[$KeyColNum];
198
199 #Collect column values from first file before the merge point...
200 for $ColNum (@{$TextFilesInfo{File1Part1ColNums}}) {
201 $Value = ($ColNum < @File1LineWords) ? $File1LineWords[$ColNum] : "";
202 push @ColValues, $Value;
203 }
204
205 #Collect column values from other text files...
206 for $Index (1 .. $#TextFilesList) {
207 @LineWords = ();
208 if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) {
209 push @LineWords, @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}};
210 }
211 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) {
212 $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : "";
213 push @ColValues, $Value;
214 }
215 }
216
217 #Collect column labels from first file after the merge point...
218 for $ColNum (@{$TextFilesInfo{File1Part2ColNums}}) {
219 $Value = ($ColNum < @File1LineWords) ? $File1LineWords[$ColNum] : "";
220 push @ColValues, $Value;
221 }
222
223 # Write it out...
224 $Line = JoinWords(\@ColValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
225 print $NewTextFileRef "$Line\n";
226 }
227
228 }
229
230 # Write out column labels for new merged text file...
231 sub WriteNewTextFileColumnLabels {
232 my($NewTextFileRef) = @_;
233 my($Index, $Line, $ColNum, @ColLabels);
234
235 #Write out column labels for the merged text file...
236 @ColLabels = ();
237
238 #Collect column labels from first file before the merge point...
239 for $ColNum (@{$TextFilesInfo{File1Part1ColNums}}) {
240 push @ColLabels, $TextFilesInfo{ColToMergeNumToLabelMap}[0]{$ColNum};
241 }
242
243 #Collect column labels from other text files...
244 for $Index (1 .. $#TextFilesList) {
245 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) {
246 push @ColLabels, $TextFilesInfo{ColToMergeNumToLabelMap}[$Index]{$ColNum};
247 }
248 }
249
250 #Collect column labels from first file after the merge point...
251 for $ColNum (@{$TextFilesInfo{File1Part2ColNums}}) {
252 push @ColLabels, $TextFilesInfo{ColToMergeNumToLabelMap}[0]{$ColNum};
253 }
254
255 #Write it out...
256 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
257 print NEWTEXTFILE "$Line\n";
258 }
259
260 # Retrieve text file columns and keys information for specified options...
261 sub RetrieveColumnsAndKeysInfo {
262 ProcessColumnsInfo();
263
264 if ($OptionsInfo{Keys}) {
265 ProcessKeysInfo();
266 }
267
268 ProcessStartColInfo();
269 }
270
271 # Process specified columns...
272 sub ProcessColumnsInfo {
273 my($Index, $SpecifiedColNum, $Values, $ColIndex, $ColNum, $ColLabel, @Words);
274
275 @{$TextFilesInfo{ColSpecified}} = ();
276 @{$TextFilesInfo{ColToMerge}} = ();
277 @{$TextFilesInfo{ColToMergeNumToLabelMap}} = ();
278
279 for $Index (0 .. $#TextFilesList) {
280
281 @{$TextFilesInfo{ColSpecified}[$Index]} = ();
282
283 $Values = "all";
284 if ($OptionsInfo{Columns}) {
285 $Values = $OptionsInfo{ColValues}[$Index];
286 }
287
288 if ($Values =~ /all/i) {
289 if ($OptionsInfo{Mode} =~ /^colnum$/i) {
290 for $ColNum (1 .. $TextFilesInfo{ColCount}[$Index]) {
291 push @{$TextFilesInfo{ColSpecified}[$Index]}, $ColNum;
292 }
293 }
294 else {
295 push @{$TextFilesInfo{ColSpecified}[$Index]}, @{$TextFilesInfo{ColLabels}[$Index]};
296 }
297 }
298 else {
299 @Words = split ",", $Values;
300 push @{$TextFilesInfo{ColSpecified}[$Index]}, @Words;
301 }
302
303 @{$TextFilesInfo{ColToMerge}[$Index]} = ();
304 %{$TextFilesInfo{ColToMergeNumToLabelMap}[$Index]} = ();
305
306 if ($OptionsInfo{Mode} =~ /^collabel$/i) {
307 for $ColIndex (0 .. $#{$TextFilesInfo{ColSpecified}[$Index]}) {
308 $ColLabel = $TextFilesInfo{ColSpecified}[$Index][$ColIndex];
309 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) {
310 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
311 push @{$TextFilesInfo{ColToMerge}[$Index]}, $ColNum;
312 $TextFilesInfo{ColToMergeNumToLabelMap}[$Index]{$ColNum} = $ColLabel;
313 }
314 else {
315 warn "Warning: Ignoring value, $ColLabel, specified using \"-c --column\" option: column name doesn't exist in $TextFilesList[$Index] \n";
316 }
317 }
318 }
319 else {
320 for $ColIndex (0 .. $#{$TextFilesInfo{ColSpecified}[$Index]}) {
321 $SpecifiedColNum = $TextFilesInfo{ColSpecified}[$Index][$ColIndex];
322 if ($SpecifiedColNum > 0 && $SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) {
323 $ColNum = $SpecifiedColNum - 1;
324 push @{$TextFilesInfo{ColToMerge}[$Index]}, $ColNum;
325 $TextFilesInfo{ColToMergeNumToLabelMap}[$Index]{$ColNum} = $TextFilesInfo{ColLabels}[$Index][$ColNum];
326 }
327 else {
328 warn "Warning: Ignoring value, $SpecifiedColNum, specified using \"-c --column\" option: column number doesn't exist in $TextFilesList[$Index] \n";
329 }
330 }
331 }
332 my (@ColToMergeSorted) = sort { $a <=> $b } @{$TextFilesInfo{ColToMerge}[$Index]};
333 @{$TextFilesInfo{ColToMerge}[$Index]} = ();
334 push @{$TextFilesInfo{ColToMerge}[$Index]}, @ColToMergeSorted;
335 }
336 }
337
338 # Process specified key column values...
339 sub ProcessKeysInfo {
340 my($Index, $Key, $ColLabel, $ColNum);
341
342 @{$TextFilesInfo{KeysSpecified}} = ();
343 @{$TextFilesInfo{KeysToUse}} = ();
344
345 for $Index (0 .. $#TextFilesList) {
346 $Key = $OptionsInfo{KeyValues}[$Index];
347
348 $TextFilesInfo{KeysSpecified}[$Index] = $Key;
349 $TextFilesInfo{KeysToUse}[$Index] = -1;
350
351 if ($OptionsInfo{Mode} =~ /^collabel$/i) {
352 $ColLabel = $Key;
353 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) {
354 $TextFilesInfo{KeysToUse}[$Index] = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
355 }
356 else {
357 warn "Warning: Ignoring value, $ColLabel, specified using \"-k --keys\" option: column name doesn't exist in $TextFilesList[$Index] \n";
358 }
359 }
360 else {
361 $ColNum = $Key;
362 if ($ColNum > 0 && $ColNum <= $TextFilesInfo{ColCount}[$Index]) {
363 $TextFilesInfo{KeysToUse}[$Index] = $ColNum - 1;
364 }
365 else {
366 warn "Warning: Ignoring value, $ColNum, specified using \"-k --keys\" option: column number doesn't exist in $TextFilesList[$Index] \n";
367 }
368 }
369 }
370
371 # Modify columns to merge list to make sure the columns identified by key are taken off the list
372 # except for the first text file...
373 my(@ColToMergeFiltered);
374
375 for $Index (1 .. $#TextFilesList) {
376 @ColToMergeFiltered = ();
377 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) {
378 if ($TextFilesInfo{KeysToUse}[$Index] != $ColNum) {
379 push @ColToMergeFiltered, $ColNum;
380 }
381 }
382 @{$TextFilesInfo{ColToMerge}[$Index]} = ();
383 push @{$TextFilesInfo{ColToMerge}[$Index]}, @ColToMergeFiltered;
384 }
385 }
386
387 # Process specified start column value...
388 sub ProcessStartColInfo {
389 my($Index, $ColIndex, $ColNum, $StartColNum, $Part1StartColNum, $Part1EndColNum, $Part2StartColNum, $Part2EndColNum, $BeforeStartColNum, $AfterStartColNum, $FirstColNum, $LastColNum, $FirstIndex, $LastIndex);
390
391 @{$TextFilesInfo{File1Part1ColNums}} = ();
392 @{$TextFilesInfo{File1Part2ColNums}} = ();
393
394 $StartColNum = "last";
395 if ($OptionsInfo{StartCol}) {
396 if (length($OptionsInfo{StartCol})) {
397 $StartColNum = $OptionsInfo{StartCol}
398 }
399 }
400
401 if ($StartColNum !~ /^last$/i) {
402 if ($OptionsInfo{Mode} =~ /^collabel$/i) {
403 if (exists($TextFilesInfo{ColLabelToNumMap}[0]{$StartColNum})) {
404 $StartColNum = $TextFilesInfo{ColLabelToNumMap}[0]{$StartColNum};
405 }
406 else {
407 die "Error: Invalid value $StartColNum specified using \"-s --startcol\" option: column name doesn't exist in $TextFilesList[0] \n";
408 }
409 }
410 else {
411 if ($StartColNum > 0 && $StartColNum <= $TextFilesInfo{ColCount}[0]) {
412 $StartColNum -= 1;
413 }
414 else {
415 die "Error: Invalid value $StartColNum specified using \"-s --startcol\" option: column number doesn't exist in $TextFilesList[0] \n";
416 }
417 }
418 }
419 else {
420 $StartColNum = $TextFilesInfo{ColCount}[0] - 1;
421 }
422
423 # Make sure StartColNum is present on the list of columns to merge for the first text file...
424 if (!exists($TextFilesInfo{ColToMergeNumToLabelMap}[0]{$StartColNum})) {
425 die "Error: Invalid value $StartColNum specified using \"-s --startcol\" option: doesn't exist in the specified lists of columns to merge for $TextFilesList[0] \n";
426 }
427
428 # Find out the column number before and after StartColNum in first text file...
429 $BeforeStartColNum = $StartColNum;
430 $AfterStartColNum = $StartColNum;
431
432 $FirstIndex = 0; $LastIndex = $#{$TextFilesInfo{ColToMerge}[0]};
433
434 $FirstColNum = $TextFilesInfo{ColToMerge}[0][$FirstIndex];
435 $LastColNum = $TextFilesInfo{ColToMerge}[0][$LastIndex];
436
437 for $Index (0 .. $LastIndex) {
438 if ($TextFilesInfo{ColToMerge}[0][$Index] == $StartColNum) {
439 $BeforeStartColNum = (($Index -1) >= $FirstIndex) ? $TextFilesInfo{ColToMerge}[0][$Index - 1] : ($FirstColNum - 1);
440 $AfterStartColNum = (($Index + 1) <= $LastIndex) ? $TextFilesInfo{ColToMerge}[0][$Index + 1] : ($LastColNum + 1);
441 }
442 }
443
444 if ($OptionsInfo{StartColMode} =~ /^after$/i) {
445 $Part1StartColNum = $FirstColNum; $Part1EndColNum = $StartColNum;
446 $Part2StartColNum = $AfterStartColNum; $Part2EndColNum = $LastColNum;
447 }
448 else {
449 $Part1StartColNum = $FirstColNum; $Part1EndColNum = $BeforeStartColNum;
450 $Part2StartColNum = $StartColNum; $Part2EndColNum = $LastColNum;
451 }
452
453 @{$TextFilesInfo{File1Part1ColNums}} = ();
454 @{$TextFilesInfo{File1Part2ColNums}} = ();
455
456 for $ColIndex (0 .. $#{$TextFilesInfo{ColToMerge}[0]}) {
457 $ColNum = $TextFilesInfo{ColToMerge}[0][$ColIndex];
458 if ($ColNum >= $Part1StartColNum && $ColNum <= $Part1EndColNum) {
459 push @{$TextFilesInfo{File1Part1ColNums}}, $ColNum;
460 }
461 }
462
463 for $ColIndex (0 .. $#{$TextFilesInfo{ColToMerge}[0]}) {
464 $ColNum = $TextFilesInfo{ColToMerge}[0][$ColIndex];
465 if ($ColNum >= $Part2StartColNum && $ColNum <= $Part2EndColNum) {
466 push @{$TextFilesInfo{File1Part2ColNums}}, $ColNum;
467 }
468 }
469
470 }
471
472 # Retrieve information about input text files...
473 sub RetrieveTextFilesInfo {
474 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, $ColNum, $ColLabel, $FileNotOkayCount, @ColLabels,);
475
476 %TextFilesInfo = ();
477
478 @{$TextFilesInfo{FileOkay}} = ();
479 @{$TextFilesInfo{ColCount}} = ();
480 @{$TextFilesInfo{ColLabels}} = ();
481 @{$TextFilesInfo{ColLabelToNumMap}} = ();
482 @{$TextFilesInfo{InDelim}} = ();
483
484 $FileNotOkayCount = 0;
485
486 FILELIST: for $Index (0 .. $#TextFilesList) {
487 $TextFile = $TextFilesList[$Index];
488
489 $TextFilesInfo{FileOkay}[$Index] = 0;
490 $TextFilesInfo{ColCount}[$Index] = 0;
491 $TextFilesInfo{InDelim}[$Index] = "";
492
493 @{$TextFilesInfo{ColLabels}[$Index]} = ();
494 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
495
496 if (!(-e $TextFile)) {
497 warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
498 $FileNotOkayCount++;
499 next FILELIST;
500 }
501 if (!CheckFileType($TextFile, "csv tsv")) {
502 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
503 $FileNotOkayCount++;
504 next FILELIST;
505 }
506 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
507 if ($FileExt =~ /^tsv$/i) {
508 $InDelim = "\t";
509 }
510 else {
511 $InDelim = "\,";
512 if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) {
513 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n";
514 $FileNotOkayCount++;
515 next FILELIST;
516 }
517 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) {
518 $InDelim = "\;";
519 }
520 }
521
522 if (!open TEXTFILE, "$TextFile") {
523 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
524 $FileNotOkayCount++;
525 next FILELIST;
526 }
527
528 $Line = GetTextLine(\*TEXTFILE);
529 @ColLabels = quotewords($InDelim, 0, $Line);
530 close TEXTFILE;
531
532 $TextFilesInfo{FileOkay}[$Index] = 1;
533 $TextFilesInfo{InDelim}[$Index] = $InDelim;
534
535 $TextFilesInfo{ColCount}[$Index] = @ColLabels;
536 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
537 for $ColNum (0 .. $#ColLabels) {
538 $ColLabel = $ColLabels[$ColNum];
539 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
540 }
541 }
542 # Make sure all specified files are valid for merging to work properly...
543 if ($FileNotOkayCount) {
544 die "Error: Problems with input text file(s)...\n";
545 }
546 }
547
548 # Process option values...
549 sub ProcessOptions {
550 my($Index, $FileDir, $FileName, $FileExt, $NewTextFile, @ColValues, @KeyValues);
551
552 %OptionsInfo = ();
553
554 $OptionsInfo{Mode} = $Options{mode};
555
556 $OptionsInfo{Columns} = $Options{columns};
557 @{$OptionsInfo{ColValues}} = ();
558
559 if ($Options{columns}) {
560 @ColValues = split ";", $Options{columns};
561 if (@ColValues != @TextFilesList) {
562 die "Error: Invalid number of values specified by \"-c --columns\" option: it must be equal to number of input text files.\n";
563 }
564 for $Index (0 .. $#ColValues) {
565 if (!length($ColValues[$Index])) {
566 die "Error: Invalid value specified by \"-c --columns\" option: empty values are not allowed.\n";
567 }
568 }
569 @{$OptionsInfo{ColValues}} = @ColValues;
570 }
571
572 $OptionsInfo{Keys} = $Options{keys};
573 @{$OptionsInfo{KeyValues}} = ();
574
575 if ($Options{keys}) {
576 @KeyValues = split ";", $Options{keys};
577 if (@KeyValues != @TextFilesList) {
578 die "Error: Invalid number of values specified by \"-k --keys\" option: it must be equal to number of input text files.\n";
579 }
580 for $Index (0 .. $#KeyValues) {
581 if (!length($KeyValues[$Index])) {
582 die "Error: Invalid value specified by \"-k --keys\" option: empty values are not allowed.\n";
583 }
584 }
585 @{$OptionsInfo{KeyValues}} = @KeyValues;
586 }
587
588 $OptionsInfo{InDelim} = $Options{indelim};
589
590 $OptionsInfo{StartCol} = $Options{startcol} ? $Options{startcol} : undef;
591 $OptionsInfo{StartColMode} = $Options{startcolmode};
592
593 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef;
594 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef;
595
596 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
597 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
598
599 if ($Options{root}) {
600 $FileDir = ""; $FileName = ""; $FileExt = "";
601 ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root});
602 if ($FileName && $FileExt) {
603 $NewTextFile = $FileName;
604 } else {
605 $NewTextFile = $Options{root};
606 }
607 } else {
608 $FileDir = ""; $FileName = ""; $FileExt = "";
609 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFilesList[0]);
610 $NewTextFile = $FileName . "1To" . @TextFilesList . "Merged";
611 }
612 if ($Options{outdelim} =~ /^tab$/i) {
613 $NewTextFile .= ".tsv";
614 } else {
615 $NewTextFile .= ".csv";
616 }
617 if (!$Options{overwrite}) {
618 if (-e $NewTextFile) {
619 die "Error: The file $NewTextFile already exists.\n";
620 }
621 }
622 if ($Options{root}) {
623 for $Index (0 .. $#TextFilesList) {
624 if (lc($NewTextFile) eq lc($TextFilesList[$Index])) {
625 die "Error: Output filename, $NewTextFile, is similar to a input file name.\nSpecify a different name using \"-r --root\" option or use default name.\n";
626 }
627 }
628 }
629
630 $OptionsInfo{NewTextFile} = $NewTextFile;
631 }
632
633 # Setup script usage and retrieve command line arguments specified using various options...
634 sub SetupScriptUsage {
635
636 # Retrieve all the options...
637 %Options = ();
638
639 $Options{mode} = "colnum";
640 $Options{indelim} = "comma";
641 $Options{outdelim} = "comma";
642 $Options{quote} = "yes";
643 $Options{startcolmode} = "after";
644
645 if (!GetOptions(\%Options, "help|h", "indelim=s", "columns|c=s", "keys|k=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "startcol|s=s", "startcolmode=s", "workingdir|w=s")) {
646 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
647 }
648 if ($Options{workingdir}) {
649 if (! -d $Options{workingdir}) {
650 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
651 }
652 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
653 }
654 if ($Options{mode} !~ /^(colnum|collabel)$/i) {
655 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum, or collabel\n";
656 }
657 if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
658 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
659 }
660 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
661 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
662 }
663 if ($Options{quote} !~ /^(yes|no)$/i) {
664 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
665 }
666 if ($Options{startcolmode} !~ /^(before|after)$/i) {
667 die "Error: The value specified, $Options{quote}, for option \"--startcolmode\" is not valid. Allowed values: before or after\n";
668 }
669 }
670
671 __END__
672
673 =head1 NAME
674
675 MergeTextFiles.pl - Merge multiple CSV or TSV text files into a single text file
676
677 =head1 SYNOPSIS
678
679 MergeTextFiles.pl TextFiles...
680
681 MergeTextFiles.pl [B<-h, --help>] [B<--indelim> comma | semicolon] [B<-c, --columns> colnum,...;... | collabel,...;...]
682 [B<-k, --keys> colnum,...;... | collabel,...;...] [B<-m, --mode> colnum | collabel]
683 [B<-o, --overwrite>] [B<--outdelim> comma | tab | semicolon] [B<-q, --quote> yes | no]
684 [B<-r, --root> rootname] [B<-s, --startcol> colnum | collabel] [B<--startcolmode> before | after]
685 [B<-w, --workingdir> dirname] TextFiles...
686
687 =head1 DESCRIPTION
688
689 Merge multiple CSV or TSV I<TextFiles> into first I<TextFile> to generate a single
690 text file. Unless B<-k --keys> option is used, data rows from other I<TextFiles>
691 are added to first I<TextFile> in a sequential order, and the number of rows in first
692 I<TextFile> is used to determine how many rows of data are added from other
693 I<TextFiles>.
694
695 Multiple I<TextFiles> names are separated by space. The valid file extensions are I<.csv> and
696 I<.tsv> for comma/semicolon and tab delimited text files respectively. All other file names
697 are ignored. All the text files in a current directory can be specified by I<*.csv>,
698 I<*.tsv>, or the current directory name. The B<--indelim> option determines the
699 format of I<TextFiles>. Any file which doesn't correspond to the format indicated
700 by B<--indelim> option is ignored.
701
702 =head1 OPTIONS
703
704 =over 4
705
706 =item B<-h, --help>
707
708 Print this help message.
709
710 =item B<--indelim> I<comma | semicolon>
711
712 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>.
713 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a
714 delimiter.
715
716 =item B<-c, --columns> I<colnum,...;... | collabel,...;...>
717
718 This value is mode specific. It is a list of columns to merge into first
719 text file specified by column numbers or labels for each text file
720 delimited by ";". All specified text files are merged into first text file.
721
722 Default value: I<all;all;...>. By default, all columns from specified text files are
723 merged into first text file.
724
725 For I<colnum> mode, input value format is: I<colnum,...;colnum,...;...>. Example:
726
727 "1,2;1,3,4;7,8,9"
728
729 For I<collabel> mode, input value format is: I<collabel,...;collabel,...;...>. Example:
730
731 "MW,SumNO;SumNHOH,ClogP,PSA;MolName,Mol_Id,Extreg"
732
733 =item B<-k, --keys> I<colnum,...;... | collabel,...;...>
734
735 This value is mode specific. It specifies column keys to use for merging
736 all specified text files into first text file. The column keys are specified by
737 column numbers or labels for each text file delimited by ";".
738
739 By default, data rows from text files are merged into first file in the order they appear.
740
741 For I<colnum> mode, input value format is:I<colkeynum, colkeynum;...>. Example:
742
743 "1;3;7"
744
745 For I<collabel> mode, input value format is:I<colkeylabel, colkeylabel;...>. Example:
746
747 "Mol_Id;Mol_Id;Cmpd_Id"
748
749 =item B<-m, --mode> I<colnum | collabel>
750
751 Specify how to merge text files: using column numbers or column labels.
752 Possible values: I<colnum or collabel>. Default value: I<colnum>.
753
754 =item B<-o, --overwrite>
755
756 Overwrite existing files.
757
758 =item B<--outdelim> I<comma | tab | semicolon>
759
760 Output text file delimiter. Possible values: I<comma, tab, or semicolon>
761 Default value: I<comma>.
762
763 =item B<-q, --quote> I<yes | no>
764
765 Put quotes around column values in output text file. Possible values: I<yes or
766 no>. Default value: I<yes>.
767
768 =item B<-r, --root> I<rootname>
769
770 New text file name is generated using the root: <Root>.<Ext>. Default file
771 name: <FirstTextFileName>1To<Count>Merged.<Ext>. The csv, and tsv
772 <Ext> values are used for comma/semicolon, and tab delimited text files
773 respectively.
774
775 =item B<-s, --startcol> I<colnum | collabel>
776
777 This value is mode specific. It specifies the column in first text file which is
778 used for start merging other text files.For I<colnum> mode, specify column
779 number and for I<collabel> mode, specify column label.
780
781 Default value: I<last>. Start merge after the last column.
782
783 =item B<--startcolmode> I<before | after>
784
785 Start the merge before or after the B<-s, --startcol> value. Possible values: I<before or after>
786 Default value: I<after>.
787
788 =item B<-w, --workingdir> I<dirname>
789
790 Location of working directory. Default: current directory.
791
792 =back
793
794 =head1 EXAMPLES
795
796 To merge Sample2.csv and Sample3.csv into Sample1.csv and generate
797 NewSample.csv, type:
798
799 % MergeTextFiles.pl -r NewSample -o Sample1.csv Sample2.csv
800 Sample3.csv
801
802 To merge all Sample*.tsv and generate NewSample.tsv file, type:
803
804 % MergeTextFiles.pl -r NewSample --indelim comma --outdelim tab -o
805 Sample*.csv
806
807 To merge column numbers "1,2" and "3,4,5" from Sample2.csv and Sample3.csv
808 into Sample1.csv starting before column number 3 in Sample1.csv and to generate
809 NewSample.csv without quoting column data, type:
810
811 % MergeTextFiles.pl -s 3 --startcolmode before -r NewSample -q no
812 -m colnum -c "all;1,2;3,4,5" -o Sample1.csv Sample2.csv
813 Sample3.csv
814
815 To merge column "Mol_ID,Formula,MolWeight" and "Mol_ID,NAME,ChemBankID"
816 from Sample2.csv and Sample3.csv into Sample1.csv using "Mol_ID" as a column keys
817 starting after the last column and to generate NewSample.tsv, type:
818
819 % MergeTextFiles.pl -r NewSample --outdelim tab -k "Mol_ID;Mol_ID;
820 Mol_ID" -m collabel -c "all;Mol_ID,Formula,MolWeight;Mol_ID,NAME,
821 ChemBankID" -o Sample1.csv Sample2.csv Sample3.csv
822
823 =head1 AUTHOR
824
825 Manish Sud <msud@san.rr.com>
826
827 =head1 SEE ALSO
828
829 JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl
830
831 =head1 COPYRIGHT
832
833 Copyright (C) 2015 Manish Sud. All rights reserved.
834
835 This file is part of MayaChemTools.
836
837 MayaChemTools is free software; you can redistribute it and/or modify it under
838 the terms of the GNU Lesser General Public License as published by the Free
839 Software Foundation; either version 3 of the License, or (at your option)
840 any later version.
841
842 =cut