Mercurial > repos > deepakjadmin > mayatool3_test3
comparison mayachemtools/bin/MergeTextFiles.pl @ 0:73ae111cf86f draft
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 11:55:01 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:73ae111cf86f |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # $RCSfile: MergeTextFiles.pl,v $ | |
4 # $Date: 2015/02/28 20:46:20 $ | |
5 # $Revision: 1.40 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
31 use Getopt::Long; | |
32 use File::Basename; | |
33 use Text::ParseWords; | |
34 use Benchmark; | |
35 use FileHandle; | |
36 use FileUtil; | |
37 use TextUtil; | |
38 | |
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
40 | |
41 # Autoflush STDOUT | |
42 $| = 1; | |
43 | |
44 # Starting message... | |
45 $ScriptName = basename $0; | |
46 print "\n$ScriptName:Starting...\n\n"; | |
47 $StartTime = new Benchmark; | |
48 | |
49 # Get the options and setup script... | |
50 SetupScriptUsage(); | |
51 if ($Options{help} || @ARGV < 1) { | |
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
53 } | |
54 | |
55 my(@TextFilesList); | |
56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); | |
57 | |
58 if (@TextFilesList == 1) { | |
59 die "Error: Specify more than one text file.\n"; | |
60 } | |
61 | |
62 # Process options... | |
63 print "Processing options...\n"; | |
64 my(%OptionsInfo); | |
65 ProcessOptions(); | |
66 | |
67 # Setup information about input files... | |
68 my(%TextFilesInfo); | |
69 print "Checking input text files...\n"; | |
70 RetrieveTextFilesInfo(); | |
71 RetrieveColumnsAndKeysInfo(); | |
72 | |
73 # Merge files... | |
74 print "\nGenerating new text file $OptionsInfo{NewTextFile}...\n"; | |
75 MergeTextFiles(); | |
76 | |
77 print "\n$ScriptName:Done...\n\n"; | |
78 | |
79 $EndTime = new Benchmark; | |
80 $TotalTime = timediff ($EndTime, $StartTime); | |
81 print "Total time: ", timestr($TotalTime), "\n"; | |
82 | |
83 ############################################################################### | |
84 | |
85 # Merge all valid Text files... | |
86 sub MergeTextFiles { | |
87 my($Index); | |
88 | |
89 open NEWTEXTFILE, ">$OptionsInfo{NewTextFile}" or die "Error: Couldn't open $OptionsInfo{NewTextFile}: $! \n"; | |
90 | |
91 WriteNewTextFileColumnLabels(\*NEWTEXTFILE); | |
92 | |
93 #Open up all the files and skip coumn label line... | |
94 @{$TextFilesInfo{FileHandle}} = (); | |
95 for $Index (0 .. $#TextFilesList) { | |
96 $TextFilesInfo{FileHandle}[$Index] = new FileHandle; | |
97 | |
98 open $TextFilesInfo{FileHandle}[$Index], "$TextFilesList[$Index]" or die "Error: Couldn't open $TextFilesList[$Index]: $! \n"; | |
99 GetTextLine($TextFilesInfo{FileHandle}[$Index]); | |
100 } | |
101 | |
102 # Merge files... | |
103 if ($OptionsInfo{Keys}) { | |
104 MergeColumnValuesUsingKeys(\*NEWTEXTFILE); | |
105 } | |
106 else { | |
107 MergeColumnValues(\*NEWTEXTFILE); | |
108 } | |
109 | |
110 # Close all opened files... | |
111 close NEWTEXTFILE; | |
112 for $Index (0 .. $#TextFilesList) { | |
113 close $TextFilesInfo{FileHandle}[$Index]; | |
114 } | |
115 | |
116 } | |
117 | |
118 # Merge all the column values... | |
119 sub MergeColumnValues { | |
120 my($NewTextFileRef) = @_; | |
121 my($Index, $Line, $InDelim, $Value, $ColNum, @LineWords, @File1LineWords, @ColValues); | |
122 | |
123 while ($Line = GetTextLine($TextFilesInfo{FileHandle}[0])) { | |
124 $InDelim = $TextFilesInfo{InDelim}[0]; | |
125 @ColValues = (); | |
126 | |
127 #Collect column values from first file before the merge point... | |
128 @File1LineWords = quotewords($InDelim, 0, $Line); | |
129 for $ColNum (@{$TextFilesInfo{File1Part1ColNums}}) { | |
130 $Value = ($ColNum < @File1LineWords) ? $File1LineWords[$ColNum] : ""; | |
131 push @ColValues, $Value; | |
132 } | |
133 | |
134 #Collect column values from other text files... | |
135 for $Index (1 .. $#TextFilesList) { | |
136 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
137 if ($Line = GetTextLine($TextFilesInfo{FileHandle}[$Index])) { | |
138 @LineWords = quotewords($InDelim, 0, $Line); | |
139 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) { | |
140 $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : ""; | |
141 push @ColValues, $Value; | |
142 } | |
143 } | |
144 } | |
145 | |
146 #Collect column labels from first file after the merge point... | |
147 for $ColNum (@{$TextFilesInfo{File1Part2ColNums}}) { | |
148 $Value = ($ColNum < @File1LineWords) ? $File1LineWords[$ColNum] : ""; | |
149 push @ColValues, $Value; | |
150 } | |
151 | |
152 # Write it out... | |
153 $Line = JoinWords(\@ColValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
154 print $NewTextFileRef "$Line\n"; | |
155 } | |
156 | |
157 } | |
158 | |
159 # Merge column values using keys... | |
160 sub MergeColumnValuesUsingKeys { | |
161 my($NewTextFileRef) = @_; | |
162 my($Index, $InDelim, $Line, $Value, $ColNum, $KeyColNum, $KeyColValue, @LineWords, @ColValues, @File1LineWords, @TextFilesKeysToLinesMap); | |
163 | |
164 @TextFilesKeysToLinesMap = (); | |
165 | |
166 # Retrieve text lines from all the files except for the first file... | |
167 for $Index (1 .. $#TextFilesList) { | |
168 %{$TextFilesKeysToLinesMap[$Index]} = (); | |
169 | |
170 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
171 $KeyColNum = $TextFilesInfo{KeysToUse}[$Index]; | |
172 | |
173 while ($Line = GetTextLine($TextFilesInfo{FileHandle}[$Index])) { | |
174 @LineWords = quotewords($InDelim, 0, $Line); | |
175 if ($KeyColNum < @LineWords) { | |
176 $KeyColValue = $LineWords[$KeyColNum]; | |
177 if (length($KeyColValue)) { | |
178 if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) { | |
179 warn "Warning: Ignoring line, $Line, in text file $TextFilesList[$Index]: Column key value, $KeyColValue, already exists\n"; | |
180 } | |
181 else { | |
182 @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}} = (); | |
183 push @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}}, @LineWords; | |
184 } | |
185 } | |
186 } | |
187 } | |
188 } | |
189 | |
190 while ($Line = GetTextLine($TextFilesInfo{FileHandle}[0])) { | |
191 $InDelim = $TextFilesInfo{InDelim}[0]; | |
192 | |
193 @ColValues = (); | |
194 @File1LineWords = quotewords($InDelim, 0, $Line); | |
195 | |
196 $KeyColNum = $TextFilesInfo{KeysToUse}[0]; | |
197 $KeyColValue = $File1LineWords[$KeyColNum]; | |
198 | |
199 #Collect column values from first file before the merge point... | |
200 for $ColNum (@{$TextFilesInfo{File1Part1ColNums}}) { | |
201 $Value = ($ColNum < @File1LineWords) ? $File1LineWords[$ColNum] : ""; | |
202 push @ColValues, $Value; | |
203 } | |
204 | |
205 #Collect column values from other text files... | |
206 for $Index (1 .. $#TextFilesList) { | |
207 @LineWords = (); | |
208 if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) { | |
209 push @LineWords, @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}}; | |
210 } | |
211 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) { | |
212 $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : ""; | |
213 push @ColValues, $Value; | |
214 } | |
215 } | |
216 | |
217 #Collect column labels from first file after the merge point... | |
218 for $ColNum (@{$TextFilesInfo{File1Part2ColNums}}) { | |
219 $Value = ($ColNum < @File1LineWords) ? $File1LineWords[$ColNum] : ""; | |
220 push @ColValues, $Value; | |
221 } | |
222 | |
223 # Write it out... | |
224 $Line = JoinWords(\@ColValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
225 print $NewTextFileRef "$Line\n"; | |
226 } | |
227 | |
228 } | |
229 | |
230 # Write out column labels for new merged text file... | |
231 sub WriteNewTextFileColumnLabels { | |
232 my($NewTextFileRef) = @_; | |
233 my($Index, $Line, $ColNum, @ColLabels); | |
234 | |
235 #Write out column labels for the merged text file... | |
236 @ColLabels = (); | |
237 | |
238 #Collect column labels from first file before the merge point... | |
239 for $ColNum (@{$TextFilesInfo{File1Part1ColNums}}) { | |
240 push @ColLabels, $TextFilesInfo{ColToMergeNumToLabelMap}[0]{$ColNum}; | |
241 } | |
242 | |
243 #Collect column labels from other text files... | |
244 for $Index (1 .. $#TextFilesList) { | |
245 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) { | |
246 push @ColLabels, $TextFilesInfo{ColToMergeNumToLabelMap}[$Index]{$ColNum}; | |
247 } | |
248 } | |
249 | |
250 #Collect column labels from first file after the merge point... | |
251 for $ColNum (@{$TextFilesInfo{File1Part2ColNums}}) { | |
252 push @ColLabels, $TextFilesInfo{ColToMergeNumToLabelMap}[0]{$ColNum}; | |
253 } | |
254 | |
255 #Write it out... | |
256 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
257 print NEWTEXTFILE "$Line\n"; | |
258 } | |
259 | |
260 # Retrieve text file columns and keys information for specified options... | |
261 sub RetrieveColumnsAndKeysInfo { | |
262 ProcessColumnsInfo(); | |
263 | |
264 if ($OptionsInfo{Keys}) { | |
265 ProcessKeysInfo(); | |
266 } | |
267 | |
268 ProcessStartColInfo(); | |
269 } | |
270 | |
271 # Process specified columns... | |
272 sub ProcessColumnsInfo { | |
273 my($Index, $SpecifiedColNum, $Values, $ColIndex, $ColNum, $ColLabel, @Words); | |
274 | |
275 @{$TextFilesInfo{ColSpecified}} = (); | |
276 @{$TextFilesInfo{ColToMerge}} = (); | |
277 @{$TextFilesInfo{ColToMergeNumToLabelMap}} = (); | |
278 | |
279 for $Index (0 .. $#TextFilesList) { | |
280 | |
281 @{$TextFilesInfo{ColSpecified}[$Index]} = (); | |
282 | |
283 $Values = "all"; | |
284 if ($OptionsInfo{Columns}) { | |
285 $Values = $OptionsInfo{ColValues}[$Index]; | |
286 } | |
287 | |
288 if ($Values =~ /all/i) { | |
289 if ($OptionsInfo{Mode} =~ /^colnum$/i) { | |
290 for $ColNum (1 .. $TextFilesInfo{ColCount}[$Index]) { | |
291 push @{$TextFilesInfo{ColSpecified}[$Index]}, $ColNum; | |
292 } | |
293 } | |
294 else { | |
295 push @{$TextFilesInfo{ColSpecified}[$Index]}, @{$TextFilesInfo{ColLabels}[$Index]}; | |
296 } | |
297 } | |
298 else { | |
299 @Words = split ",", $Values; | |
300 push @{$TextFilesInfo{ColSpecified}[$Index]}, @Words; | |
301 } | |
302 | |
303 @{$TextFilesInfo{ColToMerge}[$Index]} = (); | |
304 %{$TextFilesInfo{ColToMergeNumToLabelMap}[$Index]} = (); | |
305 | |
306 if ($OptionsInfo{Mode} =~ /^collabel$/i) { | |
307 for $ColIndex (0 .. $#{$TextFilesInfo{ColSpecified}[$Index]}) { | |
308 $ColLabel = $TextFilesInfo{ColSpecified}[$Index][$ColIndex]; | |
309 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { | |
310 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; | |
311 push @{$TextFilesInfo{ColToMerge}[$Index]}, $ColNum; | |
312 $TextFilesInfo{ColToMergeNumToLabelMap}[$Index]{$ColNum} = $ColLabel; | |
313 } | |
314 else { | |
315 warn "Warning: Ignoring value, $ColLabel, specified using \"-c --column\" option: column name doesn't exist in $TextFilesList[$Index] \n"; | |
316 } | |
317 } | |
318 } | |
319 else { | |
320 for $ColIndex (0 .. $#{$TextFilesInfo{ColSpecified}[$Index]}) { | |
321 $SpecifiedColNum = $TextFilesInfo{ColSpecified}[$Index][$ColIndex]; | |
322 if ($SpecifiedColNum > 0 && $SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) { | |
323 $ColNum = $SpecifiedColNum - 1; | |
324 push @{$TextFilesInfo{ColToMerge}[$Index]}, $ColNum; | |
325 $TextFilesInfo{ColToMergeNumToLabelMap}[$Index]{$ColNum} = $TextFilesInfo{ColLabels}[$Index][$ColNum]; | |
326 } | |
327 else { | |
328 warn "Warning: Ignoring value, $SpecifiedColNum, specified using \"-c --column\" option: column number doesn't exist in $TextFilesList[$Index] \n"; | |
329 } | |
330 } | |
331 } | |
332 my (@ColToMergeSorted) = sort { $a <=> $b } @{$TextFilesInfo{ColToMerge}[$Index]}; | |
333 @{$TextFilesInfo{ColToMerge}[$Index]} = (); | |
334 push @{$TextFilesInfo{ColToMerge}[$Index]}, @ColToMergeSorted; | |
335 } | |
336 } | |
337 | |
338 # Process specified key column values... | |
339 sub ProcessKeysInfo { | |
340 my($Index, $Key, $ColLabel, $ColNum); | |
341 | |
342 @{$TextFilesInfo{KeysSpecified}} = (); | |
343 @{$TextFilesInfo{KeysToUse}} = (); | |
344 | |
345 for $Index (0 .. $#TextFilesList) { | |
346 $Key = $OptionsInfo{KeyValues}[$Index]; | |
347 | |
348 $TextFilesInfo{KeysSpecified}[$Index] = $Key; | |
349 $TextFilesInfo{KeysToUse}[$Index] = -1; | |
350 | |
351 if ($OptionsInfo{Mode} =~ /^collabel$/i) { | |
352 $ColLabel = $Key; | |
353 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { | |
354 $TextFilesInfo{KeysToUse}[$Index] = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; | |
355 } | |
356 else { | |
357 warn "Warning: Ignoring value, $ColLabel, specified using \"-k --keys\" option: column name doesn't exist in $TextFilesList[$Index] \n"; | |
358 } | |
359 } | |
360 else { | |
361 $ColNum = $Key; | |
362 if ($ColNum > 0 && $ColNum <= $TextFilesInfo{ColCount}[$Index]) { | |
363 $TextFilesInfo{KeysToUse}[$Index] = $ColNum - 1; | |
364 } | |
365 else { | |
366 warn "Warning: Ignoring value, $ColNum, specified using \"-k --keys\" option: column number doesn't exist in $TextFilesList[$Index] \n"; | |
367 } | |
368 } | |
369 } | |
370 | |
371 # Modify columns to merge list to make sure the columns identified by key are taken off the list | |
372 # except for the first text file... | |
373 my(@ColToMergeFiltered); | |
374 | |
375 for $Index (1 .. $#TextFilesList) { | |
376 @ColToMergeFiltered = (); | |
377 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) { | |
378 if ($TextFilesInfo{KeysToUse}[$Index] != $ColNum) { | |
379 push @ColToMergeFiltered, $ColNum; | |
380 } | |
381 } | |
382 @{$TextFilesInfo{ColToMerge}[$Index]} = (); | |
383 push @{$TextFilesInfo{ColToMerge}[$Index]}, @ColToMergeFiltered; | |
384 } | |
385 } | |
386 | |
387 # Process specified start column value... | |
388 sub ProcessStartColInfo { | |
389 my($Index, $ColIndex, $ColNum, $StartColNum, $Part1StartColNum, $Part1EndColNum, $Part2StartColNum, $Part2EndColNum, $BeforeStartColNum, $AfterStartColNum, $FirstColNum, $LastColNum, $FirstIndex, $LastIndex); | |
390 | |
391 @{$TextFilesInfo{File1Part1ColNums}} = (); | |
392 @{$TextFilesInfo{File1Part2ColNums}} = (); | |
393 | |
394 $StartColNum = "last"; | |
395 if ($OptionsInfo{StartCol}) { | |
396 if (length($OptionsInfo{StartCol})) { | |
397 $StartColNum = $OptionsInfo{StartCol} | |
398 } | |
399 } | |
400 | |
401 if ($StartColNum !~ /^last$/i) { | |
402 if ($OptionsInfo{Mode} =~ /^collabel$/i) { | |
403 if (exists($TextFilesInfo{ColLabelToNumMap}[0]{$StartColNum})) { | |
404 $StartColNum = $TextFilesInfo{ColLabelToNumMap}[0]{$StartColNum}; | |
405 } | |
406 else { | |
407 die "Error: Invalid value $StartColNum specified using \"-s --startcol\" option: column name doesn't exist in $TextFilesList[0] \n"; | |
408 } | |
409 } | |
410 else { | |
411 if ($StartColNum > 0 && $StartColNum <= $TextFilesInfo{ColCount}[0]) { | |
412 $StartColNum -= 1; | |
413 } | |
414 else { | |
415 die "Error: Invalid value $StartColNum specified using \"-s --startcol\" option: column number doesn't exist in $TextFilesList[0] \n"; | |
416 } | |
417 } | |
418 } | |
419 else { | |
420 $StartColNum = $TextFilesInfo{ColCount}[0] - 1; | |
421 } | |
422 | |
423 # Make sure StartColNum is present on the list of columns to merge for the first text file... | |
424 if (!exists($TextFilesInfo{ColToMergeNumToLabelMap}[0]{$StartColNum})) { | |
425 die "Error: Invalid value $StartColNum specified using \"-s --startcol\" option: doesn't exist in the specified lists of columns to merge for $TextFilesList[0] \n"; | |
426 } | |
427 | |
428 # Find out the column number before and after StartColNum in first text file... | |
429 $BeforeStartColNum = $StartColNum; | |
430 $AfterStartColNum = $StartColNum; | |
431 | |
432 $FirstIndex = 0; $LastIndex = $#{$TextFilesInfo{ColToMerge}[0]}; | |
433 | |
434 $FirstColNum = $TextFilesInfo{ColToMerge}[0][$FirstIndex]; | |
435 $LastColNum = $TextFilesInfo{ColToMerge}[0][$LastIndex]; | |
436 | |
437 for $Index (0 .. $LastIndex) { | |
438 if ($TextFilesInfo{ColToMerge}[0][$Index] == $StartColNum) { | |
439 $BeforeStartColNum = (($Index -1) >= $FirstIndex) ? $TextFilesInfo{ColToMerge}[0][$Index - 1] : ($FirstColNum - 1); | |
440 $AfterStartColNum = (($Index + 1) <= $LastIndex) ? $TextFilesInfo{ColToMerge}[0][$Index + 1] : ($LastColNum + 1); | |
441 } | |
442 } | |
443 | |
444 if ($OptionsInfo{StartColMode} =~ /^after$/i) { | |
445 $Part1StartColNum = $FirstColNum; $Part1EndColNum = $StartColNum; | |
446 $Part2StartColNum = $AfterStartColNum; $Part2EndColNum = $LastColNum; | |
447 } | |
448 else { | |
449 $Part1StartColNum = $FirstColNum; $Part1EndColNum = $BeforeStartColNum; | |
450 $Part2StartColNum = $StartColNum; $Part2EndColNum = $LastColNum; | |
451 } | |
452 | |
453 @{$TextFilesInfo{File1Part1ColNums}} = (); | |
454 @{$TextFilesInfo{File1Part2ColNums}} = (); | |
455 | |
456 for $ColIndex (0 .. $#{$TextFilesInfo{ColToMerge}[0]}) { | |
457 $ColNum = $TextFilesInfo{ColToMerge}[0][$ColIndex]; | |
458 if ($ColNum >= $Part1StartColNum && $ColNum <= $Part1EndColNum) { | |
459 push @{$TextFilesInfo{File1Part1ColNums}}, $ColNum; | |
460 } | |
461 } | |
462 | |
463 for $ColIndex (0 .. $#{$TextFilesInfo{ColToMerge}[0]}) { | |
464 $ColNum = $TextFilesInfo{ColToMerge}[0][$ColIndex]; | |
465 if ($ColNum >= $Part2StartColNum && $ColNum <= $Part2EndColNum) { | |
466 push @{$TextFilesInfo{File1Part2ColNums}}, $ColNum; | |
467 } | |
468 } | |
469 | |
470 } | |
471 | |
472 # Retrieve information about input text files... | |
473 sub RetrieveTextFilesInfo { | |
474 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, $ColNum, $ColLabel, $FileNotOkayCount, @ColLabels,); | |
475 | |
476 %TextFilesInfo = (); | |
477 | |
478 @{$TextFilesInfo{FileOkay}} = (); | |
479 @{$TextFilesInfo{ColCount}} = (); | |
480 @{$TextFilesInfo{ColLabels}} = (); | |
481 @{$TextFilesInfo{ColLabelToNumMap}} = (); | |
482 @{$TextFilesInfo{InDelim}} = (); | |
483 | |
484 $FileNotOkayCount = 0; | |
485 | |
486 FILELIST: for $Index (0 .. $#TextFilesList) { | |
487 $TextFile = $TextFilesList[$Index]; | |
488 | |
489 $TextFilesInfo{FileOkay}[$Index] = 0; | |
490 $TextFilesInfo{ColCount}[$Index] = 0; | |
491 $TextFilesInfo{InDelim}[$Index] = ""; | |
492 | |
493 @{$TextFilesInfo{ColLabels}[$Index]} = (); | |
494 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); | |
495 | |
496 if (!(-e $TextFile)) { | |
497 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; | |
498 $FileNotOkayCount++; | |
499 next FILELIST; | |
500 } | |
501 if (!CheckFileType($TextFile, "csv tsv")) { | |
502 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; | |
503 $FileNotOkayCount++; | |
504 next FILELIST; | |
505 } | |
506 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); | |
507 if ($FileExt =~ /^tsv$/i) { | |
508 $InDelim = "\t"; | |
509 } | |
510 else { | |
511 $InDelim = "\,"; | |
512 if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) { | |
513 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; | |
514 $FileNotOkayCount++; | |
515 next FILELIST; | |
516 } | |
517 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { | |
518 $InDelim = "\;"; | |
519 } | |
520 } | |
521 | |
522 if (!open TEXTFILE, "$TextFile") { | |
523 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; | |
524 $FileNotOkayCount++; | |
525 next FILELIST; | |
526 } | |
527 | |
528 $Line = GetTextLine(\*TEXTFILE); | |
529 @ColLabels = quotewords($InDelim, 0, $Line); | |
530 close TEXTFILE; | |
531 | |
532 $TextFilesInfo{FileOkay}[$Index] = 1; | |
533 $TextFilesInfo{InDelim}[$Index] = $InDelim; | |
534 | |
535 $TextFilesInfo{ColCount}[$Index] = @ColLabels; | |
536 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; | |
537 for $ColNum (0 .. $#ColLabels) { | |
538 $ColLabel = $ColLabels[$ColNum]; | |
539 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; | |
540 } | |
541 } | |
542 # Make sure all specified files are valid for merging to work properly... | |
543 if ($FileNotOkayCount) { | |
544 die "Error: Problems with input text file(s)...\n"; | |
545 } | |
546 } | |
547 | |
548 # Process option values... | |
549 sub ProcessOptions { | |
550 my($Index, $FileDir, $FileName, $FileExt, $NewTextFile, @ColValues, @KeyValues); | |
551 | |
552 %OptionsInfo = (); | |
553 | |
554 $OptionsInfo{Mode} = $Options{mode}; | |
555 | |
556 $OptionsInfo{Columns} = $Options{columns}; | |
557 @{$OptionsInfo{ColValues}} = (); | |
558 | |
559 if ($Options{columns}) { | |
560 @ColValues = split ";", $Options{columns}; | |
561 if (@ColValues != @TextFilesList) { | |
562 die "Error: Invalid number of values specified by \"-c --columns\" option: it must be equal to number of input text files.\n"; | |
563 } | |
564 for $Index (0 .. $#ColValues) { | |
565 if (!length($ColValues[$Index])) { | |
566 die "Error: Invalid value specified by \"-c --columns\" option: empty values are not allowed.\n"; | |
567 } | |
568 } | |
569 @{$OptionsInfo{ColValues}} = @ColValues; | |
570 } | |
571 | |
572 $OptionsInfo{Keys} = $Options{keys}; | |
573 @{$OptionsInfo{KeyValues}} = (); | |
574 | |
575 if ($Options{keys}) { | |
576 @KeyValues = split ";", $Options{keys}; | |
577 if (@KeyValues != @TextFilesList) { | |
578 die "Error: Invalid number of values specified by \"-k --keys\" option: it must be equal to number of input text files.\n"; | |
579 } | |
580 for $Index (0 .. $#KeyValues) { | |
581 if (!length($KeyValues[$Index])) { | |
582 die "Error: Invalid value specified by \"-k --keys\" option: empty values are not allowed.\n"; | |
583 } | |
584 } | |
585 @{$OptionsInfo{KeyValues}} = @KeyValues; | |
586 } | |
587 | |
588 $OptionsInfo{InDelim} = $Options{indelim}; | |
589 | |
590 $OptionsInfo{StartCol} = $Options{startcol} ? $Options{startcol} : undef; | |
591 $OptionsInfo{StartColMode} = $Options{startcolmode}; | |
592 | |
593 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef; | |
594 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef; | |
595 | |
596 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,"); | |
597 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; | |
598 | |
599 if ($Options{root}) { | |
600 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
601 ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root}); | |
602 if ($FileName && $FileExt) { | |
603 $NewTextFile = $FileName; | |
604 } else { | |
605 $NewTextFile = $Options{root}; | |
606 } | |
607 } else { | |
608 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
609 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFilesList[0]); | |
610 $NewTextFile = $FileName . "1To" . @TextFilesList . "Merged"; | |
611 } | |
612 if ($Options{outdelim} =~ /^tab$/i) { | |
613 $NewTextFile .= ".tsv"; | |
614 } else { | |
615 $NewTextFile .= ".csv"; | |
616 } | |
617 if (!$Options{overwrite}) { | |
618 if (-e $NewTextFile) { | |
619 die "Error: The file $NewTextFile already exists.\n"; | |
620 } | |
621 } | |
622 if ($Options{root}) { | |
623 for $Index (0 .. $#TextFilesList) { | |
624 if (lc($NewTextFile) eq lc($TextFilesList[$Index])) { | |
625 die "Error: Output filename, $NewTextFile, is similar to a input file name.\nSpecify a different name using \"-r --root\" option or use default name.\n"; | |
626 } | |
627 } | |
628 } | |
629 | |
630 $OptionsInfo{NewTextFile} = $NewTextFile; | |
631 } | |
632 | |
633 # Setup script usage and retrieve command line arguments specified using various options... | |
634 sub SetupScriptUsage { | |
635 | |
636 # Retrieve all the options... | |
637 %Options = (); | |
638 | |
639 $Options{mode} = "colnum"; | |
640 $Options{indelim} = "comma"; | |
641 $Options{outdelim} = "comma"; | |
642 $Options{quote} = "yes"; | |
643 $Options{startcolmode} = "after"; | |
644 | |
645 if (!GetOptions(\%Options, "help|h", "indelim=s", "columns|c=s", "keys|k=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "startcol|s=s", "startcolmode=s", "workingdir|w=s")) { | |
646 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
647 } | |
648 if ($Options{workingdir}) { | |
649 if (! -d $Options{workingdir}) { | |
650 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
651 } | |
652 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
653 } | |
654 if ($Options{mode} !~ /^(colnum|collabel)$/i) { | |
655 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum, or collabel\n"; | |
656 } | |
657 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { | |
658 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; | |
659 } | |
660 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { | |
661 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
662 } | |
663 if ($Options{quote} !~ /^(yes|no)$/i) { | |
664 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; | |
665 } | |
666 if ($Options{startcolmode} !~ /^(before|after)$/i) { | |
667 die "Error: The value specified, $Options{quote}, for option \"--startcolmode\" is not valid. Allowed values: before or after\n"; | |
668 } | |
669 } | |
670 | |
671 __END__ | |
672 | |
673 =head1 NAME | |
674 | |
675 MergeTextFiles.pl - Merge multiple CSV or TSV text files into a single text file | |
676 | |
677 =head1 SYNOPSIS | |
678 | |
679 MergeTextFiles.pl TextFiles... | |
680 | |
681 MergeTextFiles.pl [B<-h, --help>] [B<--indelim> comma | semicolon] [B<-c, --columns> colnum,...;... | collabel,...;...] | |
682 [B<-k, --keys> colnum,...;... | collabel,...;...] [B<-m, --mode> colnum | collabel] | |
683 [B<-o, --overwrite>] [B<--outdelim> comma | tab | semicolon] [B<-q, --quote> yes | no] | |
684 [B<-r, --root> rootname] [B<-s, --startcol> colnum | collabel] [B<--startcolmode> before | after] | |
685 [B<-w, --workingdir> dirname] TextFiles... | |
686 | |
687 =head1 DESCRIPTION | |
688 | |
689 Merge multiple CSV or TSV I<TextFiles> into first I<TextFile> to generate a single | |
690 text file. Unless B<-k --keys> option is used, data rows from other I<TextFiles> | |
691 are added to first I<TextFile> in a sequential order, and the number of rows in first | |
692 I<TextFile> is used to determine how many rows of data are added from other | |
693 I<TextFiles>. | |
694 | |
695 Multiple I<TextFiles> names are separated by space. The valid file extensions are I<.csv> and | |
696 I<.tsv> for comma/semicolon and tab delimited text files respectively. All other file names | |
697 are ignored. All the text files in a current directory can be specified by I<*.csv>, | |
698 I<*.tsv>, or the current directory name. The B<--indelim> option determines the | |
699 format of I<TextFiles>. Any file which doesn't correspond to the format indicated | |
700 by B<--indelim> option is ignored. | |
701 | |
702 =head1 OPTIONS | |
703 | |
704 =over 4 | |
705 | |
706 =item B<-h, --help> | |
707 | |
708 Print this help message. | |
709 | |
710 =item B<--indelim> I<comma | semicolon> | |
711 | |
712 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>. | |
713 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a | |
714 delimiter. | |
715 | |
716 =item B<-c, --columns> I<colnum,...;... | collabel,...;...> | |
717 | |
718 This value is mode specific. It is a list of columns to merge into first | |
719 text file specified by column numbers or labels for each text file | |
720 delimited by ";". All specified text files are merged into first text file. | |
721 | |
722 Default value: I<all;all;...>. By default, all columns from specified text files are | |
723 merged into first text file. | |
724 | |
725 For I<colnum> mode, input value format is: I<colnum,...;colnum,...;...>. Example: | |
726 | |
727 "1,2;1,3,4;7,8,9" | |
728 | |
729 For I<collabel> mode, input value format is: I<collabel,...;collabel,...;...>. Example: | |
730 | |
731 "MW,SumNO;SumNHOH,ClogP,PSA;MolName,Mol_Id,Extreg" | |
732 | |
733 =item B<-k, --keys> I<colnum,...;... | collabel,...;...> | |
734 | |
735 This value is mode specific. It specifies column keys to use for merging | |
736 all specified text files into first text file. The column keys are specified by | |
737 column numbers or labels for each text file delimited by ";". | |
738 | |
739 By default, data rows from text files are merged into first file in the order they appear. | |
740 | |
741 For I<colnum> mode, input value format is:I<colkeynum, colkeynum;...>. Example: | |
742 | |
743 "1;3;7" | |
744 | |
745 For I<collabel> mode, input value format is:I<colkeylabel, colkeylabel;...>. Example: | |
746 | |
747 "Mol_Id;Mol_Id;Cmpd_Id" | |
748 | |
749 =item B<-m, --mode> I<colnum | collabel> | |
750 | |
751 Specify how to merge text files: using column numbers or column labels. | |
752 Possible values: I<colnum or collabel>. Default value: I<colnum>. | |
753 | |
754 =item B<-o, --overwrite> | |
755 | |
756 Overwrite existing files. | |
757 | |
758 =item B<--outdelim> I<comma | tab | semicolon> | |
759 | |
760 Output text file delimiter. Possible values: I<comma, tab, or semicolon> | |
761 Default value: I<comma>. | |
762 | |
763 =item B<-q, --quote> I<yes | no> | |
764 | |
765 Put quotes around column values in output text file. Possible values: I<yes or | |
766 no>. Default value: I<yes>. | |
767 | |
768 =item B<-r, --root> I<rootname> | |
769 | |
770 New text file name is generated using the root: <Root>.<Ext>. Default file | |
771 name: <FirstTextFileName>1To<Count>Merged.<Ext>. The csv, and tsv | |
772 <Ext> values are used for comma/semicolon, and tab delimited text files | |
773 respectively. | |
774 | |
775 =item B<-s, --startcol> I<colnum | collabel> | |
776 | |
777 This value is mode specific. It specifies the column in first text file which is | |
778 used for start merging other text files.For I<colnum> mode, specify column | |
779 number and for I<collabel> mode, specify column label. | |
780 | |
781 Default value: I<last>. Start merge after the last column. | |
782 | |
783 =item B<--startcolmode> I<before | after> | |
784 | |
785 Start the merge before or after the B<-s, --startcol> value. Possible values: I<before or after> | |
786 Default value: I<after>. | |
787 | |
788 =item B<-w, --workingdir> I<dirname> | |
789 | |
790 Location of working directory. Default: current directory. | |
791 | |
792 =back | |
793 | |
794 =head1 EXAMPLES | |
795 | |
796 To merge Sample2.csv and Sample3.csv into Sample1.csv and generate | |
797 NewSample.csv, type: | |
798 | |
799 % MergeTextFiles.pl -r NewSample -o Sample1.csv Sample2.csv | |
800 Sample3.csv | |
801 | |
802 To merge all Sample*.tsv and generate NewSample.tsv file, type: | |
803 | |
804 % MergeTextFiles.pl -r NewSample --indelim comma --outdelim tab -o | |
805 Sample*.csv | |
806 | |
807 To merge column numbers "1,2" and "3,4,5" from Sample2.csv and Sample3.csv | |
808 into Sample1.csv starting before column number 3 in Sample1.csv and to generate | |
809 NewSample.csv without quoting column data, type: | |
810 | |
811 % MergeTextFiles.pl -s 3 --startcolmode before -r NewSample -q no | |
812 -m colnum -c "all;1,2;3,4,5" -o Sample1.csv Sample2.csv | |
813 Sample3.csv | |
814 | |
815 To merge column "Mol_ID,Formula,MolWeight" and "Mol_ID,NAME,ChemBankID" | |
816 from Sample2.csv and Sample3.csv into Sample1.csv using "Mol_ID" as a column keys | |
817 starting after the last column and to generate NewSample.tsv, type: | |
818 | |
819 % MergeTextFiles.pl -r NewSample --outdelim tab -k "Mol_ID;Mol_ID; | |
820 Mol_ID" -m collabel -c "all;Mol_ID,Formula,MolWeight;Mol_ID,NAME, | |
821 ChemBankID" -o Sample1.csv Sample2.csv Sample3.csv | |
822 | |
823 =head1 AUTHOR | |
824 | |
825 Manish Sud <msud@san.rr.com> | |
826 | |
827 =head1 SEE ALSO | |
828 | |
829 JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl | |
830 | |
831 =head1 COPYRIGHT | |
832 | |
833 Copyright (C) 2015 Manish Sud. All rights reserved. | |
834 | |
835 This file is part of MayaChemTools. | |
836 | |
837 MayaChemTools is free software; you can redistribute it and/or modify it under | |
838 the terms of the GNU Lesser General Public License as published by the Free | |
839 Software Foundation; either version 3 of the License, or (at your option) | |
840 any later version. | |
841 | |
842 =cut |