Mercurial > repos > deepakjadmin > mayatool3_test3
comparison mayachemtools/bin/MergeTextFilesWithSD.pl @ 0:73ae111cf86f draft
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 11:55:01 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:73ae111cf86f |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # $RCSfile: MergeTextFilesWithSD.pl,v $ | |
4 # $Date: 2015/02/28 20:46:20 $ | |
5 # $Revision: 1.39 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
31 use Getopt::Long; | |
32 use File::Basename; | |
33 use Text::ParseWords; | |
34 use Benchmark; | |
35 use FileHandle; | |
36 use SDFileUtil; | |
37 use FileUtil; | |
38 use TextUtil; | |
39 | |
40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
41 | |
42 # Autoflush STDOUT | |
43 $| = 1; | |
44 | |
45 # Starting message... | |
46 $ScriptName = basename $0; | |
47 print "\n$ScriptName:Starting...\n\n"; | |
48 $StartTime = new Benchmark; | |
49 | |
50 # Get the options and setup script... | |
51 SetupScriptUsage(); | |
52 if ($Options{help} || @ARGV < 1) { | |
53 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
54 } | |
55 | |
56 my($SDFile, @TextFilesList); | |
57 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); | |
58 | |
59 if (@TextFilesList < 2) { | |
60 die "Error: Specify one or more text files.\n"; | |
61 } | |
62 $SDFile = shift @TextFilesList; | |
63 | |
64 # Process options... | |
65 print "Processing options...\n"; | |
66 my(%OptionsInfo); | |
67 ProcessOptions(); | |
68 | |
69 # Setup information about input files... | |
70 print "Checking input SD and text files...\n"; | |
71 my(%TextFilesInfo); | |
72 ProcessSDFileInfo(); | |
73 RetrieveTextFilesInfo(); | |
74 RetrieveColumnsAndKeysInfo(); | |
75 | |
76 # Merge files... | |
77 print "\nGenerating new SD file $OptionsInfo{NewSDFile}...\n"; | |
78 MergeTextFilesWithSD(); | |
79 | |
80 print "\n$ScriptName:Done...\n\n"; | |
81 | |
82 $EndTime = new Benchmark; | |
83 $TotalTime = timediff ($EndTime, $StartTime); | |
84 print "Total time: ", timestr($TotalTime), "\n"; | |
85 | |
86 ############################################################################### | |
87 | |
88 # Merge all valid Text files with SD file... | |
89 sub MergeTextFilesWithSD { | |
90 my($Index, $Line); | |
91 | |
92 open NEWSDFILE, ">$OptionsInfo{NewSDFile}" or die "Error: Couldn't open $OptionsInfo{NewSDFile}: $! \n"; | |
93 | |
94 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; | |
95 | |
96 @{$TextFilesInfo{FileHandle}} = (); | |
97 for $Index (0 .. $#TextFilesList) { | |
98 $TextFilesInfo{FileHandle}[$Index] = new FileHandle; | |
99 | |
100 open $TextFilesInfo{FileHandle}[$Index], "$TextFilesList[$Index]" or die "Error: Couldn't open $TextFilesList[$Index]: $! \n"; | |
101 GetTextLine($TextFilesInfo{FileHandle}[$Index]); | |
102 } | |
103 | |
104 if ($OptionsInfo{Keys}) { | |
105 MergeTextColumnValuesUsingKeys(\*NEWSDFILE, \*SDFILE); | |
106 } | |
107 else { | |
108 MergeTextColumnValues(\*NEWSDFILE, \*SDFILE); | |
109 } | |
110 | |
111 # Close all opened files... | |
112 close NEWSDFILE; | |
113 close SDFILE; | |
114 for $Index (0 .. $#TextFilesList) { | |
115 close $TextFilesInfo{FileHandle}[$Index]; | |
116 } | |
117 } | |
118 | |
119 # Merge the specified text columns into SD file... | |
120 sub MergeTextColumnValues { | |
121 my($NewSDFileRef, $SDFileRef) = @_; | |
122 my($Index, $Value, $CmpdString, $Line, $InDelim, $ColNum, $ColIndex, @ColLabels, @ColValues, @LineWords); | |
123 | |
124 while ($CmpdString = ReadCmpdString($SDFileRef)) { | |
125 $CmpdString =~ s/\$\$\$\$$//g; | |
126 print $NewSDFileRef "$CmpdString"; | |
127 | |
128 # Merge coulmn values from other text files... | |
129 @ColLabels = (); @ColValues = (); | |
130 for $Index (0 .. $#TextFilesList) { | |
131 push @ColLabels, @{$TextFilesInfo{ColToMergeLabels}[$Index]}; | |
132 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
133 | |
134 if ($Line = GetTextLine($TextFilesInfo{FileHandle}[$Index])) { | |
135 @LineWords = quotewords($InDelim, 0, $Line); | |
136 | |
137 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) { | |
138 $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : ""; | |
139 push @ColValues, $Value; | |
140 } | |
141 } | |
142 } | |
143 | |
144 for $ColIndex (0 .. $#ColLabels) { | |
145 print $NewSDFileRef "> <$ColLabels[$ColIndex]>\n$ColValues[$ColIndex]\n\n"; | |
146 } | |
147 print $NewSDFileRef "\$\$\$\$\n"; | |
148 } | |
149 } | |
150 | |
151 # Merge the specified text columns into SD file using keys... | |
152 sub MergeTextColumnValuesUsingKeys { | |
153 my($NewSDFileRef, $SDFileRef) = @_; | |
154 my($Index, $CmpdString, $Value, $InDelim, $KeyColNum, $KeyColValue, $Line, $ColIndex, $ColNum, @ColLabels, @ColValues, @LineWords, @CmpdLines, @TextFilesKeysToLinesMap, %DataFieldValues); | |
155 | |
156 # Retrieve text lines from all the text files... | |
157 @TextFilesKeysToLinesMap = (); | |
158 | |
159 for $Index (0 .. $#TextFilesList) { | |
160 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
161 %{$TextFilesKeysToLinesMap[$Index]} = (); | |
162 $KeyColNum = $TextFilesInfo{KeysToUse}[$Index]; | |
163 | |
164 while ($Line = GetTextLine($TextFilesInfo{FileHandle}[$Index])) { | |
165 @LineWords = quotewords($InDelim, 0, $Line); | |
166 | |
167 if ($KeyColNum < @LineWords) { | |
168 $KeyColValue = $LineWords[$KeyColNum]; | |
169 | |
170 if (length($KeyColValue)) { | |
171 if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) { | |
172 warn "Warning: Ignoring line, $Line, in text file $TextFilesList[$Index]: Column key value, $KeyColValue, already exists\n"; | |
173 } | |
174 else { | |
175 @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}} = (); | |
176 push @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}}, @LineWords; | |
177 } | |
178 } | |
179 } | |
180 } | |
181 } | |
182 | |
183 while ($CmpdString = ReadCmpdString($SDFileRef)) { | |
184 @CmpdLines = split "\n", $CmpdString; | |
185 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
186 | |
187 if (exists($DataFieldValues{$OptionsInfo{SDKey}})) { | |
188 @ColLabels = (); @ColValues = (); | |
189 $CmpdString =~ s/\$\$\$\$$//g; | |
190 print $NewSDFileRef "$CmpdString"; | |
191 | |
192 $KeyColValue = $DataFieldValues{$OptionsInfo{SDKey}}; | |
193 | |
194 # Merge coulmn values from other text files... | |
195 for $Index (0 .. $#TextFilesList) { | |
196 push @ColLabels, @{$TextFilesInfo{ColToMergeLabels}[$Index]}; | |
197 @LineWords = (); | |
198 | |
199 if (exists($TextFilesKeysToLinesMap[$Index]{$KeyColValue})) { | |
200 push @LineWords, @{$TextFilesKeysToLinesMap[$Index]{$KeyColValue}}; | |
201 } | |
202 | |
203 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) { | |
204 $Value = ($ColNum < @LineWords) ? $LineWords[$ColNum] : ""; | |
205 push @ColValues, $Value; | |
206 } | |
207 } | |
208 | |
209 for $ColIndex (0 .. $#ColLabels) { | |
210 $Value = (($ColIndex < @ColValues) && IsNotEmpty($ColValues[$ColIndex]) ) ? $ColValues[$ColIndex] : ""; | |
211 print $NewSDFileRef "> <$ColLabels[$ColIndex]>\n$Value\n\n"; | |
212 } | |
213 print $NewSDFileRef "\$\$\$\$\n"; | |
214 } | |
215 } | |
216 } | |
217 | |
218 # Retrieve text file columns and keys information for specified options... | |
219 sub RetrieveColumnsAndKeysInfo { | |
220 ProcessColumnsInfo(); | |
221 | |
222 if ($OptionsInfo{Keys}) { | |
223 ProcessKeysInfo(); | |
224 } | |
225 } | |
226 | |
227 # Process specified columns... | |
228 sub ProcessColumnsInfo { | |
229 my($Index, $Values, $ColIndex, $ColNum, $ColLabel, @Words); | |
230 | |
231 @{$TextFilesInfo{ColSpecified}} = (); | |
232 @{$TextFilesInfo{ColToMerge}} = (); | |
233 @{$TextFilesInfo{ColToMergeLabels}} = (); | |
234 @{$TextFilesInfo{ColToMergeNumToLabelMap}} = (); | |
235 | |
236 for $Index (0 .. $#TextFilesList) { | |
237 | |
238 @{$TextFilesInfo{ColSpecified}[$Index]} = (); | |
239 | |
240 $Values = "all"; | |
241 if ($OptionsInfo{Columns}) { | |
242 $Values = $OptionsInfo{ColValues}[$Index]; | |
243 } | |
244 | |
245 if ($Values =~ /all/i) { | |
246 if ($OptionsInfo{Mode} =~ /^colnum$/i) { | |
247 for $ColNum (1 .. $TextFilesInfo{ColCount}[$Index]) { | |
248 push @{$TextFilesInfo{ColSpecified}[$Index]}, $ColNum; | |
249 } | |
250 } else { | |
251 push @{$TextFilesInfo{ColSpecified}[$Index]}, @{$TextFilesInfo{ColLabels}[$Index]}; | |
252 } | |
253 } | |
254 else { | |
255 @Words = split ",", $Values; | |
256 push @{$TextFilesInfo{ColSpecified}[$Index]}, @Words; | |
257 } | |
258 | |
259 @{$TextFilesInfo{ColToMerge}[$Index]} = (); | |
260 %{$TextFilesInfo{ColToMergeNumToLabelMap}[$Index]} = (); | |
261 | |
262 if ($OptionsInfo{Mode} =~ /^collabel$/i) { | |
263 for $ColIndex (0 .. $#{$TextFilesInfo{ColSpecified}[$Index]}) { | |
264 $ColLabel = $TextFilesInfo{ColSpecified}[$Index][$ColIndex]; | |
265 | |
266 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { | |
267 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; | |
268 push @{$TextFilesInfo{ColToMerge}[$Index]}, $ColNum; | |
269 $TextFilesInfo{ColToMergeNumToLabelMap}[$Index]{$ColNum} = $ColLabel; | |
270 } | |
271 else { | |
272 warn "Warning: Ignoring value, $ColLabel, specified using \"-c --column\" option: column name doesn't exist in $TextFilesList[$Index] \n"; | |
273 } | |
274 } | |
275 } | |
276 else { | |
277 for $ColIndex (0 .. $#{$TextFilesInfo{ColSpecified}[$Index]}) { | |
278 $ColNum = $TextFilesInfo{ColSpecified}[$Index][$ColIndex]; | |
279 | |
280 # Make sure it's a numeric value... | |
281 if (!IsPositiveInteger($ColNum)) { | |
282 warn "Warning: Ignoring value, $ColNum, specified using \"-c --column\" option: Allowed integer values: > 0\n"; | |
283 } | |
284 else { | |
285 if ($ColNum > 0 && $ColNum <= $TextFilesInfo{ColCount}[$Index]) { | |
286 $ColNum -= 1; | |
287 push @{$TextFilesInfo{ColToMerge}[$Index]}, $ColNum; | |
288 $TextFilesInfo{ColToMergeNumToLabelMap}[$Index]{$ColNum} = $TextFilesInfo{ColLabels}[$Index][$ColNum]; | |
289 } | |
290 else { | |
291 warn "Warning: Ignoring value, $ColNum, specified using \"-c --column\" option: column number doesn't exist in $TextFilesList[$Index] \n"; | |
292 } | |
293 } | |
294 } | |
295 } | |
296 | |
297 my (@TextFilesColToMergeSorted) = sort { $a <=> $b } @{$TextFilesInfo{ColToMerge}[$Index]}; | |
298 | |
299 @{$TextFilesInfo{ColToMerge}[$Index]} = (); | |
300 push @{$TextFilesInfo{ColToMerge}[$Index]}, @TextFilesColToMergeSorted; | |
301 | |
302 # Set up the labels... | |
303 @{$TextFilesInfo{ColToMergeLabels}[$Index]} = (); | |
304 for $ColNum (@TextFilesColToMergeSorted) { | |
305 push @{$TextFilesInfo{ColToMergeLabels}[$Index]}, $TextFilesInfo{ColToMergeNumToLabelMap}[$Index]{$ColNum}; | |
306 } | |
307 } | |
308 } | |
309 | |
310 # Process specified keys.... | |
311 sub ProcessKeysInfo { | |
312 my($Index, $ColNum, $ColLabel, $Key); | |
313 | |
314 @{$TextFilesInfo{KeysSpecified}} = (); | |
315 @{$TextFilesInfo{KeysToUse}} = (); | |
316 | |
317 for $Index (0 .. $#TextFilesList) { | |
318 $Key = $OptionsInfo{KeyValues}[$Index]; | |
319 | |
320 $TextFilesInfo{KeysSpecified}[$Index] = $Key; | |
321 $TextFilesInfo{KeysToUse}[$Index] = -1; | |
322 | |
323 if ($OptionsInfo{Mode} =~ /^collabel$/i) { | |
324 $ColLabel = $Key; | |
325 | |
326 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { | |
327 $TextFilesInfo{KeysToUse}[$Index] = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; | |
328 } | |
329 else { | |
330 warn "Warning: Ignoring value, $ColLabel, specified using \"-k --keys\" option: column name doesn't exist in $TextFilesList[$Index] \n"; | |
331 } | |
332 } | |
333 else { | |
334 $ColNum = $Key; | |
335 if (!IsPositiveInteger($ColNum)) { | |
336 warn "Warning: Ignoring value, $ColNum, specified using \"-k --keys\" option: Allowed integer values: > 0 \n"; | |
337 } | |
338 else { | |
339 if ($ColNum > 0 && $ColNum <= $TextFilesInfo{ColCount}[$Index]) { | |
340 $TextFilesInfo{KeysToUse}[$Index] = $ColNum - 1; | |
341 } | |
342 else { | |
343 warn "Warning: Ignoring value, $ColNum, specified using \"-k --keys\" option: column number doesn't exist in $TextFilesList[$Index] \n"; | |
344 } | |
345 } | |
346 } | |
347 } | |
348 | |
349 # Modify columns to merge list to make sure the columns identified by key are taken off the list | |
350 my(@TextFilesColToMergeFiltered, @TextFilesColToMergeLabelsFiltered); | |
351 | |
352 for $Index (0 .. $#TextFilesList) { | |
353 @TextFilesColToMergeFiltered = (); | |
354 @TextFilesColToMergeLabelsFiltered = (); | |
355 | |
356 for $ColNum (@{$TextFilesInfo{ColToMerge}[$Index]}) { | |
357 if ($TextFilesInfo{KeysToUse}[$Index] != $ColNum) { | |
358 push @TextFilesColToMergeFiltered, $ColNum; | |
359 push @TextFilesColToMergeLabelsFiltered, $TextFilesInfo{ColToMergeNumToLabelMap}[$Index]{$ColNum}; | |
360 } | |
361 } | |
362 | |
363 @{$TextFilesInfo{ColToMerge}[$Index]} = (); | |
364 push @{$TextFilesInfo{ColToMerge}[$Index]}, @TextFilesColToMergeFiltered; | |
365 | |
366 @{$TextFilesInfo{ColToMergeLabels}[$Index]} = (); | |
367 push @{$TextFilesInfo{ColToMergeLabels}[$Index]}, @TextFilesColToMergeLabelsFiltered; | |
368 } | |
369 } | |
370 | |
371 # Check SD file... | |
372 sub ProcessSDFileInfo { | |
373 if (!CheckFileType($SDFile, "sd sdf")) { | |
374 die "Error: Invalid first file $SDFile: It's not a SD file\n"; | |
375 } | |
376 if (!(-e $SDFile)) { | |
377 die "Error: SDFile $SDFile doesn't exist\n"; | |
378 } | |
379 } | |
380 | |
381 # Retrieve information about input text files... | |
382 sub RetrieveTextFilesInfo { | |
383 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, $ColNum, $ColLabel, $FileNotOkayCount, @ColLabels,); | |
384 | |
385 %TextFilesInfo = (); | |
386 | |
387 @{$TextFilesInfo{FileOkay}} = (); | |
388 @{$TextFilesInfo{ColCount}} = (); | |
389 @{$TextFilesInfo{ColLabels}} = (); | |
390 @{$TextFilesInfo{ColLabelToNumMap}} = (); | |
391 @{$TextFilesInfo{InDelim}} = (); | |
392 | |
393 $FileNotOkayCount = 0; | |
394 | |
395 FILELIST: for $Index (0 .. $#TextFilesList) { | |
396 $TextFile = $TextFilesList[$Index]; | |
397 | |
398 $TextFilesInfo{FileOkay}[$Index] = 0; | |
399 $TextFilesInfo{ColCount}[$Index] = 0; | |
400 $TextFilesInfo{InDelim}[$Index] = ""; | |
401 | |
402 @{$TextFilesInfo{ColLabels}[$Index]} = (); | |
403 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); | |
404 | |
405 if (!(-e $TextFile)) { | |
406 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; | |
407 $FileNotOkayCount++; | |
408 next FILELIST; | |
409 } | |
410 if (!CheckFileType($TextFile, "csv tsv")) { | |
411 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; | |
412 $FileNotOkayCount++; | |
413 next FILELIST; | |
414 } | |
415 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); | |
416 if ($FileExt =~ /^tsv$/i) { | |
417 $InDelim = "\t"; | |
418 } | |
419 else { | |
420 $InDelim = "\,"; | |
421 if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) { | |
422 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; | |
423 $FileNotOkayCount++; | |
424 next FILELIST; | |
425 } | |
426 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { | |
427 $InDelim = "\;"; | |
428 } | |
429 } | |
430 | |
431 if (!open TEXTFILE, "$TextFile") { | |
432 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; | |
433 $FileNotOkayCount++; | |
434 next FILELIST; | |
435 } | |
436 | |
437 $Line = GetTextLine(\*TEXTFILE); | |
438 @ColLabels = quotewords($InDelim, 0, $Line); | |
439 close TEXTFILE; | |
440 | |
441 $TextFilesInfo{FileOkay}[$Index] = 1; | |
442 $TextFilesInfo{InDelim}[$Index] = $InDelim; | |
443 | |
444 $TextFilesInfo{ColCount}[$Index] = @ColLabels; | |
445 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; | |
446 for $ColNum (0 .. $#ColLabels) { | |
447 $ColLabel = $ColLabels[$ColNum]; | |
448 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; | |
449 } | |
450 } | |
451 # Make sure all specified files are valid for merging to work properly... | |
452 if ($FileNotOkayCount) { | |
453 die "Error: Problems with input text file(s)...\n"; | |
454 } | |
455 } | |
456 | |
457 # Process option values... | |
458 sub ProcessOptions { | |
459 my($Index, $FileDir, $FileName, $FileExt, $NewSDFile, @ColValues, @KeyValues); | |
460 | |
461 %OptionsInfo = (); | |
462 | |
463 $OptionsInfo{Mode} = $Options{mode}; | |
464 | |
465 $OptionsInfo{Columns} = $Options{columns}; | |
466 @{$OptionsInfo{ColValues}} = (); | |
467 | |
468 if ($Options{columns}) { | |
469 @ColValues = split ";", $Options{columns}; | |
470 if (@ColValues != @TextFilesList) { | |
471 die "Error: Invalid number of values specified by \"-c --columns\" option: it must be equal to number of input text files.\n"; | |
472 } | |
473 for $Index (0 .. $#ColValues) { | |
474 if (!length($ColValues[$Index])) { | |
475 die "Error: Invalid value specified by \"-c --columns\" option: empty values are not allowed.\n"; | |
476 } | |
477 } | |
478 @{$OptionsInfo{ColValues}} = @ColValues; | |
479 } | |
480 | |
481 $OptionsInfo{Keys} = $Options{keys}; | |
482 @{$OptionsInfo{KeyValues}} = (); | |
483 | |
484 if ($Options{keys}) { | |
485 @KeyValues = split ";", $Options{keys}; | |
486 if (@KeyValues != @TextFilesList) { | |
487 die "Error: Invalid number of values specified by \"-k --keys\" option: it must be equal to number of input text files.\n"; | |
488 } | |
489 for $Index (0 .. $#KeyValues) { | |
490 if (!length($KeyValues[$Index])) { | |
491 die "Error: Invalid value specified by \"-k --keys\" option: empty values are not allowed.\n"; | |
492 } | |
493 } | |
494 @{$OptionsInfo{KeyValues}} = @KeyValues; | |
495 } | |
496 | |
497 $OptionsInfo{InDelim} = $Options{indelim}; | |
498 | |
499 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef; | |
500 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; | |
501 | |
502 $OptionsInfo{SDKey} = defined $Options{sdkey} ? $Options{sdkey} : undef; | |
503 | |
504 # Setup new SD file... | |
505 if ($Options{root}) { | |
506 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
507 ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root}); | |
508 if ($FileName && $FileExt) { | |
509 $NewSDFile = $FileName; | |
510 } | |
511 else { | |
512 $NewSDFile = $Options{root}; | |
513 } | |
514 } | |
515 else { | |
516 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
517 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); | |
518 | |
519 $NewSDFile = $FileName; | |
520 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFilesList[0]); | |
521 | |
522 $NewSDFile = $NewSDFile . "MergedWith" . $FileName . "1To" . @TextFilesList; | |
523 } | |
524 | |
525 $NewSDFile = $NewSDFile . ".sdf"; | |
526 if (!$Options{overwrite}) { | |
527 if (-e $NewSDFile) { | |
528 die "Error: The file $NewSDFile already exists.\n"; | |
529 } | |
530 } | |
531 if ($Options{root}) { | |
532 if (lc($NewSDFile) eq lc($SDFile)) { | |
533 die "Error: Output filename, $NewSDFile, is similar to a input file name.\nSpecify a different name using \"-r --root\" option or use default name.\n"; | |
534 } | |
535 } | |
536 $OptionsInfo{NewSDFile} = $NewSDFile; | |
537 } | |
538 | |
539 # Setup script usage and retrieve command line arguments specified using various options... | |
540 sub SetupScriptUsage { | |
541 | |
542 # Retrieve all the options... | |
543 %Options = (); | |
544 $Options{mode} = "colnum"; | |
545 $Options{indelim} = "comma"; | |
546 | |
547 if (!GetOptions(\%Options, "help|h", "indelim=s", "columns|c=s", "keys|k=s", "mode|m=s", "overwrite|o", "root|r=s", "sdkey|s=s", "workingdir|w=s")) { | |
548 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
549 } | |
550 if ($Options{workingdir}) { | |
551 if (! -d $Options{workingdir}) { | |
552 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
553 } | |
554 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
555 } | |
556 if ($Options{mode} !~ /^(colnum|collabel)$/i) { | |
557 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum, or collabel\n"; | |
558 } | |
559 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { | |
560 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; | |
561 } | |
562 if ($Options{sdkey} && !$Options{keys}) { | |
563 die "Error: The option \"-s --sdkey\" can't be specified without the \"-k --keys\" option.\n"; | |
564 } | |
565 elsif (!$Options{sdkey} && $Options{keys}) { | |
566 die "Error: The option \"-k --keys\" can't be specified without the \"-s --sdkey\" option.\n"; | |
567 } | |
568 } | |
569 | |
570 __END__ | |
571 | |
572 =head1 NAME | |
573 | |
574 MergeTextFilesWithSD.pl - Merge CSV or TSV TextFile(s) into SDFile | |
575 | |
576 =head1 SYNOPSIS | |
577 | |
578 MergeTextFilesWithSD.pl SDFile TextFile(s)... | |
579 | |
580 MergeTextFilesWithSD.pl [B<-h, --help>] [B<--indelim> comma | semicolon] | |
581 [B<-c, --columns> colnum,...;... | collabel,...;...] [B<-k, --keys> colkeynum;... | colkeylabel;...] | |
582 [B<-m, --mode> colnum | collabel] [B<-o, --overwrite>] [B<-r, --root> rootname] | |
583 [B<-s, --sdkey> sdfieldname] [B<-w, --workingdir> dirname] SDFile TextFile(s)... | |
584 | |
585 =head1 DESCRIPTION | |
586 | |
587 Merge multiple CSV or TSV I<TextFile(s)> into I<SDFile>. Unless B<-k --keys> | |
588 option is used, data rows from all I<TextFile(s)> are added to I<SDFile> in a | |
589 sequential order, and the number of compounds in I<SDFile> is used to determine | |
590 how many rows of data are added from I<TextFile(s)>. | |
591 | |
592 Multiple I<TextFile(s)> names are separated by spaces. The valid file extensions are I<.csv> and | |
593 I<.tsv> for comma/semicolon and tab delimited text files respectively. All other file names | |
594 are ignored. All the text files in a current directory can be specified by I<*.csv>, | |
595 I<*.tsv>, or the current directory name. The B<--indelim> option determines the | |
596 format of I<TextFile(s)>. Any file which doesn't correspond to the format indicated | |
597 by B<--indelim> option is ignored. | |
598 | |
599 =head1 OPTIONS | |
600 | |
601 =over 4 | |
602 | |
603 =item B<-h, --help> | |
604 | |
605 Print this help message. | |
606 | |
607 =item B<--indelim> I<comma | semicolon> | |
608 | |
609 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>. | |
610 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a | |
611 delimiter. | |
612 | |
613 =item B<-c, --columns> I<colnum,...;... | collabel,...;...> | |
614 | |
615 This value is mode specific. It is a list of columns to merge into I<SDFile> | |
616 specified by column numbers or labels for each text file delimited by ";". | |
617 All I<TextFile(s)> are merged into I<SDFile>. | |
618 | |
619 Default value: I<all;all;...>. By default, all columns from TextFile(s) are | |
620 merged into I<SDFile>. | |
621 | |
622 For I<colnum> mode, input value format is: I<colnum,...;colnum,...;...>. Example: | |
623 | |
624 "1,2;1,3,4;7,8,9" | |
625 | |
626 For I<collabel> mode, input value format is: I<collabel,...;collabel,...;...>. Example: | |
627 | |
628 "MW,SumNO;SumNHOH,ClogP,PSA;MolName,Mol_Id,Extreg" | |
629 | |
630 =item B<-k, --keys> I<colkeynum;... | colkeylabel;...> | |
631 | |
632 This value is mode specific. It specifies column keys to use for merging | |
633 I<TextFile(s)> into I<SDFile>. The column keys, delimited by ";", are specified by column | |
634 numbers or labels for I<TextFile(s)>. | |
635 | |
636 By default, data rows from I<TextFile(s)> are merged into I<SDFile> in the order they appear. | |
637 | |
638 For I<colnum> mode, input value format is:I<colkeynum, colkeynum;...>. Example: | |
639 | |
640 "1;3;7" | |
641 | |
642 For I<collabel> mode, input value format is:I<colkeylabel, colkeylabel;...>. Example: | |
643 | |
644 "Mol_Id;Mol_Id;Cmpd_Id" | |
645 | |
646 =item B<-m, --mode> I<colnum | collabel> | |
647 | |
648 Specify how to merge I<TextFile(s)> into I<SDFile>: using column numbers or column labels. | |
649 Possible values: I<colnum or collabel>. Default value: I<colnum>. | |
650 | |
651 =item B<-o, --overwrite> | |
652 | |
653 Overwrite existing files. | |
654 | |
655 =item B<-r, --root> I<rootname> | |
656 | |
657 New SD file name is generated using the root: <Root>.sdf. Default file name: | |
658 <InitialSDFileName>MergedWith<FirstTextFileName>1To<Count>.sdf. | |
659 | |
660 =item B<-s, --sdkey> I<sdfieldname> | |
661 | |
662 I<SDFile> data field name used as a key to merge data from TextFile(s). By default, | |
663 data rows from I<TextFile(s)> are merged into I<SDFile> in the order they appear. | |
664 | |
665 =item B<-w, --workingdir> I<dirname> | |
666 | |
667 Location of working directory. Default: current directory. | |
668 | |
669 =back | |
670 | |
671 =head1 EXAMPLES | |
672 | |
673 To merge Sample1.csv and Sample2.csv into Sample.sdf and generate | |
674 NewSample.sdf, type: | |
675 | |
676 % MergeTextFileswithSD.pl -r NewSample -o Sample.sdf | |
677 Sample1.csv Sample2.csv | |
678 | |
679 To merge all Sample*.tsv into Sample.sdf and generate NewSample.sdf file, type: | |
680 | |
681 % MergeTextFilesWithSD.pl -r NewSample -o Sample.sdf | |
682 Sample*.tsv | |
683 | |
684 To merge column numbers "1,2" and "3,4,5" from Sample2.csv and Sample3.csv | |
685 into Sample.sdf and to generate NewSample.sdf, type: | |
686 | |
687 % MergeTextFilesWithSD.pl -r NewSample -m colnum -c "1,2;3,4,5" | |
688 -o Sample.sdf Sample1.csv Sample2.csv | |
689 | |
690 To merge column "Mol_ID,Formula,MolWeight" and "Mol_ID,ChemBankID,NAME" | |
691 from Sample1.csv and Sample2.csv into Sample.sdf using "Mol_ID" as SD and column keys | |
692 to generate NewSample.sdf, type: | |
693 | |
694 % MergeTextFilesWithSD.pl -r NewSample -s Mol_ID -k "Mol_ID;Mol_ID" | |
695 -m collabel -c "Mol_ID,Formula,MolWeight;Mol_ID,ChemBankID,NAME" | |
696 -o Sample1.sdf Sample1.csv Sample2.csv | |
697 | |
698 =head1 AUTHOR | |
699 | |
700 Manish Sud <msud@san.rr.com> | |
701 | |
702 =head1 SEE ALSO | |
703 | |
704 ExtractFromSDFiles.pl, FilterSDFiles.pl, InfoSDFiles.pl, JoinSDFiles.pl, JoinTextFiles.pl, | |
705 MergeTextFiles.pl, ModifyTextFilesFormat.pl, SplitSDFiles.pl, SplitTextFiles.pl | |
706 | |
707 =head1 COPYRIGHT | |
708 | |
709 Copyright (C) 2015 Manish Sud. All rights reserved. | |
710 | |
711 This file is part of MayaChemTools. | |
712 | |
713 MayaChemTools is free software; you can redistribute it and/or modify it under | |
714 the terms of the GNU Lesser General Public License as published by the Free | |
715 Software Foundation; either version 3 of the License, or (at your option) | |
716 any later version. | |
717 | |
718 =cut |