0
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # $RCSfile: ExtractFromTextFiles.pl,v $
|
|
4 # $Date: 2015/02/28 20:46:19 $
|
|
5 # $Revision: 1.42 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use FindBin; use lib "$FindBin::Bin/../lib";
|
|
31 use Getopt::Long;
|
|
32 use File::Basename;
|
|
33 use Text::ParseWords;
|
|
34 use FileHandle;
|
|
35 use Benchmark;
|
|
36 use FileUtil;
|
|
37 use TextUtil;
|
|
38
|
|
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
|
|
40
|
|
41 # Autoflush STDOUT
|
|
42 $| = 1;
|
|
43
|
|
44 $StartTime = new Benchmark;
|
|
45
|
|
46 # Starting message...
|
|
47 $ScriptName = basename $0;
|
|
48 print "\n$ScriptName:Starting...\n\n";
|
|
49
|
|
50 # Get the options and setup script...
|
|
51 SetupScriptUsage();
|
|
52 if ($Options{help} || @ARGV < 1) {
|
|
53 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
|
|
54 }
|
|
55
|
|
56 my(@TextFilesList);
|
|
57 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
|
|
58
|
|
59 # Process options...
|
|
60 print "Processing options...\n";
|
|
61 my(%OptionsInfo);
|
|
62 ProcessOptions();
|
|
63
|
|
64 # Collect column information for all the text files...
|
|
65 print "Checking input text file(s)...\n";
|
|
66 my(%TextFilesInfo);
|
|
67 RetrieveTextFilesInfo();
|
|
68 RetrieveColumnsAndRowsInfo();
|
|
69
|
|
70 # Generate output files...
|
|
71 my($FileIndex);
|
|
72 if (@TextFilesList > 1) {
|
|
73 print "\nProcessing text files...\n";
|
|
74 }
|
|
75 for $FileIndex (0 .. $#TextFilesList) {
|
|
76 if ($TextFilesInfo{FileOkay}[$FileIndex]) {
|
|
77 print "\nProcessing file $TextFilesList[$FileIndex]...\n";
|
|
78 ExtractFromTextFile($FileIndex);
|
|
79 }
|
|
80 }
|
|
81 print "\n$ScriptName:Done...\n\n";
|
|
82
|
|
83 $EndTime = new Benchmark;
|
|
84 $TotalTime = timediff ($EndTime, $StartTime);
|
|
85 print "Total time: ", timestr($TotalTime), "\n";
|
|
86
|
|
87 ###############################################################################
|
|
88
|
|
89 # Extract appropriate data from text file...
|
|
90 sub ExtractFromTextFile {
|
|
91 my($Index) = @_;
|
|
92
|
|
93 if ($OptionsInfo{Mode} =~ /^categories$/i) {
|
|
94 ExtractCategoryData($Index);
|
|
95 }
|
|
96 elsif ($OptionsInfo{Mode} =~ /^rows$/i){
|
|
97 ExtractRowsData($Index);
|
|
98 }
|
|
99 else {
|
|
100 ExtractColumnData($Index);
|
|
101 }
|
|
102 }
|
|
103
|
|
104 # Geneate category files...
|
|
105 sub ExtractCategoryData {
|
|
106 my($Index) = @_;
|
|
107 my($TextFile, $CategoryCol, $NewTextFile, $InDelim, @ColLabels);
|
|
108
|
|
109 $TextFile = $TextFilesList[$Index];
|
|
110
|
|
111 $NewTextFile = $TextFilesInfo{OutFile}[$Index];
|
|
112 $CategoryCol = $TextFilesInfo{CategoryColNum}[$Index];
|
|
113 $InDelim = $TextFilesInfo{InDelim}[$Index];
|
|
114 @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]};
|
|
115
|
|
116 my($Line, @LineWords, $CategoryName, $CategoryCount, %CategoriesNameToCountMap, %CategoriesNameToLinesMap);
|
|
117 # Collect category data...
|
|
118 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n";
|
|
119 # Skip label line...
|
|
120 $_ = <TEXTFILE>;
|
|
121
|
|
122 %CategoriesNameToCountMap = ();
|
|
123 %CategoriesNameToLinesMap = ();
|
|
124
|
|
125 while ($Line = GetTextLine(\*TEXTFILE)) {
|
|
126 @LineWords = quotewords($InDelim, 0, $Line);
|
|
127 $CategoryName = ($CategoryCol <= @LineWords) ? $LineWords[$CategoryCol] : "";
|
|
128 if (exists($CategoriesNameToCountMap{$CategoryName})) {
|
|
129 $CategoriesNameToCountMap{$CategoryName} += 1;
|
|
130 push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line;
|
|
131 }
|
|
132 else {
|
|
133 $CategoriesNameToCountMap{$CategoryName} = 1;
|
|
134 @{$CategoriesNameToLinesMap{$CategoryName}} = ();
|
|
135 push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line;
|
|
136 }
|
|
137 }
|
|
138 close TEXTFILE;
|
|
139
|
|
140 # Setup file names for individual category files...
|
|
141 my(%CategoriesNameToFileHandleMap, %CategoriesNameToFileNameMap, $CategoryFile, $CategoryFileHandle);
|
|
142
|
|
143 %CategoriesNameToFileHandleMap = ();
|
|
144 %CategoriesNameToFileNameMap = ();
|
|
145
|
|
146 for $CategoryName (keys %CategoriesNameToCountMap) {
|
|
147 $CategoryFile = $TextFilesInfo{CategoryOutFileRoot}[$Index] . "$CategoryName" . ".$TextFilesInfo{OutFileExt}[$Index]";;
|
|
148 $CategoryFile =~ s/ //g;
|
|
149 $CategoryFileHandle = new FileHandle;
|
|
150 open $CategoryFileHandle, ">$CategoryFile" or die "Couldn't open $CategoryFile: $! \n";
|
|
151 $CategoriesNameToFileNameMap{$CategoryName} = $CategoryFile;
|
|
152 $CategoriesNameToFileHandleMap{$CategoryName} = $CategoryFileHandle;
|
|
153 }
|
|
154
|
|
155 # Write out summary file...
|
|
156 print "Generating file $NewTextFile...\n";
|
|
157 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n";
|
|
158
|
|
159 # Write out column labels...
|
|
160 @LineWords = ("Category","Count");
|
|
161 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
162 print NEWTEXTFILE "$Line\n";
|
|
163
|
|
164 # Write out the category names and count...
|
|
165 for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) {
|
|
166 $CategoryCount = $CategoriesNameToCountMap{$CategoryName};
|
|
167 @LineWords = ("$CategoryName","$CategoryCount");
|
|
168 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
169 print NEWTEXTFILE "$Line\n";
|
|
170 }
|
|
171 close NEWTEXTFILE;
|
|
172
|
|
173 # Write out a file for each category...
|
|
174 my($ColLabelLine, $LineIndex);
|
|
175
|
|
176 $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
177 print "\nGenerating text files for each category...\n";
|
|
178
|
|
179 for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) {
|
|
180 print "Generating file $CategoriesNameToFileNameMap{$CategoryName}...\n";
|
|
181 $CategoryFileHandle = $CategoriesNameToFileHandleMap{$CategoryName};
|
|
182 print $CategoryFileHandle "$ColLabelLine\n";
|
|
183 for $LineIndex (0 .. $#{$CategoriesNameToLinesMap{$CategoryName}}) {
|
|
184 $Line = ${$CategoriesNameToLinesMap{$CategoryName}}[$LineIndex];
|
|
185 @LineWords = quotewords($InDelim, 0, $Line);
|
|
186 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
187 print $CategoryFileHandle "$Line\n";
|
|
188 }
|
|
189 close $CategoryFileHandle;
|
|
190 }
|
|
191 }
|
|
192
|
|
193 # Extract data for specific columns...
|
|
194 sub ExtractColumnData {
|
|
195 my($Index) = @_;
|
|
196 my($TextFile, @ColNumsToExtract, $NewTextFile, $InDelim);
|
|
197
|
|
198 $TextFile = $TextFilesList[$Index];
|
|
199 $NewTextFile =$TextFilesInfo{OutFile}[$Index];
|
|
200 $InDelim = $TextFilesInfo{InDelim}[$Index];
|
|
201 @ColNumsToExtract = @{$TextFilesInfo{ColNumsToExtract}[$Index]};
|
|
202
|
|
203 print "Generating file $NewTextFile...\n";
|
|
204 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n";
|
|
205 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n";
|
|
206
|
|
207 $_ = <TEXTFILE>;
|
|
208 # Write out column labels...
|
|
209 my($Line, @LineWords, @ColLabels, $ColLabelLine, @ColValues, $ColValuesLine, $ColNum, $ColValue);
|
|
210 @ColLabels = (); $ColLabelLine = "";
|
|
211 for $ColNum (@ColNumsToExtract) {
|
|
212 push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum];
|
|
213 }
|
|
214 $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
215 print NEWTEXTFILE "$ColLabelLine\n";
|
|
216
|
|
217 while ($Line = GetTextLine(\*TEXTFILE)) {
|
|
218 @LineWords = quotewords($InDelim, 0, $Line);
|
|
219 @ColValues = (); $ColValuesLine = "";
|
|
220 for $ColNum (@ColNumsToExtract) {
|
|
221 $ColValue = "";
|
|
222 if ($ColNum < @LineWords) {
|
|
223 $ColValue = (defined $LineWords[$ColNum]) ? $LineWords[$ColNum] : "";
|
|
224 }
|
|
225 push @ColValues, $ColValue;
|
|
226 }
|
|
227 $ColValuesLine = JoinWords(\@ColValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
228 print NEWTEXTFILE "$ColValuesLine\n";
|
|
229 }
|
|
230 close NEWTEXTFILE;
|
|
231 close TEXTFILE;
|
|
232 }
|
|
233
|
|
234 # Extract data for specific rows...
|
|
235 sub ExtractRowsData {
|
|
236 my($Index) = @_;
|
|
237 my($TextFile, $NewTextFile, $InDelim, $SpecifiedRowsMode);
|
|
238
|
|
239 $TextFile = $TextFilesList[$Index];
|
|
240 $NewTextFile =$TextFilesInfo{OutFile}[$Index];
|
|
241 $InDelim = $TextFilesInfo{InDelim}[$Index];
|
|
242
|
|
243 $SpecifiedRowsMode = $OptionsInfo{SpecifiedRowsMode};
|
|
244
|
|
245 print "Generating file $NewTextFile...\n";
|
|
246 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n";
|
|
247 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n";
|
|
248
|
|
249 my($Line, $RowCount, @LineWords, @ColLabels);
|
|
250
|
|
251 # Write out column labels...
|
|
252 $Line = <TEXTFILE>;
|
|
253 push @ColLabels, @{$TextFilesInfo{ColLabels}[$Index]};
|
|
254 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
255 print NEWTEXTFILE "$Line\n";
|
|
256
|
|
257 if ($SpecifiedRowsMode =~ /^rowsbycolvalue$/i) {
|
|
258 ExtractRowsByColValue($Index, \*TEXTFILE, \*NEWTEXTFILE);
|
|
259 }
|
|
260 elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluelist$/i) {
|
|
261 ExtractRowsByColValueList($Index, \*TEXTFILE, \*NEWTEXTFILE);
|
|
262 }
|
|
263 elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluerange$/i) {
|
|
264 ExtractRowsByColValueRange($Index, \*TEXTFILE, \*NEWTEXTFILE);
|
|
265 }
|
|
266 elsif ($SpecifiedRowsMode =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) {
|
|
267 ExtractRowByMinOrMaxColValue($Index, \*TEXTFILE, \*NEWTEXTFILE);
|
|
268 }
|
|
269 elsif ($SpecifiedRowsMode =~ /^rownums$/i) {
|
|
270 ExtractRowsByRowNums($Index, \*TEXTFILE, \*NEWTEXTFILE);
|
|
271 }
|
|
272 elsif ($SpecifiedRowsMode =~ /^rownumrange$/i) {
|
|
273 ExtractRowsByRowNumRange($Index, \*TEXTFILE, \*NEWTEXTFILE);
|
|
274 }
|
|
275
|
|
276 close NEWTEXTFILE;
|
|
277 close TEXTFILE;
|
|
278 }
|
|
279
|
|
280 # Extract rows by column value...
|
|
281 sub ExtractRowsByColValue {
|
|
282 my($Index, $TextFileRef, $NewTextFileRef) = @_;
|
|
283 my($Line, $ColNum, $ColValue, $Criterion, $Value, $ValueIndex, $InDelim, @LineWords);
|
|
284
|
|
285 $InDelim = $TextFilesInfo{InDelim}[$Index];
|
|
286
|
|
287 LINE: while ($Line = GetTextLine($TextFileRef)) {
|
|
288 @LineWords = quotewords($InDelim, 0, $Line);
|
|
289 for ($ValueIndex = 0; $ValueIndex < @{$TextFilesInfo{RowValues}[$Index]}; $ValueIndex = $ValueIndex + 3) {
|
|
290 $ColNum = $TextFilesInfo{RowValues}[$Index][$ValueIndex];
|
|
291 $ColValue = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 1];
|
|
292 $Criterion = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 2];
|
|
293 if ($ColNum > $#LineWords) {
|
|
294 next LINE;
|
|
295 }
|
|
296 $Value = $LineWords[$ColNum];
|
|
297 if ($Criterion =~ /^le$/i) {
|
|
298 if ($Value > $ColValue) {
|
|
299 next LINE;
|
|
300 }
|
|
301 }
|
|
302 elsif ($Criterion =~ /^ge$/i) {
|
|
303 if ($Value < $ColValue) {
|
|
304 next LINE;
|
|
305 }
|
|
306 }
|
|
307 elsif ($Criterion =~ /^eq$/i) {
|
|
308 if ($Value ne $ColValue) {
|
|
309 next LINE;
|
|
310 }
|
|
311 }
|
|
312 }
|
|
313 # Write it out...
|
|
314 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
315 print $NewTextFileRef "$Line\n";
|
|
316 }
|
|
317 }
|
|
318 # Extract rows by column value list...
|
|
319 sub ExtractRowsByColValueList {
|
|
320 my($Index, $TextFileRef, $NewTextFileRef) = @_;
|
|
321 my($Line, $ColNum, $ColValue, $ValueIndex, $Value, $InDelim, %ColValueMap, @LineWords);
|
|
322
|
|
323 $InDelim = $TextFilesInfo{InDelim}[$Index];
|
|
324 $ColNum = $TextFilesInfo{RowValues}[$Index][0];
|
|
325
|
|
326 # Setup a col value map...
|
|
327 %ColValueMap = ();
|
|
328 for $ValueIndex (1 .. $#{$TextFilesInfo{RowValues}[$Index]}) {
|
|
329 $Value = $TextFilesInfo{RowValues}[$Index][$ValueIndex];
|
|
330 $ColValueMap{$Value} = $Value;
|
|
331 }
|
|
332
|
|
333 LINE: while ($Line = GetTextLine($TextFileRef)) {
|
|
334 @LineWords = quotewords($InDelim, 0, $Line);
|
|
335 if ($ColNum > $#LineWords) {
|
|
336 next LINE;
|
|
337 }
|
|
338 $ColValue = $LineWords[$ColNum];
|
|
339 if (exists $ColValueMap{$ColValue}) {
|
|
340 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
341 print $NewTextFileRef "$Line\n";
|
|
342 }
|
|
343 }
|
|
344 }
|
|
345
|
|
346 # Extract row by minimum column value...
|
|
347 sub ExtractRowByMinOrMaxColValue {
|
|
348 my($Index, $TextFileRef, $NewTextFileRef) = @_;
|
|
349 my($Line, $ColNum, $ColValue, $FirstValue, $ValueLine, $InDelim, @LineWords);
|
|
350
|
|
351 $InDelim = $TextFilesInfo{InDelim}[$Index];
|
|
352 $ColNum = $TextFilesInfo{RowValues}[$Index][0];
|
|
353
|
|
354 $ValueLine = ''; $ColValue = ''; $FirstValue = 1;
|
|
355 LINE: while ($Line = GetTextLine($TextFileRef)) {
|
|
356 @LineWords = quotewords($InDelim, 0, $Line);
|
|
357 if ($ColNum > $#LineWords) {
|
|
358 next LINE;
|
|
359 }
|
|
360 if ($FirstValue) {
|
|
361 $FirstValue = 0;
|
|
362 $ColValue = $LineWords[$ColNum];
|
|
363 $ValueLine = $Line;
|
|
364 next LINE;
|
|
365 }
|
|
366 if ($OptionsInfo{SpecifiedRowsMode} =~ /^rowbymaxcolvalue$/i) {
|
|
367 if ($LineWords[$ColNum] > $ColValue) {
|
|
368 $ColValue = $LineWords[$ColNum];
|
|
369 $ValueLine = $Line;
|
|
370 }
|
|
371 }
|
|
372 else {
|
|
373 if ($LineWords[$ColNum] < $ColValue) {
|
|
374 $ColValue = $LineWords[$ColNum];
|
|
375 $ValueLine = $Line;
|
|
376 }
|
|
377 }
|
|
378 }
|
|
379 if ($ValueLine) {
|
|
380 @LineWords = quotewords($InDelim, 0, $ValueLine);
|
|
381 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
382 print $NewTextFileRef "$Line\n";
|
|
383 }
|
|
384 }
|
|
385
|
|
386 # Extract rows by column value range...
|
|
387 sub ExtractRowsByColValueRange {
|
|
388 my($Index, $TextFileRef, $NewTextFileRef) = @_;
|
|
389 my($Line, $ColNum, $ColValue, $MinValue, $MaxValue, $InDelim, @LineWords);
|
|
390
|
|
391 $InDelim = $TextFilesInfo{InDelim}[$Index];
|
|
392 $ColNum = $TextFilesInfo{RowValues}[$Index][0];
|
|
393 $MinValue = $TextFilesInfo{RowValues}[$Index][1];
|
|
394 $MaxValue = $TextFilesInfo{RowValues}[$Index][2];
|
|
395
|
|
396 LINE: while ($Line = GetTextLine($TextFileRef)) {
|
|
397 @LineWords = quotewords($InDelim, 0, $Line);
|
|
398 if ($ColNum > $#LineWords) {
|
|
399 next LINE;
|
|
400 }
|
|
401 $ColValue = $LineWords[$ColNum];
|
|
402 if ($ColValue >= $MinValue && $ColValue <= $MaxValue) {
|
|
403 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
404 print $NewTextFileRef "$Line\n";
|
|
405 }
|
|
406 }
|
|
407 }
|
|
408
|
|
409 # Extract rows by row number range...
|
|
410 sub ExtractRowsByRowNumRange {
|
|
411 my($Index, $TextFileRef, $NewTextFileRef) = @_;
|
|
412
|
|
413 my($Line, $MinRowNum, $MaxRowNum, $RowCount, $InDelim, @LineWords);
|
|
414 $InDelim = $TextFilesInfo{InDelim}[$Index];
|
|
415 $MinRowNum = $TextFilesInfo{RowValues}[$Index][0];
|
|
416 $MaxRowNum = $TextFilesInfo{RowValues}[$Index][1];
|
|
417
|
|
418 $RowCount = 1;
|
|
419 LINE: while ($Line = GetTextLine($TextFileRef)) {
|
|
420 $RowCount++;
|
|
421 if ($RowCount >= $MinRowNum && $RowCount <= $MaxRowNum) {
|
|
422 @LineWords = quotewords($InDelim, 0, $Line);
|
|
423 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
424 print $NewTextFileRef "$Line\n";
|
|
425 }
|
|
426 elsif ($RowCount > $MaxRowNum) {
|
|
427 last LINE;
|
|
428 }
|
|
429 }
|
|
430 }
|
|
431
|
|
432 # Extract rows by row numbers...
|
|
433 sub ExtractRowsByRowNums {
|
|
434 my($Index, $TextFileRef, $NewTextFileRef) = @_;
|
|
435 my($Line, $RowNum, $MaxRowNum, $RowCount, $InDelim, %RowNumMap, @LineWords);
|
|
436
|
|
437 $InDelim = $TextFilesInfo{InDelim}[$Index];
|
|
438
|
|
439 # Setup a row nums map...
|
|
440 %RowNumMap = ();
|
|
441 $MaxRowNum = $TextFilesInfo{RowValues}[$Index][0];
|
|
442 for $RowNum (@{$TextFilesInfo{RowValues}[$Index]}) {
|
|
443 if ($RowNum > $MaxRowNum) {
|
|
444 $MaxRowNum = $RowNum;
|
|
445 }
|
|
446 $RowNumMap{$RowNum} = $RowNum;
|
|
447 }
|
|
448
|
|
449 $RowCount = 1;
|
|
450 LINE: while ($Line = GetTextLine($TextFileRef)) {
|
|
451 $RowCount++;
|
|
452 if (exists $RowNumMap{$RowCount}) {
|
|
453 @LineWords = quotewords($InDelim, 0, $Line);
|
|
454 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
455 print $NewTextFileRef "$Line\n";
|
|
456 }
|
|
457 elsif ($RowCount > $MaxRowNum) {
|
|
458 last LINE;
|
|
459 }
|
|
460 }
|
|
461 }
|
|
462
|
|
463 # Retrieve text file columns and rows information for specified options...
|
|
464 sub RetrieveColumnsAndRowsInfo {
|
|
465 ProcessColumnsInfo();
|
|
466 ProcessRowsInfo();
|
|
467 }
|
|
468
|
|
469 # Make sure the specified columns exists in text files...
|
|
470 sub ProcessColumnsInfo {
|
|
471 my($Index, $SpecifiedCategoryCol, $TextFile, @ColNumsToExtract);
|
|
472
|
|
473 @{$TextFilesInfo{CategoryColNum}} = ();
|
|
474 @{$TextFilesInfo{ColNumsToExtract}} = ();
|
|
475
|
|
476 $SpecifiedCategoryCol = $OptionsInfo{SpecifiedCategoryCol};
|
|
477
|
|
478 FILELIST: for $Index (0 .. $#TextFilesList) {
|
|
479 $TextFile = $TextFilesList[$Index];
|
|
480
|
|
481 $TextFilesInfo{CategoryColNum}[$Index] = 0;
|
|
482 @{$TextFilesInfo{ColNumsToExtract}[$Index]} = ();
|
|
483
|
|
484 if ($TextFilesInfo{FileOkay}[$Index]) {
|
|
485 if ($OptionsInfo{Mode} =~ /^categories$/i) {
|
|
486 my($CategoryColNum, $CategoryColValid);
|
|
487
|
|
488 $CategoryColNum = 0;
|
|
489 $CategoryColValid = 1;
|
|
490 if ($SpecifiedCategoryCol) {
|
|
491 if ($OptionsInfo{ColMode} =~ /^colnum$/i) {
|
|
492 if ($SpecifiedCategoryCol <= $TextFilesInfo{ColCount}[$Index]) {
|
|
493 $CategoryColNum = $SpecifiedCategoryCol - 1;
|
|
494 }
|
|
495 else {
|
|
496 $CategoryColValid = 0;
|
|
497 }
|
|
498 }
|
|
499 else {
|
|
500 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol})) {
|
|
501 $CategoryColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol};
|
|
502 }
|
|
503 else {
|
|
504 $CategoryColValid = 0;
|
|
505 }
|
|
506 }
|
|
507 }
|
|
508 if ($CategoryColValid) {
|
|
509 $TextFilesInfo{CategoryColNum}[$Index] = $CategoryColNum;
|
|
510 }
|
|
511 else {
|
|
512 warn "Warning: Ignoring file $TextFile: Category column specified, $SpecifiedCategoryCol, using \"--categorycol\" option doesn't exist\n";
|
|
513 $TextFilesInfo{FileOkay}[$Index] = 0;
|
|
514 }
|
|
515 }
|
|
516 elsif ($OptionsInfo{Mode} =~ /^columns$/i) {
|
|
517 my($SpecifiedColNum, $ColNum);
|
|
518
|
|
519 $ColNum = 0;
|
|
520 @ColNumsToExtract = ();
|
|
521
|
|
522 if (@{$OptionsInfo{SpecifiedColumns}}) {
|
|
523 if ($OptionsInfo{ColMode} =~ /^colnum$/i) {
|
|
524 for $SpecifiedColNum (@{$OptionsInfo{SpecifiedColumns}}) {
|
|
525 if ($SpecifiedColNum >=1 && $SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) {
|
|
526 $ColNum = $SpecifiedColNum - 1;
|
|
527 push @ColNumsToExtract, $ColNum;
|
|
528 }
|
|
529 }
|
|
530 }
|
|
531 else {
|
|
532 my($ColLabel);
|
|
533 for $ColLabel (@{$OptionsInfo{SpecifiedColumns}}) {
|
|
534 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) {
|
|
535 push @ColNumsToExtract, $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
|
|
536 }
|
|
537 }
|
|
538 }
|
|
539 }
|
|
540 else {
|
|
541 push @ColNumsToExtract, $ColNum;
|
|
542 }
|
|
543 if (@ColNumsToExtract) {
|
|
544 push @{$TextFilesInfo{ColNumsToExtract}[$Index]}, @ColNumsToExtract;
|
|
545 }
|
|
546 else {
|
|
547 warn "Warning: Ignoring file $TextFile: None of the columns specified, @{$OptionsInfo{SpecifiedColumns}}, using \"--columns\" option exist\n";
|
|
548 $TextFilesInfo{FileOkay}[$Index] = 0;
|
|
549 }
|
|
550 }
|
|
551 }
|
|
552 }
|
|
553 }
|
|
554
|
|
555 # Process specified rows info...
|
|
556 sub ProcessRowsInfo {
|
|
557 my($Index, $TextFile, $ColID, $ColIDOkay, $Value, $Criterion, $ColNum, @RowValues);
|
|
558
|
|
559 @{$TextFilesInfo{RowValues}} = ();
|
|
560
|
|
561 FILELIST: for $Index (0 .. $#TextFilesList) {
|
|
562 $TextFile = $TextFilesList[$Index];
|
|
563 @{$TextFilesInfo{RowValues}[$Index]} = ();
|
|
564
|
|
565 if ($OptionsInfo{Mode} !~ /^rows$/i) {
|
|
566 next FILELIST;
|
|
567 }
|
|
568 if (!$TextFilesInfo{FileOkay}[$Index]) {
|
|
569 next FILELIST;
|
|
570 }
|
|
571
|
|
572 @RowValues = ();
|
|
573
|
|
574 if ($OptionsInfo{RowsMode} =~ /^rowsbycolvalue$/i) {
|
|
575 my($ValueIndex);
|
|
576 for ($ValueIndex = 0; $ValueIndex < @{$OptionsInfo{SpecifiedRowValues}}; $ValueIndex = $ValueIndex + 3) {
|
|
577 $ColID = $OptionsInfo{SpecifiedRowValues}[$ValueIndex];
|
|
578 $Value = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 1];
|
|
579 $Criterion = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 2];
|
|
580
|
|
581 $ColIDOkay = 0;
|
|
582 if ($OptionsInfo{ColMode} =~ /^collabel$/i) {
|
|
583 if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) {
|
|
584 $ColIDOkay = 1;
|
|
585 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID};
|
|
586 }
|
|
587 }
|
|
588 else {
|
|
589 if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) {
|
|
590 $ColNum = $ColID - 1;
|
|
591 $ColIDOkay = 1;
|
|
592 }
|
|
593 }
|
|
594 if ($ColIDOkay) {
|
|
595 push @RowValues, ($ColNum, $Value, $Criterion);
|
|
596 }
|
|
597 }
|
|
598 }
|
|
599 elsif ($OptionsInfo{RowsMode} =~ /^(rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue)$/i) {
|
|
600 # Process coulumn id...
|
|
601 $ColID = $OptionsInfo{SpecifiedRowValues}[0];
|
|
602 $ColIDOkay = 0;
|
|
603
|
|
604 if ($OptionsInfo{ColMode} =~ /^collabel$/i) {
|
|
605 if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) {
|
|
606 $ColIDOkay = 1;
|
|
607 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID};
|
|
608 }
|
|
609 }
|
|
610 else {
|
|
611 if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) {
|
|
612 $ColIDOkay = 1;
|
|
613 $ColNum = $ColID - 1;
|
|
614 }
|
|
615 }
|
|
616 if ($ColIDOkay) {
|
|
617 push @RowValues, $ColNum;
|
|
618 # Get rest of the specified values...
|
|
619 if (@{$OptionsInfo{SpecifiedRowValues}} > 1) {
|
|
620 for $Index (1 .. $#{$OptionsInfo{SpecifiedRowValues}}) {
|
|
621 push @RowValues, $OptionsInfo{SpecifiedRowValues}[$Index];
|
|
622 }
|
|
623 }
|
|
624 }
|
|
625 }
|
|
626 elsif ($OptionsInfo{RowsMode} =~ /^(rownums|rownumrange)$/i) {
|
|
627 push @RowValues, @{$OptionsInfo{SpecifiedRowValues}};
|
|
628 }
|
|
629
|
|
630 if (@RowValues) {
|
|
631 push @{$TextFilesInfo{RowValues}[$Index]}, @RowValues;
|
|
632 }
|
|
633 else {
|
|
634 warn "Warning: Ignoring file $TextFile: Column specified, $ColID, using \"--rows\" option doesn't exist\n";
|
|
635 $TextFilesInfo{FileOkay}[$Index] = 0;
|
|
636 }
|
|
637 }
|
|
638 }
|
|
639
|
|
640 # Retrieve information about input text files...
|
|
641 sub RetrieveTextFilesInfo {
|
|
642 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $CategoryOutFileRoot, $OutFile, $ColNum, $ColLabel);
|
|
643
|
|
644 %TextFilesInfo = ();
|
|
645
|
|
646 @{$TextFilesInfo{FileOkay}} = ();
|
|
647 @{$TextFilesInfo{ColCount}} = ();
|
|
648 @{$TextFilesInfo{ColLabels}} = ();
|
|
649 @{$TextFilesInfo{ColLabelToNumMap}} = ();
|
|
650 @{$TextFilesInfo{InDelim}} = ();
|
|
651 @{$TextFilesInfo{OutFile}} = ();
|
|
652 @{$TextFilesInfo{OutFileExt}} = ();
|
|
653 @{$TextFilesInfo{CategoryOutFileRoot}} = ();
|
|
654
|
|
655 FILELIST: for $Index (0 .. $#TextFilesList) {
|
|
656 $TextFile = $TextFilesList[$Index];
|
|
657
|
|
658 $TextFilesInfo{FileOkay}[$Index] = 0;
|
|
659 $TextFilesInfo{ColCount}[$Index] = 0;
|
|
660 $TextFilesInfo{InDelim}[$Index] = "";
|
|
661 $TextFilesInfo{OutFile}[$Index] = "";
|
|
662 $TextFilesInfo{OutFileExt}[$Index] = "";
|
|
663 $TextFilesInfo{CategoryOutFileRoot}[$Index] = "";
|
|
664
|
|
665 @{$TextFilesInfo{ColLabels}[$Index]} = ();
|
|
666 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
|
|
667
|
|
668 if (!(-e $TextFile)) {
|
|
669 warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
|
|
670 next FILELIST;
|
|
671 }
|
|
672 if (!CheckFileType($TextFile, "csv tsv")) {
|
|
673 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
|
|
674 next FILELIST;
|
|
675 }
|
|
676
|
|
677 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
|
|
678 if ($FileExt =~ /^tsv$/i) {
|
|
679 $InDelim = "\t";
|
|
680 }
|
|
681 else {
|
|
682 $InDelim = "\,";
|
|
683 if (!($OptionsInfo{InDelim} =~ /^(comma|semicolon)$/i)) {
|
|
684 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n";
|
|
685 next FILELIST;
|
|
686 }
|
|
687 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) {
|
|
688 $InDelim = "\;";
|
|
689 }
|
|
690 }
|
|
691
|
|
692 if (!open TEXTFILE, "$TextFile") {
|
|
693 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
|
|
694 next FILELIST;
|
|
695 }
|
|
696
|
|
697 $Line = GetTextLine(\*TEXTFILE);
|
|
698 @ColLabels = quotewords($InDelim, 0, $Line);
|
|
699 close TEXTFILE;
|
|
700
|
|
701 $FileDir = ""; $FileName = ""; $FileExt = "";
|
|
702 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
|
|
703 $FileExt = "csv";
|
|
704 if ($OptionsInfo{OutDelim} =~ /^tab$/i) {
|
|
705 $FileExt = "tsv";
|
|
706 }
|
|
707
|
|
708 if ($OptionsInfo{OutFileRoot} && (@TextFilesList == 1)) {
|
|
709 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
|
|
710 if ($RootFileName && $RootFileExt) {
|
|
711 $FileName = $RootFileName;
|
|
712 }
|
|
713 else {
|
|
714 $FileName = $OptionsInfo{OutFileRoot};
|
|
715 }
|
|
716 $OutFileRoot .= $FileName;
|
|
717 }
|
|
718 else {
|
|
719 $OutFileRoot = $FileName;
|
|
720 $OutFileRoot .= ($OptionsInfo{Mode} =~ /^categories$/i) ? "CategoriesSummary" : (($OptionsInfo{Mode} =~ /^rows$/i) ? "ExtractedRows" : "ExtractedColumns");
|
|
721 }
|
|
722 $CategoryOutFileRoot = "$FileName" . "Category";
|
|
723
|
|
724 $OutFile = $OutFileRoot . ".$FileExt";
|
|
725 if (lc($OutFile) eq lc($TextFile)) {
|
|
726 warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n";
|
|
727 next FILELIST;
|
|
728 }
|
|
729
|
|
730 if (!$OptionsInfo{Overwrite}) {
|
|
731 if (-e $OutFile) {
|
|
732 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n";
|
|
733 next FILELIST;
|
|
734 }
|
|
735 }
|
|
736
|
|
737 $TextFilesInfo{FileOkay}[$Index] = 1;
|
|
738 $TextFilesInfo{InDelim}[$Index] = $InDelim;
|
|
739 $TextFilesInfo{CategoryOutFileRoot}[$Index] = $CategoryOutFileRoot;
|
|
740 $TextFilesInfo{OutFile}[$Index] = "$OutFile";
|
|
741 $TextFilesInfo{OutFileExt}[$Index] = "$FileExt";
|
|
742
|
|
743 $TextFilesInfo{ColCount}[$Index] = @ColLabels;
|
|
744 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
|
|
745
|
|
746 for $ColNum (0 .. $#ColLabels) {
|
|
747 $ColLabel = $ColLabels[$ColNum];
|
|
748 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
|
|
749 }
|
|
750 }
|
|
751 }
|
|
752
|
|
753 # Process option values...
|
|
754 sub ProcessOptions {
|
|
755 my(@SpecifiedColumns, @SpecifiedRowValues);
|
|
756
|
|
757 %OptionsInfo = ();
|
|
758
|
|
759 $OptionsInfo{Mode} = $Options{mode};
|
|
760
|
|
761 $OptionsInfo{ColMode} = $Options{colmode};
|
|
762
|
|
763 $OptionsInfo{CategoryCol} = defined $Options{categorycol} ? $Options{categorycol} : undef;
|
|
764 $OptionsInfo{SpecifiedCategoryCol} = "";
|
|
765
|
|
766 if (defined $Options{categorycol}) {
|
|
767 my(@SpecifiedValues) = split ",", $Options{categorycol};
|
|
768 if (@SpecifiedValues != 1) {
|
|
769 die "Error: Invalid number of values, ",scalar(@SpecifiedValues), " using \"--categorycol\" option: Only one value is allowed.\n";
|
|
770 }
|
|
771 $OptionsInfo{SpecifiedCategoryCol} = $SpecifiedValues[0];
|
|
772 if ($Options{colmode} =~ /^colnum$/i) {
|
|
773 if (!IsPositiveInteger($OptionsInfo{SpecifiedCategoryCol})) {
|
|
774 die "Error: Category column value, $OptionsInfo{SpecifiedCategoryCol}, specified using \"--categorycol\" is not valid. Allowed integer values: > 0.\n";
|
|
775 }
|
|
776 }
|
|
777 }
|
|
778
|
|
779 $OptionsInfo{Columns} = defined $Options{columns} ? $Options{columns} : undef;
|
|
780 @{$OptionsInfo{SpecifiedColumns}} = ();
|
|
781 @SpecifiedColumns = ();
|
|
782
|
|
783 if (defined $Options{columns}) {
|
|
784 my(@SpecifiedValues) = split ",", $Options{columns};
|
|
785 if ($Options{colmode} =~ /^colnum$/i) {
|
|
786 my($ColValue);
|
|
787 for $ColValue (@SpecifiedValues) {
|
|
788 if (!IsPositiveInteger($ColValue)) {
|
|
789 die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n";
|
|
790 }
|
|
791 }
|
|
792 }
|
|
793 push @SpecifiedColumns, @SpecifiedValues;
|
|
794 }
|
|
795 @{$OptionsInfo{SpecifiedColumns}} = @SpecifiedColumns;
|
|
796
|
|
797 $OptionsInfo{InDelim} = $Options{indelim};
|
|
798
|
|
799 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
|
|
800 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
|
|
801 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
|
|
802
|
|
803 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef;
|
|
804
|
|
805 # Process any specified rows values...
|
|
806 @SpecifiedRowValues = ();
|
|
807 @{$OptionsInfo{SpecifiedRowValues}} = ();
|
|
808
|
|
809 $OptionsInfo{RowsMode} = $Options{rowsmode};
|
|
810 $OptionsInfo{Rows} = defined $Options{rows} ? $Options{rows} : undef;
|
|
811
|
|
812 $OptionsInfo{SpecifiedRowsMode} = $Options{rowsmode};
|
|
813
|
|
814 if (defined $Options{rows}) {
|
|
815 (@SpecifiedRowValues) = split ",", $Options{rows};
|
|
816 }
|
|
817 else {
|
|
818 if ($Options{rowsmode} !~ /^rownums$/i) {
|
|
819 die "Error: Specify value for \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\n";
|
|
820 }
|
|
821 push @SpecifiedRowValues, "1";
|
|
822 }
|
|
823 @{$OptionsInfo{SpecifiedRowValues}} = @SpecifiedRowValues;
|
|
824
|
|
825 my($SpecifiedColID, $SpecifiedRowID);
|
|
826 # Make sure specified values are okay...
|
|
827 if ($Options{rowsmode} =~ /^rowsbycolvalue$/i) {
|
|
828 if (@SpecifiedRowValues % 3) {
|
|
829 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain triplets.\n";
|
|
830 }
|
|
831 # Triplet format: colid,value,criteria. Criterion: le,ge,eq
|
|
832 my($Index, $ColID, $Criterion, $Value);
|
|
833 for ($Index = 0; $Index < @SpecifiedRowValues; $Index = $Index + 3) {
|
|
834 $ColID = $SpecifiedRowValues[$Index];
|
|
835 $Value = $SpecifiedRowValues[$Index + 1];
|
|
836 $Criterion = $SpecifiedRowValues[$Index + 2];
|
|
837 if ($Options{colmode} =~ /^colnum$/i) {
|
|
838 if (!IsPositiveInteger($ColID)) {
|
|
839 die "Error: Invalid column id, $ColID, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
|
|
840 }
|
|
841 }
|
|
842 if ($Criterion !~ /^(eq|le|ge)$/i) {
|
|
843 die "Error: Invalid criterion value, $Criterion, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed values: le, ge, or eq.\n";
|
|
844 }
|
|
845 }
|
|
846 }
|
|
847 elsif ($Options{rowsmode} =~ /^rowsbycolvaluelist$/i) {
|
|
848 ($SpecifiedColID) = $SpecifiedRowValues[0];
|
|
849 if ($Options{colmode} =~ /^colnum$/i) {
|
|
850 if (!IsPositiveInteger($SpecifiedColID)) {
|
|
851 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
|
|
852 }
|
|
853 }
|
|
854 if (@SpecifiedRowValues == 1) {
|
|
855 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain more than one value\n";
|
|
856 }
|
|
857 }
|
|
858 elsif ($Options{rowsmode} =~ /^rowsbycolvaluerange$/i) {
|
|
859 if (@SpecifiedRowValues != 3) {
|
|
860 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain three values\n";
|
|
861 }
|
|
862 ($SpecifiedColID) = $SpecifiedRowValues[0];
|
|
863 if ($Options{colmode} =~ /^colnum$/i) {
|
|
864 if (!IsPositiveInteger($SpecifiedColID)) {
|
|
865 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
|
|
866 }
|
|
867 }
|
|
868 if ($SpecifiedRowValues[1] >= $SpecifiedRowValues[2]) {
|
|
869 die "Error: Invalid value triplet - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: second value < third value\n";
|
|
870 }
|
|
871 }
|
|
872 elsif ($Options{rowsmode} =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) {
|
|
873 if (@SpecifiedRowValues != 1) {
|
|
874 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nOnly one value is allowed.\n";
|
|
875 }
|
|
876 ($SpecifiedColID) = $SpecifiedRowValues[0];
|
|
877 if ($Options{colmode} =~ /^colnum$/i) {
|
|
878 if (!IsPositiveInteger($SpecifiedColID)) {
|
|
879 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
|
|
880 }
|
|
881 }
|
|
882 }
|
|
883 elsif ($Options{rowsmode} =~ /^rownums$/i) {
|
|
884 for $SpecifiedRowID (@SpecifiedRowValues) {
|
|
885 if (!IsPositiveInteger($SpecifiedRowID)) {
|
|
886 die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
|
|
887 }
|
|
888 }
|
|
889 }
|
|
890 elsif ($Options{rowsmode} =~ /^rownumrange$/i) {
|
|
891 if (@SpecifiedRowValues != 2) {
|
|
892 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain only two values.\n";
|
|
893 }
|
|
894 for $SpecifiedRowID (@SpecifiedRowValues) {
|
|
895 if (!IsPositiveInteger($SpecifiedRowID)) {
|
|
896 die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
|
|
897 }
|
|
898 }
|
|
899 if ($SpecifiedRowValues[0] >= $SpecifiedRowValues[1]) {
|
|
900 die "Error: Invalid value pair - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: First value < second value\n";
|
|
901 }
|
|
902 }
|
|
903 }
|
|
904
|
|
905 # Setup script usage and retrieve command line arguments specified using various options...
|
|
906 sub SetupScriptUsage {
|
|
907
|
|
908 # Setup default and retrieve all the options...
|
|
909 %Options = ();
|
|
910 $Options{colmode} = "colnum";
|
|
911 $Options{indelim} = "comma";
|
|
912 $Options{mode} = "columns";
|
|
913 $Options{outdelim} = "comma";
|
|
914 $Options{quote} = "yes";
|
|
915 $Options{rowsmode} = "rownums";
|
|
916
|
|
917 if (!GetOptions(\%Options, "categorycol=s", "columns=s", "colmode|c=s", "help|h", "indelim=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "rows=s", "rowsmode=s", "workingdir|w=s")) {
|
|
918 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
|
|
919 }
|
|
920 if ($Options{workingdir}) {
|
|
921 if (! -d $Options{workingdir}) {
|
|
922 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
|
|
923 }
|
|
924 chdir $Options{workingdir} || die "Error: Couldn't chdir $Options{workingdir}: $! \n";
|
|
925 }
|
|
926 if ($Options{mode} !~ /^(columns|rows|categories)$/i) {
|
|
927 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: columns, rows or categories \n";
|
|
928 }
|
|
929 if ($Options{colmode} !~ /^(colnum|collabel)$/i) {
|
|
930 die "Error: The value specified, $Options{colmode}, for option \"--colmode\" is not valid. Allowed values: colnum or collabel \n";
|
|
931 }
|
|
932 if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
|
|
933 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
|
|
934 }
|
|
935 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
|
|
936 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
|
|
937 }
|
|
938 if ($Options{quote} !~ /^(yes|no)$/i) {
|
|
939 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
|
|
940 }
|
|
941 if ($Options{rowsmode} !~ /^(rowsbycolvalue|rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue|rownums|rownumrange)$/i) {
|
|
942 die "Error: The value specified, $Options{rowsmode}, for option \"--rowsmode\" is not valid. Allowed values: rowsbycolvalue, rowsbycolvaluelist, rowsbycolvaluerange, rowbymincolvalue, rowbymaxcolvalue, rownum, rownumrange\n";
|
|
943 }
|
|
944 }
|
|
945 __END__
|
|
946
|
|
947
|
|
948 =head1 NAME
|
|
949
|
|
950 ExtractFromTextFiles.pl - Extract specific data from TextFile(s)
|
|
951
|
|
952 =head1 SYNOPSIS
|
|
953
|
|
954 ExtractFromTextFiles.pl TextFile(s)...
|
|
955
|
|
956 ExtractFromTextFiles.pl [B<-c, --colmode> colnum | collabel] [B<--categorycol > number | string]
|
|
957 [B<--columns> "colnum,[colnum]..." | "collabel,[collabel]..."] [B<-h, --help>]
|
|
958 [B<--indelim> I<comma | semicolon>] [B<-m, --mode > I<columns | rows | categories>]
|
|
959 [B<-o, --overwrite>] [B<--outdelim> I<comma | tab | semicolon>] [B<-q, --quote> I<yes | no>]
|
|
960 [B<--rows> "colid,value,criteria..." | "colid,value..." | "colid,mincolvalue,maxcolvalue" | "rownum,rownum,..." | colid | "minrownum,maxrownum"]
|
|
961 [ B<--rowsmode> rowsbycolvalue | rowsbycolvaluelist | rowsbycolvaluerange | rowbymincolvalue | rowbymaxcolvalue | rownums | rownumrange]
|
|
962 [B<-r, --root> I<rootname>] [B<-w, --workingdir> I<dirname>] TextFile(s)...
|
|
963
|
|
964 =head1 DESCRIPTION
|
|
965
|
|
966 Extract column(s)/row(s) data from I<TextFile(s)> identified by column numbers or labels. Or categorize
|
|
967 data using a specified column category. During categorization, a summary text file is
|
|
968 generated containing category name and count; an additional text file, containing data for
|
|
969 for each category, is also generated. The file names are separated by space. The
|
|
970 valid file extensions are I<.csv> and I<.tsv> for comma/semicolon and tab delimited
|
|
971 text files respectively. All other file names are ignored. All the text files in a
|
|
972 current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory
|
|
973 name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file
|
|
974 which doesn't correspond to the format indicated by B<--indelim> option is ignored.
|
|
975
|
|
976 =head1 OPTIONS
|
|
977
|
|
978 =over 4
|
|
979
|
|
980 =item B<-c, --colmode> I<colnum | collabel>
|
|
981
|
|
982 Specify how columns are identified in I<TextFile(s)>: using column number or column
|
|
983 label. Possible values: I<colnum or collabel>. Default value: I<colnum>.
|
|
984
|
|
985 =item B<--categorycol > I<number | string>
|
|
986
|
|
987 Column used to categorize data. Default value: First column.
|
|
988
|
|
989 For I<colnum> value of B<-c, --colmode> option, input value is a column number.
|
|
990 Example: I<1>.
|
|
991
|
|
992 For I<collabel> value of B<-c, --colmode> option, input value is a column label.
|
|
993 Example: I<Mol_ID>.
|
|
994
|
|
995 =item B<--columns> I<"colnum,[colnum]..." | "collabel,[collabel]...">
|
|
996
|
|
997 List of comma delimited columns to extract. Default value: First column.
|
|
998
|
|
999 For I<colnum> value of B<-c, --colmode> option, input values format is:
|
|
1000 I<colnum,colnum,...>. Example: I<1,3,5>
|
|
1001
|
|
1002 For I<collabel> value of B<-c, --colmode> option, input values format is:
|
|
1003 I<collabel,collabel,..>. Example: I<Mol_ID,MolWeight>
|
|
1004
|
|
1005 =item B<-h, --help>
|
|
1006
|
|
1007 Print this help message.
|
|
1008
|
|
1009 =item B<--indelim> I<comma | semicolon>
|
|
1010
|
|
1011 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>.
|
|
1012 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a
|
|
1013 delimiter.
|
|
1014
|
|
1015 =item B<-m, --mode > I<columns | rows | categories>
|
|
1016
|
|
1017 Specify what to extract from I<TextFile(s)>. Possible values: I<columns, rows,
|
|
1018 or categories>. Default value: I<columns>.
|
|
1019
|
|
1020 For I<columns> mode, data for appropriate columns specified by B<--columns> option
|
|
1021 is extracted from I<TextFile(s)> and placed into new text files.
|
|
1022
|
|
1023 For I<rows> mode, appropriate rows specified in conjuction with B<--rowsmode> and
|
|
1024 B<rows> options are extracted from I<TextFile(s)> and placed into new text files.
|
|
1025
|
|
1026 For I<categories> mode, coulmn specified by B<--categorycol> is
|
|
1027 used to categorize data, and a summary text file is generated
|
|
1028 containing category name and count; an additional text file, containing data for
|
|
1029 for each category, is also generated.
|
|
1030
|
|
1031 =item B<-o, --overwrite>
|
|
1032
|
|
1033 Overwrite existing files.
|
|
1034
|
|
1035 =item B<--outdelim> I<comma | tab | semicolon>.
|
|
1036
|
|
1037 Output text file delimiter. Possible values: I<comma, tab, or semicolon>.
|
|
1038 Default value: I<comma>
|
|
1039
|
|
1040 =item B<-q, --quote> I<yes | no>
|
|
1041
|
|
1042 Put quotes around column values in output text file. Possible values: I<yes or
|
|
1043 no>. Default value: I<yes>.
|
|
1044
|
|
1045 =item B<-r, --root> I<rootname>
|
|
1046
|
|
1047 New file name is generated using the root: <Root>.<Ext>. Default for new file
|
|
1048 names: <TextFile>CategoriesSummary.<Ext>, <TextFile>ExtractedColumns.<Ext>, and
|
|
1049 <TextFile>ExtractedRows.<Ext> for I<categories>, I<columns>, and I<rows> mode
|
|
1050 respectively. And <TextFile>Category<CategoryName>.<Ext>
|
|
1051 for each category retrieved from each text file. The output file type determines <Ext>
|
|
1052 value: csv and tsv for CSV, and TSV files respectively.
|
|
1053
|
|
1054 This option is ignored for multiple input files.
|
|
1055
|
|
1056 =item B<--rows> I<"colid,value,criteria..." | "colid,value..." | "colid,mincolvalue,maxcolvalue" | "rownum,rownum,..." | colid | "minrownum,maxrownum">
|
|
1057
|
|
1058 This value is B<--rowsmode> specific. In general, it's a list of comma separated column ids and
|
|
1059 associated mode specific value. Based on Column ids specification, column label or number, is
|
|
1060 controlled by B<-c, --colmode> option.
|
|
1061
|
|
1062 First line containing column labels is always written out. And value comparisons assume
|
|
1063 numerical column data.
|
|
1064
|
|
1065 For I<rowsbycolvalue> mode, input value format contains these triplets:
|
|
1066 I<colid,value, criteria...>. Possible values for criteria: I<le, ge or eq>.
|
|
1067 Examples:
|
|
1068
|
|
1069 MolWt,450,le
|
|
1070 MolWt,450,le,LogP,5,le,SumNumNO,10,le,SumNHOH,5,le
|
|
1071
|
|
1072 For I<rowsbycolvaluelist> mode, input value format is: I<colid,value...>. Examples:
|
|
1073
|
|
1074 Mol_ID,20
|
|
1075 Mol_ID,20,1002,1115
|
|
1076
|
|
1077 For I<rowsbycolvaluerange> mode, input value format is: I<colid,mincolvalue,maxcolvalue>. Examples:
|
|
1078
|
|
1079 MolWt,100,450
|
|
1080
|
|
1081 For I<rowbymincolvalue, rowbymaxcolvalue> modes, input value format is: I<colid>.
|
|
1082
|
|
1083 For I<rownum> mode, input value format is: I<rownum>. Default value: I<2>.
|
|
1084
|
|
1085 For I<rownumrange> mode, input value format is: I<minrownum, maxrownum>. Examples:
|
|
1086
|
|
1087 10,40
|
|
1088
|
|
1089 =item B<--rowsmode> I<rowsbycolvalue | rowsbycolvaluelist | rowsbycolvaluerange | rowbymincolvalue | rowbymaxcolvalue | rownums | rownumrange>
|
|
1090
|
|
1091 Specify how to extract rows from I<TextFile(s)>. Possible values: I<rowsbycolvalue, rowsbycolvaluelist, rowsbycolvaluerange,
|
|
1092 rowbymincolvalue, rowbymaxcolvalue, rownum, rownumrange>. Default value: I<rownum>.
|
|
1093
|
|
1094 Use B<--rows> option to list rows criterion used for extraction of rows from
|
|
1095 I<TextFile(s)>.
|
|
1096
|
|
1097 =item B<-w, --workingdir> I<dirname>
|
|
1098
|
|
1099 Location of working directory. Default: current directory.
|
|
1100
|
|
1101 =back
|
|
1102
|
|
1103 =head1 EXAMPLES
|
|
1104
|
|
1105 To extract first column from a text file and generate a new CSV text file NewSample1.csv,
|
|
1106 type:
|
|
1107
|
|
1108 % ExtractFromTextFiles.pl -r NewSample1 -o Sample1.csv
|
|
1109
|
|
1110 To extract columns Mol_ID, MolWeight, and NAME from Sample1.csv and generate a new
|
|
1111 textfile NewSample1.tsv with no quotes, type:
|
|
1112
|
|
1113 % ExtractFromTextFiles.pl -m columns -c collabel --columns "Mol_ID,
|
|
1114 MolWeight,NAME" --outdelim tab --quote no -r NewSample1
|
|
1115 -o Sample1.csv
|
|
1116
|
|
1117 To extract rows containing values for MolWeight column of less than 450 from
|
|
1118 Sample1.csv and generate a new textfile NewSample1.csv, type:
|
|
1119
|
|
1120 % ExtractFromTextFiles.pl -m rows --rowsmode rowsbycolvalue
|
|
1121 -c collabel --rows MolWeight,450,le -r NewSample1
|
|
1122 -o Sample1.csv
|
|
1123
|
|
1124 To extract rows containing values for MolWeight column between 400 and 500 from
|
|
1125 Sample1.csv and generate a new textfile NewSample1.csv, type:
|
|
1126
|
|
1127 % ExtractFromTextFiles.pl -m rows --rowsmode rowsbycolvaluerange
|
|
1128 -c collabel --rows MolWeight,450,500 -r NewSample1
|
|
1129 -o Sample1.csv
|
|
1130
|
|
1131 To extract a row containing minimum value for column MolWeight from Sample1.csv and generate
|
|
1132 a new textfile NewSample1.csv, type:
|
|
1133
|
|
1134 % ExtractFromTextFiles.pl -m rows --rowsmode rowbymincolvalue
|
|
1135 -c collabel --rows MolWeight -r NewSample1
|
|
1136 -o Sample1.csv
|
|
1137
|
|
1138 =head1 AUTHOR
|
|
1139
|
|
1140 Manish Sud <msud@san.rr.com>
|
|
1141
|
|
1142 =head1 SEE ALSO
|
|
1143
|
|
1144 JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl
|
|
1145
|
|
1146 =head1 COPYRIGHT
|
|
1147
|
|
1148 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
1149
|
|
1150 This file is part of MayaChemTools.
|
|
1151
|
|
1152 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
1153 the terms of the GNU Lesser General Public License as published by the Free
|
|
1154 Software Foundation; either version 3 of the License, or (at your option)
|
|
1155 any later version.
|
|
1156
|
|
1157 =cut
|