comparison bin/SortTextFiles.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: SortTextFiles.pl,v $
4 # $Date: 2015/02/28 20:46:21 $
5 # $Revision: 1.36 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37
38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
39
40 # Autoflush STDOUT
41 $| = 1;
42
43 # Starting message...
44 $ScriptName = basename($0);
45 print "\n$ScriptName: Starting...\n\n";
46 $StartTime = new Benchmark;
47
48 # Get the options and setup script...
49 SetupScriptUsage();
50 if ($Options{help} || @ARGV < 1) {
51 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
52 }
53
54 my(@TextFilesList);
55 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
56
57 print "Processing options...\n";
58 my(%OptionsInfo);
59 ProcessOptions();
60
61 print "Checking input text file(s)...\n";
62 my(%TextFilesInfo);
63 RetrieveTextFilesInfo();
64 ProcessColumnsInfo();
65
66 # Generate output files...
67 my($FileIndex);
68 if (@TextFilesList > 1) {
69 print "\nProcessing text files...\n";
70 }
71 for $FileIndex (0 .. $#TextFilesList) {
72 if ($TextFilesInfo{FileOkay}[$FileIndex]) {
73 print "\nProcessing file $TextFilesList[$FileIndex]...\n";
74 SortTextFile($FileIndex);
75 }
76 }
77 print "\n$ScriptName:Done...\n\n";
78
79 $EndTime = new Benchmark;
80 $TotalTime = timediff ($EndTime, $StartTime);
81 print "Total time: ", timestr($TotalTime), "\n";
82
83 ###############################################################################
84
85 # Sort it out...
86 sub SortTextFile {
87 my($Index) = @_;
88 my($TextFile, $NewTextFile, $KeyCol, $Line, $KeyColValue, $InDelim, @ColLabels, @LineWords);
89
90 $TextFile = $TextFilesList[$Index];
91 $InDelim = $TextFilesInfo{InDelim}[$Index];
92 $NewTextFile = $TextFilesInfo{OutFile}[$Index];
93 $KeyCol = $TextFilesInfo{KeyColNum}[$Index];
94 @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]};
95
96 print "Generating new Text file $NewTextFile...\n";
97 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n";
98 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
99
100 # Skip over column labels from old file...
101 $Line = GetTextLine(\*TEXTFILE);
102
103 # Add column lablels in new file...
104 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
105 print NEWTEXTFILE "$Line\n";
106
107 # Go over all rows and store the lines using key value as hash...
108 my(%KeyToLinesMap, @InvalidDataLines, $LineCount);
109
110 %KeyToLinesMap = ();
111 @InvalidDataLines = ();
112 $LineCount = 1;
113 TEXTLINE: while ($Line = GetTextLine(\*TEXTFILE)) {
114 @LineWords = quotewords($InDelim, 0, $Line);
115 $LineCount++;
116 if ($KeyCol < @LineWords) {
117 $KeyColValue = $LineWords[$KeyCol];
118 if (!IsNotEmpty($KeyColValue)) {
119 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
120 push @InvalidDataLines, $Line;
121 if ($OptionsInfo{DetailLevel} >= 3 ) {
122 print "Ignoring line $LineCount: Contains empty value for key column $ColLabels[$KeyCol]: $Line\n";
123 }
124 elsif ($OptionsInfo{DetailLevel} >= 2) {
125 print "Ignoring line $LineCount: Contains empty value for key column $ColLabels[$KeyCol]...\n";
126 }
127 next TEXTLINE;
128 }
129 if ($OptionsInfo{KeyData} =~ /^numeric$/i) {
130 if (!IsFloat($KeyColValue)) {
131 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
132 push @InvalidDataLines, $Line;
133 if ($OptionsInfo{DetailLevel} >= 3 ) {
134 print "Line number $LineCount: Contains non-numerical value for key column $ColLabels[$KeyCol]: $Line\n";
135 }
136 elsif ($OptionsInfo{DetailLevel} >= 2) {
137 print "Line number $LineCount: Contains non-numerical value for key column $ColLabels[$KeyCol]...\n";
138 }
139 next TEXTLINE;
140 }
141 }
142 if (exists($KeyToLinesMap{$KeyColValue})) {
143 # Append to existing line...
144 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
145 $KeyToLinesMap{$KeyColValue} .= "\n" . $Line;
146 }
147 else {
148 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
149 $KeyToLinesMap{$KeyColValue} = $Line;
150 }
151 }
152 }
153 if ($OptionsInfo{Sort} =~ /^ascending$/i) {
154 if ($OptionsInfo{KeyData} =~ /^alphanumeric$/i) {
155 for $KeyColValue (sort { lc($a) cmp lc($b) } keys %KeyToLinesMap ) {
156 print NEWTEXTFILE "$KeyToLinesMap{$KeyColValue}\n";
157 }
158 }
159 else {
160 for $KeyColValue (sort { $a <=> $b } keys %KeyToLinesMap ) {
161 print NEWTEXTFILE "$KeyToLinesMap{$KeyColValue}\n";
162 }
163 }
164 }
165 else {
166 if ($OptionsInfo{KeyData} =~ /^alphanumeric$/i) {
167 for $KeyColValue (sort { lc($b) cmp lc($a) } keys %KeyToLinesMap ) {
168 print NEWTEXTFILE "$KeyToLinesMap{$KeyColValue}\n";
169 }
170 }
171 else {
172 for $KeyColValue (sort { $b <=> $a } keys %KeyToLinesMap ) {
173 print NEWTEXTFILE "$KeyToLinesMap{$KeyColValue}\n";
174 }
175 }
176 }
177 # Write out the lines with invalid data...
178 if (@InvalidDataLines) {
179 print "Placing ", scalar(@InvalidDataLines)," line(s) with invalid column key data at the end...\n";
180 for $Line (@InvalidDataLines) {
181 print NEWTEXTFILE "$Line\n";
182 }
183 }
184 close NEWTEXTFILE;
185 close TEXTFILE;
186
187 }
188
189 # Retrieve information about input text files...
190 sub RetrieveTextFilesInfo {
191 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $OutFile, $ColNum, $ColLabel);
192
193 %TextFilesInfo = ();
194
195 @{$TextFilesInfo{FileOkay}} = ();
196 @{$TextFilesInfo{ColCount}} = ();
197 @{$TextFilesInfo{ColLabels}} = ();
198 @{$TextFilesInfo{ColLabelToNumMap}} = ();
199 @{$TextFilesInfo{InDelim}} = ();
200 @{$TextFilesInfo{OutFile}} = ();
201
202 FILELIST: for $Index (0 .. $#TextFilesList) {
203 $TextFile = $TextFilesList[$Index];
204
205 $TextFilesInfo{FileOkay}[$Index] = 0;
206 $TextFilesInfo{ColCount}[$Index] = 0;
207 $TextFilesInfo{InDelim}[$Index] = "";
208 $TextFilesInfo{OutFile}[$Index] = "";
209 @{$TextFilesInfo{ColLabels}[$Index]} = ();
210 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
211
212 if (!(-e $TextFile)) {
213 warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
214 next FILELIST;
215 }
216 if (!CheckFileType($TextFile, "csv tsv")) {
217 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
218 next FILELIST;
219 }
220 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
221 if ($FileExt =~ /^tsv$/i) {
222 $InDelim = "\t";
223 }
224 else {
225 $InDelim = "\,";
226 if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
227 warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n";
228 next FILELIST;
229 }
230 if ($Options{indelim} =~ /^semicolon$/i) {
231 $InDelim = "\;";
232 }
233 }
234
235 if (!open TEXTFILE, "$TextFile") {
236 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
237 next FILELIST;
238 }
239
240 $Line = GetTextLine(\*TEXTFILE);
241 @ColLabels = quotewords($InDelim, 0, $Line);
242 close TEXTFILE;
243
244 $FileDir = ""; $FileName = ""; $FileExt = "";
245 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
246 $FileExt = "csv";
247 if ($Options{outdelim} =~ /^tab$/i) {
248 $FileExt = "tsv";
249 }
250 if ($Options{root} && (@TextFilesList == 1)) {
251 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
252 if ($RootFileName && $RootFileExt) {
253 $FileName = $RootFileName;
254 }
255 else {
256 $FileName = $Options{root};
257 }
258 $OutFileRoot = $FileName;
259 }
260 else {
261 $OutFileRoot = $FileName . "SortedByColumn";
262 }
263
264 $OutFile = $OutFileRoot . ".$FileExt";
265 if (lc($OutFile) eq lc($TextFile)) {
266 warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n";
267 next FILELIST;
268 }
269 if (!$Options{overwrite}) {
270 if (-e $OutFile) {
271 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n";
272 next FILELIST;
273 }
274 }
275
276 $TextFilesInfo{FileOkay}[$Index] = 1;
277 $TextFilesInfo{InDelim}[$Index] = $InDelim;
278 $TextFilesInfo{OutFile}[$Index] = "$OutFile";
279
280 $TextFilesInfo{ColCount}[$Index] = @ColLabels;
281 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
282 for $ColNum (0 .. $#ColLabels) {
283 $ColLabel = $ColLabels[$ColNum];
284 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
285 }
286 }
287
288 }
289
290 # Make sure specified key column are okay...
291 sub ProcessColumnsInfo {
292 my($Index, $TextFile, $SpecifiedKeyCol);
293
294 @{$TextFilesInfo{KeyColNum}} = ();
295
296 $SpecifiedKeyCol = $OptionsInfo{SpecifiedKeyCol};
297
298 FILELIST: for $Index (0 .. $#TextFilesList) {
299 $TextFile = $TextFilesList[$Index];
300 $TextFilesInfo{KeyColNum}[$Index] = 0;
301
302 if ($TextFilesInfo{FileOkay}[$Index]) {
303 my($KeyColNum, $KeyColValid);
304
305 $KeyColNum = 0;
306 $KeyColValid = 1;
307 if ($SpecifiedKeyCol) {
308 if ($OptionsInfo{Mode} =~ /^colnum$/i) {
309 if ($SpecifiedKeyCol <= $TextFilesInfo{ColCount}[$Index]) {
310 $KeyColNum = $SpecifiedKeyCol - 1;
311 }
312 else {
313 $KeyColValid = 0;
314 }
315 }
316 else {
317 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedKeyCol})) {
318 $KeyColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedKeyCol};
319 }
320 else {
321 $KeyColValid = 0;
322 }
323 }
324 }
325 if ($KeyColValid) {
326 $TextFilesInfo{KeyColNum}[$Index] = $KeyColNum;
327 }
328 else {
329 warn "Warning: Ignoring file $TextFile: Column key specified, $SpecifiedKeyCol, using \"k --key\" option doesn't exist\n";
330 $TextFilesInfo{FileOkay}[$Index] = 0;
331 }
332 }
333 }
334 }
335
336 # Process option values...
337 sub ProcessOptions {
338 %OptionsInfo = ();
339
340 $OptionsInfo{Mode} = $Options{mode};
341
342 $OptionsInfo{DetailLevel} = $Options{detail};
343
344 $OptionsInfo{Sort} = $Options{sort};
345 $OptionsInfo{KeyData} = $Options{keydata};
346
347 $OptionsInfo{InDelim} = $Options{indelim};
348
349 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
350 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
351
352 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
353 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef;
354
355 $OptionsInfo{Key} = defined $Options{key} ? $Options{key} : undef;
356 $OptionsInfo{SpecifiedKeyCol} = "";
357
358 if (defined $Options{key}) {
359 $OptionsInfo{SpecifiedKeyCol} = $Options{key};
360 if ($Options{mode} =~ /^colnum$/i) {
361 if (!IsPositiveInteger($OptionsInfo{SpecifiedKeyCol})) {
362 die "Error: Invalid value $Options{key} specified using \"-k --key\" option: Allowed values: > 0\n";
363 }
364 }
365 }
366 }
367
368 # Setup script usage and retrieve command line arguments specified using various options...
369 sub SetupScriptUsage {
370 %Options = ();
371
372 $Options{detail} = 1;
373 $Options{mode} = "colnum";
374 $Options{sort} = "ascending";
375 $Options{keydata} = "numeric";
376 $Options{indelim} = "comma";
377 $Options{outdelim} = "comma";
378 $Options{quote} = "yes";
379 if (!GetOptions(\%Options, "detail|d=i", "help|h", "indelim=s", "key|k=s", "keydata=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "sort|s=s", "workingdir|w=s")) {
380 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
381 }
382 if ($Options{workingdir}) {
383 if (! -d $Options{workingdir}) {
384 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
385 }
386 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
387 }
388 if ($Options{mode} !~ /^(colnum|collabel)$/i) {
389 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n";
390 }
391 if ($Options{keydata} !~ /^(numeric|alphanumeric)$/i) {
392 die "Error: The value specified, $Options{keydata}, for option \"--keydata\" is not valid. Allowed values: numeric or alphanumeric\n";
393 }
394 if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
395 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
396 }
397 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
398 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
399 }
400 if ($Options{quote} !~ /^(yes|no)$/i) {
401 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
402 }
403 if ($Options{sort} !~ /^(ascending|descending)$/i) {
404 die "Error: The value specified, $Options{sort}, for option \"-s --sort\" is not valid. Allowed values: ascending or descending\n";
405 }
406 if (!IsPositiveInteger($Options{detail})) {
407 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
408 }
409 }
410
411 __END__
412
413 =head1 NAME
414
415 SortTextFiles.pl - Sort TextFile(s) using values for a column
416
417 =head1 SYNOPSIS
418
419 SortTextFiles.pl TextFile(s)...
420
421 SortTextFiles.pl [B<-d, --detail> infolevel] [B<-h, --help>] [B<--indelim> comma | semicolon] [B<-k, --key> colnum | collabel]
422 [B<--keydata> numeric | alphanumeric] [B<-m, --mode> colnum | collabel] [B<-o, --overwrite>]
423 [B<--outdelim> comma | tab | semicolon] [B<-q, --quote> yes | no] [B<-r, --root> rootname]
424 [B<-s, --sort> ascending | descending] [B<-w, --workingdir> dirname] TextFile(s)...
425
426 =head1 DESCRIPTION
427
428 Sort I<TextFile(s)> using values for a key column specified by a column number or label.
429 Only one column key can be specified for sorting. In an event of conflict during sorting
430 process, two similar values for a column key are simply transferred to output files in
431 order of their presence in input files. Additionally, rows with empty or inappropriate
432 values for column key are simply placed at the end. The file names are separated by space.
433 The valid file extensions are I<.csv> and I<.tsv> for comma/semicolon and tab delimited
434 text files respectively. All other file names are ignored. All the text files in a
435 current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory
436 name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file
437 which doesn't correspond to the format indicated by B<--indelim> option is ignored.
438
439 =head1 OPTIONS
440
441 =over 4
442
443 =item B<-d, --detail> I<infolevel>
444
445 Level of information to print about lines being ignored. Default: I<1>. Possible values:
446 I<1, 2 or 3>.
447
448 =item B<-h, --help>
449
450 Print this help message.
451
452 =item B<--indelim> I<comma | semicolon>
453
454 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>.
455 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a
456 delimiter.
457
458 =item B<-k, --key> I<col number | col name>
459
460 This value is mode specific. It specifies which column to use for sorting I<TextFile(s)>.
461 Possible values: I<col number or col label>. Default value: I<first column>.
462
463 =item B<--keydata> I<numeric | alphanumeric>
464
465 Data type for column key. Possible values: I<numeric or alphanumeric>. Default value:
466 I<numeric>. For I<alphanumeric> data values, comparison is case insensitive.
467
468 =item B<-m, --mode> I<colnum | collabel>
469
470 Specify how to sort text files: using column number or column label.
471 Possible values: I<colnum or collabel>. Default value: I<colnum>.
472
473 =item B<-o, --overwrite>
474
475 Overwrite existing files.
476
477 =item B<--outdelim> I<comma | tab | semicolon>
478
479 Output text file delimiter. Possible values: I<comma, tab, or semicolon>
480 Default value: I<comma>.
481
482 =item B<-q, --quote> I<yes | no>
483
484 Put quotes around column values in output text file. Possible values: I<yes or
485 no>. Default value: I<yes>.
486
487 =item B<-r, --root> I<rootname>
488
489 New text file name is generated using the root: <Root>.<Ext>. Default new file
490 name: <InitialTextFileName>SortedByColumn.<Ext>. The csv, and tsv
491 <Ext> values are used for comma/semicolon, and tab delimited text files
492 respectively. This option is ignored for multiple input files.
493
494 =item B<-s, --sort> I<ascending | descending>
495
496 Sorting order for column values. Possible values: I<ascending or descending>.
497 Default value: I<ascending>.
498
499 =item B<-w, --workingdir> I<dirname>
500
501 Location of working directory. Default: current directory.
502
503 =back
504
505 =head1 EXAMPLES
506
507 To perform numerical sort in ascending order using first column values and generate
508 a new CSV text file NewSample1.csv, type:
509
510 % SortTextFiles.pl -o -r NewSample1 Sample1.csv
511
512 To perform numerical sort in descending order using MolWeight column and generate
513 a new CSV text file NewSample1.csv, type:
514
515 % SortTextFiles.pl -m collabel -k MolWeight --keydata numeric
516 -s descending -r NewSample1 -o Sample1.csv
517
518 To perform numerical sort in ascending order using column number 1 and generate
519 a new TSV text file NewSample1.csv, type:
520
521 % SortTextFiles.pl -m colnum -k 1 --keydata numeric -s ascending
522 -r NewSample1 --outdelim tab -o Sample1.csv
523
524 =head1 AUTHOR
525
526 Manish Sud <msud@san.rr.com>
527
528 =head1 SEE ALSO
529
530 JoinTextFiles.pl, MergeTextFilesWithSD.pl, ModifyTextFilesFormat.pl, SplitTextFiles.pl, TextFilesToHTML.pl
531
532 =head1 COPYRIGHT
533
534 Copyright (C) 2015 Manish Sud. All rights reserved.
535
536 This file is part of MayaChemTools.
537
538 MayaChemTools is free software; you can redistribute it and/or modify it under
539 the terms of the GNU Lesser General Public License as published by the Free
540 Software Foundation; either version 3 of the License, or (at your option)
541 any later version.
542
543 =cut