Mercurial > repos > bgruening > text_processing
comparison sort-header @ 4:56e80527c482 draft
Uploaded
author | bgruening |
---|---|
date | Wed, 07 Jan 2015 11:10:52 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:7068d1548234 | 4:56e80527c482 |
---|---|
1 #!/usr/bin/env perl | |
2 ## | |
3 ## Sort-header - wrapper for GNU sort with header-line support | |
4 ## | |
5 ## Copyright(C) A. Gordon | |
6 ## license AGPLv3+ | |
7 ## | |
8 use strict; | |
9 use warnings; | |
10 use Data::Dumper; | |
11 use IO::Handle; | |
12 use Getopt::Long qw(:config bundling no_ignore_case_always); | |
13 | |
14 ## Forward declarations | |
15 sub add_standard_sort_param(@); | |
16 sub add_standard_sort_param_value(@); | |
17 sub forbidden_sort_param(@); | |
18 sub show_help(); | |
19 sub show_version(); | |
20 sub show_examples(); | |
21 sub parse_commandline_options(); | |
22 sub reassign_input_output(); | |
23 sub process_header_lines(); | |
24 sub run_sort(); | |
25 sub read_line_non_buffered(); | |
26 | |
27 | |
28 ## | |
29 ## Runtime options | |
30 ## | |
31 my $PROGRAM="sort-header"; | |
32 my $VERSION=0.4; | |
33 | |
34 my $check_only=undef; | |
35 my $input_file=undef; | |
36 my $output_file=undef; | |
37 my $field_separator=undef; | |
38 my $header_lines =1 ; | |
39 my $debug=undef; | |
40 my $sort_exit_code=1; #by default, assume some error | |
41 | |
42 my @sort_options; | |
43 | |
44 ## | |
45 ## Program Start | |
46 ## | |
47 parse_commandline_options(); | |
48 reassign_input_output(); | |
49 process_header_lines(); | |
50 run_sort(); | |
51 exit($sort_exit_code); | |
52 ## | |
53 ## Program End | |
54 ## | |
55 | |
56 sub show_examples() | |
57 { | |
58 print<<EOF; | |
59 Sorting a file with a header line: | |
60 | |
61 \$ cat input.txt | |
62 Fruit Color Price | |
63 Banana Yellow 4.1 | |
64 Avocado Green 8.0 | |
65 Apple Red 3.0 | |
66 Melon Green 6.1 | |
67 | |
68 # By default, 'sort-header' assumes 1 header line | |
69 # (no need to use --header in this case). | |
70 | |
71 \$ sort-header -k3,3nr input.txt | |
72 Fruit Color Price | |
73 Avocado Green 8.0 | |
74 Melon Green 6.1 | |
75 Banana Yellow 4.1 | |
76 Apple Red 3.0 | |
77 | |
78 EOF | |
79 exit(0); | |
80 } | |
81 | |
82 sub show_help() | |
83 { | |
84 print<<EOF; | |
85 ${PROGRAM}: Wrapper for GNU sort, allowing sorting files with header lines. | |
86 | |
87 Usage: $PROGRAM [HEADER-OPTIONS] [GNU sort Options] [INPUT-FILE] | |
88 | |
89 HEADER-OPTIONS: the following options are supported by '${PROGRAM}': | |
90 | |
91 --header N = Treat the first N lines as header lines. | |
92 These line will NOT be sorted. They will be passed | |
93 directly to the output file. (default: 1) | |
94 | |
95 --version = Print ${PROGRAM}'s version. | |
96 | |
97 --debugheader = Print debug messages (relating to ${PROGRAM}'s operation). | |
98 | |
99 --help = Show this help screen. | |
100 | |
101 --examples = Show usage examples. | |
102 | |
103 GNU sort options: | |
104 Most of the standard GNU sort options are supported and passed to GNU sort. | |
105 The following options can not be used with '${PROGRAM}': | |
106 | |
107 -m --merge => ${PROGRAM} can only sort one file, not merge multiple files. | |
108 -c -C --check => Currently not supported | |
109 --files0-from => Currently not supported | |
110 -z --zero-terminated => Currently not supported | |
111 | |
112 INPUT-FILE: | |
113 If INPUT-FILE is not specified, $PROGRAM will use STDIN (just like GNU sort). | |
114 | |
115 EOF | |
116 exit(0); | |
117 } | |
118 | |
119 sub show_version() | |
120 { | |
121 print<<EOF; | |
122 $PROGRAM $VERSION | |
123 Copyright (C) 2010 A. Gordon (gordon\@cshl.edu) | |
124 License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html) | |
125 | |
126 To see the GNU's sort version, run: | |
127 sort --version | |
128 EOF | |
129 exit(0); | |
130 } | |
131 | |
132 sub parse_commandline_options() | |
133 { | |
134 my $rc = GetOptions( | |
135 "ignore-leading-blanks|b" => \&add_standard_sort_param, | |
136 "dictionary-order|d" => \&add_standard_sort_param, | |
137 "ignore-case|f" => \&add_standard_sort_param, | |
138 "general-numeric-sort|g" => \&add_standard_sort_param, | |
139 "ignore-nonprinting|i" => \&add_standard_sort_param, | |
140 "month-sort|M" => \&add_standard_sort_param, | |
141 "human-numeric-sort|h" => \&add_standard_sort_param, | |
142 "numeric-sort|n" => \&add_standard_sort_param, | |
143 "random-source=s" => \&add_standard_sort_param_value, | |
144 "random-sort|R" => \&add_standard_sort_param, | |
145 "reverse|r" => \&add_standard_sort_param, | |
146 "sort=s" => \&add_standard_sort_param_value, | |
147 "version-sort|V" => \&add_standard_sort_param, | |
148 | |
149 "check|c" => \&forbidden_sort_param, | |
150 "C" => \&forbidden_sort_param, | |
151 "compress-program=s" => \&add_standard_sort_param_value, | |
152 "debug" => \&add_standard_sort_param, | |
153 | |
154 "files0-from=s" => \&forbidden_sort_param, | |
155 | |
156 "key|k=s" => \&add_standard_sort_param_value, | |
157 "merge|m" => \&forbidden_sort_param, | |
158 "batch-size=i" => \&forbidden_sort_param, | |
159 | |
160 "parallel=i" => \&add_standard_sort_param_value, | |
161 | |
162 "output|o=s" => \$output_file, | |
163 | |
164 "stable|s" => \&add_standard_sort_param, | |
165 "buffer-size|S=s" => \&add_standard_sort_param_value, | |
166 | |
167 "field-separator|t=s" => \&add_standard_sort_param_value, | |
168 "temporary-directory|T=s" => \&add_standard_sort_param_value, | |
169 "unique|u" => \&add_standard_sort_param, | |
170 | |
171 "zero-terminated|z" => \&forbidden_sort_param, | |
172 | |
173 "help" => \&show_help, | |
174 "version" => \&show_version, | |
175 "examples" => \&show_examples, | |
176 | |
177 "header=i" => \$header_lines, | |
178 "debugheader" => \$debug, | |
179 ); | |
180 | |
181 exit 1 unless $rc; | |
182 | |
183 my @INPUT_FILES = @ARGV; | |
184 | |
185 die "$PROGRAM: error: invalid number of header lines ($header_lines)\n" unless $header_lines>=0; | |
186 die "$PROGRAM: error: Multiple input files specified. This program can sort only a signle file.\n" if (scalar(@INPUT_FILES)>1); | |
187 $input_file = shift @INPUT_FILES if scalar(@INPUT_FILES)==1; | |
188 | |
189 if ($debug) { | |
190 warn "$PROGRAM: number of header lines = $header_lines\n"; | |
191 warn "$PROGRAM: PASS-to-Sort options:\n", Dumper(\@sort_options), "\n"; | |
192 } | |
193 } | |
194 | |
195 sub reassign_input_output() | |
196 { | |
197 if ($output_file) { | |
198 warn "$PROGRAM: Re-assigning STDOUT to '$output_file'\n" if $debug; | |
199 open OUTPUT, '>', $output_file or die "$PROGRAM: Error: failed to create output file '$output_file': $!\n"; | |
200 STDOUT->fdopen(\*OUTPUT, 'w') or die "$PROGRAM: Error: failed to reassign STDOUT to '$output_file': $!\n"; | |
201 } | |
202 | |
203 | |
204 if ($input_file) { | |
205 warn "$PROGRAM: Re-assigning STDIN to '$input_file'\n" if $debug; | |
206 open INPUT, '<', $input_file or die "$PROGRAM: Error: failed to open input file '$input_file': $!\n"; | |
207 STDIN->fdopen(\*INPUT, 'r') or die "$PROGRAM: Error: failed to reassign STDIN to '$input_file': $!\n"; | |
208 } | |
209 } | |
210 | |
211 sub process_header_lines() | |
212 { | |
213 warn "$PROGRAM: Reading $header_lines header lines...\n" if $debug; | |
214 for (my $i=0; $i<$header_lines; $i++) { | |
215 my $line = read_line_non_buffered(); | |
216 exit unless defined $line; | |
217 print $line; | |
218 } | |
219 } | |
220 | |
221 sub run_sort() | |
222 { | |
223 warn "$PROGRAM: Running GNU sort...\n" if $debug; | |
224 system('sort', @sort_options); | |
225 if ($? == -1) { | |
226 die "$PROGRAM: Error: failed to execute 'sort': $!\n"; | |
227 } | |
228 elsif ($? & 127) { | |
229 my $signal = ($? & 127); | |
230 kill 2, $$ if $signal == 2; ##if sort was interrupted (CTRL-C) - just pass it on and commit suicide | |
231 die "$PROGRAM: Error: 'sort' child-process died with signal $signal\n"; | |
232 } | |
233 else { | |
234 $sort_exit_code = ($? >> 8); | |
235 } | |
236 } | |
237 | |
238 | |
239 sub add_standard_sort_param(@) | |
240 { | |
241 my ($obj)= @_; | |
242 add_standard_sort_param_value($obj, undef); | |
243 } | |
244 | |
245 sub add_standard_sort_param_value(@) | |
246 { | |
247 my ($obj,$value)= @_; | |
248 | |
249 my $option = "" . $obj ; #stringify the optino object, get the option name. | |
250 | |
251 if (length($option)==1) { | |
252 $option = "-" . $option ; | |
253 } else { | |
254 $option = "--" . $option ; | |
255 } | |
256 push @sort_options, $option ; | |
257 push @sort_options, $value if $value; | |
258 } | |
259 | |
260 sub forbidden_sort_param(@) | |
261 { | |
262 my ($obj,$value)= @_; | |
263 my $option = "" . $obj ; #stringify the optino object, get the option name. | |
264 | |
265 die "$PROGRAM: Error: option '$option' can not be used with this program. If you must use it, run GNU sort directly. see --help for more details.\n"; | |
266 } | |
267 | |
268 sub read_line_non_buffered() | |
269 { | |
270 my $line = ''; | |
271 while ( 1 ) { | |
272 my $c; | |
273 my $rc = sysread STDIN, $c, 1; | |
274 die "$PROGRAM: STDIN Read error: $!" unless defined $rc; | |
275 return $line if $rc==0 && $line; | |
276 return undef if $rc==0 && (!$line); | |
277 $line .= $c ; | |
278 return $line if ( $c eq "\n"); | |
279 } | |
280 } | |
281 |