Mercurial > repos > bgruening > text_processing
view easyjoin @ 4:56e80527c482 draft
Uploaded
author | bgruening |
---|---|
date | Wed, 07 Jan 2015 11:10:52 -0500 |
parents | ec66f9d90ef0 |
children |
line wrap: on
line source
#!/usr/bin/env perl ## EASY Join - ## Join with automatic pre-sorting of both files ## Copyright (C) 2010 A. Gordon (gordon@cshl.edu) ## license: AGPLv3+ use strict; use warnings; use Data::Dumper; use Getopt::Long qw(:config bundling no_ignore_case_always); use File::Temp qw/tempfile/; use POSIX qw(locale_h); sub show_help(); sub show_version(); sub show_examples(); sub parse_commandline_options(); sub sort_file($$$); sub join_files($$); sub cleanup_files(@); my $PROGRAM="easyjoin"; my $VERSION="0.6.1"; my $debug=undef; my $HEADER=undef; my $IGNORE_CASE=undef; my $FIELD_SEP=undef; my $FILE1_KEY_COLUMN=1; my $FILE2_KEY_COLUMN=1; my @OUTPUT_SPECIFIERS=(); my $OUTPUT_FORMAT=undef; my $EMPTY_FILLER=undef; my $SORT_BUFFER_SIZE=undef; my $SORT_TEMP_DIR=undef; my $input_filename1; my $input_filename2; ## ## Program Start ## $ENV{'LANG'}="C";## "C" locale is critical for sorting and joining correctly parse_commandline_options(); my (undef, $tmp_filename1) = tempfile(OPEN=>0); my (undef, $tmp_filename2) = tempfile(OPEN=>0); sort_file($input_filename1, $tmp_filename1, $FILE1_KEY_COLUMN); sort_file($input_filename2, $tmp_filename2, $FILE2_KEY_COLUMN); my $join_exit_code = join_files($tmp_filename1, $tmp_filename2); cleanup_files($tmp_filename1, $tmp_filename2); exit($join_exit_code); ## ## Program end ## sub show_help() { print<<EOF; ${PROGRAM}: Wrapper for GNU join+sort, automaticalyl sorts files before joining them. Usage: $PROGRAM [OPTIONS] [JOIN-OPTIONS] [SORT-OPTIONS] FILE1 FILE2 OPTIONS: Options specific to this program: --header = Both input files have a header line as the first line. The header line will be joined properly, without being sorted. --version = Print ${PROGRAM}'s version. --debug = Print debug messages (relating to ${PROGRAM}'s operation). --help = Show this help screen. --example = Show usage examples. --all = Short-cut for: -a 1 -a 2 -o auto -e . -t <TAB> This will show all values (paired and unpared) from both files, Automatically formatting the columns, and using TAB as field separator. You can override the empty filler (-e X) on the command line. --allh = Short-cut for: -a 1 -a 2 -o auto -e . -t <TAB> --header Same as above, but will also respect the header line from both input files. JOIN-OPTIONS: All of GNU join options are supported. Run: join --help To see all possible joining options. SORT-OPTIONS: The following options are supported for the intermediate sorting step: -S SIZE --buffer-size SIZE = GNU sort's --buffer-size option. -T DIR --temporary-directory DIR = GNU sort's --temporary-directory option. Run: sort --help To learn about these options. They might improve sorting performances for big files. FILE1 FILE2: The two input files to be sorted, joined. Unlike GNU join, joining STDIN is not supported. Both files must be real files. NOTE About "--header" and "--auto-format": The "--header" feature requires GNU coreutils version 8.6 or later. The "-o auto" feature requires GNU coreutils version 8.10 or later. EOF exit(0); } sub show_version() { print<<EOF; $PROGRAM $VERSION Copyright (C) 2010 A. Gordon (gordon\@cshl.edu) License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html) To see the GNU's join version, run: join --version EOF exit(0); } sub show_examples() { print<<EOF; Example of joining two unsorted files (each file having a header line): \$ cat input1.txt Fruit Color Apple red Banana yellow Orange orange Melon green \$ cat input2.txt Fruit Price Orange 7 Avocado 8 Apple 4 Banana 3 \$ easyjoin -j 1 -a 1 -a 2 --header -e . -o auto input1.txt input2.txt Fruit Color Price Apple red 4 Avocado . 8 Banana yellow 3 Melon green . Orange orange 7 ## A short-cut for all the options above: \$ easyjoin --allh input1.txt input2.txt Fruit Color Price Apple red 4 Avocado . 8 Banana yellow 3 Melon green . Orange orange 7 EOF exit(0); } sub parse_commandline_options() { ## ## Parse command line ## my $rc = GetOptions( "a=i" => sub { push @OUTPUT_SPECIFIERS, '-a', $_[1] }, "e=s" => \$EMPTY_FILLER, "ignore-case|i" => \$IGNORE_CASE, "j=i" => sub { $FILE1_KEY_COLUMN = $_[1] ; $FILE2_KEY_COLUMN = $_[1] ; }, "o=s" => \$OUTPUT_FORMAT, "t=s" => \$FIELD_SEP, "v=i" => sub { push @OUTPUT_SPECIFIERS, '-v', $_[1] }, "1=i" => \$FILE1_KEY_COLUMN, "2=i" => \$FILE2_KEY_COLUMN, "debug" => \$debug, "header" => \$HEADER, "help" => \&show_help, "version" => \&show_version, "examples" => \&show_examples, "buffer-size|S=s" => \$SORT_BUFFER_SIZE, "temporary-directory|T=s" => \$SORT_TEMP_DIR, "all" => sub { push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2; $FIELD_SEP = "\t"; $OUTPUT_FORMAT = "auto"; $EMPTY_FILLER = "." unless defined $EMPTY_FILLER; }, "allh" => sub { push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2; $FIELD_SEP = "\t"; $OUTPUT_FORMAT = "auto"; $HEADER=1; $EMPTY_FILLER = "." unless defined $EMPTY_FILLER; }, ); die "$PROGRAM: invalid command-line arguments.\n" unless $rc; ## We need two file names to join my @INPUT_FILES = @ARGV; die "$PROGRAM: missing operand: two file names to join\n" if (scalar(@INPUT_FILES)<2); die "$PROGRAM: error: too many files specified (can only join two files)\n" if (scalar(@INPUT_FILES)>2); die "$PROGRAM: error: input file can't be STDIN, please use a real file name.\n" if $INPUT_FILES[0] eq "-" || $INPUT_FILES[1] eq "-"; die "$PROGRAM: error: input file 1 '" . $INPUT_FILES[0] . "' not found!" unless -e $INPUT_FILES[0]; die "$PROGRAM: error: input file 2 '" . $INPUT_FILES[1] . "' not found!" unless -e $INPUT_FILES[1]; $input_filename1 = $INPUT_FILES[0]; $input_filename2 = $INPUT_FILES[1]; } sub sort_file($$$) { my ($input_filename, $output_filename, $key_column) = @_; my @SORT_COMMAND; push @SORT_COMMAND, $HEADER ? "./sort-header" : "sort" ; push @SORT_COMMAND, "-f" if $IGNORE_CASE; push @SORT_COMMAND, "-k${key_column},${key_column}" ; push @SORT_COMMAND, "--buffer-size", $SORT_BUFFER_SIZE if $SORT_BUFFER_SIZE; push @SORT_COMMAND, "--temporary-directory", $SORT_TEMP_DIR if $SORT_TEMP_DIR; push @SORT_COMMAND, "--output", $output_filename; push @SORT_COMMAND, "--debugheader" if $debug && $HEADER; push @SORT_COMMAND, "-t", $FIELD_SEP if $FIELD_SEP; push @SORT_COMMAND, $input_filename; if ($debug) { warn "$PROGRAM: Running sort on '$input_filename' => '$output_filename'\n"; warn "$PROGRAM: Sort command line:\n"; print STDERR Dumper(\@SORT_COMMAND), "\n"; } my $sort_exit_code=1; system(@SORT_COMMAND); if ($? == -1) { die "$PROGRAM: Error: failed to execute 'sort': $!\n"; } elsif ($? & 127) { my $signal = ($? & 127); kill 2, $$ if $signal == 2; ##if sort was interrupted (CTRL-C) - just pass it on and commit suicide die "$PROGRAM: Error: 'sort' child-process died with signal $signal\n"; } else { $sort_exit_code = ($? >> 8); } die "$PROGRAM: Error: 'sort' process failed, exit code $sort_exit_code\n" if $sort_exit_code!=0; } sub join_files($$) { my ($file1, $file2) = @_; my @join_command = qw/join/; push @join_command, "--header" if $HEADER; push @join_command, "--ignore-case" if $IGNORE_CASE; push @join_command, "-t", $FIELD_SEP if $FIELD_SEP; push @join_command, "-1", $FILE1_KEY_COLUMN if $FILE1_KEY_COLUMN; push @join_command, "-2", $FILE2_KEY_COLUMN if $FILE2_KEY_COLUMN; push @join_command, "-e", $EMPTY_FILLER if defined $EMPTY_FILLER; push @join_command, "-o", $OUTPUT_FORMAT if $OUTPUT_FORMAT; push @join_command, @OUTPUT_SPECIFIERS; push @join_command, $file1, $file2; if ($debug) { warn "$PROGRAM: Running join on '$file1' and '$file2'\n"; warn "$PROGRAM: join command line:\n"; print STDERR Dumper(\@join_command), "\n"; } my $join_exit_code=1; system(@join_command); if ($? == -1) { die "$PROGRAM: Error: failed to execute 'join': $!\n"; } elsif ($? & 127) { my $signal = ($? & 127); kill 2, $$ if $signal == 2; ##if join was interrupted (CTRL-C) - just pass it on and commit suicide die "$PROGRAM: Error: 'join' child-process died with signal $signal\n"; } else { $join_exit_code = ($? >> 8); } return $join_exit_code; } sub cleanup_files(@) { my (@files) = @_; foreach my $file (@files) { if ($debug) { warn "$PROGRAM: debug mode, not deleting temporary file '$file'\n"; } else { my $count = unlink $file; warn "$PROGRAM: Error: failed to delete temporary file '$file': $!\n" if ($count != 1); } } }