Mercurial > repos > bgruening > text_processing
diff multijoin @ 0:ec66f9d90ef0 draft
initial uploaded
author | bgruening |
---|---|
date | Thu, 05 Sep 2013 04:58:21 -0400 |
parents | |
children | 062ed2bb4f2e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multijoin Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,321 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Getopt::Long qw(:config no_ignore_case); +use Data::Dumper; +use Carp; +use File::Basename; +use Sort::Key::Natural qw(natsort); + +my $version = "0.1.1"; +my $field_sep = "\t"; +my $key_column; +my @values_columns; +my $max_value_column; +my @input_files; +my $input_headers ; +my $output_headers; +my $filler = "0"; +my $filler_string ; +my $ignore_duplicates; +my $debug = 0 ; +my %input_headers; +my $have_file_labels; +my %file_labels; + +sub parse_command_line_parameters(); +sub show_help(); +sub read_input_file($); +sub print_combined_data(); +sub sanitize_filename($); +sub print_output_header(); +sub show_examples(); + +## +## Program Start +## + +parse_command_line_parameters(); + +my %data; +foreach my $file (@input_files) { + read_input_file($file); +} +#print STDERR Dumper(\%input_headers),"\n"; +#print STDERR Dumper(\%data) if $debug; +print_output_header() if $output_headers; +print_combined_data(); + + +## +## Program End +## +sub print_output_header() +{ + my @output = ("key"); + foreach my $file ( @input_files ) { + foreach my $column ( @values_columns ) { + my $column_name = ( exists $input_headers{$file}->{$column} ) ? + $input_headers{$file}->{$column} : + "V$column" ; + + push @output, $file_labels{$file} . "_" . $column_name; + } + } + print join($field_sep,@output),"\n" + or die "Output error: can't write output line: $!\n"; +} + +sub print_combined_data() +{ + my @keys = natsort keys %data ; + + foreach my $key ( @keys ) { + my @outputs; + + foreach my $file (@input_files) { + push @outputs, + (exists $data{$key}->{$file}) ? $data{$key}->{$file} : $filler_string; + } + + print join($field_sep,$key,@outputs),"\n" + or die "Output error: can't write output line: $!\n"; + } +} + +sub sanitize_filename($) +{ + my ($filename) = shift or croak "missing file name"; + my $file_ID = basename($filename); + $file_ID =~ s/\.\w+$//; # remove extension + $file_ID =~ s/^[^\w\.\-]+//; + $file_ID =~ s/[^\w\.\-]+$//; + $file_ID =~ s/[^\w\.\-]+/_/g; # sanitize bad characters + return $file_ID; +} + +sub read_input_file($) +{ + my ($filename) = shift or croak "Missing input file name"; + + my @value_indexes = map { $_-1 } @values_columns; #zero-based indexes for value columns + + open FILE, "<", $filename + or die "Error: can't open file '$filename': $!\n"; + + ## Read file's header + if ($input_headers) { + my $line = <FILE>; + chomp $line; + my @fields = split $field_sep, $line; + + my $num_input_fields = scalar(@fields); + die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ; + + foreach my $col (@values_columns) { + $input_headers{$filename}->{$col} = $fields[$col-1] ; + } + } + + + ## Read file's data + while ( my $line = <FILE> ) { + chomp $line; + my @fields = split $field_sep, $line; + + my $num_input_fields = scalar(@fields); + die "Input error: file '$filename' line $. doesn't have enough columns (key column = $key_column, line has only $num_input_fields columns)\n" if $num_input_fields < $key_column ; + die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ; + + + my $key = $fields[$key_column-1]; + my $value = join($field_sep, @fields[@value_indexes]); + + die "Input error: file '$filename' line $. have duplicated key '$key'.\n" + if (exists $data{$key}->{$filename} && !$ignore_duplicates) ; + $data{$key}->{$filename} = $value; + } + close FILE + or die "Error: can't write and close file '$filename': $!\n"; +} + +sub parse_command_line_parameters() +{ + my $values_columns_string; + + my $rc = GetOptions("help" => \&show_help, + "key|k=i" => \$key_column, + "values|v=s" => \$values_columns_string, + "t=s" => \$field_sep, + "in-header" => \$input_headers, + "out-header|h" => \$output_headers, + "H" => sub { $input_headers = 1 ; $output_headers = 1 ; }, + "ignore-dups" => \$ignore_duplicates, + "filler|f=s" => \$filler, + "examples" => \&show_examples, + "labels" => \$have_file_labels, + ); + die "Error: inalid command-line parameters.\n" unless $rc; + + die "Error: missing key column. use --key N. see --help for more details.\n" unless defined $key_column; + die "Error: Invalid key column ($key_column). Must be bigger than zero. see --help for more details.\n" if $key_column <= 0 ; + + die "Error: missing values column. use --values V1,V2,Vn. See --help for more details.\n" unless defined $values_columns_string; + @values_columns = split(/\s*,\s*/, $values_columns_string); + + die "Error: missing values column. use --values N,N,N. see --help for more details.\n" unless scalar(@values_columns)>0; + foreach my $v (@values_columns) { + die "Error: invalid value column ($v), please use only numbers>=1. see --help for more details.\n" + unless $v =~ /^\d+$/ && $v>=1; + + $max_value_column = $v unless defined $max_value_column && $max_value_column>$v; + } + + $filler_string = join($field_sep, map { $filler } @values_columns); + + + if ($have_file_labels) { + ## have file labels - each pair of parameters is a file/label pair. + die "Error: missing input files and labels\n" if scalar(@ARGV)==0; + die "Error: when using --labels, a pair of file names + labels is required (got odd number of argiments)\n" unless scalar(@ARGV)%2==0; + + while (@ARGV) { + my $filename = shift @ARGV; + my $label = shift @ARGV; + $label =~ s/^[^\.\w\-]+//; + $label =~ s/[^\.\w\-]+$//g; + $label =~ s/[^\.\w\-]+/_/g; + + my $file_ID = sanitize_filename($filename); + $file_labels{$filename} = $label; + push @input_files, $filename; + } + } else { + ## no file labels - the rest of the arguments are just file names; + @input_files = @ARGV; + die "Error: missing input files\n" if scalar(@input_files)==0; + die "Error: need more than one input file to join.\n" if scalar(@input_files)==1; + + foreach my $file (@input_files) { + my $file_ID = sanitize_filename($file); + $file_labels{$file} = $file_ID; + } + } + +} + +sub show_help() +{ + print<<EOF; +Multi-File join, version $version +Copyright (C) 2012 - A. Gordon (gordon at cshl dot edu) +License AGPLv3+: Affero GPL version 3 or later (http://www.gnu.org/licenses/agpl.html) + +Usage: + multijoin [OPTIONS] -k N -v V1,V2,Vn,.. FILE1 FILE2 ... FILEn + +Options: + + --help This helpful help screen. + + -k N + --key N Use column N as key column. + + -v V1,V2,Vn + --values V1,V2,Vn + Use columns V1,V2,Vn as value columns - those will be joined + According to the Key column. + Multiple columns can be specified. + + -t SEP Use SEP as field separator character (default: tab). + + -h + --out-header Add a header line to the output file. + + --in-header The input files have a header line. + The first line will not be joined. + if '--out-header' is also used, the output column headers will + be constructed based on the input header column names. + + -H + --headers Same as '--in-header --out-header' combined. + + --ignore-dups Ignore duplicated keys (within a file). + By default, duplicated keys cause an error. + + -f X + --filler X Fill missing values with X. + (Default: '$filler'). + + --labels When printning output headers with '-h', instead of using the file name, + use specific labels. + Each file name must be followed by a name. + + example (without labels): + \$ multijoin -h -k 1 -v 2 A.TXT B.TXT C.TXT + + example (with labels): + \$ multijoin -h --labels -k 1 -v 2 A.TXT Sample1 B.TXT SampleB C.TXT SampleC + + --examples Show detailed examples. + +EOF + exit(0); +} + +sub show_examples() +{ + print<<EOF; + +To join three files, based on the 4th column, and keeping the 7th,8th,9th columns: + +\$ head *.txt +==> AAA.txt <== +chr4 888449 890171 FBtr0308778 0 + 266 1527 1722 +chr4 972167 979017 FBtr0310651 0 - 3944 6428 6850 +chr4 972186 979017 FBtr0089229 0 - 3944 6428 6831 +chr4 972186 979017 FBtr0089231 0 - 3944 6428 6831 +chr4 972186 979017 FBtr0089233 0 - 3944 6428 6831 +chr4 995793 996435 FBtr0111046 0 + 7 166 642 +chr4 995793 997931 FBtr0111044 0 + 28 683 2138 +chr4 995793 997931 FBtr0111045 0 + 28 683 2138 +chr4 1034029 1047719 FBtr0089223 0 - 5293 13394 13690 + +==> BBB.txt <== +chr4 90286 134453 FBtr0309803 0 + 657 29084 44167 +chr4 251355 266499 FBtr0089116 0 + 56 1296 15144 +chr4 252050 266506 FBtr0308086 0 + 56 1296 14456 +chr4 252050 266506 FBtr0308087 0 + 56 1296 14456 +chr4 252053 266528 FBtr0300796 0 + 56 1296 14475 +chr4 252053 266528 FBtr0300800 0 + 56 1296 14475 +chr4 252055 266528 FBtr0300798 0 + 56 1296 14473 +chr4 252055 266528 FBtr0300799 0 + 56 1296 14473 +chr4 252541 266528 FBtr0300797 0 + 56 1296 13987 + +==> CCC.txt <== +chr4 972167 979017 FBtr0310651 0 - 9927 6738 6850 +chr4 972186 979017 FBtr0089229 0 - 9927 6738 6831 +chr4 972186 979017 FBtr0089231 0 - 9927 6738 6831 +chr4 972186 979017 FBtr0089233 0 - 9927 6738 6831 +chr4 995793 996435 FBtr0111046 0 + 5 304 642 +chr4 995793 997931 FBtr0111044 0 + 17 714 2138 +chr4 995793 997931 FBtr0111045 0 + 17 714 2138 +chr4 1034029 1047719 FBtr0089223 0 - 17646 13536 13690 + +\$ multijoin -h --key 4 --values 7,8,9 *.txt | head -n 10 +key AAA__V7 AAA__V8 AAA__V9 BBB__V7 BBB__V8 BBB__V9 CCC__V7 CCC__V8 CCC__V9 +FBtr0089116 0 0 0 56 1296 15144 0 0 0 +FBtr0089223 5293 13394 13690 0 0 0 17646 13536 13690 +FBtr0089229 3944 6428 6831 0 0 0 9927 6738 6831 +FBtr0089231 3944 6428 6831 0 0 0 9927 6738 6831 +FBtr0089233 3944 6428 6831 0 0 0 9927 6738 6831 +FBtr0111044 28 683 2138 0 0 0 17 714 2138 +FBtr0111045 28 683 2138 0 0 0 17 714 2138 +FBtr0111046 7 166 642 0 0 0 5 304 642 +FBtr0300796 0 0 0 56 1296 14475 0 0 0 + + + +EOF + exit(0); +}