# HG changeset patch # User bgruening # Date 1378371501 14400 # Node ID ec66f9d90ef0862d30504c410147402b4a165344 initial uploaded diff -r 000000000000 -r ec66f9d90ef0 awk.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/awk.xml Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,124 @@ + + + + gnu_awk + + + awk --sandbox -v FS=\$'\t' -v OFS=\$'\t' --re-interval -f '$awk_script' '$input' > '$output' + + + + + + + + + + + + + + + + + + + + + + + + + + $url_paste + + + + +**What it does** + +This tool runs the unix **awk** command on the selected data file. + +.. class:: infomark + +**TIP:** This tool uses the **extended regular** expression syntax (not the perl syntax). + + +**Further reading** + +- Awk by Example (http://www.ibm.com/developerworks/linux/library/l-awk1.html) +- Long AWK tutorial (http://www.grymoire.com/Unix/Awk.html) +- Learn AWK in 1 hour (http://www.selectorweb.com/awk.html) +- awk cheat-sheet (http://cbi.med.harvard.edu/people/peshkin/sb302/awk_cheatsheets.pdf) +- Collection of useful awk one-liners (http://student.northpark.edu/pemente/awk/awk1line.txt) + +----- + +**AWK programs** + +Most AWK programs consist of **patterns** (i.e. rules that match lines of text) and **actions** (i.e. commands to execute when a pattern matches a line). + +The basic form of AWK program is:: + + pattern { action 1; action 2; action 3; } + + + + + +**Pattern Examples** + +- **$2 == "chr3"** will match lines whose second column is the string 'chr3' +- **$5-$4>23** will match lines that after subtracting the value of the fourth column from the value of the fifth column, gives value alrger than 23. +- **/AG..AG/** will match lines that contain the regular expression **AG..AG** (meaning the characeters AG followed by any two characeters followed by AG). (This is the way to specify regular expressions on the entire line, similar to GREP.) +- **$7 ~ /A{4}U/** will match lines whose seventh column contains 4 consecutive A's followed by a U. (This is the way to specify regular expressions on a specific field.) +- **10000 < $4 && $4 < 20000** will match lines whose fourth column value is larger than 10,000 but smaller than 20,000 +- If no pattern is specified, all lines match (meaning the **action** part will be executed on all lines). + + + +**Action Examples** + +- **{ print }** or **{ print $0 }** will print the entire input line (the line that matched in **pattern**). **$0** is a special marker meaning 'the entire line'. +- **{ print $1, $4, $5 }** will print only the first, fourth and fifth fields of the input line. +- **{ print $4, $5-$4 }** will print the fourth column and the difference between the fifth and fourth column. (If the fourth column was start-position in the input file, and the fifth column was end-position - the output file will contain the start-position, and the length). +- If no action part is specified (not even the curly brackets) - the default action is to print the entire line. + + + + + + + + + +**AWK's Regular Expression Syntax** + +The select tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text. + +- **( ) { } [ ] . * ? + \ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for. +- **^** matches the beginning of a string(but not an internal line). +- **(** .. **)** groups a particular pattern. +- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern. + + - **{n}** The preceding item is matched exactly n times. + - **{n,}** The preceding item ismatched n or more times. + - **{n,m}** The preceding item is matched at least n times but not more than m times. + +- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**. +- **.** Matches any single character except a newline. +- ***** The preceding item will be matched zero or more times. +- **?** The preceding item is optional and matched at most once. +- **+** The preceding item will be matched one or more times. +- **^** has two meaning: + - matches the beginning of a line or string. + - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets. +- **$** matches the end of a line or string. +- **\|** Separates alternate possibilities. + + +**Note**: AWK uses extended regular expression syntax, not Perl syntax. **\\d**, **\\w**, **\\s** etc. are **not** supported. + + + diff -r 000000000000 -r ec66f9d90ef0 cut.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cut.xml Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,103 @@ + + columns from files + + gnu_coreutils + + + cut ${complement} ${cutwhat} '${list}' '${input}' > '${output}' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool runs the **cut** unix command, which extract or delete columns from a file. + +----- + +Field List Example: + +**1,3,7** - Cut specific fields/characters. + +**3-** - Cut from the third field/character to the end of the line. + +**2-5** - Cut from the second to the fifth field/character. + +**-8** - Cut from the first to the eight field/characters. + + + + +Input Example:: + + fruit color price weight + apple red 1.4 0.5 + orange orange 1.5 0.3 + banana yellow 0.9 0.3 + + +Output Example ( **Keeping fields 1,3,4** ):: + + fruit price weight + apple 1.4 0.5 + orange 1.5 0.3 + banana 0.9 0.3 + +Output Example ( **Discarding field 2** ):: + + fruit price weight + apple 1.4 0.5 + orange 1.5 0.3 + banana 0.9 0.3 + +Output Example ( **Keeping 3 characters** ):: + + fru + app + ora + ban + + + diff -r 000000000000 -r ec66f9d90ef0 easyjoin --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/easyjoin Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,308 @@ +#!/usr/bin/env perl +## EASY Join - +## Join with automatic pre-sorting of both files +## Copyright (C) 2010 A. Gordon (gordon@cshl.edu) +## license: AGPLv3+ +use strict; +use warnings; +use Data::Dumper; +use Getopt::Long qw(:config bundling no_ignore_case_always); +use File::Temp qw/tempfile/; +use POSIX qw(locale_h); + +sub show_help(); +sub show_version(); +sub show_examples(); +sub parse_commandline_options(); +sub sort_file($$$); +sub join_files($$); +sub cleanup_files(@); + + +my $PROGRAM="easyjoin"; +my $VERSION="0.6.1"; + +my $debug=undef; +my $HEADER=undef; +my $IGNORE_CASE=undef; +my $FIELD_SEP=undef; +my $FILE1_KEY_COLUMN=1; +my $FILE2_KEY_COLUMN=1; +my @OUTPUT_SPECIFIERS=(); +my $OUTPUT_FORMAT=undef; +my $EMPTY_FILLER=undef; +my $SORT_BUFFER_SIZE=undef; +my $SORT_TEMP_DIR=undef; +my $input_filename1; +my $input_filename2; + +## +## Program Start +## +$ENV{'LANG'}="C";## "C" locale is critical for sorting and joining correctly +parse_commandline_options(); +my (undef, $tmp_filename1) = tempfile(OPEN=>0); +my (undef, $tmp_filename2) = tempfile(OPEN=>0); +sort_file($input_filename1, $tmp_filename1, $FILE1_KEY_COLUMN); +sort_file($input_filename2, $tmp_filename2, $FILE2_KEY_COLUMN); +my $join_exit_code = join_files($tmp_filename1, $tmp_filename2); +cleanup_files($tmp_filename1, $tmp_filename2); +exit($join_exit_code); + +## +## Program end +## + + +sub show_help() +{ +print< + This will show all values (paired and unpared) from both files, + Automatically formatting the columns, and using TAB as field separator. + You can override the empty filler (-e X) on the command line. + + --allh = Short-cut for: + -a 1 -a 2 -o auto -e . -t --header + Same as above, but will also respect the header line from both input files. + +JOIN-OPTIONS: + All of GNU join options are supported. + Run: + join --help + To see all possible joining options. + +SORT-OPTIONS: + The following options are supported for the intermediate sorting step: + + -S SIZE + --buffer-size SIZE = GNU sort's --buffer-size option. + + -T DIR + --temporary-directory DIR = GNU sort's --temporary-directory option. + + Run: + sort --help + To learn about these options. They might improve sorting performances for big files. + +FILE1 FILE2: + The two input files to be sorted, joined. + Unlike GNU join, joining STDIN is not supported. Both files must be real files. + + +NOTE About "--header" and "--auto-format": + The "--header" feature requires GNU coreutils version 8.6 or later. + The "-o auto" feature requires GNU coreutils version 8.10 or later. + +EOF + exit(0); +} + +sub show_version() +{ +print< sub { push @OUTPUT_SPECIFIERS, '-a', $_[1] }, + "e=s" => \$EMPTY_FILLER, + "ignore-case|i" => \$IGNORE_CASE, + "j=i" => sub { $FILE1_KEY_COLUMN = $_[1] ; $FILE2_KEY_COLUMN = $_[1] ; }, + "o=s" => \$OUTPUT_FORMAT, + "t=s" => \$FIELD_SEP, + "v=i" => sub { push @OUTPUT_SPECIFIERS, '-v', $_[1] }, + "1=i" => \$FILE1_KEY_COLUMN, + "2=i" => \$FILE2_KEY_COLUMN, + "debug" => \$debug, + "header" => \$HEADER, + "help" => \&show_help, + "version" => \&show_version, + "examples" => \&show_examples, + "buffer-size|S=s" => \$SORT_BUFFER_SIZE, + "temporary-directory|T=s" => \$SORT_TEMP_DIR, + "all" => sub { + push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2; + $FIELD_SEP = "\t"; + $OUTPUT_FORMAT = "auto"; + $EMPTY_FILLER = "." unless defined $EMPTY_FILLER; + }, + "allh" => sub { + push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2; + $FIELD_SEP = "\t"; + $OUTPUT_FORMAT = "auto"; + $HEADER=1; + $EMPTY_FILLER = "." unless defined $EMPTY_FILLER; + }, + ); + die "$PROGRAM: invalid command-line arguments.\n" unless $rc; + + ## We need two file names to join + my @INPUT_FILES = @ARGV; + die "$PROGRAM: missing operand: two file names to join\n" if (scalar(@INPUT_FILES)<2); + die "$PROGRAM: error: too many files specified (can only join two files)\n" if (scalar(@INPUT_FILES)>2); + die "$PROGRAM: error: input file can't be STDIN, please use a real file name.\n" if $INPUT_FILES[0] eq "-" || $INPUT_FILES[1] eq "-"; + die "$PROGRAM: error: input file 1 '" . $INPUT_FILES[0] . "' not found!" unless -e $INPUT_FILES[0]; + die "$PROGRAM: error: input file 2 '" . $INPUT_FILES[1] . "' not found!" unless -e $INPUT_FILES[1]; + + $input_filename1 = $INPUT_FILES[0]; + $input_filename2 = $INPUT_FILES[1]; +} + +sub sort_file($$$) +{ + my ($input_filename, $output_filename, $key_column) = @_; + + my @SORT_COMMAND; + push @SORT_COMMAND, $HEADER ? "sort-header" : "sort" ; + push @SORT_COMMAND, "-f" if $IGNORE_CASE; + push @SORT_COMMAND, "-k${key_column},${key_column}" ; + push @SORT_COMMAND, "--buffer-size", $SORT_BUFFER_SIZE if $SORT_BUFFER_SIZE; + push @SORT_COMMAND, "--temporary-directory", $SORT_TEMP_DIR if $SORT_TEMP_DIR; + push @SORT_COMMAND, "--output", $output_filename; + push @SORT_COMMAND, "--debugheader" if $debug && $HEADER; + push @SORT_COMMAND, "-t", $FIELD_SEP if $FIELD_SEP; + push @SORT_COMMAND, $input_filename; + + if ($debug) { + warn "$PROGRAM: Running sort on '$input_filename' => '$output_filename'\n"; + warn "$PROGRAM: Sort command line:\n"; + print STDERR Dumper(\@SORT_COMMAND), "\n"; + } + + my $sort_exit_code=1; + system(@SORT_COMMAND); + if ($? == -1) { + die "$PROGRAM: Error: failed to execute 'sort': $!\n"; + } + elsif ($? & 127) { + my $signal = ($? & 127); + kill 2, $$ if $signal == 2; ##if sort was interrupted (CTRL-C) - just pass it on and commit suicide + die "$PROGRAM: Error: 'sort' child-process died with signal $signal\n"; + } + else { + $sort_exit_code = ($? >> 8); + } + die "$PROGRAM: Error: 'sort' process failed, exit code $sort_exit_code\n" if $sort_exit_code!=0; +} + +sub join_files($$) +{ + my ($file1, $file2) = @_; + + my @join_command = qw/join/; + push @join_command, "--header" if $HEADER; + push @join_command, "--ignore-case" if $IGNORE_CASE; + push @join_command, "-t", $FIELD_SEP if $FIELD_SEP; + push @join_command, "-1", $FILE1_KEY_COLUMN if $FILE1_KEY_COLUMN; + push @join_command, "-2", $FILE2_KEY_COLUMN if $FILE2_KEY_COLUMN; + push @join_command, "-e", $EMPTY_FILLER if defined $EMPTY_FILLER; + push @join_command, "-o", $OUTPUT_FORMAT if $OUTPUT_FORMAT; + push @join_command, @OUTPUT_SPECIFIERS; + push @join_command, $file1, $file2; + + if ($debug) { + warn "$PROGRAM: Running join on '$file1' and '$file2'\n"; + warn "$PROGRAM: join command line:\n"; + print STDERR Dumper(\@join_command), "\n"; + } + + my $join_exit_code=1; + system(@join_command); + if ($? == -1) { + die "$PROGRAM: Error: failed to execute 'join': $!\n"; + } + elsif ($? & 127) { + my $signal = ($? & 127); + kill 2, $$ if $signal == 2; ##if join was interrupted (CTRL-C) - just pass it on and commit suicide + die "$PROGRAM: Error: 'join' child-process died with signal $signal\n"; + } + else { + $join_exit_code = ($? >> 8); + } + return $join_exit_code; +} + +sub cleanup_files(@) +{ + my (@files) = @_; + + foreach my $file (@files) { + if ($debug) { + warn "$PROGRAM: debug mode, not deleting temporary file '$file'\n"; + } else { + my $count = unlink $file; + warn "$PROGRAM: Error: failed to delete temporary file '$file': $!\n" if ($count != 1); + } + } +} diff -r 000000000000 -r ec66f9d90ef0 easyjoin.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/easyjoin.xml Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,95 @@ + + + gnu_coreutils + + two files + easyjoin $jointype + -t ' ' + $header + -e '$empty_string_filler' + -o auto + $ignore_case + -1 '$column1' + -2 '$column2' + "$input1" "$input2" + > '$output' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool joins two tabular files based on a common key column. + +----- + +**Example** + +**First file**:: + + Fruit Color + Apple red + Banana yellow + Orange orange + Melon green + +**Second File**:: + + Fruit Price + Orange 7 + Avocado 8 + Apple 4 + Banana 3 + +**Joining** both files, using **key column 1** and a **header line**, will return:: + + Fruit Color Price + Apple red 4 + Avocado . 8 + Banana yellow 3 + Melon green . + Orange orange 7 + +# Input files need not be sorted. +# The header line (**Fruit Color Price**) was joined and kept as first line. +# Missing values ( Avocado's color, missing from the first file ) are replaced with a period character. + +----- + +*easyjoin* was written by A. Gordon + + + diff -r 000000000000 -r ec66f9d90ef0 find_and_replace --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/find_and_replace Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,202 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Getopt::Std; + +sub parse_command_line(); +sub build_regex_string(); +sub usage(); + +my $input_file ; +my $output_file; +my $find_pattern ; +my $replace_pattern ; +my $find_complete_words ; +my $find_pattern_is_regex ; +my $find_in_specific_column ; +my $find_case_insensitive ; +my $replace_global ; +my $skip_first_line ; + + +## +## Program Start +## +usage() if @ARGV<2; +parse_command_line(); +my $regex_string = build_regex_string() ; + +# Allow first line to pass without filtering? +if ( $skip_first_line ) { + my $line = <$input_file>; + print $output_file $line ; +} + + +## +## Main loop +## + +## I LOVE PERL (and hate it, at the same time...) +## +## So what's going on with the self-compiling perl code? +## +## 1. The program gets the find-pattern and the replace-pattern from the user (as strings). +## 2. If both the find-pattern and replace-pattern are simple strings (not regex), +## it would be possible to pre-compile a regex (with qr//) and use it in a 's///' +## 3. If the find-pattern is a regex but the replace-pattern is a simple text string (with out back-references) +## it is still possible to pre-compile the regex and use it in a 's///' +## However, +## 4. If the replace-pattern contains back-references, pre-compiling is not possible. +## (in perl, you can't precompile a substitute regex). +## See these examples: +## http://www.perlmonks.org/?node_id=84420 +## http://stackoverflow.com/questions/125171/passing-a-regex-substitution-as-a-variable-in-perl +## +## The solution: +## we build the regex string as valid perl code (in 'build_regex()', stored in $regex_string ), +## Then eval() a new perl code that contains the substitution regex as inlined code. +## Gotta love perl! + +my $perl_program ; +if ( $find_in_specific_column ) { + # Find & replace in specific column + + $perl_program = < ) { + chomp ; + my \@columns = split ; + + #not enough columns in this line - skip it + next if ( \@columns < $find_in_specific_column ) ; + + \$columns [ $find_in_specific_column - 1 ] =~ $regex_string ; + + print STDOUT join("\t", \@columns), "\n" ; + } +EOF + +} else { + # Find & replace the entire line + $perl_program = < ) { + $regex_string ; + print STDOUT; + } +EOF +} + + +# The dynamic perl code reads from STDIN and writes to STDOUT, +# so connect these handles (if the user didn't specifiy input / output +# file names, these might be already be STDIN/OUT, so the whole could be a no-op). +*STDIN = $input_file ; +*STDOUT = $output_file ; +eval $perl_program ; + + +## +## Program end +## + + +sub parse_command_line() +{ + my %opts ; + getopts('grsiwc:o:', \%opts) or die "$0: Invalid option specified\n"; + + die "$0: missing Find-Pattern argument\n" if (@ARGV==0); + $find_pattern = $ARGV[0]; + die "$0: missing Replace-Pattern argument\n" if (@ARGV==1); + $replace_pattern = $ARGV[1]; + + $find_complete_words = ( exists $opts{w} ) ; + $find_case_insensitive = ( exists $opts{i} ) ; + $skip_first_line = ( exists $opts{s} ) ; + $find_pattern_is_regex = ( exists $opts{r} ) ; + $replace_global = ( exists $opts{g} ) ; + + # Search in specific column ? + if ( defined $opts{c} ) { + $find_in_specific_column = $opts{c}; + + die "$0: invalid column number ($find_in_specific_column).\n" + unless $find_in_specific_column =~ /^\d+$/ ; + + die "$0: invalid column number ($find_in_specific_column).\n" + if $find_in_specific_column <= 0; + } + else { + $find_in_specific_column = 0 ; + } + + # Output File specified (instead of STDOUT) ? + if ( defined $opts{o} ) { + my $filename = $opts{o}; + open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ; + } else { + $output_file = *STDOUT ; + } + + + # Input file Specified (instead of STDIN) ? + if ( @ARGV>2 ) { + my $filename = $ARGV[2]; + open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ; + } else { + $input_file = *STDIN; + } +} + +sub build_regex_string() +{ + my $find_string ; + my $replace_string ; + + if ( $find_pattern_is_regex ) { + $find_string = $find_pattern ; + $replace_string = $replace_pattern ; + } else { + $find_string = quotemeta $find_pattern ; + $replace_string = quotemeta $replace_pattern; + } + + if ( $find_complete_words ) { + $find_string = "\\b($find_string)\\b"; + } + + my $regex_string = "s/$find_string/$replace_string/"; + + $regex_string .= "i" if ( $find_case_insensitive ); + $regex_string .= "g" if ( $replace_global ) ; + + + return $regex_string; +} + +sub usage() +{ +print < + text + + find_and_replace + #if $searchwhere.choice == "column": + -c $searchwhere.column + #end if + -o $output + $caseinsensitive + $wholewords + $skip_first_line + $is_regex + '$url_paste' + '$file_data' + '$input' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool finds & replaces text in an input dataset. + +.. class:: infomark + +The **pattern to find** can be a simple text string, or a perl **regular expression** string (depending on *pattern is a regex* check-box). + +.. class:: infomark + +When using regular expressions, the **replace pattern** can contain back-references ( e.g. \\1 ) + +.. class:: infomark + +This tool uses Perl regular expression syntax. + +----- + +**Examples of *regular-expression* Find Patterns** + +- **HELLO** The word 'HELLO' (case sensitive). +- **AG.T** The letters A,G followed by any single character, followed by the letter T. +- **A{4,}** Four or more consecutive A's. +- **chr2[012]\\t** The words 'chr20' or 'chr21' or 'chr22' followed by a tab character. +- **hsa-mir-([^ ]+)** The text 'hsa-mir-' followed by one-or-more non-space characters. When using parenthesis, the matched content of the parenthesis can be accessed with **\1** in the **replace** pattern. + + +**Examples of Replace Patterns** + +- **WORLD** The word 'WORLD' will be placed whereever the find pattern was found. +- **FOO-&-BAR** Each time the find pattern is found, it will be surrounded with 'FOO-' at the begining and '-BAR' at the end. **$&** (dollar-ampersand) represents the matched find pattern. +- **$1** The text which matched the first parenthesis in the Find Pattern. + + +----- + +**Example 1** + +**Find Pattern:** HELLO +**Replace Pattern:** WORLD +**Regular Expression:** no +**Replace what:** entire line + +Every time the word HELLO is found, it will be replaced with the word WORLD. + +----- + +**Example 2** + +**Find Pattern:** ^chr +**Replace Pattern:** (empty) +**Regular Expression:** yes +**Replace what:** column 11 + +If column 11 (of every line) begins with ther letters 'chr', they will be removed. Effectively, it'll turn "chr4" into "4" and "chrXHet" into "XHet" + + +----- + +**Perl's Regular Expression Syntax** + +The Find & Replace tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text. + +- **( ) { } [ ] . * ? + \\ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for. +- **^** matches the beginning of a string(but not an internal line). +- **(** .. **)** groups a particular pattern. +- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern. + + - **{n}** The preceding item is matched exactly n times. + - **{n,}** The preceding item ismatched n or more times. + - **{n,m}** The preceding item is matched at least n times but not more than m times. + +- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**. +- **.** Matches any single character except a newline. +- ***** The preceding item will be matched zero or more times. +- **?** The preceding item is optional and matched at most once. +- **+** The preceding item will be matched one or more times. +- **^** has two meaning: + - matches the beginning of a line or string. + - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets. +- **$** matches the end of a line or string. +- **\\|** Separates alternate possibilities. +- **\\d** matches a single digit +- **\\w** matches a single letter or digit or an underscore. +- **\\s** matches a single white-space (space or tabs). + + + + + diff -r 000000000000 -r ec66f9d90ef0 grep.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/grep.xml Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,144 @@ + + (grep) + + gnu_coreutils + gnu_grep + UNIX_TOOLS_SCRIPT_PATH + + + #if $color = "COLOR": + GREP_COLOR='1;34' grep --color=always -P "$@" -- "${url_paste}" '${input}' | \$UNIX_TOOLS_SCRIPT_PATH/ansi2html.sh > "${output}" + #else: + grep -P "$@" -- "${url_paste}" '${input}' | grep -v "^--$" > "${output}" + #end if + + ##grep_wrapper.sh '$input' '$output' '$url_paste' $color -A $lines_after -B $lines_before $invert $case_sensitive + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool runs the unix **grep** command on the selected data file. + +.. class:: infomark + +**TIP:** This tool uses the **perl** regular expression syntax (same as running 'grep -P'). This is **NOT** the POSIX or POSIX-extended syntax (unlike the awk/sed tools). + + +**Further reading** + +- Wikipedia's Regular Expression page (http://en.wikipedia.org/wiki/Regular_expression) +- Regular Expressions cheat-sheet (PDF) (http://www.addedbytes.com/cheat-sheets/download/regular-expressions-cheat-sheet-v2.pdf) +- Grep Tutorial (http://www.panix.com/~elflord/unix/grep.html) + +----- + +**Grep Examples** + +- **AGC.AAT** would match lines with AGC followed by any character, followed by AAT (e.g. **AGCQAAT**, **AGCPAAT**, **AGCwAAT**) +- **C{2,5}AGC** would match lines with 2 to 5 consecutive Cs followed by AGC +- **TTT.{4,10}AAA** would match lines with 3 Ts, followed by 4 to 10 characters (any characeters), followed by 3 As. +- **^chr([0-9A-Za-z])+** would match lines that begin with chromsomes, such as lines in a BED format file. +- **(ACGT){1,5}** would match at least 1 "ACGT" and at most 5 "ACGT" consecutively. +- **hsa|mmu** would match lines containing "hsa" or "mmu" (or both). + +----- + +**Regular Expression Syntax** + +The select tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text. + +- **( ) { } [ ] . * ? + \ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for. +- **^** matches the beginning of a string(but not an internal line). +- **\\d** matches a digit, same as [0-9]. +- **\\D** matches a non-digit. +- **\\s** matches a whitespace character. +- **\\S** matches anything BUT a whitespace. +- **\\t** matches a tab. +- **\\w** matches an alphanumeric character ( A to Z, 0 to 9 and underscore ) +- **\\W** matches anything but an alphanumeric character. +- **(** .. **)** groups a particular pattern. +- **\\Z** matches the end of a string(but not a internal line). +- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern. + + - **{n}** The preceding item is matched exactly n times. + - **{n,}** The preceding item ismatched n or more times. + - **{n,m}** The preceding item is matched at least n times but not more than m times. + +- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**. +- **.** Matches any single character except a newline. +- ***** The preceding item will be matched zero or more times. +- **?** The preceding item is optional and matched at most once. +- **+** The preceding item will be matched one or more times. +- **^** has two meaning: + - matches the beginning of a line or string. + - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets. +- **$** matches the end of a line or string. +- **\|** Separates alternate possibilities. + + + + diff -r 000000000000 -r ec66f9d90ef0 head.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/head.xml Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,37 @@ + + lines from a dataset (head) + + gnu_coreutils + + + head --lines $complement$count '${infile}' > '${outfile}' + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool runs the **head** unix command, which discards lines from the end of a file. + + + diff -r 000000000000 -r ec66f9d90ef0 multijoin --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multijoin Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,321 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use Getopt::Long qw(:config no_ignore_case); +use Data::Dumper; +use Carp; +use File::Basename; +use Sort::Key::Natural qw(natsort); + +my $version = "0.1.1"; +my $field_sep = "\t"; +my $key_column; +my @values_columns; +my $max_value_column; +my @input_files; +my $input_headers ; +my $output_headers; +my $filler = "0"; +my $filler_string ; +my $ignore_duplicates; +my $debug = 0 ; +my %input_headers; +my $have_file_labels; +my %file_labels; + +sub parse_command_line_parameters(); +sub show_help(); +sub read_input_file($); +sub print_combined_data(); +sub sanitize_filename($); +sub print_output_header(); +sub show_examples(); + +## +## Program Start +## + +parse_command_line_parameters(); + +my %data; +foreach my $file (@input_files) { + read_input_file($file); +} +#print STDERR Dumper(\%input_headers),"\n"; +#print STDERR Dumper(\%data) if $debug; +print_output_header() if $output_headers; +print_combined_data(); + + +## +## Program End +## +sub print_output_header() +{ + my @output = ("key"); + foreach my $file ( @input_files ) { + foreach my $column ( @values_columns ) { + my $column_name = ( exists $input_headers{$file}->{$column} ) ? + $input_headers{$file}->{$column} : + "V$column" ; + + push @output, $file_labels{$file} . "_" . $column_name; + } + } + print join($field_sep,@output),"\n" + or die "Output error: can't write output line: $!\n"; +} + +sub print_combined_data() +{ + my @keys = natsort keys %data ; + + foreach my $key ( @keys ) { + my @outputs; + + foreach my $file (@input_files) { + push @outputs, + (exists $data{$key}->{$file}) ? $data{$key}->{$file} : $filler_string; + } + + print join($field_sep,$key,@outputs),"\n" + or die "Output error: can't write output line: $!\n"; + } +} + +sub sanitize_filename($) +{ + my ($filename) = shift or croak "missing file name"; + my $file_ID = basename($filename); + $file_ID =~ s/\.\w+$//; # remove extension + $file_ID =~ s/^[^\w\.\-]+//; + $file_ID =~ s/[^\w\.\-]+$//; + $file_ID =~ s/[^\w\.\-]+/_/g; # sanitize bad characters + return $file_ID; +} + +sub read_input_file($) +{ + my ($filename) = shift or croak "Missing input file name"; + + my @value_indexes = map { $_-1 } @values_columns; #zero-based indexes for value columns + + open FILE, "<", $filename + or die "Error: can't open file '$filename': $!\n"; + + ## Read file's header + if ($input_headers) { + my $line = ; + chomp $line; + my @fields = split $field_sep, $line; + + my $num_input_fields = scalar(@fields); + die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ; + + foreach my $col (@values_columns) { + $input_headers{$filename}->{$col} = $fields[$col-1] ; + } + } + + + ## Read file's data + while ( my $line = ) { + chomp $line; + my @fields = split $field_sep, $line; + + my $num_input_fields = scalar(@fields); + die "Input error: file '$filename' line $. doesn't have enough columns (key column = $key_column, line has only $num_input_fields columns)\n" if $num_input_fields < $key_column ; + die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ; + + + my $key = $fields[$key_column-1]; + my $value = join($field_sep, @fields[@value_indexes]); + + die "Input error: file '$filename' line $. have duplicated key '$key'.\n" + if (exists $data{$key}->{$filename} && !$ignore_duplicates) ; + $data{$key}->{$filename} = $value; + } + close FILE + or die "Error: can't write and close file '$filename': $!\n"; +} + +sub parse_command_line_parameters() +{ + my $values_columns_string; + + my $rc = GetOptions("help" => \&show_help, + "key|k=i" => \$key_column, + "values|v=s" => \$values_columns_string, + "t=s" => \$field_sep, + "in-header" => \$input_headers, + "out-header|h" => \$output_headers, + "H" => sub { $input_headers = 1 ; $output_headers = 1 ; }, + "ignore-dups" => \$ignore_duplicates, + "filler|f=s" => \$filler, + "examples" => \&show_examples, + "labels" => \$have_file_labels, + ); + die "Error: inalid command-line parameters.\n" unless $rc; + + die "Error: missing key column. use --key N. see --help for more details.\n" unless defined $key_column; + die "Error: Invalid key column ($key_column). Must be bigger than zero. see --help for more details.\n" if $key_column <= 0 ; + + die "Error: missing values column. use --values V1,V2,Vn. See --help for more details.\n" unless defined $values_columns_string; + @values_columns = split(/\s*,\s*/, $values_columns_string); + + die "Error: missing values column. use --values N,N,N. see --help for more details.\n" unless scalar(@values_columns)>0; + foreach my $v (@values_columns) { + die "Error: invalid value column ($v), please use only numbers>=1. see --help for more details.\n" + unless $v =~ /^\d+$/ && $v>=1; + + $max_value_column = $v unless defined $max_value_column && $max_value_column>$v; + } + + $filler_string = join($field_sep, map { $filler } @values_columns); + + + if ($have_file_labels) { + ## have file labels - each pair of parameters is a file/label pair. + die "Error: missing input files and labels\n" if scalar(@ARGV)==0; + die "Error: when using --labels, a pair of file names + labels is required (got odd number of argiments)\n" unless scalar(@ARGV)%2==0; + + while (@ARGV) { + my $filename = shift @ARGV; + my $label = shift @ARGV; + $label =~ s/^[^\.\w\-]+//; + $label =~ s/[^\.\w\-]+$//g; + $label =~ s/[^\.\w\-]+/_/g; + + my $file_ID = sanitize_filename($filename); + $file_labels{$filename} = $label; + push @input_files, $filename; + } + } else { + ## no file labels - the rest of the arguments are just file names; + @input_files = @ARGV; + die "Error: missing input files\n" if scalar(@input_files)==0; + die "Error: need more than one input file to join.\n" if scalar(@input_files)==1; + + foreach my $file (@input_files) { + my $file_ID = sanitize_filename($file); + $file_labels{$file} = $file_ID; + } + } + +} + +sub show_help() +{ + print< AAA.txt <== +chr4 888449 890171 FBtr0308778 0 + 266 1527 1722 +chr4 972167 979017 FBtr0310651 0 - 3944 6428 6850 +chr4 972186 979017 FBtr0089229 0 - 3944 6428 6831 +chr4 972186 979017 FBtr0089231 0 - 3944 6428 6831 +chr4 972186 979017 FBtr0089233 0 - 3944 6428 6831 +chr4 995793 996435 FBtr0111046 0 + 7 166 642 +chr4 995793 997931 FBtr0111044 0 + 28 683 2138 +chr4 995793 997931 FBtr0111045 0 + 28 683 2138 +chr4 1034029 1047719 FBtr0089223 0 - 5293 13394 13690 + +==> BBB.txt <== +chr4 90286 134453 FBtr0309803 0 + 657 29084 44167 +chr4 251355 266499 FBtr0089116 0 + 56 1296 15144 +chr4 252050 266506 FBtr0308086 0 + 56 1296 14456 +chr4 252050 266506 FBtr0308087 0 + 56 1296 14456 +chr4 252053 266528 FBtr0300796 0 + 56 1296 14475 +chr4 252053 266528 FBtr0300800 0 + 56 1296 14475 +chr4 252055 266528 FBtr0300798 0 + 56 1296 14473 +chr4 252055 266528 FBtr0300799 0 + 56 1296 14473 +chr4 252541 266528 FBtr0300797 0 + 56 1296 13987 + +==> CCC.txt <== +chr4 972167 979017 FBtr0310651 0 - 9927 6738 6850 +chr4 972186 979017 FBtr0089229 0 - 9927 6738 6831 +chr4 972186 979017 FBtr0089231 0 - 9927 6738 6831 +chr4 972186 979017 FBtr0089233 0 - 9927 6738 6831 +chr4 995793 996435 FBtr0111046 0 + 5 304 642 +chr4 995793 997931 FBtr0111044 0 + 17 714 2138 +chr4 995793 997931 FBtr0111045 0 + 17 714 2138 +chr4 1034029 1047719 FBtr0089223 0 - 17646 13536 13690 + +\$ multijoin -h --key 4 --values 7,8,9 *.txt | head -n 10 +key AAA__V7 AAA__V8 AAA__V9 BBB__V7 BBB__V8 BBB__V9 CCC__V7 CCC__V8 CCC__V9 +FBtr0089116 0 0 0 56 1296 15144 0 0 0 +FBtr0089223 5293 13394 13690 0 0 0 17646 13536 13690 +FBtr0089229 3944 6428 6831 0 0 0 9927 6738 6831 +FBtr0089231 3944 6428 6831 0 0 0 9927 6738 6831 +FBtr0089233 3944 6428 6831 0 0 0 9927 6738 6831 +FBtr0111044 28 683 2138 0 0 0 17 714 2138 +FBtr0111045 28 683 2138 0 0 0 17 714 2138 +FBtr0111046 7 166 642 0 0 0 5 304 642 +FBtr0300796 0 0 0 56 1296 14475 0 0 0 + + + +EOF + exit(0); +} diff -r 000000000000 -r ec66f9d90ef0 multijoin.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multijoin.xml Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,122 @@ + + (combine multiple files) + multijoin + --key '$key_column' + --values '$value_columns' + --filler '$filler' + $ignore_dups + $output_header + $input_header + #for $file in $files + '$file.filename' + #end for + > '$output' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool joins multiple tabular files based on a common key column. + +----- + +**Example** + +To join three files, based on the 4th column, and keeping the 7th,8th,9th columns: + +**First file (AAA)**:: + + chr4 888449 890171 FBtr0308778 0 + 266 1527 1722 + chr4 972167 979017 FBtr0310651 0 - 3944 6428 6850 + chr4 972186 979017 FBtr0089229 0 - 3944 6428 6831 + chr4 972186 979017 FBtr0089231 0 - 3944 6428 6831 + chr4 972186 979017 FBtr0089233 0 - 3944 6428 6831 + chr4 995793 996435 FBtr0111046 0 + 7 166 642 + chr4 995793 997931 FBtr0111044 0 + 28 683 2138 + chr4 995793 997931 FBtr0111045 0 + 28 683 2138 + chr4 1034029 1047719 FBtr0089223 0 - 5293 13394 13690 + ... + + +**Second File (BBB)**:: + + chr4 90286 134453 FBtr0309803 0 + 657 29084 44167 + chr4 251355 266499 FBtr0089116 0 + 56 1296 15144 + chr4 252050 266506 FBtr0308086 0 + 56 1296 14456 + chr4 252050 266506 FBtr0308087 0 + 56 1296 14456 + chr4 252053 266528 FBtr0300796 0 + 56 1296 14475 + chr4 252053 266528 FBtr0300800 0 + 56 1296 14475 + chr4 252055 266528 FBtr0300798 0 + 56 1296 14473 + chr4 252055 266528 FBtr0300799 0 + 56 1296 14473 + chr4 252541 266528 FBtr0300797 0 + 56 1296 13987 + ... + +**Third file (CCC)**:: + + chr4 972167 979017 FBtr0310651 0 - 9927 6738 6850 + chr4 972186 979017 FBtr0089229 0 - 9927 6738 6831 + chr4 972186 979017 FBtr0089231 0 - 9927 6738 6831 + chr4 972186 979017 FBtr0089233 0 - 9927 6738 6831 + chr4 995793 996435 FBtr0111046 0 + 5 304 642 + chr4 995793 997931 FBtr0111044 0 + 17 714 2138 + chr4 995793 997931 FBtr0111045 0 + 17 714 2138 + chr4 1034029 1047719 FBtr0089223 0 - 17646 13536 13690 + ... + + +**Joining** the files, using **key column 4**, **value columns 7,8,9** and a **header line**, will return:: + + key AAA__V7 AAA__V8 AAA__V9 BBB__V7 BBB__V8 BBB__V9 CCC__V7 CCC__V8 CCC__V9 + FBtr0089116 0 0 0 56 1296 15144 0 0 0 + FBtr0089223 5293 13394 13690 0 0 0 17646 13536 13690 + FBtr0089229 3944 6428 6831 0 0 0 9927 6738 6831 + FBtr0089231 3944 6428 6831 0 0 0 9927 6738 6831 + FBtr0089233 3944 6428 6831 0 0 0 9927 6738 6831 + FBtr0111044 28 683 2138 0 0 0 17 714 2138 + FBtr0111045 28 683 2138 0 0 0 17 714 2138 + FBtr0111046 7 166 642 0 0 0 5 304 642 + FBtr0300796 0 0 0 56 1296 14475 0 0 0 + ... + + +# Input files need not be sorted. + +----- + +*multijoin* was written by A. Gordon (gordon at cshl dot edu) + + + diff -r 000000000000 -r ec66f9d90ef0 readme.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.rst Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,86 @@ +These are Galaxy wrappers for common unix text-processing tools +=============================================================== + +The initial work was done by Assaf Gordon and Greg Hannon's lab ( http://hannonlab.cshl.edu ) +in Cold Spring Harbor Laboratory ( http://www.cshl.edu ). + + +The tools are: + +* awk - The AWK programmning language ( http://www.gnu.org/software/gawk/ ) +* sed - Stream Editor ( http://sed.sf.net ) +* grep - Search files ( http://www.gnu.org/software/grep/ ) +* sort_columns - Sorting every line according to there columns +* GNU Coreutils programs ( http://www.gnu.org/software/coreutils/ ): + * sort - sort files + * join - join two files, based on common key field. + * cut - keep/discard fields from a file + * unsorted_uniq - keep unique/duplicated lines in a file + * sorted_uniq - keep unique/duplicated lines in a file + * head - keep the first X lines in a file. + * tail - keep the last X lines in a file. + +Few improvements over the standard tools: + + * EasyJoin - A Join tool that does not require pre-sorted the files ( https://github.com/agordon/filo/blob/scripts/src/scripts/easyjoin ) + * Multi-Join - Join multiple (>2) files ( https://github.com/agordon/filo/blob/scripts/src/scripts/multijoin ) + * Find_and_Replace - Find/Replace text in a line or specific column. + * Grep with Perl syntax - uses grep with Perl-Compatible regular expressions. + * HTML'd Grep - grep text in a file, and produced high-lighted HTML output, for easier viewing ( uses https://github.com/agordon/filo/blob/scripts/src/scripts/sort-header ) + + +Requirements +------------ + +1. Coreutils vesion 8.19 or later. +2. AWK version 4.0.1 or later. +3. SED version 4.2 *with* a special patch +4. Grep with PCRE support + +These will be installed automatically with the Galaxy Tool Shed. + + +------------------- +NOTE About Security +------------------- + +The included tools are secure (barring unintentional bugs): +The main concern might be executing system commands with awk's "system" and sed's "e" commands, +or reading/writing arbitrary files with awk's redirection and sed's "r/w" commands. +These commands are DISABLED using the "--sandbox" parameter to awk and sed. + +User trying to run an awk program similar to: + BEGIN { system("ls") } +Will get an error (in Galaxy) saying: + fatal: 'system' function not allowed in sandbox mode. + +User trying to run a SED program similar to: + 1els +will get an error (in Galaxy) saying: + sed: -e expression #1, char 2: e/r/w commands disabled in sandbox mode + +That being said, if you do find some vulnerability in these tools, please let me know and I'll try fix them. + +------------ +Installation +------------ + +Should be done with the Galaxy `Tool Shed`_. + +.. _`Tool Shed`: http://wiki.galaxyproject.org/Tool%20Shed + + +---- +TODO +---- + +- unit-tests +- uniqu will get a new --group funciton with the 8.22 release, its currently commended out +- also shuf will get a major improved performance with large files http://git.savannah.gnu.org/gitweb/?p=coreutils.git;a=commit;h=20d7bce0f7e57d9a98f0ee811e31c757e9fedfff + we can remove the random feature from sort and use shuf instead +- move some advanced settings under a conditional, for example the cut tools offers to cut bytes + + + + + diff -r 000000000000 -r ec66f9d90ef0 scripts/ansi2html.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/ansi2html.sh Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,331 @@ +#!/bin/sh + +# Convert ANSI (terminal) colours and attributes to HTML + +# Author: +# http://www.pixelbeat.org/docs/terminal_colours/ +# Examples: +# ls -l --color=always | ansi2html.sh > ls.html +# git show --color | ansi2html.sh > last_change.html +# Generally one can use the `script` util to capture full terminal output. +# Changes: +# V0.1, 24 Apr 2008, Initial release +# V0.2, 01 Jan 2009, Phil Harnish +# Support `git diff --color` output by +# matching ANSI codes that specify only +# bold or background colour. +# P@draigBrady.com +# Support `ls --color` output by stripping +# redundant leading 0s from ANSI codes. +# Support `grep --color=always` by stripping +# unhandled ANSI codes (specifically ^[[K). +# V0.3, 20 Mar 2009, http://eexpress.blog.ubuntu.org.cn/ +# Remove cat -v usage which mangled non ascii input. +# Cleanup regular expressions used. +# Support other attributes like reverse, ... +# P@draigBrady.com +# Correctly nest tags (even across lines). +# Add a command line option to use a dark background. +# Strip more terminal control codes. +# V0.4, 17 Sep 2009, P@draigBrady.com +# Handle codes with combined attributes and color. +# Handle isolated attributes with css. +# Strip more terminal control codes. +# V0.12, 12 Jul 2011 +# http://github.com/pixelb/scripts/commits/master/scripts/ansi2html.sh + +if [ "$1" = "--version" ]; then + echo "0.12" && exit +fi + +if [ "$1" = "--help" ]; then + echo "This utility converts ANSI codes in data passed to stdin" >&2 + echo "It has 2 optional parameters:" >&2 + echo " --bg=dark --palette=linux|solarized|tango|xterm" >&2 + echo "E.g.: ls -l --color=always | ansi2html.sh --bg=dark > ls.html" >&2 + exit +fi + +[ "$1" = "--bg=dark" ] && { dark_bg=yes; shift; } + +if [ "$1" = "--palette=solarized" ]; then + # See http://ethanschoonover.com/solarized + P0=073642; P1=D30102; P2=859900; P3=B58900; + P4=268BD2; P5=D33682; P6=2AA198; P7=EEE8D5; + P8=002B36; P9=CB4B16; P10=586E75; P11=657B83; + P12=839496; P13=6C71C4; P14=93A1A1; P15=FDF6E3; + shift; +elif [ "$1" = "--palette=solarized-xterm" ]; then + # Above mapped onto the xterm 256 color palette + P0=262626; P1=AF0000; P2=5F8700; P3=AF8700; + P4=0087FF; P5=AF005F; P6=00AFAF; P7=E4E4E4; + P8=1C1C1C; P9=D75F00; P10=585858; P11=626262; + P12=808080; P13=5F5FAF; P14=8A8A8A; P15=FFFFD7; + shift; +elif [ "$1" = "--palette=tango" ]; then + # Gnome default + P0=000000; P1=CC0000; P2=4E9A06; P3=C4A000; + P4=3465A4; P5=75507B; P6=06989A; P7=D3D7CF; + P8=555753; P9=EF2929; P10=8AE234; P11=FCE94F; + P12=729FCF; P13=AD7FA8; P14=34E2E2; P15=EEEEEC; + shift; +elif [ "$1" = "--palette=xterm" ]; then + P0=000000; P1=CD0000; P2=00CD00; P3=CDCD00; + P4=0000EE; P5=CD00CD; P6=00CDCD; P7=E5E5E5; + P8=7F7F7F; P9=FF0000; P10=00FF00; P11=FFFF00; + P12=5C5CFF; P13=FF00FF; P14=00FFFF; P15=FFFFFF; + shift; +else # linux console + P0=000000; P1=AA0000; P2=00AA00; P3=AA5500; + P4=0000AA; P5=AA00AA; P6=00AAAA; P7=AAAAAA; + P8=555555; P9=FF5555; P10=55FF55; P11=FFFF55; + P12=5555FF; P13=FF55FF; P14=55FFFF; P15=FFFFFF; + [ "$1" = "--palette=linux" ] && shift +fi + +[ "$1" = "--bg=dark" ] && { dark_bg=yes; shift; } + +echo -n " + + + + + + +
+'
+
+p='\x1b\['        #shortcut to match escape codes
+P="\(^[^°]*\)¡$p" #expression to match prepended codes below
+
+# Handle various xterm control sequences.
+# See /usr/share/doc/xterm-*/ctlseqs.txt
+sed "
+s#\x1b[^\x1b]*\x1b\\\##g  # strip anything between \e and ST
+s#\x1b][0-9]*;[^\a]*\a##g # strip any OSC (xterm title etc.)
+
+#handle carriage returns
+s#^.*\r\{1,\}\([^$]\)#\1#
+s#\r\$## # strip trailing \r
+
+# strip other non SGR escape sequences
+s#[\x07]##g
+s#\x1b[]>=\][0-9;]*##g
+s#\x1bP+.\{5\}##g
+s#${p}[0-9;?]*[^0-9;?m]##g
+
+#remove backspace chars and what they're backspacing over
+:rm_bs
+s#[^\x08]\x08##g; t rm_bs
+" |
+
+# Normalize the input before transformation
+sed "
+# escape HTML
+s#\&#\&#g; s#>#\>#g; s#<#\<#g; s#\"#\"#g
+
+# normalize SGR codes a little
+
+# split 256 colors out and mark so that they're not
+# recognised by the following 'split combined' line
+:e
+s#${p}\([0-9;]\{1,\}\);\([34]8;5;[0-9]\{1,3\}\)m#${p}\1m${p}¬\2m#g; t e
+s#${p}\([34]8;5;[0-9]\{1,3\}\)m#${p}¬\1m#g;
+
+:c
+s#${p}\([0-9]\{1,\}\);\([0-9;]\{1,\}\)m#${p}\1m${p}\2m#g; t c   # split combined
+s#${p}0\([0-7]\)#${p}\1#g                                 #strip leading 0
+s#${p}1m\(\(${p}[4579]m\)*\)#\1${p}1m#g                   #bold last (with clr)
+s#${p}m#${p}0m#g                                          #add leading 0 to norm
+
+# undo any 256 color marking
+s#${p}¬\([34]8;5;[0-9]\{1,3\}\)m#${p}\1m#g;
+
+# map 16 color codes to color + bold
+s#${p}9\([0-7]\)m#${p}3\1m${p}1m#g;
+s#${p}10\([0-7]\)m#${p}4\1m${p}1m#g;
+
+# change 'reset' code to a single char, and prepend a single char to
+# other codes so that we can easily do negative matching, as sed
+# does not support look behind expressions etc.
+s#°#\°#g; s#${p}0m#°#g
+s#¡#\¡#g; s#${p}[0-9;]*m#¡&#g
+" |
+
+# Convert SGR sequences to HTML
+sed "
+:ansi_to_span # replace ANSI codes with CSS classes
+t ansi_to_span # hack so t commands below only apply to preceeding s cmd
+
+/^[^¡]*°/ { b span_end } # replace 'reset code' if no preceeding code
+
+# common combinations to minimise html (optional)
+s#${P}3\([0-7]\)m¡${p}4\([0-7]\)m#\1#;t span_count
+s#${P}4\([0-7]\)m¡${p}3\([0-7]\)m#\1#;t span_count
+
+s#${P}1m#\1#;                            t span_count
+s#${P}4m#\1#;                       t span_count
+s#${P}5m#\1#;                           t span_count
+s#${P}7m#\1#;                         t span_count
+s#${P}9m#\1#;                    t span_count
+s#${P}3\([0-9]\)m#\1#;                    t span_count
+s#${P}4\([0-9]\)m#\1#;                    t span_count
+
+s#${P}38;5;\([0-9]\{1,3\}\)m#\1#;        t span_count
+s#${P}48;5;\([0-9]\{1,3\}\)m#\1#;        t span_count
+
+s#${P}[0-9;]*m#\1#g; t ansi_to_span # strip unhandled codes
+
+b # next line of input
+
+# add a corresponding span end flag
+:span_count
+x; s/^/s/; x
+b ansi_to_span
+
+# replace 'reset code' with correct number of  tags
+:span_end
+x
+/^s/ {
+  s/^.//
+  x
+  s#°#°#
+  b span_end
+}
+x
+s#°##
+b ansi_to_span
+" |
+
+# Convert alternative character set
+# Note we convert here, as if we do at start we have to worry about avoiding
+# conversion of SGR codes etc., whereas doing here we only have to
+# avoid conversions of stuff between &...; or <...>
+#
+# Note we could use sed to do this based around:
+#   sed 'y/abcdefghijklmnopqrstuvwxyz{}`~/▒␉␌␍␊°±␤␋┘┐┌└┼⎺⎻─⎼⎽├┤┴┬│≤≥π£◆·/'
+# However that would be very awkward as we need to only conv some input.
+# The basic scheme that we do in the python script below is:
+#  1. enable transliterate once ¡ char seen
+#  2. disable once µ char seen (may be on diff line to ¡)
+#  3. never transliterate between &; or <> chars
+sed "
+# change 'smacs' and 'rmacs' to a single char so that we can easily do
+# negative matching, as sed does not support look behind expressions etc.
+# Note we don't use ° like above as that's part of the alternate charset.
+s#\x1b(0#¡#g;
+s#µ#\µ#g; s#\x1b(B#µ#g
+" |
+(
+python -c "
+# vim:fileencoding=utf8
+
+import sys
+import locale
+encoding=locale.getpreferredencoding()
+
+old='abcdefghijklmnopqrstuvwxyz{}\`~'
+new='▒␉␌␍␊°±␤␋┘┐┌└┼⎺⎻─⎼⎽├┤┴┬│≤≥π£◆·'
+new=unicode(new, 'utf-8')
+table=range(128)
+for o,n in zip(old, new): table[ord(o)]=n
+
+(STANDARD, ALTERNATIVE, HTML_TAG, HTML_ENTITY) = (0, 1, 2, 3)
+
+state = STANDARD
+last_mode = STANDARD
+for c in unicode(sys.stdin.read(), encoding):
+  if state == HTML_TAG:
+    if c == '>':
+      state = last_mode
+  elif state == HTML_ENTITY:
+    if c == ';':
+      state = last_mode
+  else:
+    if c == '<':
+      state = HTML_TAG
+    elif c == '&':
+      state = HTML_ENTITY
+    elif c == u'¡' and state == STANDARD:
+      state = ALTERNATIVE
+      last_mode = ALTERNATIVE
+      continue
+    elif c == u'µ' and state == ALTERNATIVE:
+      state = STANDARD
+      last_mode = STANDARD
+      continue
+    elif state == ALTERNATIVE:
+      c = c.translate(table)
+  sys.stdout.write(c.encode(encoding))
+" 2>/dev/null ||
+sed 's/[¡µ]//g' # just strip aternative flag chars
+)
+
+echo "
+ +" diff -r 000000000000 -r ec66f9d90ef0 sed.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sed.xml Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,110 @@ + + with sed + + gnu_sed + + + sed --sandbox -r $silent -f '$sed_script' '$input' > '$output' + + + + + + + + + + + + + + + + + + + + + $url_paste + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool runs the unix **sed** command on the selected data file. + +.. class:: infomark + +**TIP:** This tool uses the **extended regular** expression syntax (same as running 'sed -r'). + + + +**Further reading** + +- Short sed tutorial (http://www.linuxhowtos.org/System/sed_tutorial.htm) +- Long sed tutorial (http://www.grymoire.com/Unix/Sed.html) +- sed faq with good examples (http://sed.sourceforge.net/sedfaq.html) +- sed cheat-sheet (http://www.catonmat.net/download/sed.stream.editor.cheat.sheet.pdf) +- Collection of useful sed one-liners (http://student.northpark.edu/pemente/sed/sed1line.txt) + +----- + +**Sed commands** + +The most useful sed command is **s** (substitute). + +**Examples** + +- **s/hsa//** will remove the first instance of 'hsa' in every line. +- **s/hsa//g** will remove all instances (beacuse of the **g**) of 'hsa' in every line. +- **s/A{4,}/--&--/g** will find sequences of 4 or more consecutive A's, and once found, will surround them with two dashes from each side. The **&** marker is a place holder for 'whatever matched the regular expression'. +- **s/hsa-mir-([^ ]+)/short name: \\1 full name: &/** will find strings such as 'hsa-mir-43a' (the regular expression is 'hsa-mir-' followed by non-space characters) and will replace it will string such as 'short name: 43a full name: hsa-mir-43a'. The **\\1** marker is a place holder for 'whatever matched the first parenthesis' (similar to perl's **$1**) . + + +**sed's Regular Expression Syntax** + +The select tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text. + +- **( ) { } [ ] . * ? + \ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for. +- **^** matches the beginning of a string(but not an internal line). +- **(** .. **)** groups a particular pattern. +- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern. + + - **{n}** The preceding item is matched exactly n times. + - **{n,}** The preceding item ismatched n or more times. + - **{n,m}** The preceding item is matched at least n times but not more than m times. + +- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**. +- **.** Matches any single character except a newline. +- ***** The preceding item will be matched zero or more times. +- **?** The preceding item is optional and matched at most once. +- **+** The preceding item will be matched one or more times. +- **^** has two meaning: + - matches the beginning of a line or string. + - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets. +- **$** matches the end of a line or string. +- **\|** Separates alternate possibilities. + + +**Note**: SED uses extended regular expression syntax, not Perl syntax. **\\d**, **\\w**, **\\s** etc. are **not** supported. + + + diff -r 000000000000 -r ec66f9d90ef0 sort.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sort.xml Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,137 @@ + + + gnu_coreutils + gnu_sed + + + #if int($header) > 0: + (sed -u '${header}'q && sort $unique $ignore_case --stable -t ' ' + + #for $key in $sortkeys + '-k ${key.column}${key.order}${key.style},${key.column}' + #end for + + ) < '${infile}' > '${outfile}' + #else: + (sort $unique $ignore_case --stable -t ' ' + + #for $key in $sortkeys + '-k ${key.column}${key.order}${key.style},${key.column}' + #end for + + ) < '${infile}' > '${outfile}' + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool sorts an input file. + +----- + +**Sorting Styles** + +* **Fast Numeric**: sort by numeric values. Handles integer values (e.g. 43, 134) and decimal-point values (e.g. 3.14). *Does not* handle scientific notation (e.g. -2.32e2). +* **General Numeric**: sort by numeric values. Handles all numeric notations (including scientific notation). Slower than *fast numeric*, so use only when necessary. +* **Natural Sort**: Sort in 'natural' order (natural to humans, not to computers). See example below. +* **Alphabetical sort**: Sort in strict alphabetical order. See example below. +* **Human-readable numbers**: Sort human readble numbers (e.g. 1G > 2M > 3K > 400) +* **Random order**: return lines in random order. + +------ + +**Example - Header line** + +**Input file** (note first line is a header line, should not be sorted):: + + Fruit Color Price + Banana Yellow 4.1 + Avocado Green 8.0 + Apple Red 3.0 + Melon Green 6.1 + +**Sorting** by **numeric order** on column **3**, with **header**, will return:: + + Fruit Color Price + Apple Red 3.0 + Banana Yellow 4.1 + Melon Green 6.1 + Avocado Green 8.0 + + +----- + +**Example - Natural vs. Alphabetical sorting** + +Given the following list:: + + chr4 + chr13 + chr1 + chr10 + chr20 + chr2 + +**Alphabetical sort** would produce the following sorted list:: + + chr1 + chr10 + chr13 + chr2 + chr20 + chr4 + +**Natural Sort** would produce the following sorted list:: + + chr1 + chr2 + chr4 + chr10 + chr13 + chr20 + + +.. class:: infomark + +If you're planning to use the file with another tool that expected sorted files (such as *join*), you should use the **Alphabetical sort**, not the **Natural Sort**. Natural sort order is easier for humans, but is unnatural for computer programs. + +----- + +*sort-header* is was written by A. Gordon ( gordon at cshl dot edu ) + + + diff -r 000000000000 -r ec66f9d90ef0 sort_rows.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sort_rows.xml Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,26 @@ + + according to their columns + python -c 'for line in ["\t".join(sorted(line.strip().split("\t"))) for line in open("$input").readlines() ]: print line' > $outfile + + + + + + + + + + + + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +**What it does** + +That tool sorts each row in a TAB separated file, according to their columns. In other words: It is a sorted reordering of all columns. + + + + diff -r 000000000000 -r ec66f9d90ef0 sorted_uniq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sorted_uniq.xml Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,55 @@ + + from sorted file + + gnu_coreutils + + + uniq + -f + $skipfields + $count + $repeated + $ignorecase + $uniqueonly + $input + + ## feature is not yet released, it will be in the next 8.22 version + ##--group=$group + > $output + + + + + + + + + + + + + + + + + + +This tool takes a sorted file and look for lines that are unique. + +.. class:: warningmark + +Please make sure your file is sorted, or else this tool will give you an erroneous output. + +.. class:: infomark + +You can sort your file using either the "Sort" tool in "Filter and Sort", or the "Sort" tool in "Unix Tools". + + + diff -r 000000000000 -r ec66f9d90ef0 tail.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tail.xml Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,25 @@ + + lines from a dataset (tail) + + gnu_coreutils + + + tail --lines $count '$input1' > '$output' + + + + + + + + + + + + +**What it does** + +This tool runs the **tail** unix command, which discards lines from the beginning of a file. + + + diff -r 000000000000 -r ec66f9d90ef0 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + + $REPOSITORY_INSTALL_DIR/scripts + + diff -r 000000000000 -r ec66f9d90ef0 unsorted_uniq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/unsorted_uniq.py Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,36 @@ +import sys +import subprocess + +""" + We only need that file because galaxy do not understand the -t $'\t' term. + Otherwise that would be the right XML-only solution: + sort -u + $ignore_case + $is_numeric + -t \$'\t' + #if $adv_opts.adv_opts_selector=="advanced": + -k$adv_opts.column_start,$adv_opts.column_end + #end if + -o $outfile + $input +""" + +if sys.argv[1].strip() != 'false': + ignore_case = sys.argv[1] +else: + ignore_case = '' + +if sys.argv[2].strip() != 'false': + is_numeric = sys.argv[2] +else: + is_numeric = '' + +try: + col_start = sys.argv[3] + col_end = sys.argv[4] + com = "sort -u %s %s -t ' ' -k%s,%s -o %s %s" % (is_numeric, ignore_case, col_start, col_end, sys.argv[5], sys.argv[6]) +except: + # no advanced options selected + com = "sort -u %s %s -t ' ' -o %s %s" % (is_numeric, ignore_case, sys.argv[3], sys.argv[4]) + +subprocess.call(com, shell=True) diff -r 000000000000 -r ec66f9d90ef0 unsorted_uniq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/unsorted_uniq.xml Thu Sep 05 04:58:21 2013 -0400 @@ -0,0 +1,79 @@ + + occurrences of each record + + gnu_coreutils + + + unique_lines.py + $ignore_case + $is_numeric + #if $adv_opts.adv_opts_selector=="advanced": + $adv_opts.column_start + $adv_opts.column_end + #end if + $outfile + $infile + + + + + + + + + + + + + + + + + + + + + + + + + + + .. class:: infomark + +**Syntax** + +This tool returns all unique lines using the 'sort -u' command. It can be used with unsorted files. +If you need additional options, like grouping or counting your unique results, please use the 'Unique lines from sorted file' tool. + +----- + +.. class:: infomark + +The input file needs to be tab separated. Please convert your file if necessary. + +----- + +**Example** + +- Input file:: + + chr1 10 100 gene1 + chr1 105 200 gene2 + chr1 10 100 gene1 + chr2 10 100 gene4 + chr2 1000 1900 gene5 + chr3 15 1656 gene6 + chr2 10 100 gene4 + +- Unique lines will result in:: + + chr1 10 100 gene1 + chr1 105 200 gene2 + chr2 10 100 gene4 + chr2 1000 1900 gene5 + chr3 15 1656 gene6 + + + +