# HG changeset patch
# User bgruening
# Date 1378371501 14400
# Node ID ec66f9d90ef0862d30504c410147402b4a165344
initial uploaded
diff -r 000000000000 -r ec66f9d90ef0 awk.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/awk.xml Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,124 @@
+
+
+
+ gnu_awk
+
+
+ awk --sandbox -v FS=\$'\t' -v OFS=\$'\t' --re-interval -f '$awk_script' '$input' > '$output'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ $url_paste
+
+
+
+
+**What it does**
+
+This tool runs the unix **awk** command on the selected data file.
+
+.. class:: infomark
+
+**TIP:** This tool uses the **extended regular** expression syntax (not the perl syntax).
+
+
+**Further reading**
+
+- Awk by Example (http://www.ibm.com/developerworks/linux/library/l-awk1.html)
+- Long AWK tutorial (http://www.grymoire.com/Unix/Awk.html)
+- Learn AWK in 1 hour (http://www.selectorweb.com/awk.html)
+- awk cheat-sheet (http://cbi.med.harvard.edu/people/peshkin/sb302/awk_cheatsheets.pdf)
+- Collection of useful awk one-liners (http://student.northpark.edu/pemente/awk/awk1line.txt)
+
+-----
+
+**AWK programs**
+
+Most AWK programs consist of **patterns** (i.e. rules that match lines of text) and **actions** (i.e. commands to execute when a pattern matches a line).
+
+The basic form of AWK program is::
+
+ pattern { action 1; action 2; action 3; }
+
+
+
+
+
+**Pattern Examples**
+
+- **$2 == "chr3"** will match lines whose second column is the string 'chr3'
+- **$5-$4>23** will match lines that after subtracting the value of the fourth column from the value of the fifth column, gives value alrger than 23.
+- **/AG..AG/** will match lines that contain the regular expression **AG..AG** (meaning the characeters AG followed by any two characeters followed by AG). (This is the way to specify regular expressions on the entire line, similar to GREP.)
+- **$7 ~ /A{4}U/** will match lines whose seventh column contains 4 consecutive A's followed by a U. (This is the way to specify regular expressions on a specific field.)
+- **10000 < $4 && $4 < 20000** will match lines whose fourth column value is larger than 10,000 but smaller than 20,000
+- If no pattern is specified, all lines match (meaning the **action** part will be executed on all lines).
+
+
+
+**Action Examples**
+
+- **{ print }** or **{ print $0 }** will print the entire input line (the line that matched in **pattern**). **$0** is a special marker meaning 'the entire line'.
+- **{ print $1, $4, $5 }** will print only the first, fourth and fifth fields of the input line.
+- **{ print $4, $5-$4 }** will print the fourth column and the difference between the fifth and fourth column. (If the fourth column was start-position in the input file, and the fifth column was end-position - the output file will contain the start-position, and the length).
+- If no action part is specified (not even the curly brackets) - the default action is to print the entire line.
+
+
+
+
+
+
+
+
+
+**AWK's Regular Expression Syntax**
+
+The select tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text.
+
+- **( ) { } [ ] . * ? + \ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for.
+- **^** matches the beginning of a string(but not an internal line).
+- **(** .. **)** groups a particular pattern.
+- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern.
+
+ - **{n}** The preceding item is matched exactly n times.
+ - **{n,}** The preceding item ismatched n or more times.
+ - **{n,m}** The preceding item is matched at least n times but not more than m times.
+
+- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**.
+- **.** Matches any single character except a newline.
+- ***** The preceding item will be matched zero or more times.
+- **?** The preceding item is optional and matched at most once.
+- **+** The preceding item will be matched one or more times.
+- **^** has two meaning:
+ - matches the beginning of a line or string.
+ - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets.
+- **$** matches the end of a line or string.
+- **\|** Separates alternate possibilities.
+
+
+**Note**: AWK uses extended regular expression syntax, not Perl syntax. **\\d**, **\\w**, **\\s** etc. are **not** supported.
+
+
+
diff -r 000000000000 -r ec66f9d90ef0 cut.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cut.xml Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,103 @@
+
+ columns from files
+
+ gnu_coreutils
+
+
+ cut ${complement} ${cutwhat} '${list}' '${input}' > '${output}'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool runs the **cut** unix command, which extract or delete columns from a file.
+
+-----
+
+Field List Example:
+
+**1,3,7** - Cut specific fields/characters.
+
+**3-** - Cut from the third field/character to the end of the line.
+
+**2-5** - Cut from the second to the fifth field/character.
+
+**-8** - Cut from the first to the eight field/characters.
+
+
+
+
+Input Example::
+
+ fruit color price weight
+ apple red 1.4 0.5
+ orange orange 1.5 0.3
+ banana yellow 0.9 0.3
+
+
+Output Example ( **Keeping fields 1,3,4** )::
+
+ fruit price weight
+ apple 1.4 0.5
+ orange 1.5 0.3
+ banana 0.9 0.3
+
+Output Example ( **Discarding field 2** )::
+
+ fruit price weight
+ apple 1.4 0.5
+ orange 1.5 0.3
+ banana 0.9 0.3
+
+Output Example ( **Keeping 3 characters** )::
+
+ fru
+ app
+ ora
+ ban
+
+
+
diff -r 000000000000 -r ec66f9d90ef0 easyjoin
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/easyjoin Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,308 @@
+#!/usr/bin/env perl
+## EASY Join -
+## Join with automatic pre-sorting of both files
+## Copyright (C) 2010 A. Gordon (gordon@cshl.edu)
+## license: AGPLv3+
+use strict;
+use warnings;
+use Data::Dumper;
+use Getopt::Long qw(:config bundling no_ignore_case_always);
+use File::Temp qw/tempfile/;
+use POSIX qw(locale_h);
+
+sub show_help();
+sub show_version();
+sub show_examples();
+sub parse_commandline_options();
+sub sort_file($$$);
+sub join_files($$);
+sub cleanup_files(@);
+
+
+my $PROGRAM="easyjoin";
+my $VERSION="0.6.1";
+
+my $debug=undef;
+my $HEADER=undef;
+my $IGNORE_CASE=undef;
+my $FIELD_SEP=undef;
+my $FILE1_KEY_COLUMN=1;
+my $FILE2_KEY_COLUMN=1;
+my @OUTPUT_SPECIFIERS=();
+my $OUTPUT_FORMAT=undef;
+my $EMPTY_FILLER=undef;
+my $SORT_BUFFER_SIZE=undef;
+my $SORT_TEMP_DIR=undef;
+my $input_filename1;
+my $input_filename2;
+
+##
+## Program Start
+##
+$ENV{'LANG'}="C";## "C" locale is critical for sorting and joining correctly
+parse_commandline_options();
+my (undef, $tmp_filename1) = tempfile(OPEN=>0);
+my (undef, $tmp_filename2) = tempfile(OPEN=>0);
+sort_file($input_filename1, $tmp_filename1, $FILE1_KEY_COLUMN);
+sort_file($input_filename2, $tmp_filename2, $FILE2_KEY_COLUMN);
+my $join_exit_code = join_files($tmp_filename1, $tmp_filename2);
+cleanup_files($tmp_filename1, $tmp_filename2);
+exit($join_exit_code);
+
+##
+## Program end
+##
+
+
+sub show_help()
+{
+print<
+ This will show all values (paired and unpared) from both files,
+ Automatically formatting the columns, and using TAB as field separator.
+ You can override the empty filler (-e X) on the command line.
+
+ --allh = Short-cut for:
+ -a 1 -a 2 -o auto -e . -t --header
+ Same as above, but will also respect the header line from both input files.
+
+JOIN-OPTIONS:
+ All of GNU join options are supported.
+ Run:
+ join --help
+ To see all possible joining options.
+
+SORT-OPTIONS:
+ The following options are supported for the intermediate sorting step:
+
+ -S SIZE
+ --buffer-size SIZE = GNU sort's --buffer-size option.
+
+ -T DIR
+ --temporary-directory DIR = GNU sort's --temporary-directory option.
+
+ Run:
+ sort --help
+ To learn about these options. They might improve sorting performances for big files.
+
+FILE1 FILE2:
+ The two input files to be sorted, joined.
+ Unlike GNU join, joining STDIN is not supported. Both files must be real files.
+
+
+NOTE About "--header" and "--auto-format":
+ The "--header" feature requires GNU coreutils version 8.6 or later.
+ The "-o auto" feature requires GNU coreutils version 8.10 or later.
+
+EOF
+ exit(0);
+}
+
+sub show_version()
+{
+print< sub { push @OUTPUT_SPECIFIERS, '-a', $_[1] },
+ "e=s" => \$EMPTY_FILLER,
+ "ignore-case|i" => \$IGNORE_CASE,
+ "j=i" => sub { $FILE1_KEY_COLUMN = $_[1] ; $FILE2_KEY_COLUMN = $_[1] ; },
+ "o=s" => \$OUTPUT_FORMAT,
+ "t=s" => \$FIELD_SEP,
+ "v=i" => sub { push @OUTPUT_SPECIFIERS, '-v', $_[1] },
+ "1=i" => \$FILE1_KEY_COLUMN,
+ "2=i" => \$FILE2_KEY_COLUMN,
+ "debug" => \$debug,
+ "header" => \$HEADER,
+ "help" => \&show_help,
+ "version" => \&show_version,
+ "examples" => \&show_examples,
+ "buffer-size|S=s" => \$SORT_BUFFER_SIZE,
+ "temporary-directory|T=s" => \$SORT_TEMP_DIR,
+ "all" => sub {
+ push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2;
+ $FIELD_SEP = "\t";
+ $OUTPUT_FORMAT = "auto";
+ $EMPTY_FILLER = "." unless defined $EMPTY_FILLER;
+ },
+ "allh" => sub {
+ push @OUTPUT_SPECIFIERS, "-a", 1, "-a", 2;
+ $FIELD_SEP = "\t";
+ $OUTPUT_FORMAT = "auto";
+ $HEADER=1;
+ $EMPTY_FILLER = "." unless defined $EMPTY_FILLER;
+ },
+ );
+ die "$PROGRAM: invalid command-line arguments.\n" unless $rc;
+
+ ## We need two file names to join
+ my @INPUT_FILES = @ARGV;
+ die "$PROGRAM: missing operand: two file names to join\n" if (scalar(@INPUT_FILES)<2);
+ die "$PROGRAM: error: too many files specified (can only join two files)\n" if (scalar(@INPUT_FILES)>2);
+ die "$PROGRAM: error: input file can't be STDIN, please use a real file name.\n" if $INPUT_FILES[0] eq "-" || $INPUT_FILES[1] eq "-";
+ die "$PROGRAM: error: input file 1 '" . $INPUT_FILES[0] . "' not found!" unless -e $INPUT_FILES[0];
+ die "$PROGRAM: error: input file 2 '" . $INPUT_FILES[1] . "' not found!" unless -e $INPUT_FILES[1];
+
+ $input_filename1 = $INPUT_FILES[0];
+ $input_filename2 = $INPUT_FILES[1];
+}
+
+sub sort_file($$$)
+{
+ my ($input_filename, $output_filename, $key_column) = @_;
+
+ my @SORT_COMMAND;
+ push @SORT_COMMAND, $HEADER ? "sort-header" : "sort" ;
+ push @SORT_COMMAND, "-f" if $IGNORE_CASE;
+ push @SORT_COMMAND, "-k${key_column},${key_column}" ;
+ push @SORT_COMMAND, "--buffer-size", $SORT_BUFFER_SIZE if $SORT_BUFFER_SIZE;
+ push @SORT_COMMAND, "--temporary-directory", $SORT_TEMP_DIR if $SORT_TEMP_DIR;
+ push @SORT_COMMAND, "--output", $output_filename;
+ push @SORT_COMMAND, "--debugheader" if $debug && $HEADER;
+ push @SORT_COMMAND, "-t", $FIELD_SEP if $FIELD_SEP;
+ push @SORT_COMMAND, $input_filename;
+
+ if ($debug) {
+ warn "$PROGRAM: Running sort on '$input_filename' => '$output_filename'\n";
+ warn "$PROGRAM: Sort command line:\n";
+ print STDERR Dumper(\@SORT_COMMAND), "\n";
+ }
+
+ my $sort_exit_code=1;
+ system(@SORT_COMMAND);
+ if ($? == -1) {
+ die "$PROGRAM: Error: failed to execute 'sort': $!\n";
+ }
+ elsif ($? & 127) {
+ my $signal = ($? & 127);
+ kill 2, $$ if $signal == 2; ##if sort was interrupted (CTRL-C) - just pass it on and commit suicide
+ die "$PROGRAM: Error: 'sort' child-process died with signal $signal\n";
+ }
+ else {
+ $sort_exit_code = ($? >> 8);
+ }
+ die "$PROGRAM: Error: 'sort' process failed, exit code $sort_exit_code\n" if $sort_exit_code!=0;
+}
+
+sub join_files($$)
+{
+ my ($file1, $file2) = @_;
+
+ my @join_command = qw/join/;
+ push @join_command, "--header" if $HEADER;
+ push @join_command, "--ignore-case" if $IGNORE_CASE;
+ push @join_command, "-t", $FIELD_SEP if $FIELD_SEP;
+ push @join_command, "-1", $FILE1_KEY_COLUMN if $FILE1_KEY_COLUMN;
+ push @join_command, "-2", $FILE2_KEY_COLUMN if $FILE2_KEY_COLUMN;
+ push @join_command, "-e", $EMPTY_FILLER if defined $EMPTY_FILLER;
+ push @join_command, "-o", $OUTPUT_FORMAT if $OUTPUT_FORMAT;
+ push @join_command, @OUTPUT_SPECIFIERS;
+ push @join_command, $file1, $file2;
+
+ if ($debug) {
+ warn "$PROGRAM: Running join on '$file1' and '$file2'\n";
+ warn "$PROGRAM: join command line:\n";
+ print STDERR Dumper(\@join_command), "\n";
+ }
+
+ my $join_exit_code=1;
+ system(@join_command);
+ if ($? == -1) {
+ die "$PROGRAM: Error: failed to execute 'join': $!\n";
+ }
+ elsif ($? & 127) {
+ my $signal = ($? & 127);
+ kill 2, $$ if $signal == 2; ##if join was interrupted (CTRL-C) - just pass it on and commit suicide
+ die "$PROGRAM: Error: 'join' child-process died with signal $signal\n";
+ }
+ else {
+ $join_exit_code = ($? >> 8);
+ }
+ return $join_exit_code;
+}
+
+sub cleanup_files(@)
+{
+ my (@files) = @_;
+
+ foreach my $file (@files) {
+ if ($debug) {
+ warn "$PROGRAM: debug mode, not deleting temporary file '$file'\n";
+ } else {
+ my $count = unlink $file;
+ warn "$PROGRAM: Error: failed to delete temporary file '$file': $!\n" if ($count != 1);
+ }
+ }
+}
diff -r 000000000000 -r ec66f9d90ef0 easyjoin.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/easyjoin.xml Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,95 @@
+
+
+ gnu_coreutils
+
+ two files
+ easyjoin $jointype
+ -t ' '
+ $header
+ -e '$empty_string_filler'
+ -o auto
+ $ignore_case
+ -1 '$column1'
+ -2 '$column2'
+ "$input1" "$input2"
+ > '$output'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool joins two tabular files based on a common key column.
+
+-----
+
+**Example**
+
+**First file**::
+
+ Fruit Color
+ Apple red
+ Banana yellow
+ Orange orange
+ Melon green
+
+**Second File**::
+
+ Fruit Price
+ Orange 7
+ Avocado 8
+ Apple 4
+ Banana 3
+
+**Joining** both files, using **key column 1** and a **header line**, will return::
+
+ Fruit Color Price
+ Apple red 4
+ Avocado . 8
+ Banana yellow 3
+ Melon green .
+ Orange orange 7
+
+# Input files need not be sorted.
+# The header line (**Fruit Color Price**) was joined and kept as first line.
+# Missing values ( Avocado's color, missing from the first file ) are replaced with a period character.
+
+-----
+
+*easyjoin* was written by A. Gordon
+
+
+
diff -r 000000000000 -r ec66f9d90ef0 find_and_replace
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/find_and_replace Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,202 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Getopt::Std;
+
+sub parse_command_line();
+sub build_regex_string();
+sub usage();
+
+my $input_file ;
+my $output_file;
+my $find_pattern ;
+my $replace_pattern ;
+my $find_complete_words ;
+my $find_pattern_is_regex ;
+my $find_in_specific_column ;
+my $find_case_insensitive ;
+my $replace_global ;
+my $skip_first_line ;
+
+
+##
+## Program Start
+##
+usage() if @ARGV<2;
+parse_command_line();
+my $regex_string = build_regex_string() ;
+
+# Allow first line to pass without filtering?
+if ( $skip_first_line ) {
+ my $line = <$input_file>;
+ print $output_file $line ;
+}
+
+
+##
+## Main loop
+##
+
+## I LOVE PERL (and hate it, at the same time...)
+##
+## So what's going on with the self-compiling perl code?
+##
+## 1. The program gets the find-pattern and the replace-pattern from the user (as strings).
+## 2. If both the find-pattern and replace-pattern are simple strings (not regex),
+## it would be possible to pre-compile a regex (with qr//) and use it in a 's///'
+## 3. If the find-pattern is a regex but the replace-pattern is a simple text string (with out back-references)
+## it is still possible to pre-compile the regex and use it in a 's///'
+## However,
+## 4. If the replace-pattern contains back-references, pre-compiling is not possible.
+## (in perl, you can't precompile a substitute regex).
+## See these examples:
+## http://www.perlmonks.org/?node_id=84420
+## http://stackoverflow.com/questions/125171/passing-a-regex-substitution-as-a-variable-in-perl
+##
+## The solution:
+## we build the regex string as valid perl code (in 'build_regex()', stored in $regex_string ),
+## Then eval() a new perl code that contains the substitution regex as inlined code.
+## Gotta love perl!
+
+my $perl_program ;
+if ( $find_in_specific_column ) {
+ # Find & replace in specific column
+
+ $perl_program = < ) {
+ chomp ;
+ my \@columns = split ;
+
+ #not enough columns in this line - skip it
+ next if ( \@columns < $find_in_specific_column ) ;
+
+ \$columns [ $find_in_specific_column - 1 ] =~ $regex_string ;
+
+ print STDOUT join("\t", \@columns), "\n" ;
+ }
+EOF
+
+} else {
+ # Find & replace the entire line
+ $perl_program = < ) {
+ $regex_string ;
+ print STDOUT;
+ }
+EOF
+}
+
+
+# The dynamic perl code reads from STDIN and writes to STDOUT,
+# so connect these handles (if the user didn't specifiy input / output
+# file names, these might be already be STDIN/OUT, so the whole could be a no-op).
+*STDIN = $input_file ;
+*STDOUT = $output_file ;
+eval $perl_program ;
+
+
+##
+## Program end
+##
+
+
+sub parse_command_line()
+{
+ my %opts ;
+ getopts('grsiwc:o:', \%opts) or die "$0: Invalid option specified\n";
+
+ die "$0: missing Find-Pattern argument\n" if (@ARGV==0);
+ $find_pattern = $ARGV[0];
+ die "$0: missing Replace-Pattern argument\n" if (@ARGV==1);
+ $replace_pattern = $ARGV[1];
+
+ $find_complete_words = ( exists $opts{w} ) ;
+ $find_case_insensitive = ( exists $opts{i} ) ;
+ $skip_first_line = ( exists $opts{s} ) ;
+ $find_pattern_is_regex = ( exists $opts{r} ) ;
+ $replace_global = ( exists $opts{g} ) ;
+
+ # Search in specific column ?
+ if ( defined $opts{c} ) {
+ $find_in_specific_column = $opts{c};
+
+ die "$0: invalid column number ($find_in_specific_column).\n"
+ unless $find_in_specific_column =~ /^\d+$/ ;
+
+ die "$0: invalid column number ($find_in_specific_column).\n"
+ if $find_in_specific_column <= 0;
+ }
+ else {
+ $find_in_specific_column = 0 ;
+ }
+
+ # Output File specified (instead of STDOUT) ?
+ if ( defined $opts{o} ) {
+ my $filename = $opts{o};
+ open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ;
+ } else {
+ $output_file = *STDOUT ;
+ }
+
+
+ # Input file Specified (instead of STDIN) ?
+ if ( @ARGV>2 ) {
+ my $filename = $ARGV[2];
+ open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ;
+ } else {
+ $input_file = *STDIN;
+ }
+}
+
+sub build_regex_string()
+{
+ my $find_string ;
+ my $replace_string ;
+
+ if ( $find_pattern_is_regex ) {
+ $find_string = $find_pattern ;
+ $replace_string = $replace_pattern ;
+ } else {
+ $find_string = quotemeta $find_pattern ;
+ $replace_string = quotemeta $replace_pattern;
+ }
+
+ if ( $find_complete_words ) {
+ $find_string = "\\b($find_string)\\b";
+ }
+
+ my $regex_string = "s/$find_string/$replace_string/";
+
+ $regex_string .= "i" if ( $find_case_insensitive );
+ $regex_string .= "g" if ( $replace_global ) ;
+
+
+ return $regex_string;
+}
+
+sub usage()
+{
+print <
+ text
+
+ find_and_replace
+ #if $searchwhere.choice == "column":
+ -c $searchwhere.column
+ #end if
+ -o $output
+ $caseinsensitive
+ $wholewords
+ $skip_first_line
+ $is_regex
+ '$url_paste'
+ '$file_data'
+ '$input'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool finds & replaces text in an input dataset.
+
+.. class:: infomark
+
+The **pattern to find** can be a simple text string, or a perl **regular expression** string (depending on *pattern is a regex* check-box).
+
+.. class:: infomark
+
+When using regular expressions, the **replace pattern** can contain back-references ( e.g. \\1 )
+
+.. class:: infomark
+
+This tool uses Perl regular expression syntax.
+
+-----
+
+**Examples of *regular-expression* Find Patterns**
+
+- **HELLO** The word 'HELLO' (case sensitive).
+- **AG.T** The letters A,G followed by any single character, followed by the letter T.
+- **A{4,}** Four or more consecutive A's.
+- **chr2[012]\\t** The words 'chr20' or 'chr21' or 'chr22' followed by a tab character.
+- **hsa-mir-([^ ]+)** The text 'hsa-mir-' followed by one-or-more non-space characters. When using parenthesis, the matched content of the parenthesis can be accessed with **\1** in the **replace** pattern.
+
+
+**Examples of Replace Patterns**
+
+- **WORLD** The word 'WORLD' will be placed whereever the find pattern was found.
+- **FOO-&-BAR** Each time the find pattern is found, it will be surrounded with 'FOO-' at the begining and '-BAR' at the end. **$&** (dollar-ampersand) represents the matched find pattern.
+- **$1** The text which matched the first parenthesis in the Find Pattern.
+
+
+-----
+
+**Example 1**
+
+**Find Pattern:** HELLO
+**Replace Pattern:** WORLD
+**Regular Expression:** no
+**Replace what:** entire line
+
+Every time the word HELLO is found, it will be replaced with the word WORLD.
+
+-----
+
+**Example 2**
+
+**Find Pattern:** ^chr
+**Replace Pattern:** (empty)
+**Regular Expression:** yes
+**Replace what:** column 11
+
+If column 11 (of every line) begins with ther letters 'chr', they will be removed. Effectively, it'll turn "chr4" into "4" and "chrXHet" into "XHet"
+
+
+-----
+
+**Perl's Regular Expression Syntax**
+
+The Find & Replace tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text.
+
+- **( ) { } [ ] . * ? + \\ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for.
+- **^** matches the beginning of a string(but not an internal line).
+- **(** .. **)** groups a particular pattern.
+- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern.
+
+ - **{n}** The preceding item is matched exactly n times.
+ - **{n,}** The preceding item ismatched n or more times.
+ - **{n,m}** The preceding item is matched at least n times but not more than m times.
+
+- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**.
+- **.** Matches any single character except a newline.
+- ***** The preceding item will be matched zero or more times.
+- **?** The preceding item is optional and matched at most once.
+- **+** The preceding item will be matched one or more times.
+- **^** has two meaning:
+ - matches the beginning of a line or string.
+ - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets.
+- **$** matches the end of a line or string.
+- **\\|** Separates alternate possibilities.
+- **\\d** matches a single digit
+- **\\w** matches a single letter or digit or an underscore.
+- **\\s** matches a single white-space (space or tabs).
+
+
+
+
+
diff -r 000000000000 -r ec66f9d90ef0 grep.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/grep.xml Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,144 @@
+
+ (grep)
+
+ gnu_coreutils
+ gnu_grep
+ UNIX_TOOLS_SCRIPT_PATH
+
+
+ #if $color = "COLOR":
+ GREP_COLOR='1;34' grep --color=always -P "$@" -- "${url_paste}" '${input}' | \$UNIX_TOOLS_SCRIPT_PATH/ansi2html.sh > "${output}"
+ #else:
+ grep -P "$@" -- "${url_paste}" '${input}' | grep -v "^--$" > "${output}"
+ #end if
+
+ ##grep_wrapper.sh '$input' '$output' '$url_paste' $color -A $lines_after -B $lines_before $invert $case_sensitive
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool runs the unix **grep** command on the selected data file.
+
+.. class:: infomark
+
+**TIP:** This tool uses the **perl** regular expression syntax (same as running 'grep -P'). This is **NOT** the POSIX or POSIX-extended syntax (unlike the awk/sed tools).
+
+
+**Further reading**
+
+- Wikipedia's Regular Expression page (http://en.wikipedia.org/wiki/Regular_expression)
+- Regular Expressions cheat-sheet (PDF) (http://www.addedbytes.com/cheat-sheets/download/regular-expressions-cheat-sheet-v2.pdf)
+- Grep Tutorial (http://www.panix.com/~elflord/unix/grep.html)
+
+-----
+
+**Grep Examples**
+
+- **AGC.AAT** would match lines with AGC followed by any character, followed by AAT (e.g. **AGCQAAT**, **AGCPAAT**, **AGCwAAT**)
+- **C{2,5}AGC** would match lines with 2 to 5 consecutive Cs followed by AGC
+- **TTT.{4,10}AAA** would match lines with 3 Ts, followed by 4 to 10 characters (any characeters), followed by 3 As.
+- **^chr([0-9A-Za-z])+** would match lines that begin with chromsomes, such as lines in a BED format file.
+- **(ACGT){1,5}** would match at least 1 "ACGT" and at most 5 "ACGT" consecutively.
+- **hsa|mmu** would match lines containing "hsa" or "mmu" (or both).
+
+-----
+
+**Regular Expression Syntax**
+
+The select tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text.
+
+- **( ) { } [ ] . * ? + \ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for.
+- **^** matches the beginning of a string(but not an internal line).
+- **\\d** matches a digit, same as [0-9].
+- **\\D** matches a non-digit.
+- **\\s** matches a whitespace character.
+- **\\S** matches anything BUT a whitespace.
+- **\\t** matches a tab.
+- **\\w** matches an alphanumeric character ( A to Z, 0 to 9 and underscore )
+- **\\W** matches anything but an alphanumeric character.
+- **(** .. **)** groups a particular pattern.
+- **\\Z** matches the end of a string(but not a internal line).
+- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern.
+
+ - **{n}** The preceding item is matched exactly n times.
+ - **{n,}** The preceding item ismatched n or more times.
+ - **{n,m}** The preceding item is matched at least n times but not more than m times.
+
+- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**.
+- **.** Matches any single character except a newline.
+- ***** The preceding item will be matched zero or more times.
+- **?** The preceding item is optional and matched at most once.
+- **+** The preceding item will be matched one or more times.
+- **^** has two meaning:
+ - matches the beginning of a line or string.
+ - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets.
+- **$** matches the end of a line or string.
+- **\|** Separates alternate possibilities.
+
+
+
+
diff -r 000000000000 -r ec66f9d90ef0 head.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/head.xml Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,37 @@
+
+ lines from a dataset (head)
+
+ gnu_coreutils
+
+
+ head --lines $complement$count '${infile}' > '${outfile}'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool runs the **head** unix command, which discards lines from the end of a file.
+
+
+
diff -r 000000000000 -r ec66f9d90ef0 multijoin
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/multijoin Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,321 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Getopt::Long qw(:config no_ignore_case);
+use Data::Dumper;
+use Carp;
+use File::Basename;
+use Sort::Key::Natural qw(natsort);
+
+my $version = "0.1.1";
+my $field_sep = "\t";
+my $key_column;
+my @values_columns;
+my $max_value_column;
+my @input_files;
+my $input_headers ;
+my $output_headers;
+my $filler = "0";
+my $filler_string ;
+my $ignore_duplicates;
+my $debug = 0 ;
+my %input_headers;
+my $have_file_labels;
+my %file_labels;
+
+sub parse_command_line_parameters();
+sub show_help();
+sub read_input_file($);
+sub print_combined_data();
+sub sanitize_filename($);
+sub print_output_header();
+sub show_examples();
+
+##
+## Program Start
+##
+
+parse_command_line_parameters();
+
+my %data;
+foreach my $file (@input_files) {
+ read_input_file($file);
+}
+#print STDERR Dumper(\%input_headers),"\n";
+#print STDERR Dumper(\%data) if $debug;
+print_output_header() if $output_headers;
+print_combined_data();
+
+
+##
+## Program End
+##
+sub print_output_header()
+{
+ my @output = ("key");
+ foreach my $file ( @input_files ) {
+ foreach my $column ( @values_columns ) {
+ my $column_name = ( exists $input_headers{$file}->{$column} ) ?
+ $input_headers{$file}->{$column} :
+ "V$column" ;
+
+ push @output, $file_labels{$file} . "_" . $column_name;
+ }
+ }
+ print join($field_sep,@output),"\n"
+ or die "Output error: can't write output line: $!\n";
+}
+
+sub print_combined_data()
+{
+ my @keys = natsort keys %data ;
+
+ foreach my $key ( @keys ) {
+ my @outputs;
+
+ foreach my $file (@input_files) {
+ push @outputs,
+ (exists $data{$key}->{$file}) ? $data{$key}->{$file} : $filler_string;
+ }
+
+ print join($field_sep,$key,@outputs),"\n"
+ or die "Output error: can't write output line: $!\n";
+ }
+}
+
+sub sanitize_filename($)
+{
+ my ($filename) = shift or croak "missing file name";
+ my $file_ID = basename($filename);
+ $file_ID =~ s/\.\w+$//; # remove extension
+ $file_ID =~ s/^[^\w\.\-]+//;
+ $file_ID =~ s/[^\w\.\-]+$//;
+ $file_ID =~ s/[^\w\.\-]+/_/g; # sanitize bad characters
+ return $file_ID;
+}
+
+sub read_input_file($)
+{
+ my ($filename) = shift or croak "Missing input file name";
+
+ my @value_indexes = map { $_-1 } @values_columns; #zero-based indexes for value columns
+
+ open FILE, "<", $filename
+ or die "Error: can't open file '$filename': $!\n";
+
+ ## Read file's header
+ if ($input_headers) {
+ my $line = ;
+ chomp $line;
+ my @fields = split $field_sep, $line;
+
+ my $num_input_fields = scalar(@fields);
+ die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ;
+
+ foreach my $col (@values_columns) {
+ $input_headers{$filename}->{$col} = $fields[$col-1] ;
+ }
+ }
+
+
+ ## Read file's data
+ while ( my $line = ) {
+ chomp $line;
+ my @fields = split $field_sep, $line;
+
+ my $num_input_fields = scalar(@fields);
+ die "Input error: file '$filename' line $. doesn't have enough columns (key column = $key_column, line has only $num_input_fields columns)\n" if $num_input_fields < $key_column ;
+ die "Input error: file '$filename' line $. doesn't have enough columns (value column = $max_value_column, line has only $num_input_fields columns)\n" if $num_input_fields < $max_value_column ;
+
+
+ my $key = $fields[$key_column-1];
+ my $value = join($field_sep, @fields[@value_indexes]);
+
+ die "Input error: file '$filename' line $. have duplicated key '$key'.\n"
+ if (exists $data{$key}->{$filename} && !$ignore_duplicates) ;
+ $data{$key}->{$filename} = $value;
+ }
+ close FILE
+ or die "Error: can't write and close file '$filename': $!\n";
+}
+
+sub parse_command_line_parameters()
+{
+ my $values_columns_string;
+
+ my $rc = GetOptions("help" => \&show_help,
+ "key|k=i" => \$key_column,
+ "values|v=s" => \$values_columns_string,
+ "t=s" => \$field_sep,
+ "in-header" => \$input_headers,
+ "out-header|h" => \$output_headers,
+ "H" => sub { $input_headers = 1 ; $output_headers = 1 ; },
+ "ignore-dups" => \$ignore_duplicates,
+ "filler|f=s" => \$filler,
+ "examples" => \&show_examples,
+ "labels" => \$have_file_labels,
+ );
+ die "Error: inalid command-line parameters.\n" unless $rc;
+
+ die "Error: missing key column. use --key N. see --help for more details.\n" unless defined $key_column;
+ die "Error: Invalid key column ($key_column). Must be bigger than zero. see --help for more details.\n" if $key_column <= 0 ;
+
+ die "Error: missing values column. use --values V1,V2,Vn. See --help for more details.\n" unless defined $values_columns_string;
+ @values_columns = split(/\s*,\s*/, $values_columns_string);
+
+ die "Error: missing values column. use --values N,N,N. see --help for more details.\n" unless scalar(@values_columns)>0;
+ foreach my $v (@values_columns) {
+ die "Error: invalid value column ($v), please use only numbers>=1. see --help for more details.\n"
+ unless $v =~ /^\d+$/ && $v>=1;
+
+ $max_value_column = $v unless defined $max_value_column && $max_value_column>$v;
+ }
+
+ $filler_string = join($field_sep, map { $filler } @values_columns);
+
+
+ if ($have_file_labels) {
+ ## have file labels - each pair of parameters is a file/label pair.
+ die "Error: missing input files and labels\n" if scalar(@ARGV)==0;
+ die "Error: when using --labels, a pair of file names + labels is required (got odd number of argiments)\n" unless scalar(@ARGV)%2==0;
+
+ while (@ARGV) {
+ my $filename = shift @ARGV;
+ my $label = shift @ARGV;
+ $label =~ s/^[^\.\w\-]+//;
+ $label =~ s/[^\.\w\-]+$//g;
+ $label =~ s/[^\.\w\-]+/_/g;
+
+ my $file_ID = sanitize_filename($filename);
+ $file_labels{$filename} = $label;
+ push @input_files, $filename;
+ }
+ } else {
+ ## no file labels - the rest of the arguments are just file names;
+ @input_files = @ARGV;
+ die "Error: missing input files\n" if scalar(@input_files)==0;
+ die "Error: need more than one input file to join.\n" if scalar(@input_files)==1;
+
+ foreach my $file (@input_files) {
+ my $file_ID = sanitize_filename($file);
+ $file_labels{$file} = $file_ID;
+ }
+ }
+
+}
+
+sub show_help()
+{
+ print< AAA.txt <==
+chr4 888449 890171 FBtr0308778 0 + 266 1527 1722
+chr4 972167 979017 FBtr0310651 0 - 3944 6428 6850
+chr4 972186 979017 FBtr0089229 0 - 3944 6428 6831
+chr4 972186 979017 FBtr0089231 0 - 3944 6428 6831
+chr4 972186 979017 FBtr0089233 0 - 3944 6428 6831
+chr4 995793 996435 FBtr0111046 0 + 7 166 642
+chr4 995793 997931 FBtr0111044 0 + 28 683 2138
+chr4 995793 997931 FBtr0111045 0 + 28 683 2138
+chr4 1034029 1047719 FBtr0089223 0 - 5293 13394 13690
+
+==> BBB.txt <==
+chr4 90286 134453 FBtr0309803 0 + 657 29084 44167
+chr4 251355 266499 FBtr0089116 0 + 56 1296 15144
+chr4 252050 266506 FBtr0308086 0 + 56 1296 14456
+chr4 252050 266506 FBtr0308087 0 + 56 1296 14456
+chr4 252053 266528 FBtr0300796 0 + 56 1296 14475
+chr4 252053 266528 FBtr0300800 0 + 56 1296 14475
+chr4 252055 266528 FBtr0300798 0 + 56 1296 14473
+chr4 252055 266528 FBtr0300799 0 + 56 1296 14473
+chr4 252541 266528 FBtr0300797 0 + 56 1296 13987
+
+==> CCC.txt <==
+chr4 972167 979017 FBtr0310651 0 - 9927 6738 6850
+chr4 972186 979017 FBtr0089229 0 - 9927 6738 6831
+chr4 972186 979017 FBtr0089231 0 - 9927 6738 6831
+chr4 972186 979017 FBtr0089233 0 - 9927 6738 6831
+chr4 995793 996435 FBtr0111046 0 + 5 304 642
+chr4 995793 997931 FBtr0111044 0 + 17 714 2138
+chr4 995793 997931 FBtr0111045 0 + 17 714 2138
+chr4 1034029 1047719 FBtr0089223 0 - 17646 13536 13690
+
+\$ multijoin -h --key 4 --values 7,8,9 *.txt | head -n 10
+key AAA__V7 AAA__V8 AAA__V9 BBB__V7 BBB__V8 BBB__V9 CCC__V7 CCC__V8 CCC__V9
+FBtr0089116 0 0 0 56 1296 15144 0 0 0
+FBtr0089223 5293 13394 13690 0 0 0 17646 13536 13690
+FBtr0089229 3944 6428 6831 0 0 0 9927 6738 6831
+FBtr0089231 3944 6428 6831 0 0 0 9927 6738 6831
+FBtr0089233 3944 6428 6831 0 0 0 9927 6738 6831
+FBtr0111044 28 683 2138 0 0 0 17 714 2138
+FBtr0111045 28 683 2138 0 0 0 17 714 2138
+FBtr0111046 7 166 642 0 0 0 5 304 642
+FBtr0300796 0 0 0 56 1296 14475 0 0 0
+
+
+
+EOF
+ exit(0);
+}
diff -r 000000000000 -r ec66f9d90ef0 multijoin.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/multijoin.xml Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,122 @@
+
+ (combine multiple files)
+ multijoin
+ --key '$key_column'
+ --values '$value_columns'
+ --filler '$filler'
+ $ignore_dups
+ $output_header
+ $input_header
+ #for $file in $files
+ '$file.filename'
+ #end for
+ > '$output'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool joins multiple tabular files based on a common key column.
+
+-----
+
+**Example**
+
+To join three files, based on the 4th column, and keeping the 7th,8th,9th columns:
+
+**First file (AAA)**::
+
+ chr4 888449 890171 FBtr0308778 0 + 266 1527 1722
+ chr4 972167 979017 FBtr0310651 0 - 3944 6428 6850
+ chr4 972186 979017 FBtr0089229 0 - 3944 6428 6831
+ chr4 972186 979017 FBtr0089231 0 - 3944 6428 6831
+ chr4 972186 979017 FBtr0089233 0 - 3944 6428 6831
+ chr4 995793 996435 FBtr0111046 0 + 7 166 642
+ chr4 995793 997931 FBtr0111044 0 + 28 683 2138
+ chr4 995793 997931 FBtr0111045 0 + 28 683 2138
+ chr4 1034029 1047719 FBtr0089223 0 - 5293 13394 13690
+ ...
+
+
+**Second File (BBB)**::
+
+ chr4 90286 134453 FBtr0309803 0 + 657 29084 44167
+ chr4 251355 266499 FBtr0089116 0 + 56 1296 15144
+ chr4 252050 266506 FBtr0308086 0 + 56 1296 14456
+ chr4 252050 266506 FBtr0308087 0 + 56 1296 14456
+ chr4 252053 266528 FBtr0300796 0 + 56 1296 14475
+ chr4 252053 266528 FBtr0300800 0 + 56 1296 14475
+ chr4 252055 266528 FBtr0300798 0 + 56 1296 14473
+ chr4 252055 266528 FBtr0300799 0 + 56 1296 14473
+ chr4 252541 266528 FBtr0300797 0 + 56 1296 13987
+ ...
+
+**Third file (CCC)**::
+
+ chr4 972167 979017 FBtr0310651 0 - 9927 6738 6850
+ chr4 972186 979017 FBtr0089229 0 - 9927 6738 6831
+ chr4 972186 979017 FBtr0089231 0 - 9927 6738 6831
+ chr4 972186 979017 FBtr0089233 0 - 9927 6738 6831
+ chr4 995793 996435 FBtr0111046 0 + 5 304 642
+ chr4 995793 997931 FBtr0111044 0 + 17 714 2138
+ chr4 995793 997931 FBtr0111045 0 + 17 714 2138
+ chr4 1034029 1047719 FBtr0089223 0 - 17646 13536 13690
+ ...
+
+
+**Joining** the files, using **key column 4**, **value columns 7,8,9** and a **header line**, will return::
+
+ key AAA__V7 AAA__V8 AAA__V9 BBB__V7 BBB__V8 BBB__V9 CCC__V7 CCC__V8 CCC__V9
+ FBtr0089116 0 0 0 56 1296 15144 0 0 0
+ FBtr0089223 5293 13394 13690 0 0 0 17646 13536 13690
+ FBtr0089229 3944 6428 6831 0 0 0 9927 6738 6831
+ FBtr0089231 3944 6428 6831 0 0 0 9927 6738 6831
+ FBtr0089233 3944 6428 6831 0 0 0 9927 6738 6831
+ FBtr0111044 28 683 2138 0 0 0 17 714 2138
+ FBtr0111045 28 683 2138 0 0 0 17 714 2138
+ FBtr0111046 7 166 642 0 0 0 5 304 642
+ FBtr0300796 0 0 0 56 1296 14475 0 0 0
+ ...
+
+
+# Input files need not be sorted.
+
+-----
+
+*multijoin* was written by A. Gordon (gordon at cshl dot edu)
+
+
+
diff -r 000000000000 -r ec66f9d90ef0 readme.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/readme.rst Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,86 @@
+These are Galaxy wrappers for common unix text-processing tools
+===============================================================
+
+The initial work was done by Assaf Gordon and Greg Hannon's lab ( http://hannonlab.cshl.edu )
+in Cold Spring Harbor Laboratory ( http://www.cshl.edu ).
+
+
+The tools are:
+
+* awk - The AWK programmning language ( http://www.gnu.org/software/gawk/ )
+* sed - Stream Editor ( http://sed.sf.net )
+* grep - Search files ( http://www.gnu.org/software/grep/ )
+* sort_columns - Sorting every line according to there columns
+* GNU Coreutils programs ( http://www.gnu.org/software/coreutils/ ):
+ * sort - sort files
+ * join - join two files, based on common key field.
+ * cut - keep/discard fields from a file
+ * unsorted_uniq - keep unique/duplicated lines in a file
+ * sorted_uniq - keep unique/duplicated lines in a file
+ * head - keep the first X lines in a file.
+ * tail - keep the last X lines in a file.
+
+Few improvements over the standard tools:
+
+ * EasyJoin - A Join tool that does not require pre-sorted the files ( https://github.com/agordon/filo/blob/scripts/src/scripts/easyjoin )
+ * Multi-Join - Join multiple (>2) files ( https://github.com/agordon/filo/blob/scripts/src/scripts/multijoin )
+ * Find_and_Replace - Find/Replace text in a line or specific column.
+ * Grep with Perl syntax - uses grep with Perl-Compatible regular expressions.
+ * HTML'd Grep - grep text in a file, and produced high-lighted HTML output, for easier viewing ( uses https://github.com/agordon/filo/blob/scripts/src/scripts/sort-header )
+
+
+Requirements
+------------
+
+1. Coreutils vesion 8.19 or later.
+2. AWK version 4.0.1 or later.
+3. SED version 4.2 *with* a special patch
+4. Grep with PCRE support
+
+These will be installed automatically with the Galaxy Tool Shed.
+
+
+-------------------
+NOTE About Security
+-------------------
+
+The included tools are secure (barring unintentional bugs):
+The main concern might be executing system commands with awk's "system" and sed's "e" commands,
+or reading/writing arbitrary files with awk's redirection and sed's "r/w" commands.
+These commands are DISABLED using the "--sandbox" parameter to awk and sed.
+
+User trying to run an awk program similar to:
+ BEGIN { system("ls") }
+Will get an error (in Galaxy) saying:
+ fatal: 'system' function not allowed in sandbox mode.
+
+User trying to run a SED program similar to:
+ 1els
+will get an error (in Galaxy) saying:
+ sed: -e expression #1, char 2: e/r/w commands disabled in sandbox mode
+
+That being said, if you do find some vulnerability in these tools, please let me know and I'll try fix them.
+
+------------
+Installation
+------------
+
+Should be done with the Galaxy `Tool Shed`_.
+
+.. _`Tool Shed`: http://wiki.galaxyproject.org/Tool%20Shed
+
+
+----
+TODO
+----
+
+- unit-tests
+- uniqu will get a new --group funciton with the 8.22 release, its currently commended out
+- also shuf will get a major improved performance with large files http://git.savannah.gnu.org/gitweb/?p=coreutils.git;a=commit;h=20d7bce0f7e57d9a98f0ee811e31c757e9fedfff
+ we can remove the random feature from sort and use shuf instead
+- move some advanced settings under a conditional, for example the cut tools offers to cut bytes
+
+
+
+
+
diff -r 000000000000 -r ec66f9d90ef0 scripts/ansi2html.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/ansi2html.sh Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,331 @@
+#!/bin/sh
+
+# Convert ANSI (terminal) colours and attributes to HTML
+
+# Author:
+# http://www.pixelbeat.org/docs/terminal_colours/
+# Examples:
+# ls -l --color=always | ansi2html.sh > ls.html
+# git show --color | ansi2html.sh > last_change.html
+# Generally one can use the `script` util to capture full terminal output.
+# Changes:
+# V0.1, 24 Apr 2008, Initial release
+# V0.2, 01 Jan 2009, Phil Harnish
+# Support `git diff --color` output by
+# matching ANSI codes that specify only
+# bold or background colour.
+# P@draigBrady.com
+# Support `ls --color` output by stripping
+# redundant leading 0s from ANSI codes.
+# Support `grep --color=always` by stripping
+# unhandled ANSI codes (specifically ^[[K).
+# V0.3, 20 Mar 2009, http://eexpress.blog.ubuntu.org.cn/
+# Remove cat -v usage which mangled non ascii input.
+# Cleanup regular expressions used.
+# Support other attributes like reverse, ...
+# P@draigBrady.com
+# Correctly nest tags (even across lines).
+# Add a command line option to use a dark background.
+# Strip more terminal control codes.
+# V0.4, 17 Sep 2009, P@draigBrady.com
+# Handle codes with combined attributes and color.
+# Handle isolated attributes with css.
+# Strip more terminal control codes.
+# V0.12, 12 Jul 2011
+# http://github.com/pixelb/scripts/commits/master/scripts/ansi2html.sh
+
+if [ "$1" = "--version" ]; then
+ echo "0.12" && exit
+fi
+
+if [ "$1" = "--help" ]; then
+ echo "This utility converts ANSI codes in data passed to stdin" >&2
+ echo "It has 2 optional parameters:" >&2
+ echo " --bg=dark --palette=linux|solarized|tango|xterm" >&2
+ echo "E.g.: ls -l --color=always | ansi2html.sh --bg=dark > ls.html" >&2
+ exit
+fi
+
+[ "$1" = "--bg=dark" ] && { dark_bg=yes; shift; }
+
+if [ "$1" = "--palette=solarized" ]; then
+ # See http://ethanschoonover.com/solarized
+ P0=073642; P1=D30102; P2=859900; P3=B58900;
+ P4=268BD2; P5=D33682; P6=2AA198; P7=EEE8D5;
+ P8=002B36; P9=CB4B16; P10=586E75; P11=657B83;
+ P12=839496; P13=6C71C4; P14=93A1A1; P15=FDF6E3;
+ shift;
+elif [ "$1" = "--palette=solarized-xterm" ]; then
+ # Above mapped onto the xterm 256 color palette
+ P0=262626; P1=AF0000; P2=5F8700; P3=AF8700;
+ P4=0087FF; P5=AF005F; P6=00AFAF; P7=E4E4E4;
+ P8=1C1C1C; P9=D75F00; P10=585858; P11=626262;
+ P12=808080; P13=5F5FAF; P14=8A8A8A; P15=FFFFD7;
+ shift;
+elif [ "$1" = "--palette=tango" ]; then
+ # Gnome default
+ P0=000000; P1=CC0000; P2=4E9A06; P3=C4A000;
+ P4=3465A4; P5=75507B; P6=06989A; P7=D3D7CF;
+ P8=555753; P9=EF2929; P10=8AE234; P11=FCE94F;
+ P12=729FCF; P13=AD7FA8; P14=34E2E2; P15=EEEEEC;
+ shift;
+elif [ "$1" = "--palette=xterm" ]; then
+ P0=000000; P1=CD0000; P2=00CD00; P3=CDCD00;
+ P4=0000EE; P5=CD00CD; P6=00CDCD; P7=E5E5E5;
+ P8=7F7F7F; P9=FF0000; P10=00FF00; P11=FFFF00;
+ P12=5C5CFF; P13=FF00FF; P14=00FFFF; P15=FFFFFF;
+ shift;
+else # linux console
+ P0=000000; P1=AA0000; P2=00AA00; P3=AA5500;
+ P4=0000AA; P5=AA00AA; P6=00AAAA; P7=AAAAAA;
+ P8=555555; P9=FF5555; P10=55FF55; P11=FFFF55;
+ P12=5555FF; P13=FF55FF; P14=55FFFF; P15=FFFFFF;
+ [ "$1" = "--palette=linux" ] && shift
+fi
+
+[ "$1" = "--bg=dark" ] && { dark_bg=yes; shift; }
+
+echo -n "
+
+
+
+
+
+
+
+'
+
+p='\x1b\[' #shortcut to match escape codes
+P="\(^[^°]*\)¡$p" #expression to match prepended codes below
+
+# Handle various xterm control sequences.
+# See /usr/share/doc/xterm-*/ctlseqs.txt
+sed "
+s#\x1b[^\x1b]*\x1b\\\##g # strip anything between \e and ST
+s#\x1b][0-9]*;[^\a]*\a##g # strip any OSC (xterm title etc.)
+
+#handle carriage returns
+s#^.*\r\{1,\}\([^$]\)#\1#
+s#\r\$## # strip trailing \r
+
+# strip other non SGR escape sequences
+s#[\x07]##g
+s#\x1b[]>=\][0-9;]*##g
+s#\x1bP+.\{5\}##g
+s#${p}[0-9;?]*[^0-9;?m]##g
+
+#remove backspace chars and what they're backspacing over
+:rm_bs
+s#[^\x08]\x08##g; t rm_bs
+" |
+
+# Normalize the input before transformation
+sed "
+# escape HTML
+s#\\&#g; s#>#\>#g; s#<#\<#g; s#\"#\"#g
+
+# normalize SGR codes a little
+
+# split 256 colors out and mark so that they're not
+# recognised by the following 'split combined' line
+:e
+s#${p}\([0-9;]\{1,\}\);\([34]8;5;[0-9]\{1,3\}\)m#${p}\1m${p}¬\2m#g; t e
+s#${p}\([34]8;5;[0-9]\{1,3\}\)m#${p}¬\1m#g;
+
+:c
+s#${p}\([0-9]\{1,\}\);\([0-9;]\{1,\}\)m#${p}\1m${p}\2m#g; t c # split combined
+s#${p}0\([0-7]\)#${p}\1#g #strip leading 0
+s#${p}1m\(\(${p}[4579]m\)*\)#\1${p}1m#g #bold last (with clr)
+s#${p}m#${p}0m#g #add leading 0 to norm
+
+# undo any 256 color marking
+s#${p}¬\([34]8;5;[0-9]\{1,3\}\)m#${p}\1m#g;
+
+# map 16 color codes to color + bold
+s#${p}9\([0-7]\)m#${p}3\1m${p}1m#g;
+s#${p}10\([0-7]\)m#${p}4\1m${p}1m#g;
+
+# change 'reset' code to a single char, and prepend a single char to
+# other codes so that we can easily do negative matching, as sed
+# does not support look behind expressions etc.
+s#°#\°#g; s#${p}0m#°#g
+s#¡#\¡#g; s#${p}[0-9;]*m#¡g
+" |
+
+# Convert SGR sequences to HTML
+sed "
+:ansi_to_span # replace ANSI codes with CSS classes
+t ansi_to_span # hack so t commands below only apply to preceeding s cmd
+
+/^[^¡]*°/ { b span_end } # replace 'reset code' if no preceeding code
+
+# common combinations to minimise html (optional)
+s#${P}3\([0-7]\)m¡${p}4\([0-7]\)m#\1#;t span_count
+s#${P}4\([0-7]\)m¡${p}3\([0-7]\)m#\1#;t span_count
+
+s#${P}1m#\1#; t span_count
+s#${P}4m#\1#; t span_count
+s#${P}5m#\1#; t span_count
+s#${P}7m#\1#; t span_count
+s#${P}9m#\1#; t span_count
+s#${P}3\([0-9]\)m#\1#; t span_count
+s#${P}4\([0-9]\)m#\1#; t span_count
+
+s#${P}38;5;\([0-9]\{1,3\}\)m#\1#; t span_count
+s#${P}48;5;\([0-9]\{1,3\}\)m#\1#; t span_count
+
+s#${P}[0-9;]*m#\1#g; t ansi_to_span # strip unhandled codes
+
+b # next line of input
+
+# add a corresponding span end flag
+:span_count
+x; s/^/s/; x
+b ansi_to_span
+
+# replace 'reset code' with correct number of tags
+:span_end
+x
+/^s/ {
+ s/^.//
+ x
+ s#°#°#
+ b span_end
+}
+x
+s#°##
+b ansi_to_span
+" |
+
+# Convert alternative character set
+# Note we convert here, as if we do at start we have to worry about avoiding
+# conversion of SGR codes etc., whereas doing here we only have to
+# avoid conversions of stuff between &...; or <...>
+#
+# Note we could use sed to do this based around:
+# sed 'y/abcdefghijklmnopqrstuvwxyz{}`~/▒␉␌␍␊°±␋┘┐┌└┼⎺⎻─⎼⎽├┤┴┬│≤≥π£◆·/'
+# However that would be very awkward as we need to only conv some input.
+# The basic scheme that we do in the python script below is:
+# 1. enable transliterate once ¡ char seen
+# 2. disable once µ char seen (may be on diff line to ¡)
+# 3. never transliterate between &; or <> chars
+sed "
+# change 'smacs' and 'rmacs' to a single char so that we can easily do
+# negative matching, as sed does not support look behind expressions etc.
+# Note we don't use ° like above as that's part of the alternate charset.
+s#\x1b(0#¡#g;
+s#µ#\µ#g; s#\x1b(B#µ#g
+" |
+(
+python -c "
+# vim:fileencoding=utf8
+
+import sys
+import locale
+encoding=locale.getpreferredencoding()
+
+old='abcdefghijklmnopqrstuvwxyz{}\`~'
+new='▒␉␌␍␊°±␋┘┐┌└┼⎺⎻─⎼⎽├┤┴┬│≤≥π£◆·'
+new=unicode(new, 'utf-8')
+table=range(128)
+for o,n in zip(old, new): table[ord(o)]=n
+
+(STANDARD, ALTERNATIVE, HTML_TAG, HTML_ENTITY) = (0, 1, 2, 3)
+
+state = STANDARD
+last_mode = STANDARD
+for c in unicode(sys.stdin.read(), encoding):
+ if state == HTML_TAG:
+ if c == '>':
+ state = last_mode
+ elif state == HTML_ENTITY:
+ if c == ';':
+ state = last_mode
+ else:
+ if c == '<':
+ state = HTML_TAG
+ elif c == '&':
+ state = HTML_ENTITY
+ elif c == u'¡' and state == STANDARD:
+ state = ALTERNATIVE
+ last_mode = ALTERNATIVE
+ continue
+ elif c == u'µ' and state == ALTERNATIVE:
+ state = STANDARD
+ last_mode = STANDARD
+ continue
+ elif state == ALTERNATIVE:
+ c = c.translate(table)
+ sys.stdout.write(c.encode(encoding))
+" 2>/dev/null ||
+sed 's/[¡µ]//g' # just strip aternative flag chars
+)
+
+echo "
+
+"
diff -r 000000000000 -r ec66f9d90ef0 sed.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sed.xml Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,110 @@
+
+ with sed
+
+ gnu_sed
+
+
+ sed --sandbox -r $silent -f '$sed_script' '$input' > '$output'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ $url_paste
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool runs the unix **sed** command on the selected data file.
+
+.. class:: infomark
+
+**TIP:** This tool uses the **extended regular** expression syntax (same as running 'sed -r').
+
+
+
+**Further reading**
+
+- Short sed tutorial (http://www.linuxhowtos.org/System/sed_tutorial.htm)
+- Long sed tutorial (http://www.grymoire.com/Unix/Sed.html)
+- sed faq with good examples (http://sed.sourceforge.net/sedfaq.html)
+- sed cheat-sheet (http://www.catonmat.net/download/sed.stream.editor.cheat.sheet.pdf)
+- Collection of useful sed one-liners (http://student.northpark.edu/pemente/sed/sed1line.txt)
+
+-----
+
+**Sed commands**
+
+The most useful sed command is **s** (substitute).
+
+**Examples**
+
+- **s/hsa//** will remove the first instance of 'hsa' in every line.
+- **s/hsa//g** will remove all instances (beacuse of the **g**) of 'hsa' in every line.
+- **s/A{4,}/--&--/g** will find sequences of 4 or more consecutive A's, and once found, will surround them with two dashes from each side. The **&** marker is a place holder for 'whatever matched the regular expression'.
+- **s/hsa-mir-([^ ]+)/short name: \\1 full name: &/** will find strings such as 'hsa-mir-43a' (the regular expression is 'hsa-mir-' followed by non-space characters) and will replace it will string such as 'short name: 43a full name: hsa-mir-43a'. The **\\1** marker is a place holder for 'whatever matched the first parenthesis' (similar to perl's **$1**) .
+
+
+**sed's Regular Expression Syntax**
+
+The select tool searches the data for lines containing or not containing a match to the given pattern. A Regular Expression is a pattern descibing a certain amount of text.
+
+- **( ) { } [ ] . * ? + \ ^ $** are all special characters. **\\** can be used to "escape" a special character, allowing that special character to be searched for.
+- **^** matches the beginning of a string(but not an internal line).
+- **(** .. **)** groups a particular pattern.
+- **{** n or n, or n,m **}** specifies an expected number of repetitions of the preceding pattern.
+
+ - **{n}** The preceding item is matched exactly n times.
+ - **{n,}** The preceding item ismatched n or more times.
+ - **{n,m}** The preceding item is matched at least n times but not more than m times.
+
+- **[** ... **]** creates a character class. Within the brackets, single characters can be placed. A dash (-) may be used to indicate a range such as **a-z**.
+- **.** Matches any single character except a newline.
+- ***** The preceding item will be matched zero or more times.
+- **?** The preceding item is optional and matched at most once.
+- **+** The preceding item will be matched one or more times.
+- **^** has two meaning:
+ - matches the beginning of a line or string.
+ - indicates negation in a character class. For example, [^...] matches every character except the ones inside brackets.
+- **$** matches the end of a line or string.
+- **\|** Separates alternate possibilities.
+
+
+**Note**: SED uses extended regular expression syntax, not Perl syntax. **\\d**, **\\w**, **\\s** etc. are **not** supported.
+
+
+
diff -r 000000000000 -r ec66f9d90ef0 sort.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sort.xml Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,137 @@
+
+
+ gnu_coreutils
+ gnu_sed
+
+
+ #if int($header) > 0:
+ (sed -u '${header}'q && sort $unique $ignore_case --stable -t ' '
+
+ #for $key in $sortkeys
+ '-k ${key.column}${key.order}${key.style},${key.column}'
+ #end for
+
+ ) < '${infile}' > '${outfile}'
+ #else:
+ (sort $unique $ignore_case --stable -t ' '
+
+ #for $key in $sortkeys
+ '-k ${key.column}${key.order}${key.style},${key.column}'
+ #end for
+
+ ) < '${infile}' > '${outfile}'
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool sorts an input file.
+
+-----
+
+**Sorting Styles**
+
+* **Fast Numeric**: sort by numeric values. Handles integer values (e.g. 43, 134) and decimal-point values (e.g. 3.14). *Does not* handle scientific notation (e.g. -2.32e2).
+* **General Numeric**: sort by numeric values. Handles all numeric notations (including scientific notation). Slower than *fast numeric*, so use only when necessary.
+* **Natural Sort**: Sort in 'natural' order (natural to humans, not to computers). See example below.
+* **Alphabetical sort**: Sort in strict alphabetical order. See example below.
+* **Human-readable numbers**: Sort human readble numbers (e.g. 1G > 2M > 3K > 400)
+* **Random order**: return lines in random order.
+
+------
+
+**Example - Header line**
+
+**Input file** (note first line is a header line, should not be sorted)::
+
+ Fruit Color Price
+ Banana Yellow 4.1
+ Avocado Green 8.0
+ Apple Red 3.0
+ Melon Green 6.1
+
+**Sorting** by **numeric order** on column **3**, with **header**, will return::
+
+ Fruit Color Price
+ Apple Red 3.0
+ Banana Yellow 4.1
+ Melon Green 6.1
+ Avocado Green 8.0
+
+
+-----
+
+**Example - Natural vs. Alphabetical sorting**
+
+Given the following list::
+
+ chr4
+ chr13
+ chr1
+ chr10
+ chr20
+ chr2
+
+**Alphabetical sort** would produce the following sorted list::
+
+ chr1
+ chr10
+ chr13
+ chr2
+ chr20
+ chr4
+
+**Natural Sort** would produce the following sorted list::
+
+ chr1
+ chr2
+ chr4
+ chr10
+ chr13
+ chr20
+
+
+.. class:: infomark
+
+If you're planning to use the file with another tool that expected sorted files (such as *join*), you should use the **Alphabetical sort**, not the **Natural Sort**. Natural sort order is easier for humans, but is unnatural for computer programs.
+
+-----
+
+*sort-header* is was written by A. Gordon ( gordon at cshl dot edu )
+
+
+
diff -r 000000000000 -r ec66f9d90ef0 sort_rows.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sort_rows.xml Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,26 @@
+
+ according to their columns
+ python -c 'for line in ["\t".join(sorted(line.strip().split("\t"))) for line in open("$input").readlines() ]: print line' > $outfile
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert*
+
+**What it does**
+
+That tool sorts each row in a TAB separated file, according to their columns. In other words: It is a sorted reordering of all columns.
+
+
+
+
diff -r 000000000000 -r ec66f9d90ef0 sorted_uniq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/sorted_uniq.xml Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,55 @@
+
+ from sorted file
+
+ gnu_coreutils
+
+
+ uniq
+ -f
+ $skipfields
+ $count
+ $repeated
+ $ignorecase
+ $uniqueonly
+ $input
+
+ ## feature is not yet released, it will be in the next 8.22 version
+ ##--group=$group
+ > $output
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool takes a sorted file and look for lines that are unique.
+
+.. class:: warningmark
+
+Please make sure your file is sorted, or else this tool will give you an erroneous output.
+
+.. class:: infomark
+
+You can sort your file using either the "Sort" tool in "Filter and Sort", or the "Sort" tool in "Unix Tools".
+
+
+
diff -r 000000000000 -r ec66f9d90ef0 tail.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tail.xml Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,25 @@
+
+ lines from a dataset (tail)
+
+ gnu_coreutils
+
+
+ tail --lines $count '$input1' > '$output'
+
+
+
+
+
+
+
+
+
+
+
+
+**What it does**
+
+This tool runs the **tail** unix command, which discards lines from the beginning of a file.
+
+
+
diff -r 000000000000 -r ec66f9d90ef0 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,18 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ $REPOSITORY_INSTALL_DIR/scripts
+
+
diff -r 000000000000 -r ec66f9d90ef0 unsorted_uniq.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/unsorted_uniq.py Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,36 @@
+import sys
+import subprocess
+
+"""
+ We only need that file because galaxy do not understand the -t $'\t' term.
+ Otherwise that would be the right XML-only solution:
+ sort -u
+ $ignore_case
+ $is_numeric
+ -t \$'\t'
+ #if $adv_opts.adv_opts_selector=="advanced":
+ -k$adv_opts.column_start,$adv_opts.column_end
+ #end if
+ -o $outfile
+ $input
+"""
+
+if sys.argv[1].strip() != 'false':
+ ignore_case = sys.argv[1]
+else:
+ ignore_case = ''
+
+if sys.argv[2].strip() != 'false':
+ is_numeric = sys.argv[2]
+else:
+ is_numeric = ''
+
+try:
+ col_start = sys.argv[3]
+ col_end = sys.argv[4]
+ com = "sort -u %s %s -t ' ' -k%s,%s -o %s %s" % (is_numeric, ignore_case, col_start, col_end, sys.argv[5], sys.argv[6])
+except:
+ # no advanced options selected
+ com = "sort -u %s %s -t ' ' -o %s %s" % (is_numeric, ignore_case, sys.argv[3], sys.argv[4])
+
+subprocess.call(com, shell=True)
diff -r 000000000000 -r ec66f9d90ef0 unsorted_uniq.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/unsorted_uniq.xml Thu Sep 05 04:58:21 2013 -0400
@@ -0,0 +1,79 @@
+
+ occurrences of each record
+
+ gnu_coreutils
+
+
+ unique_lines.py
+ $ignore_case
+ $is_numeric
+ #if $adv_opts.adv_opts_selector=="advanced":
+ $adv_opts.column_start
+ $adv_opts.column_end
+ #end if
+ $outfile
+ $infile
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .. class:: infomark
+
+**Syntax**
+
+This tool returns all unique lines using the 'sort -u' command. It can be used with unsorted files.
+If you need additional options, like grouping or counting your unique results, please use the 'Unique lines from sorted file' tool.
+
+-----
+
+.. class:: infomark
+
+The input file needs to be tab separated. Please convert your file if necessary.
+
+-----
+
+**Example**
+
+- Input file::
+
+ chr1 10 100 gene1
+ chr1 105 200 gene2
+ chr1 10 100 gene1
+ chr2 10 100 gene4
+ chr2 1000 1900 gene5
+ chr3 15 1656 gene6
+ chr2 10 100 gene4
+
+- Unique lines will result in::
+
+ chr1 10 100 gene1
+ chr1 105 200 gene2
+ chr2 10 100 gene4
+ chr2 1000 1900 gene5
+ chr3 15 1656 gene6
+
+
+
+