changeset 50:0b9aab6aaebf draft

Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
author rnateam
date Tue, 26 Jan 2016 04:38:27 -0500
parents 303f6402a035
children c226eef33bab
files convert_bc_to_binary_RY.py convert_bc_to_binary_RY.xml coords2clnt.py coords2clnt.xml extract_aln_ends.py extract_aln_ends.xml extract_bcs.py extract_bcs.xml merge_pcr_duplicates.py merge_pcr_duplicates.xml remove_tail.py remove_tail.xml rm_spurious_events.pl rm_spurious_events.py rm_spurious_events.xml tool_dependencies.xml
diffstat 16 files changed, 295 insertions(+), 206 deletions(-) [+]
line wrap: on
line diff
--- a/convert_bc_to_binary_RY.py	Sat Dec 19 06:16:22 2015 -0500
+++ b/convert_bc_to_binary_RY.py	Tue Jan 26 04:38:27 2016 -0500
@@ -1,5 +1,13 @@
 #!/usr/bin/env python
 
+import argparse
+import logging
+from string import maketrans
+from sys import stdout
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+
 tool_description = """
 Convert standard nucleotides to IUPAC nucleotide codes used for binary barcodes.
 
@@ -19,19 +27,6 @@
 Status: Testing
 """
 
-import argparse
-import logging
-from string import maketrans
-from sys import stdout
-from Bio import SeqIO
-from Bio.Seq import Seq
-from Bio.Alphabet import IUPAC
-
-# # avoid ugly python IOError when stdout output is piped into another program
-# # and then truncated (such as piping to head)
-# from signal import signal, SIGPIPE, SIG_DFL
-# signal(SIGPIPE, SIG_DFL)
-
 # parse command line arguments
 parser = argparse.ArgumentParser(description=tool_description,
                                  epilog=epilog,
--- a/convert_bc_to_binary_RY.xml	Sat Dec 19 06:16:22 2015 -0500
+++ b/convert_bc_to_binary_RY.xml	Tue Jan 26 04:38:27 2016 -0500
@@ -5,7 +5,7 @@
   </macros>
   <expand macro="requirements" />
   <expand macro="stdio" />
-  <version_command>python convert_bc_to_binary_RY.py --version</version_command>
+  <version_command>python $__tool_directory__/convert_bc_to_binary_RY.py --version</version_command>
   <command interpreter="python"><![CDATA[
 convert_bc_to_binary_RY.py
 #if $positional_1 and $positional_1 is not None:
--- a/coords2clnt.py	Sat Dec 19 06:16:22 2015 -0500
+++ b/coords2clnt.py	Tue Jan 26 04:38:27 2016 -0500
@@ -1,5 +1,15 @@
 #!/usr/bin/env python
 
+import argparse
+import logging
+from sys import stdout
+from pybedtools import BedTool
+from pybedtools.featurefuncs import five_prime
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
 tool_description = """
 Given coordinates of the aligned reads, calculate positions of the crosslinked
 nucleotides. Crosslinked nts are assumed to be one nt upstream of the 5'-end of
@@ -25,16 +35,6 @@
 Status: Testing
 """
 
-import argparse
-import logging
-from sys import stdout
-from pybedtools import BedTool
-from pybedtools.featurefuncs import five_prime
-# avoid ugly python IOError when stdout output is piped into another program
-# and then truncated (such as piping to head)
-from signal import signal, SIGPIPE, SIG_DFL
-signal(SIGPIPE, SIG_DFL)
-
 # parse command line arguments
 parser = argparse.ArgumentParser(description=tool_description,
                                  epilog=epilog,
--- a/coords2clnt.xml	Sat Dec 19 06:16:22 2015 -0500
+++ b/coords2clnt.xml	Tue Jan 26 04:38:27 2016 -0500
@@ -8,7 +8,7 @@
   <stdio>
     <exit_code level="fatal" range="1:"/>
   </stdio>
-  <version_command>python coords2clnt.py --version</version_command>
+  <version_command>python $__tool_directory__/coords2clnt.py --version</version_command>
   <command interpreter="python"><![CDATA[coords2clnt.py
 #if $positional_1 and $positional_1 is not None:
 $positional_1
--- a/extract_aln_ends.py	Sat Dec 19 06:16:22 2015 -0500
+++ b/extract_aln_ends.py	Tue Jan 26 04:38:27 2016 -0500
@@ -1,5 +1,17 @@
 #!/usr/bin/env python
 
+import argparse
+import logging
+from sys import stdout
+from shutil import rmtree
+from tempfile import mkdtemp
+from pybedtools import BedTool
+import pysam
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
 tool_description = """
 Extract alignment ends from sam file.
 
@@ -32,25 +44,12 @@
 Status: Development
 """
 
-import argparse
-import logging
-from sys import stdout
-from shutil import rmtree
-from tempfile import mkdtemp
-from pybedtools import BedTool
-import pysam
-
 
 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
                                           argparse.RawDescriptionHelpFormatter):
     # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
     pass
 
-# avoid ugly python IOError when stdout output is piped into another program
-# and then truncated (such as piping to head)
-from signal import signal, SIGPIPE, SIG_DFL
-signal(SIGPIPE, SIG_DFL)
-
 # parse command line arguments
 parser = argparse.ArgumentParser(description=tool_description,
                                  epilog=epilog,
--- a/extract_aln_ends.xml	Sat Dec 19 06:16:22 2015 -0500
+++ b/extract_aln_ends.xml	Tue Jan 26 04:38:27 2016 -0500
@@ -5,7 +5,7 @@
   </macros>
   <expand macro="requirements" />
   <expand macro="stdio" />
-  <version_command>python extract_aln_ends.py --version</version_command>
+  <version_command>python $__tool_directory__/extract_aln_ends.py --version</version_command>
   <command interpreter="python"><![CDATA[
 extract_aln_ends.py
 #if $positional_1 and $positional_1 is not None:
--- a/extract_bcs.py	Sat Dec 19 06:16:22 2015 -0500
+++ b/extract_bcs.py	Tue Jan 26 04:38:27 2016 -0500
@@ -1,5 +1,15 @@
 #!/usr/bin/env python
 
+import argparse
+import logging
+import re
+from sys import stdout
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
 tool_description = """
 Exract barcodes from a FASTQ file according to a user-specified pattern.
 
@@ -19,17 +29,6 @@
 Status: Testing
 """
 
-import argparse
-import logging
-import re
-from sys import stdout
-from Bio.SeqIO.QualityIO import FastqGeneralIterator
-
-# avoid ugly python IOError when stdout output is piped into another program
-# and then truncated (such as piping to head)
-from signal import signal, SIGPIPE, SIG_DFL
-signal(SIGPIPE, SIG_DFL)
-
 # parse command line arguments
 parser = argparse.ArgumentParser(description=tool_description,
                                  epilog=epilog,
--- a/extract_bcs.xml	Sat Dec 19 06:16:22 2015 -0500
+++ b/extract_bcs.xml	Tue Jan 26 04:38:27 2016 -0500
@@ -5,7 +5,7 @@
   </macros>
   <expand macro="requirements" />
   <expand macro="stdio" />
-  <version_command>python extract_bcs.py --version</version_command>
+  <version_command>python $__tool_directory__/extract_bcs.py --version</version_command>
   <command interpreter="python"><![CDATA[
 extract_bcs.py
 #if $positional_1 and $positional_1 is not None:
--- a/merge_pcr_duplicates.py	Sat Dec 19 06:16:22 2015 -0500
+++ b/merge_pcr_duplicates.py	Tue Jan 26 04:38:27 2016 -0500
@@ -1,5 +1,18 @@
 #!/usr/bin/env python
 
+import argparse
+import logging
+from sys import stdout
+import pandas as pd
+from subprocess import check_call
+from shutil import rmtree
+from tempfile import mkdtemp
+from os.path import isfile
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
 tool_description = """
 Merge PCR duplicates according to random barcode library.
 
@@ -28,24 +41,6 @@
 Status: Testing
 """
 
-import argparse
-import logging
-from sys import stdout
-from Bio import SeqIO
-import pandas as pd
-
-# avoid ugly python IOError when stdout output is piped into another program
-# and then truncated (such as piping to head)
-from signal import signal, SIGPIPE, SIG_DFL
-signal(SIGPIPE, SIG_DFL)
-
-
-def fasta_tuple_generator(fasta_iterator):
-    "Yields id, sequence tuples given an iterator over Biopython SeqIO objects."
-    for record in input_seq_iterator:
-        yield (record.id, str(record.seq))
-
-
 # parse command line arguments
 parser = argparse.ArgumentParser(description=tool_description,
                                  epilog=epilog,
@@ -56,16 +51,11 @@
     help="Path to bed6 file containing alignments.")
 parser.add_argument(
     "bclib",
-    help="Path to fasta barcode library.")
+    help="Path to fastq barcode library.")
 # optional arguments
 parser.add_argument(
     "-o", "--outfile",
     help="Write results to this file.")
-parser.add_argument(
-    "--fasta-library",
-    dest="fasta_library",
-    action="store_true",
-    help="Read random barcode library as fasta format.")
 # misc arguments
 parser.add_argument(
     "-v", "--verbose",
@@ -78,7 +68,7 @@
 parser.add_argument(
     '--version',
     action='version',
-    version='0.1.0')
+    version='0.2.0')
 
 args = parser.parse_args()
 
@@ -96,50 +86,65 @@
     logging.info("  outfile: '{}'".format(args.outfile))
 logging.info("")
 
-# load barcode library into dictionary
-input_handle = open(args.bclib, "rU")
-if args.fasta_library:
-    input_seq_iterator = SeqIO.parse(input_handle, "fasta")
-else:
-    input_seq_iterator = SeqIO.parse(input_handle, "fastq")
-bcs = pd.DataFrame.from_records(
-    data=fasta_tuple_generator(input_seq_iterator),
-    columns=["read_id", "bc"])
+# see if alignments are empty and the tool can quit
+n_alns = sum(1 for line in open(args.alignments))
+if n_alns == 0:
+    logging.warning("WARNING: Working on empty set of alignments, writing empty output.")
+    eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
+    eventalnout.close()
+    exit(0)
+
+# check input filenames
+if not isfile(args.bclib):
+    raise Exception("ERROR: barcode library '{}' not found.")
+if not isfile(args.alignments):
+    raise Exception("ERROR: alignments '{}' not found.")
+
+try:
+    tmpdir = mkdtemp()
+    logging.debug("tmpdir: " + tmpdir)
 
-# load alignments
-alns = pd.read_csv(
-    args.alignments,
-    sep="\t",
-    names=["chrom", "start", "stop", "read_id", "score", "strand"])
+    # prepare barcode library
+    syscall1 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort -k1,1 > " + tmpdir + "/bclib.csv"
+    check_call(syscall1, shell=True)
+
+    # prepare alinments
+    syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort -k4,4 > " + tmpdir + "/alns.csv"
+    check_call(syscall2, shell=True)
 
-# keep id parts up to first whitespace
-alns["read_id"] = alns["read_id"].str.split(' ').str.get(0)
+    # join barcode library and alignments
+    syscall3 = "join -1 1 -2 4 " + tmpdir + "/bclib.csv " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}{print $3,$4,$5,$2,$6,$7}' > " + tmpdir + "/bcalib.csv"
+    check_call(syscall3, shell=True)
 
-# combine barcode library and alignments
-bcalib = pd.merge(
-    bcs, alns,
-    on="read_id",
-    how="inner",
-    sort=False)
+    # get alignments combined with barcodes
+    bcalib = pd.read_csv(
+        tmpdir + "/bcalib.csv",
+        sep="\t",
+        names=["chrom", "start", "stop", "bc", "score", "strand"])
+finally:
+    logging.debug("removed tmpdir: " + tmpdir)
+    rmtree(tmpdir)
+
+# fail if alignments given but combined library is empty
 if bcalib.empty:
     raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.")
-n_alns = len(alns.index)
+
+# warn if not all alignments could be assigned a barcode
 n_bcalib = len(bcalib.index)
 if n_bcalib < n_alns:
     logging.warning(
-        "{} of {} alignments could not be associated with a random barcode.".format(
-            n_alns - n_bcalib, n_alns))
+        "{} of {} alignments could not be associated with a random barcode.".format(n_alns - n_bcalib, n_alns))
 
 # remove entries with barcodes that has uncalled base N
 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index)
 n_bcalib_cleaned = len(bcalib_cleaned)
-if n_bcalib_cleaned < n_bcalib:
-    msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
-        n_bcalib - n_bcalib_cleaned, n_bcalib)
-    if n_bcalib_cleaned < (0.8 * n_bcalib):
-        logging.warning(msg)
-    else:
-        logging.info(msg)
+# if n_bcalib_cleaned < n_bcalib:
+#     msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
+#         n_bcalib - n_bcalib_cleaned, n_bcalib)
+#     if n_bcalib_cleaned < (0.8 * n_bcalib):
+#         logging.warning(msg)
+#     else:
+#         logging.info(msg)
 
 # count and merge pcr duplicates
 # grouping sorts by keys, so the ouput will be properly sorted
--- a/merge_pcr_duplicates.xml	Sat Dec 19 06:16:22 2015 -0500
+++ b/merge_pcr_duplicates.xml	Tue Jan 26 04:38:27 2016 -0500
@@ -1,11 +1,15 @@
-<tool id="merge_pcr_duplicates.py" name="Merge PCR duplicates" version="0.1.0">
+<tool id="merge_pcr_duplicates.py" name="Merge PCR duplicates" version="0.2.0">
   <description>according to random barcode library.</description>
   <macros>
     <import>macros.xml</import>
   </macros>
   <expand macro="requirements" />
   <expand macro="stdio" />
-  <version_command>python merge_pcr_duplicates.py --version</version_command>
+  <requirements>
+      <requirement type="package" version="4.1.0">gnu_awk</requirement>
+      <requirement type="package" version="8.22">gnu_coreutils</requirement>
+  </requirements>
+  <version_command>python $__tool_directory__/merge_pcr_duplicates.py --version</version_command>
   <command interpreter="python"><![CDATA[merge_pcr_duplicates.py
 #if $positional_1 and $positional_1 is not None:
 $positional_1
@@ -18,7 +22,7 @@
 > $default]]></command>
   <inputs>
     <param area="false" label="bed6 file containing alignments." name="positional_1" type="data" format="bed"/>
-    <param area="false" label="fasta barcode library." name="positional_2" type="data" format="fastq"/>
+    <param area="false" label="fastaq barcode library." name="positional_2" type="data" format="fastq"/>
   </inputs>
   <outputs>
     <data format="bed" hidden="false" name="default"/>
--- a/remove_tail.py	Sat Dec 19 06:16:22 2015 -0500
+++ b/remove_tail.py	Tue Jan 26 04:38:27 2016 -0500
@@ -1,5 +1,14 @@
 #!/usr/bin/env python
 
+import argparse
+import logging
+from sys import stdout
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
 tool_description = """
 Remove a certain number of nucleotides from the 3'-tails of sequences in fastq
 format.
@@ -18,16 +27,6 @@
 Status: Testing
 """
 
-import argparse
-import logging
-from sys import stdout
-from Bio.SeqIO.QualityIO import FastqGeneralIterator
-
-# avoid ugly python IOError when stdout output is piped into another program
-# and then truncated (such as piping to head)
-from signal import signal, SIGPIPE, SIG_DFL
-signal(SIGPIPE, SIG_DFL)
-
 # parse command line arguments
 parser = argparse.ArgumentParser(description=tool_description,
                                  epilog=epilog,
--- a/remove_tail.xml	Sat Dec 19 06:16:22 2015 -0500
+++ b/remove_tail.xml	Tue Jan 26 04:38:27 2016 -0500
@@ -5,7 +5,7 @@
   </macros>
   <expand macro="requirements" />
   <expand macro="stdio" />
-  <version_command>python remove_tail.py --version</version_command>
+  <version_command>python $__tool_directory__/remove_tail.py --version</version_command>
   <command interpreter="python"><![CDATA[remove_tail.py
 #if $positional_1 and $positional_1 is not None:
 $positional_1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rm_spurious_events.pl	Tue Jan 26 04:38:27 2016 -0500
@@ -0,0 +1,87 @@
+#!/usr/local/perl/bin/perl
+use feature ':5.10';
+use strict 'vars';
+use warnings;
+use Getopt::Long;
+use Pod::Usage;
+use List::Util qw/ min max /;
+use POSIX qw/ceil floor/;
+use File::Temp qw(tempdir);
+use File::Basename;
+
+=head1 NAME
+
+filterSquashedReads.pl --frac_max FLOAT
+
+=head1 SYNOPSIS
+
+this script reads sites after pcr duplicate removal.
+for all crosslinking sites sharing start and stop coordinates, the maximum number
+of squashed reads is determined.
+alignments having less than frac_max * max reads are discarded
+
+assumes bed entries to be sorted chr,strand,start,stop,score with score descending
+
+Options:
+
+    --frac_max  filter out alignments supported by less reads than this fraction of the maximum number of reads per position
+    -debug      enable debug output
+    -help       brief help message
+    -man        full documentation
+
+=head1 DESCRIPTION
+
+=cut
+
+###############################################################################
+# parse command line options
+###############################################################################
+my $frac_max = 0.1;
+my $help;
+my $man;
+my $debug;
+my $result = GetOptions (   "frac_max=f" => \$frac_max,
+                            "help"	=> \$help,
+							"man"	=> \$man,
+							"debug" => \$debug);
+pod2usage(-exitstatus => 1, -verbose => 1) if $help;
+pod2usage(-exitstatus => 0, -verbose => 2) if $man;
+($result) or pod2usage(2);
+
+###############################################################################
+# main
+###############################################################################
+my $current_chr = '';
+my $current_start = -1;
+my $current_stop = -1;
+my $current_strand = '';
+my $current_max = -1;
+my $current_threshold = -1;
+
+while (<>) {
+    my ($chr, $start, $stop, $id, $count, $strand) = split("\t");
+    # my ($count, undef, $start, $chr, $strand, $stop) = split("\t");
+
+    if ($current_start != $start or
+        $current_stop != $stop or
+        $current_chr ne $chr or
+        $current_strand ne $strand) {
+        # if this is the first occourence of these coordinates
+        # this must be the new maximum
+        # save new state
+        $current_start = $start;
+        $current_stop = $stop;
+        $current_chr = $chr;
+        $current_strand = $strand;
+        # print record and record maximum
+        $current_max = $count;
+        $current_threshold = $count*$frac_max;
+        print $_;
+        $debug and say "new threshold ${current_threshold} @ $chr $start $stop $strand $count";
+    } elsif ($count >= $current_threshold) {
+        # if it is not the first occourence, evaluate threshold and print if valid
+        print $_;
+    } else {
+        $debug and say "below threshold @ $chr $start $stop $strand $count";
+    }
+}
--- a/rm_spurious_events.py	Sat Dec 19 06:16:22 2015 -0500
+++ b/rm_spurious_events.py	Tue Jan 26 04:38:27 2016 -0500
@@ -1,5 +1,10 @@
 #!/usr/bin/env python
 
+import argparse
+import logging
+from subprocess import check_call
+import os
+
 tool_description = """
 Remove spurious events originating from errors in random sequence tags.
 
@@ -7,8 +12,6 @@
 of events the maximum number of PCR duplicates is determined. All events that
 are supported by less than 10 percent of this maximum count are removed.
 
-By default output is written to stdout.
-
 Input:
 * bed6 file containing crosslinking events with score field set to number of PCR
   duplicates
@@ -19,7 +22,7 @@
 
 Example usage:
 - remove spurious events from spurious.bed and write results to file cleaned.bed
-rm_spurious_events.py spurious.bed --out cleaned.bed
+rm_spurious_events.py spurious.bed --oufile cleaned.bed
 """
 
 epilog = """
@@ -30,89 +33,74 @@
 Status: Testing
 """
 
-import argparse
-import logging
-from sys import stdout
-import pandas as pd
-
 
 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
                                           argparse.RawDescriptionHelpFormatter):
     # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
     pass
 
-# avoid ugly python IOError when stdout output is piped into another program
-# and then truncated (such as piping to head)
-from signal import signal, SIGPIPE, SIG_DFL
-signal(SIGPIPE, SIG_DFL)
-
-# parse command line arguments
-parser = argparse.ArgumentParser(description=tool_description,
-                                 epilog=epilog,
-                                 formatter_class=DefaultsRawDescriptionHelpFormatter)
-# positional arguments
-parser.add_argument(
-    "events",
-    help="Path to bed6 file containing alignments.")
-# optional arguments
-parser.add_argument(
-    "-o", "--outfile",
-    help="Write results to this file.")
-parser.add_argument(
-    "-t", "--threshold",
-    type=float,
-    default=0.1,
-    help="Threshold for spurious event removal."
-)
-# misc arguments
-parser.add_argument(
-    "-v", "--verbose",
-    help="Be verbose.",
-    action="store_true")
-parser.add_argument(
-    "-d", "--debug",
-    help="Print lots of debugging information",
-    action="store_true")
-parser.add_argument(
-    '--version',
-    action='version',
-    version='0.1.0')
 
-args = parser.parse_args()
-
-if args.debug:
-    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
-elif args.verbose:
-    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
-else:
-    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
-logging.info("Parsed arguments:")
-logging.info("  alignments: '{}'".format(args.events))
-logging.info("  threshold: '{}'".format(args.threshold))
-if args.outfile:
-    logging.info("  outfile: enabled writing to file")
-    logging.info("  outfile: '{}'".format(args.outfile))
-logging.info("")
+def main():
+    # parse command line arguments
+    parser = argparse.ArgumentParser(description=tool_description,
+                                     epilog=epilog,
+                                     formatter_class=DefaultsRawDescriptionHelpFormatter)
+    # positional arguments
+    parser.add_argument(
+        "events",
+        help="Path to bed6 file containing alignments.")
+    # optional arguments
+    parser.add_argument(
+        "-o", "--outfile",
+        required=True,
+        help="Write results to this file.")
+    parser.add_argument(
+        "-t", "--threshold",
+        type=float,
+        default=0.1,
+        help="Threshold for spurious event removal."
+    )
+    # misc arguments
+    parser.add_argument(
+        "-v", "--verbose",
+        help="Be verbose.",
+        action="store_true")
+    parser.add_argument(
+        "-d", "--debug",
+        help="Print lots of debugging information",
+        action="store_true")
+    parser.add_argument(
+        '--version',
+        action='version',
+        version='0.1.0')
 
-# check threshold parameter value
-if args.threshold < 0 or args.threshold > 1:
-    raise ValueError("Threshold must be in [0,1].")
-
-# load alignments
-alns = pd.read_csv(
-    args.events,
-    sep="\t",
-    names=["chrom", "start", "stop", "read_id", "score", "strand"])
+    args = parser.parse_args()
 
-# remove all alignments that not enough PCR duplicates with respect to
-# the group maximum
-grouped = alns.groupby(['chrom', 'start', 'stop', 'strand'], group_keys=False)
-alns_cleaned = grouped.apply(lambda g: g[g["score"] >= args.threshold * g["score"].max()])
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+    elif args.verbose:
+        logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+    else:
+        logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+    logging.info("Parsed arguments:")
+    logging.info("  alignments: '{}'".format(args.events))
+    logging.info("  threshold: '{}'".format(args.threshold))
+    if args.outfile:
+        logging.info("  outfile: enabled writing to file")
+        logging.info("  outfile: '{}'".format(args.outfile))
+    logging.info("")
 
-# write coordinates of crosslinking event alignments
-alns_cleaned_out = (open(args.outfile, "w") if args.outfile is not None else stdout)
-alns_cleaned.to_csv(
-    alns_cleaned_out,
-    columns=['chrom', 'start', 'stop', 'read_id', 'score', 'strand'],
-    sep="\t", index=False, header=False)
-alns_cleaned_out.close()
+    # check threshold parameter value
+    if args.threshold < 0 or args.threshold > 1:
+        raise ValueError("Threshold must be in [0,1].")
+
+    if not os.path.isfile(args.events):
+        raise Exception("ERROR: file '{}' not found.")
+
+    # prepare barcode library
+    syscall = "cat " + args.events + " | sort -k1,1V -k6,6 -k2,2n -k3,3 -k5,5nr | perl " + os.path.dirname(os.path.realpath(__file__)) + "/rm_spurious_events.pl --frac_max " + str(args.threshold) + "| sort -k1,1V -k2,2n -k3,3n -k6,6 -k4,4 -k5,5nr > " + args.outfile
+    check_call(syscall, shell=True)
+
+
+if __name__ == "__main__":
+    main()
--- a/rm_spurious_events.xml	Sat Dec 19 06:16:22 2015 -0500
+++ b/rm_spurious_events.xml	Tue Jan 26 04:38:27 2016 -0500
@@ -5,7 +5,11 @@
   </macros>
   <expand macro="requirements" />
   <expand macro="stdio" />
-  <version_command>python rm_spurious_events.py --version</version_command>
+  <requirements>
+    <requirement type="package" version="5.18">perl</requirement>
+    <requirement type="package" version="8.22">gnu_coreutils</requirement>
+  </requirements>
+  <version_command>python $__tool_directory__/rm_spurious_events.py --version</version_command>
   <command interpreter="python"><![CDATA[rm_spurious_events.py
 #if $positional_1 and $positional_1 is not None:
 $positional_1
@@ -14,7 +18,7 @@
 #if $threshold and $threshold is not None:
 --threshold $threshold
 #end if
-> $default]]></command>
+--outfile $default]]></command>
   <inputs>
     <param area="false" label="bed6 file containing alignments." name="positional_1" type="data" format="bed"/>
     <param help="(--threshold)" label="Threshold for spurious event removal." name="threshold" optional="true" type="float" value="0.1"/>
--- a/tool_dependencies.xml	Sat Dec 19 06:16:22 2015 -0500
+++ b/tool_dependencies.xml	Tue Jan 26 04:38:27 2016 -0500
@@ -7,18 +7,27 @@
         <repository name="package_biopython_1_65" owner="biopython"/>
     </package> -->
     <package name="pandas" version="0.16">
-        <repository changeset_revision="8766fb73e88f" name="package_python_2_7_pandas_0_16" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="67abd81bd3ac" name="package_python_2_7_pandas_0_16" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
     <!-- <package name="pybedtools" version="0.7.4">
         <repository name="package_python_2_7_pybedtools_0_7_4" owner="iuc"/>
     </package> -->
     <package name="pybedtools" version="0.6.9">
-        <repository changeset_revision="7d07b5c08b8f" name="package_python_2_7_pybedtools_0_6_9" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="c4641c3a869f" name="package_python_2_7_pybedtools_0_6_9" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
     <package name="pysam" version="0.8.3">
-        <repository changeset_revision="f88a5f4ac6c1" name="package_python_2_7_pysam_0_8_3" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="7ac80143c68d" name="package_python_2_7_pysam_0_8_3" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
     <package name="flexbar" version="2.5">
         <repository changeset_revision="45d1b8373762" name="package_flexbar_2_5" owner="rnateam" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
+    <package name="gnu_awk" version="4.1.0">
+        <repository changeset_revision="0f0bdef2f686" name="package_gnu_awk_4_1_0" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="gnu_coreutils" version="8.22">
+        <repository changeset_revision="8b60cf3e0c07" name="package_gnu_coreutils_8_22" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="perl" version="5.18">
+        <repository changeset_revision="6f144bd786a8" name="package_perl_5_18" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    </package>
 </tool_dependency>