Mercurial > repos > rnateam > bctools

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_bc_to_binary_RY.py	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+tool_description = """
+Convert standard nucleotides to IUPAC nucleotide codes used for binary barcodes.
+
+A and G are converted to nucleotide code R. T, U and C are converted to Y. By
+default output is written to stdout.
+
+Example usage:
+- write converted sequences from file in.fa to file file out.fa:
+convert_bc_to_binary_RY.py in.fa --outfile out.fa
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+from string import maketrans
+from sys import stdout
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+
+# # avoid ugly python IOError when stdout output is piped into another program
+# # and then truncated (such as piping to head)
+# from signal import signal, SIGPIPE, SIG_DFL
+# signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+                                 epilog=epilog,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+    "infile",
+    help="Path to fasta input file.")
+# optional arguments
+parser.add_argument(
+    "-o", "--outfile",
+    help="Write results to this file.")
+parser.add_argument(
+    "-v", "--verbose",
+    help="Be verbose.",
+    action="store_true")
+parser.add_argument(
+    "-d", "--debug",
+    help="Print lots of debugging information",
+    action="store_true")
+parser.add_argument(
+    '--version',
+    action='version',
+    version='0.1.0')
+
+
+def translate_nt_to_RY(seq):
+    """Translates nucleotides to RY (A,G -> R; C,U,T -> Y).
+
+    >>> translate_nt_to_RY("ACGUTACGUT")
+    RYRYYRYRYY
+    """
+    trans_table = maketrans("AGCUT", "RRYYY")
+    trans_seq = seq.translate(trans_table)
+    logging.debug(seq + " -> " + trans_seq)
+    return trans_seq
+
+
+def translate_nt_to_RY_iterator(robj):
+    """Translate SeqRecords sequences to RY alphabet."""
+    for record in robj:
+        record.seq = Seq(translate_nt_to_RY(str(record.seq)),
+                         IUPAC.unambiguous_dna)
+        yield record
+
+# handle arguments
+args = parser.parse_args()
+if args.debug:
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+if args.outfile:
+    logging.info("  outfile: enabled writing to file")
+    logging.info("  outfile: '{}'".format(args.outfile))
+logging.info("  outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# get input iterator
+input_handle = open(args.infile, "rU")
+input_seq_iterator = SeqIO.parse(input_handle, "fasta")
+convert_seq_iterator = translate_nt_to_RY_iterator(input_seq_iterator)
+output_handle = (open(args.outfile, "w") if args.outfile is not None else stdout)
+SeqIO.write(convert_seq_iterator, output_handle, "fasta")
+output_handle.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_bc_to_binary_RY.xml	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,39 @@
+<tool id="convert_bc_to_binary_RY.py" name="convert_bc_to_binary_RY.py" version="0.1.0">
+  <description>Convert to binary barcodes.</description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <expand macro="requirements" />
+  <expand macro="stdio" />
+  <version_command>python convert_bc_to_binary_RY.py --version</version_command>
+  <command interpreter="python"><![CDATA[
+convert_bc_to_binary_RY.py
+#if $positional_1 and $positional_1 is not None:
+$positional_1
+#end if
+> $default]]></command>
+  <inputs>
+    <param label="Fasta file to convert." name="positional_1" type="data" format="fasta"/>
+  </inputs>
+  <outputs>
+    <data hidden="false" name="default" format="fasta"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="positional_1" value="result.fa"/>
+      <output name="default" file="converted_bcs.fa"/>
+    </test>
+  </tests>
+  <help><![CDATA[
+Convert standard nucleotides to IUPAC nucleotide codes used for binary barcodes.
+
+A and G are converted to nucleotide code R. T, U and C are converted to Y.
+
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+]]></help>
+  <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/coords2clnt.py	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+tool_description = """
+Given coordinates of the aligned reads, calculate positions of the crosslinked
+nucleotides. Crosslinked nts are assumed to be one nt upstream of the 5'-end of
+the read.
+
+By default output is written to stdout.
+
+Input:
+* bed6 file containing coordinates of aligned reads
+* bed6 file containing coordinates of crosslinking events
+
+Example usage:
+- convert read coordinates from file in.bed to coordinates of the crosslinking
+  events, written to out.bed:
+coords2clnt.py in.bed --outfile out.bed
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+from sys import stdout
+from pybedtools import BedTool
+from pybedtools.featurefuncs import five_prime
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+                                 epilog=epilog,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+    "infile",
+    help="Path to bed input file.")
+# optional arguments
+parser.add_argument(
+    "-o", "--outfile",
+    help="Write results to this file.")
+parser.add_argument(
+    "-v", "--verbose",
+    help="Be verbose.",
+    action="store_true")
+parser.add_argument(
+    "-d", "--debug",
+    help="Print lots of debugging information",
+    action="store_true")
+parser.add_argument(
+    '--version',
+    action='version',
+    version='0.1.0')
+
+
+# handle arguments
+args = parser.parse_args()
+if args.debug:
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+if args.outfile:
+    logging.info("  outfile: enabled writing to file")
+    logging.info("  outfile: '{}'".format(args.outfile))
+logging.info("  outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# data processing
+alns = BedTool(args.infile)
+clnts = alns.each(five_prime, upstream=1, downstream=0)
+
+# write to file or to stdout
+if args.outfile:
+    clnts.saveas(args.outfile)
+else:
+    tmptool = clnts.saveas()
+    logging.debug("results written to temporary file :" + tmptool.fn)
+    tmp = open(tmptool.fn)
+    for line in tmp:
+        stdout.write(line)
+    tmp.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/coords2clnt.xml	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,45 @@
+<tool id="coords2clnt.py" name="coords2clnt.py" version="1.0">
+  <description>Extract crosslinked nucleotide.</description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <expand macro="requirements" />
+  <expand macro="stdio" />
+  <stdio>
+    <exit_code level="fatal" range="1:"/>
+  </stdio>
+  <version_command>python coords2clnt.py --version</version_command>
+  <command interpreter="python"><![CDATA[coords2clnt.py
+#if $positional_1 and $positional_1 is not None:
+$positional_1
+#end if
+
+> $default]]></command>
+  <inputs>
+    <param area="false" label="Alignments in bed format." name="positional_1" type="data" format="bed"/>
+  </inputs>
+  <outputs>
+    <data hidden="false" name="default" format="bed"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="positional_1" value="merged_pcr_dupes.bed"/>
+      <output name="default" file="merged_pcr_dupes_clnts.bed"/>
+    </test>
+  </tests>
+  <help><![CDATA[
+Given coordinates of the aligned reads, calculate positions of the crosslinked nucleotides.
+Crosslinked nts are assumed to be one nt upstream of the 5'-end of the read.
+
+Input:
+* bed6 file containing coordinates of aligned reads
+* bed6 file containing coordinates of crosslinking events
+
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+]]></help>
+  <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_aln_ends.py	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+
+tool_description = """
+Extract alignment ends from sam file.
+
+The resulting bed file contains the outer coordinates of the alignments. The
+bed name field is set to the read id and the score field is set to the edit
+distance of the alignment. The crosslinked nucleotide is one nt upstream of the
+5'-end of the bed entries.
+
+This script only reports results for alignments that are properly aligned in FR
+("forward-reverse") direction.
+
+By default output is written to stdout.
+
+Input:
+* sam file containing alignments (paired-end sequencing)
+
+Output:
+* bed6 file containing outer coordinates (sorted by read id)
+
+Example usage:
+- Extract coordinates from file input.bam and write to file output.bed
+extract_aln_ends.py input.bam --out output.bed
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Development
+"""
+
+import argparse
+import logging
+from sys import stdout
+from shutil import rmtree
+from tempfile import mkdtemp
+from pybedtools import BedTool
+import pysam
+
+
+class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
+                                          argparse.RawDescriptionHelpFormatter):
+    # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
+    pass
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+                                 epilog=epilog,
+                                 formatter_class=DefaultsRawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+    "sam",
+    help="Path to sam file containing alignments.")
+# optional arguments
+parser.add_argument(
+    "-o", "--outfile",
+    help="Write results to this file.")
+# misc arguments
+parser.add_argument(
+    "-v", "--verbose",
+    help="Be verbose.",
+    action="store_true")
+parser.add_argument(
+    "-d", "--debug",
+    help="Print lots of debugging information",
+    action="store_true")
+parser.add_argument(
+    '--version',
+    action='version',
+    version='0.1.0')
+
+args = parser.parse_args()
+
+if args.debug:
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info("  sam: '{}'".format(args.sam))
+if args.outfile:
+    logging.info("  outfile: enabled writing to file")
+    logging.info("  outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# convert to bam for use with pybedtools
+try:
+    # setup temporary directory
+    tmpdir = mkdtemp()
+    logging.debug("tmpdir: " + tmpdir)
+
+    fn_sorted = tmpdir + "/sorted.bam"
+    fn_fixedmates = tmpdir + "/fixedmates.bam"
+
+    # sort by id
+    logging.debug("calling samtools sort")
+    pysam.sort(args.sam, "-n", "-o{}".format(fn_sorted), "-T sortprefix")
+
+    # fix mate information
+    # also removes secondary and unmapped reads
+    logging.debug("calling samtools fixmates")
+    pysam.fixmate("-r", fn_sorted, fn_fixedmates)
+
+    # bedtools bam2bed
+    alns = BedTool(fn_fixedmates)
+    alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True)
+
+    # determine alignment ends and write to file
+    with (open(args.outfile, "w") if args.outfile is not None else stdout) as out:
+        for i in alns_bedpe:
+            chrom = i.fields[0]
+            fmstart = i.fields[1]
+            fmend = i.fields[2]
+            smstart = i.fields[4]
+            smend = i.fields[5]
+            readid = i.fields[6]
+            score = i.fields[7]
+            fmstrand = i.fields[8]
+            if fmstrand == "+":
+                start = fmstart
+                end = smend
+            elif fmstrand == "-":
+                start = smstart
+                end = fmend
+            else:
+                logging.warning("Skipping {}, strand information is missing: '{}'".format(readid, i))
+                continue
+            out.write("\t".join([chrom, start, end, readid, score, fmstrand]) + "\n")
+finally:
+    logging.debug("removed tmpdir: " + tmpdir)
+    rmtree(tmpdir)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_aln_ends.xml	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,52 @@
+<tool id="extract_aln_ends.py" name="extract_aln_ends.py" version="0.1.0">
+  <description>Extract alignment ends from sam file.</description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <expand macro="requirements" />
+  <expand macro="stdio" />
+  <version_command>python extract_aln_ends.py --version</version_command>
+  <command interpreter="python"><![CDATA[
+extract_aln_ends.py
+#if $positional_1 and $positional_1 is not None:
+$positional_1
+#end if
+
+> $default]]></command>
+  <inputs>
+    <param area="false" label="Sam input." name="positional_1" type="data" format="sam"/>
+  </inputs>
+  <outputs>
+    <data format="bed" hidden="false" name="default"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="positional_1" value="twomates.sam"/>
+      <output name="default" file="tworeads_aln_ends.bed"/>
+    </test>
+  </tests>
+  <help><![CDATA[
+Extract alignment ends from sam file.
+
+The resulting bed file contains the outer coordinates of the alignments. The
+bed name field is set to the read id and the score field is set to the edit
+distance of the alignment. The crosslinked nucleotide is one nt upstream of the
+5'-end of the bed entries.
+
+This tool only reports results for alignments that are properly aligned in FR
+("forward-reverse") direction.
+
+Input:
+* sam file containing alignments (paired-end sequencing)
+
+Output:
+* bed6 file containing outer coordinates (sorted by read id)
+
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Development
+]]></help>
+  <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_bcs.py	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,166 @@
+#!/usr/bin/env python
+
+tool_description = """
+Exract barcodes from a FASTQ file according to a user-specified pattern.
+
+By default output is written to stdout.
+
+Example usage:
+- move nucleotides at positions 1-3 and 6-7 to FASTQ header and write to file
+  output.fastq:
+fastq_extract_barcodes.py barcoded_input.fastq XXXNNXX --out output.fastq
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+import re
+from sys import stdout
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+                                 epilog=epilog,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+    "infile",
+    help="Path to fastq file.")
+parser.add_argument(
+    "pattern",
+    help="Pattern of barcode nucleotides starting at 5'-end. X positions will be moved to the header, N positions will be kept.")
+# optional arguments
+parser.add_argument(
+    "-o", "--outfile",
+    help="Write results to this file.")
+parser.add_argument(
+    "-b", "--bcs",
+    dest="out_bc_fasta",
+    help="If set, barcodes are written to this file in FASTA format.")
+parser.add_argument(
+    "-v", "--verbose",
+    help="Be verbose.",
+    action="store_true")
+parser.add_argument(
+    "-d", "--debug",
+    help="Print lots of debugging information",
+    action="store_true")
+parser.add_argument(
+    '--version',
+    action='version',
+    version='0.1.0')
+
+args = parser.parse_args()
+if args.debug:
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info("  infile: '{}'".format(args.infile))
+logging.info("  pattern: '{}'".format(args.pattern))
+if args.outfile:
+    logging.info("  outfile: enabled writing to file")
+    logging.info("  outfile: '{}'".format(args.outfile))
+if args.out_bc_fasta:
+    logging.info("  bcs: enabled writing barcodes to fasta file")
+    logging.info("  bcs: {}".format(args.out_bc_fasta))
+logging.info("")
+
+# check if supplied pattern is valid
+valid_pattern = re.compile("^[XN]+$")
+pattern_match = valid_pattern.match(args.pattern)
+if pattern_match is None:
+    raise ValueError("Error: supplied pattern '{}' is not valid.".format(args.pattern))
+
+# check if at least one barcode position is included in the pattern
+has_bcpos_pattern = re.compile("X")
+pattern_match = has_bcpos_pattern.search(args.pattern)
+if pattern_match is None:
+    raise ValueError("Error: supplied pattern '{}' does not contain a barcode position 'X'.".format(args.pattern))
+
+logging.info("Barcode pattern analysis:")
+# get X positions of pattern string
+barcode_nt_pattern = re.compile("X+")
+barcode_positions = []
+for m in re.finditer(barcode_nt_pattern, args.pattern):
+    logging.info('  found barcode positions in pattern: %02d-%02d: %s' % (m.start(), m.end(), m.group(0)))
+    barcode_positions.append((m.start(), m.end()))
+logging.info("  barcode positions: {}".format(barcode_positions))
+# get last position of a barcode nt in the pattern
+# reads must be long enough for all
+min_readlen = barcode_positions[-1][-1]
+logging.info("  last position of a barcode nt in pattern: {}".format(min_readlen))
+logging.info("")
+
+# get coordinates of nucleotides to keep
+# the tail after the last barcode nt is handled separately
+seq_positions = []
+last_seq_start = 0
+for bcstart, bcstop in barcode_positions:
+    seq_positions.append((last_seq_start, bcstart))
+    last_seq_start = bcstop
+logging.info("  sequence positions: {}".format(seq_positions))
+logging.info("  start of sequence tail: {}".format(last_seq_start))
+
+samout = (open(args.outfile, "w") if args.outfile is not None else stdout)
+if args.out_bc_fasta is not None:
+    faout = open(args.out_bc_fasta, "w")
+for header, seq, qual in FastqGeneralIterator(open(args.infile)):
+
+    # skip reads that are too short to extract the full requested barcode
+    if len(seq) < min_readlen:
+        logging.warning("skipping read '{}', is too short to extract the full requested barcode".format(header))
+        logging.debug("seq: {}".format(seq))
+        logging.debug("len(seq): {}".format(len(seq)))
+        continue
+
+    # extract barcode nucleotides
+    barcode_list = []
+    for bcstart, bcstop in barcode_positions:
+        barcode_list.append(seq[bcstart:bcstop])
+    barcode = "".join(barcode_list)
+    logging.debug("extracted barcode: {}".format(barcode))
+
+    # create new sequence and quality string without barcode nucleotides
+    new_seq_list = []
+    new_qual_list = []
+    for seqstart, seqstop in seq_positions:
+        new_seq_list.append(seq[seqstart:seqstop])
+        new_qual_list.append(qual[seqstart:seqstop])
+    new_seq_list.append(seq[last_seq_start:])
+    new_qual_list.append(qual[last_seq_start:])
+    new_seq = "".join(new_seq_list)
+    new_qual = "".join(new_qual_list)
+    # check if at least one nucleotide is left. having none would break fastq
+    if len(new_seq) == 0:
+        logging.warning("skipping read '{}', no sequence remains after barcode extraction".format(header))
+        logging.debug("seq: {}".format(seq))
+        logging.debug("len(seq): {}".format(len(seq)))
+        continue
+
+    # write barcode nucleotides into header
+    annotated_header = header + " " + barcode
+    samout.write("@%s\n%s\n+\n%s\n" % (annotated_header, new_seq, new_qual))
+
+    # write barcode to fasta if requested
+    if args.out_bc_fasta is not None:
+        faout.write(">{}\n{}\n".format(header, barcode))
+
+# close files
+samout.close()
+if args.out_bc_fasta is not None:
+    faout.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_bcs.xml	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,44 @@
+<tool id="extract_bcs.py" name="extract_bcs.py" version="1.0">
+  <description>Extract barcodes using pattern.</description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <expand macro="requirements" />
+  <expand macro="stdio" />
+  <version_command>python extract_bcs.py --version</version_command>
+  <command interpreter="python"><![CDATA[
+extract_bcs.py
+#if $positional_1 and $positional_1 is not None:
+$positional_1
+#end if
+
+#if $positional_2 and $positional_2 is not None:
+$positional_2
+#end if
+
+> $default]]></command>
+  <inputs>
+    <param area="false" label="Barcoded sequences." name="positional_1" type="data" format="fastq"/>
+    <param area="false" label="Pattern of barcode nucleotides starting at 5'-end. X positions will be moved to the header, N positions will be kept." name="positional_2" type="text"/>
+  </inputs>
+  <outputs>
+    <data hidden="false" name="default" format="fastq" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="positional_1" value="reads.fastq"/>
+      <param name="positional_2" value="XXXNNXXX"/>
+      <output name="default" file="result.fastq"/>
+    </test>
+  </tests>
+  <help><![CDATA[
+Exract barcodes from a FASTQ file according to a user-specified pattern.
+
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+]]></help>
+  <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,17 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="0.1.0">bctools</requirement>
+        </requirements>
+    </xml>
+    <xml name="stdio">
+        <stdio>
+          <exit_code level="fatal" range="1:"/>
+        </stdio>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1016/j.molcel.2013.07.001</citation>
+        </citations>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/merge_pcr_duplicates.py	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+
+tool_description = """
+Merge PCR duplicates according to random barcode library.
+
+Barcodes containing uncalled base 'N' are removed. By default output is written
+to stdout.
+
+Input:
+* bed6 file containing alignments with fastq read-id in name field
+* fasta library with fastq read-id as sequence ids
+
+Output:
+* bed6 file with random barcode in name field and number of PCR duplicates as
+  score, sorted by fields chrom, start, stop, strand, name
+
+Example usage:
+- read PCR duplicates from file duplicates.bed and write merged results to file
+  merged.bed:
+merge_pcr_duplicates.py duplicates.bed bclibrary.fa --out merged.bed
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+from sys import stdout
+from Bio import SeqIO
+import pandas as pd
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+
+def fasta_tuple_generator(fasta_iterator):
+    "Yields id, sequence tuples given an iterator over Biopython SeqIO objects."
+    for record in input_seq_iterator:
+        yield (record.id, str(record.seq))
+
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+                                 epilog=epilog,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+    "alignments",
+    help="Path to bed6 file containing alignments.")
+parser.add_argument(
+    "bclib",
+    help="Path to fasta barcode library.")
+# optional arguments
+parser.add_argument(
+    "-o", "--outfile",
+    help="Write results to this file.")
+# misc arguments
+parser.add_argument(
+    "-v", "--verbose",
+    help="Be verbose.",
+    action="store_true")
+parser.add_argument(
+    "-d", "--debug",
+    help="Print lots of debugging information",
+    action="store_true")
+parser.add_argument(
+    '--version',
+    action='version',
+    version='0.1.0')
+
+args = parser.parse_args()
+
+if args.debug:
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info("  alignments: '{}'".format(args.alignments))
+logging.info("  bclib: '{}'".format(args.bclib))
+if args.outfile:
+    logging.info("  outfile: enabled writing to file")
+    logging.info("  outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# load barcode library into dictionary
+input_handle = open(args.bclib, "rU")
+input_seq_iterator = SeqIO.parse(input_handle, "fasta")
+bcs = pd.DataFrame.from_records(
+    data=fasta_tuple_generator(input_seq_iterator),
+    columns=["read_id", "bc"])
+
+# load alignments
+alns = pd.read_csv(
+    args.alignments,
+    sep="\t",
+    names=["chrom", "start", "stop", "read_id", "score", "strand"])
+
+# combine barcode library and alignments
+bcalib = pd.merge(
+    bcs, alns,
+    on="read_id",
+    how="inner",
+    sort=False)
+if bcalib.empty:
+    raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.")
+n_alns = len(alns.index)
+n_bcalib = len(bcalib.index)
+if n_bcalib < n_alns:
+    logging.warning(
+        "{} of {} alignments could not be associated with a random barcode.".format(
+            n_alns - n_bcalib, n_alns))
+
+# remove entries with barcodes that has uncalled base N
+bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index)
+n_bcalib_cleaned = len(bcalib_cleaned)
+if n_bcalib_cleaned < n_bcalib:
+    msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
+        n_bcalib - n_bcalib_cleaned, n_bcalib)
+    if n_bcalib_cleaned < (0.8 * n_bcalib):
+        logging.warning(msg)
+    else:
+        logging.info(msg)
+
+# count and merge pcr duplicates
+# grouping sorts by keys, so the ouput will be properly sorted
+merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index()
+merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True)
+
+# write coordinates of crosslinking event alignments
+eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
+merged.to_csv(
+    eventalnout,
+    columns=['chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'],
+    sep="\t", index=False, header=False)
+eventalnout.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/merge_pcr_duplicates.xml	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,52 @@
+<tool id="merge_pcr_duplicates.py" name="merge_pcr_duplicates.py" version="1.0">
+  <description>Merge PCR duplicates according to random barcode library.</description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <expand macro="requirements" />
+  <expand macro="stdio" />
+  <version_command>python merge_pcr_duplicates.py --version</version_command>
+  <command interpreter="python"><![CDATA[merge_pcr_duplicates.py
+#if $positional_1 and $positional_1 is not None:
+$positional_1
+#end if
+
+#if $positional_2 and $positional_2 is not None:
+$positional_2
+#end if
+
+> $default]]></command>
+  <inputs>
+    <param area="false" label="bed6 file containing alignments." name="positional_1" type="data" format="bed"/>
+    <param area="false" label="fasta barcode library." name="positional_2" type="data" format="fasta"/>
+  </inputs>
+  <outputs>
+    <data format="bed" hidden="false" name="default"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="positional_1" value="pcr_dupes_sorted_2.bed"/>
+      <param name="positional_2" value="pcr_dupes_randomdict.fa"/>
+      <output name="default" file="merged_pcr_dupes.bed"/>
+    </test>
+  </tests>
+  <help><![CDATA[
+Merge PCR duplicates according to random barcode library.
+
+Barcodes containing uncalled base 'N' are removed.
+
+Input:
+* bed6 file containing alignments with fastq read-id in name field
+* fasta library with fastq read-id as sequence ids
+
+Output:
+* bed6 file with random barcode in name field and number of PCR duplicates as score, sorted by fields chrom, start, stop, strand, name
+
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+]]></help>
+  <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/remove_tail.py	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+tool_description = """
+Remove a certain number of nucleotides from the 3'-tails of sequences in fastq
+format.
+
+Example usage:
+- remove the last 7 nucleotides from file input.fastq, write result to file
+  output.fastq:
+remove_tail.py input.fastq 7 --out output.fastq
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+from sys import stdout
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+                                 epilog=epilog,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+    "infile",
+    help="Path to fastq file.")
+parser.add_argument(
+    "length",
+    type=int,
+    help="Remove this many nts.")
+# optional arguments
+parser.add_argument(
+    "-o", "--outfile",
+    help="Write results to this file.")
+parser.add_argument(
+    "-v", "--verbose",
+    help="Be verbose.",
+    action="store_true")
+parser.add_argument(
+    "-d", "--debug",
+    help="Print lots of debugging information",
+    action="store_true")
+parser.add_argument(
+    '--version',
+    action='version',
+    version='0.1.0')
+
+args = parser.parse_args()
+if args.debug:
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info("  infile: '{}'".format(args.infile))
+logging.info("  length: '{}'".format(args.length))
+if args.outfile:
+    logging.info("  outfile: enabled writing to file")
+    logging.info("  outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# check length parameter
+if args.length < 0:
+    raise ValueError("Length must be a positive integer, is '{}'.".format(args.length))
+
+# remove tail
+with (open(args.outfile, "w") if args.outfile is not None else stdout) as samout:
+    for header, seq, qual in FastqGeneralIterator(open(args.infile)):
+
+        # if removing tail would lead to an empty sequence,
+        # set sequence to a single N to keep fastq synchronized
+        if len(seq) <= args.length:
+            seq = "N"
+            qual = "B"
+            logging.debug("read '{}' was too short to remove full tail".format(header))
+            logging.debug("seq: {}".format(seq))
+            logging.debug("len(seq): {}".format(len(seq)))
+        else:
+            seq = seq[0:-args.length]
+            qual = qual[0:-args.length]
+
+        samout.write("@%s\n%s\n+\n%s\n" % (header, seq, qual))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/remove_tail.xml	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,43 @@
+<tool id="remove_tail.py" name="remove_tail.py" version="1.0">
+  <description>Remove nts from 3'-end.</description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <expand macro="requirements" />
+  <expand macro="stdio" />
+  <version_command>python remove_tail.py --version</version_command>
+  <command interpreter="python"><![CDATA[remove_tail.py
+#if $positional_1 and $positional_1 is not None:
+$positional_1
+#end if
+
+#if $positional_2 and $positional_2 is not None:
+$positional_2
+#end if
+
+> $default]]></command>
+  <inputs>
+    <param area="false" label="Fastq file." name="positional_1" type="data" format="fastq"/>
+    <param label="Remove this many nts." name="positional_2" type="integer" value="0"/>
+  </inputs>
+  <outputs>
+    <data format="fastq" hidden="false" name="default"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="positional_1" value="readswithtail.fastq"/>
+      <param name="positional_2" value="7"/>
+      <output name="default" file="readswithtailremoved.fastq"/>
+    </test>
+  </tests>
+  <help><![CDATA[
+Remove a certain number of nucleotides from the 3'-tails of sequences in fastq format.
+
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+]]></help>
+  <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rm_spurious_events.py	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+tool_description = """
+Remove spurious events originating from errors in random sequence tags.
+
+This script compares all events sharing the same coordinates. Among each group
+of events the maximum number of PCR duplicates is determined. All events that
+are supported by less than 10 percent of this maximum count are removed.
+
+By default output is written to stdout.
+
+Input:
+* bed6 file containing crosslinking events with score field set to number of PCR
+  duplicates
+
+Output:
+* bed6 file with spurious crosslinking events removed, sorted by fields chrom,
+  start, stop, strand
+
+Example usage:
+- remove spurious events from spurious.bed and write results to file cleaned.bed
+rm_spurious_events.py spurious.bed --out cleaned.bed
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+from sys import stdout
+import pandas as pd
+
+
+class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
+                                          argparse.RawDescriptionHelpFormatter):
+    # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
+    pass
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+                                 epilog=epilog,
+                                 formatter_class=DefaultsRawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+    "events",
+    help="Path to bed6 file containing alignments.")
+# optional arguments
+parser.add_argument(
+    "-o", "--outfile",
+    help="Write results to this file.")
+parser.add_argument(
+    "-t", "--threshold",
+    type=float,
+    default=0.1,
+    help="Threshold for spurious event removal."
+)
+# misc arguments
+parser.add_argument(
+    "-v", "--verbose",
+    help="Be verbose.",
+    action="store_true")
+parser.add_argument(
+    "-d", "--debug",
+    help="Print lots of debugging information",
+    action="store_true")
+parser.add_argument(
+    '--version',
+    action='version',
+    version='0.1.0')
+
+args = parser.parse_args()
+
+if args.debug:
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info("  alignments: '{}'".format(args.events))
+logging.info("  threshold: '{}'".format(args.threshold))
+if args.outfile:
+    logging.info("  outfile: enabled writing to file")
+    logging.info("  outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# check threshold parameter value
+if args.threshold < 0 or args.threshold > 1:
+    raise ValueError("Threshold must be in [0,1].")
+
+# load alignments
+alns = pd.read_csv(
+    args.events,
+    sep="\t",
+    names=["chrom", "start", "stop", "read_id", "score", "strand"])
+
+# remove all alignments that not enough PCR duplicates with respect to
+# the group maximum
+grouped = alns.groupby(['chrom', 'start', 'stop', 'strand'], group_keys=False)
+alns_cleaned = grouped.apply(lambda g: g[g["score"] >= args.threshold * g["score"].max()])
+
+# write coordinates of crosslinking event alignments
+alns_cleaned_out = (open(args.outfile, "w") if args.outfile is not None else stdout)
+alns_cleaned.to_csv(
+    alns_cleaned_out,
+    columns=['chrom', 'start', 'stop', 'read_id', 'score', 'strand'],
+    sep="\t", index=False, header=False)
+alns_cleaned_out.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rm_spurious_events.xml	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,54 @@
+<tool id="rm_spurious_events.py" name="rm_spurious_events.py" version="1.0">
+  <description>Remove spurious events.</description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <expand macro="requirements" />
+  <expand macro="stdio" />
+  <version_command>python rm_spurious_events.py --version</version_command>
+  <command interpreter="python"><![CDATA[rm_spurious_events.py
+#if $positional_1 and $positional_1 is not None:
+$positional_1
+#end if
+
+#if $threshold and $threshold is not None:
+--threshold $threshold
+#end if
+> $default]]></command>
+  <inputs>
+    <param area="false" label="bed6 file containing alignments." name="positional_1" type="data" format="bed"/>
+    <param help="(--threshold)" label="Threshold for spurious event removal." name="threshold" optional="true" type="float" value="0.1"/>
+  </inputs>
+  <outputs>
+    <data format="bed" hidden="false" name="default"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="positional_1" value="merged_pcr_dupes_spurious.bed"/>
+      <param name="threshold" value="0.5"/>
+      <output name="default" file="merged_pcr_dupes_spurious_filtered_thresh05.bed"/>
+    </test>
+  </tests>
+  <help><![CDATA[
+Remove spurious events originating from errors in random sequence tags.
+
+This script compares all events sharing the same coordinates. Among each group
+of events the maximum number of PCR duplicates is determined. All events that
+are supported by less than 10 percent of this maximum count are removed.
+
+Input:
+* bed6 file containing crosslinking events with score field set to number of PCR
+  duplicates
+
+Output:
+* bed6 file with spurious crosslinking events removed, sorted by fields chrom,
+  start, stop, strand
+
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+]]></help>
+  <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="biopython" version="1.65">
+        <repository changeset_revision="b3a791f6e3ba" name="package_biopython_1_65" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="pandas" version="0.14">
+        <repository changeset_revision="045c4645abdf" name="package_pandas_0_14" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="pybedtools" version="0.6.6">
+        <repository changeset_revision="372c85bed2ca" name="package_pybedtools_0_6_6" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>