# HG changeset patch
# User rnateam
# Date 1445524005 14400
# Node ID de4ea3aa1090ed0e9df5bbc30891f9f1ea21307e
# Parent ae0f58d3318faf4f553156ed21d954fbb3ecef92
Uploaded
diff -r ae0f58d3318f -r de4ea3aa1090 convert_bc_to_binary_RY.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_bc_to_binary_RY.py Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+tool_description = """
+Convert standard nucleotides to IUPAC nucleotide codes used for binary barcodes.
+
+A and G are converted to nucleotide code R. T, U and C are converted to Y. By
+default output is written to stdout.
+
+Example usage:
+- write converted sequences from file in.fa to file file out.fa:
+convert_bc_to_binary_RY.py in.fa --outfile out.fa
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+from string import maketrans
+from sys import stdout
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Alphabet import IUPAC
+
+# # avoid ugly python IOError when stdout output is piped into another program
+# # and then truncated (such as piping to head)
+# from signal import signal, SIGPIPE, SIG_DFL
+# signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+ epilog=epilog,
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+ "infile",
+ help="Path to fasta input file.")
+# optional arguments
+parser.add_argument(
+ "-o", "--outfile",
+ help="Write results to this file.")
+parser.add_argument(
+ "-v", "--verbose",
+ help="Be verbose.",
+ action="store_true")
+parser.add_argument(
+ "-d", "--debug",
+ help="Print lots of debugging information",
+ action="store_true")
+parser.add_argument(
+ '--version',
+ action='version',
+ version='0.1.0')
+
+
+def translate_nt_to_RY(seq):
+ """Translates nucleotides to RY (A,G -> R; C,U,T -> Y).
+
+ >>> translate_nt_to_RY("ACGUTACGUT")
+ RYRYYRYRYY
+ """
+ trans_table = maketrans("AGCUT", "RRYYY")
+ trans_seq = seq.translate(trans_table)
+ logging.debug(seq + " -> " + trans_seq)
+ return trans_seq
+
+
+def translate_nt_to_RY_iterator(robj):
+ """Translate SeqRecords sequences to RY alphabet."""
+ for record in robj:
+ record.seq = Seq(translate_nt_to_RY(str(record.seq)),
+ IUPAC.unambiguous_dna)
+ yield record
+
+# handle arguments
+args = parser.parse_args()
+if args.debug:
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+ logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+ logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+if args.outfile:
+ logging.info(" outfile: enabled writing to file")
+ logging.info(" outfile: '{}'".format(args.outfile))
+logging.info(" outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# get input iterator
+input_handle = open(args.infile, "rU")
+input_seq_iterator = SeqIO.parse(input_handle, "fasta")
+convert_seq_iterator = translate_nt_to_RY_iterator(input_seq_iterator)
+output_handle = (open(args.outfile, "w") if args.outfile is not None else stdout)
+SeqIO.write(convert_seq_iterator, output_handle, "fasta")
+output_handle.close()
diff -r ae0f58d3318f -r de4ea3aa1090 convert_bc_to_binary_RY.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_bc_to_binary_RY.xml Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,39 @@
+
+ Convert to binary barcodes.
+
+ macros.xml
+
+
+
+ python convert_bc_to_binary_RY.py --version
+ $default]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r ae0f58d3318f -r de4ea3aa1090 coords2clnt.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/coords2clnt.py Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+tool_description = """
+Given coordinates of the aligned reads, calculate positions of the crosslinked
+nucleotides. Crosslinked nts are assumed to be one nt upstream of the 5'-end of
+the read.
+
+By default output is written to stdout.
+
+Input:
+* bed6 file containing coordinates of aligned reads
+* bed6 file containing coordinates of crosslinking events
+
+Example usage:
+- convert read coordinates from file in.bed to coordinates of the crosslinking
+ events, written to out.bed:
+coords2clnt.py in.bed --outfile out.bed
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+from sys import stdout
+from pybedtools import BedTool
+from pybedtools.featurefuncs import five_prime
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+ epilog=epilog,
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+ "infile",
+ help="Path to bed input file.")
+# optional arguments
+parser.add_argument(
+ "-o", "--outfile",
+ help="Write results to this file.")
+parser.add_argument(
+ "-v", "--verbose",
+ help="Be verbose.",
+ action="store_true")
+parser.add_argument(
+ "-d", "--debug",
+ help="Print lots of debugging information",
+ action="store_true")
+parser.add_argument(
+ '--version',
+ action='version',
+ version='0.1.0')
+
+
+# handle arguments
+args = parser.parse_args()
+if args.debug:
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+ logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+ logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+if args.outfile:
+ logging.info(" outfile: enabled writing to file")
+ logging.info(" outfile: '{}'".format(args.outfile))
+logging.info(" outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# data processing
+alns = BedTool(args.infile)
+clnts = alns.each(five_prime, upstream=1, downstream=0)
+
+# write to file or to stdout
+if args.outfile:
+ clnts.saveas(args.outfile)
+else:
+ tmptool = clnts.saveas()
+ logging.debug("results written to temporary file :" + tmptool.fn)
+ tmp = open(tmptool.fn)
+ for line in tmp:
+ stdout.write(line)
+ tmp.close()
diff -r ae0f58d3318f -r de4ea3aa1090 coords2clnt.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/coords2clnt.xml Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,45 @@
+
+ Extract crosslinked nucleotide.
+
+ macros.xml
+
+
+
+
+
+
+ python coords2clnt.py --version
+ $default]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r ae0f58d3318f -r de4ea3aa1090 extract_aln_ends.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_aln_ends.py Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+
+tool_description = """
+Extract alignment ends from sam file.
+
+The resulting bed file contains the outer coordinates of the alignments. The
+bed name field is set to the read id and the score field is set to the edit
+distance of the alignment. The crosslinked nucleotide is one nt upstream of the
+5'-end of the bed entries.
+
+This script only reports results for alignments that are properly aligned in FR
+("forward-reverse") direction.
+
+By default output is written to stdout.
+
+Input:
+* sam file containing alignments (paired-end sequencing)
+
+Output:
+* bed6 file containing outer coordinates (sorted by read id)
+
+Example usage:
+- Extract coordinates from file input.bam and write to file output.bed
+extract_aln_ends.py input.bam --out output.bed
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Development
+"""
+
+import argparse
+import logging
+from sys import stdout
+from shutil import rmtree
+from tempfile import mkdtemp
+from pybedtools import BedTool
+import pysam
+
+
+class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
+ argparse.RawDescriptionHelpFormatter):
+ # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
+ pass
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+ epilog=epilog,
+ formatter_class=DefaultsRawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+ "sam",
+ help="Path to sam file containing alignments.")
+# optional arguments
+parser.add_argument(
+ "-o", "--outfile",
+ help="Write results to this file.")
+# misc arguments
+parser.add_argument(
+ "-v", "--verbose",
+ help="Be verbose.",
+ action="store_true")
+parser.add_argument(
+ "-d", "--debug",
+ help="Print lots of debugging information",
+ action="store_true")
+parser.add_argument(
+ '--version',
+ action='version',
+ version='0.1.0')
+
+args = parser.parse_args()
+
+if args.debug:
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+ logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+ logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info(" sam: '{}'".format(args.sam))
+if args.outfile:
+ logging.info(" outfile: enabled writing to file")
+ logging.info(" outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# convert to bam for use with pybedtools
+try:
+ # setup temporary directory
+ tmpdir = mkdtemp()
+ logging.debug("tmpdir: " + tmpdir)
+
+ fn_sorted = tmpdir + "/sorted.bam"
+ fn_fixedmates = tmpdir + "/fixedmates.bam"
+
+ # sort by id
+ logging.debug("calling samtools sort")
+ pysam.sort(args.sam, "-n", "-o{}".format(fn_sorted), "-T sortprefix")
+
+ # fix mate information
+ # also removes secondary and unmapped reads
+ logging.debug("calling samtools fixmates")
+ pysam.fixmate("-r", fn_sorted, fn_fixedmates)
+
+ # bedtools bam2bed
+ alns = BedTool(fn_fixedmates)
+ alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True)
+
+ # determine alignment ends and write to file
+ with (open(args.outfile, "w") if args.outfile is not None else stdout) as out:
+ for i in alns_bedpe:
+ chrom = i.fields[0]
+ fmstart = i.fields[1]
+ fmend = i.fields[2]
+ smstart = i.fields[4]
+ smend = i.fields[5]
+ readid = i.fields[6]
+ score = i.fields[7]
+ fmstrand = i.fields[8]
+ if fmstrand == "+":
+ start = fmstart
+ end = smend
+ elif fmstrand == "-":
+ start = smstart
+ end = fmend
+ else:
+ logging.warning("Skipping {}, strand information is missing: '{}'".format(readid, i))
+ continue
+ out.write("\t".join([chrom, start, end, readid, score, fmstrand]) + "\n")
+finally:
+ logging.debug("removed tmpdir: " + tmpdir)
+ rmtree(tmpdir)
diff -r ae0f58d3318f -r de4ea3aa1090 extract_aln_ends.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_aln_ends.xml Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,52 @@
+
diff -r ae0f58d3318f -r de4ea3aa1090 extract_bcs.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_bcs.py Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,166 @@
+#!/usr/bin/env python
+
+tool_description = """
+Exract barcodes from a FASTQ file according to a user-specified pattern.
+
+By default output is written to stdout.
+
+Example usage:
+- move nucleotides at positions 1-3 and 6-7 to FASTQ header and write to file
+ output.fastq:
+fastq_extract_barcodes.py barcoded_input.fastq XXXNNXX --out output.fastq
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+import re
+from sys import stdout
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+ epilog=epilog,
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+ "infile",
+ help="Path to fastq file.")
+parser.add_argument(
+ "pattern",
+ help="Pattern of barcode nucleotides starting at 5'-end. X positions will be moved to the header, N positions will be kept.")
+# optional arguments
+parser.add_argument(
+ "-o", "--outfile",
+ help="Write results to this file.")
+parser.add_argument(
+ "-b", "--bcs",
+ dest="out_bc_fasta",
+ help="If set, barcodes are written to this file in FASTA format.")
+parser.add_argument(
+ "-v", "--verbose",
+ help="Be verbose.",
+ action="store_true")
+parser.add_argument(
+ "-d", "--debug",
+ help="Print lots of debugging information",
+ action="store_true")
+parser.add_argument(
+ '--version',
+ action='version',
+ version='0.1.0')
+
+args = parser.parse_args()
+if args.debug:
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+ logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+ logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info(" infile: '{}'".format(args.infile))
+logging.info(" pattern: '{}'".format(args.pattern))
+if args.outfile:
+ logging.info(" outfile: enabled writing to file")
+ logging.info(" outfile: '{}'".format(args.outfile))
+if args.out_bc_fasta:
+ logging.info(" bcs: enabled writing barcodes to fasta file")
+ logging.info(" bcs: {}".format(args.out_bc_fasta))
+logging.info("")
+
+# check if supplied pattern is valid
+valid_pattern = re.compile("^[XN]+$")
+pattern_match = valid_pattern.match(args.pattern)
+if pattern_match is None:
+ raise ValueError("Error: supplied pattern '{}' is not valid.".format(args.pattern))
+
+# check if at least one barcode position is included in the pattern
+has_bcpos_pattern = re.compile("X")
+pattern_match = has_bcpos_pattern.search(args.pattern)
+if pattern_match is None:
+ raise ValueError("Error: supplied pattern '{}' does not contain a barcode position 'X'.".format(args.pattern))
+
+logging.info("Barcode pattern analysis:")
+# get X positions of pattern string
+barcode_nt_pattern = re.compile("X+")
+barcode_positions = []
+for m in re.finditer(barcode_nt_pattern, args.pattern):
+ logging.info(' found barcode positions in pattern: %02d-%02d: %s' % (m.start(), m.end(), m.group(0)))
+ barcode_positions.append((m.start(), m.end()))
+logging.info(" barcode positions: {}".format(barcode_positions))
+# get last position of a barcode nt in the pattern
+# reads must be long enough for all
+min_readlen = barcode_positions[-1][-1]
+logging.info(" last position of a barcode nt in pattern: {}".format(min_readlen))
+logging.info("")
+
+# get coordinates of nucleotides to keep
+# the tail after the last barcode nt is handled separately
+seq_positions = []
+last_seq_start = 0
+for bcstart, bcstop in barcode_positions:
+ seq_positions.append((last_seq_start, bcstart))
+ last_seq_start = bcstop
+logging.info(" sequence positions: {}".format(seq_positions))
+logging.info(" start of sequence tail: {}".format(last_seq_start))
+
+samout = (open(args.outfile, "w") if args.outfile is not None else stdout)
+if args.out_bc_fasta is not None:
+ faout = open(args.out_bc_fasta, "w")
+for header, seq, qual in FastqGeneralIterator(open(args.infile)):
+
+ # skip reads that are too short to extract the full requested barcode
+ if len(seq) < min_readlen:
+ logging.warning("skipping read '{}', is too short to extract the full requested barcode".format(header))
+ logging.debug("seq: {}".format(seq))
+ logging.debug("len(seq): {}".format(len(seq)))
+ continue
+
+ # extract barcode nucleotides
+ barcode_list = []
+ for bcstart, bcstop in barcode_positions:
+ barcode_list.append(seq[bcstart:bcstop])
+ barcode = "".join(barcode_list)
+ logging.debug("extracted barcode: {}".format(barcode))
+
+ # create new sequence and quality string without barcode nucleotides
+ new_seq_list = []
+ new_qual_list = []
+ for seqstart, seqstop in seq_positions:
+ new_seq_list.append(seq[seqstart:seqstop])
+ new_qual_list.append(qual[seqstart:seqstop])
+ new_seq_list.append(seq[last_seq_start:])
+ new_qual_list.append(qual[last_seq_start:])
+ new_seq = "".join(new_seq_list)
+ new_qual = "".join(new_qual_list)
+ # check if at least one nucleotide is left. having none would break fastq
+ if len(new_seq) == 0:
+ logging.warning("skipping read '{}', no sequence remains after barcode extraction".format(header))
+ logging.debug("seq: {}".format(seq))
+ logging.debug("len(seq): {}".format(len(seq)))
+ continue
+
+ # write barcode nucleotides into header
+ annotated_header = header + " " + barcode
+ samout.write("@%s\n%s\n+\n%s\n" % (annotated_header, new_seq, new_qual))
+
+ # write barcode to fasta if requested
+ if args.out_bc_fasta is not None:
+ faout.write(">{}\n{}\n".format(header, barcode))
+
+# close files
+samout.close()
+if args.out_bc_fasta is not None:
+ faout.close()
diff -r ae0f58d3318f -r de4ea3aa1090 extract_bcs.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_bcs.xml Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,44 @@
+
diff -r ae0f58d3318f -r de4ea3aa1090 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,17 @@
+
+
+
+ bctools
+
+
+
+
+
+
+
+
+
+ 10.1016/j.molcel.2013.07.001
+
+
+
diff -r ae0f58d3318f -r de4ea3aa1090 merge_pcr_duplicates.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/merge_pcr_duplicates.py Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+
+tool_description = """
+Merge PCR duplicates according to random barcode library.
+
+Barcodes containing uncalled base 'N' are removed. By default output is written
+to stdout.
+
+Input:
+* bed6 file containing alignments with fastq read-id in name field
+* fasta library with fastq read-id as sequence ids
+
+Output:
+* bed6 file with random barcode in name field and number of PCR duplicates as
+ score, sorted by fields chrom, start, stop, strand, name
+
+Example usage:
+- read PCR duplicates from file duplicates.bed and write merged results to file
+ merged.bed:
+merge_pcr_duplicates.py duplicates.bed bclibrary.fa --out merged.bed
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+from sys import stdout
+from Bio import SeqIO
+import pandas as pd
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+
+def fasta_tuple_generator(fasta_iterator):
+ "Yields id, sequence tuples given an iterator over Biopython SeqIO objects."
+ for record in input_seq_iterator:
+ yield (record.id, str(record.seq))
+
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+ epilog=epilog,
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+ "alignments",
+ help="Path to bed6 file containing alignments.")
+parser.add_argument(
+ "bclib",
+ help="Path to fasta barcode library.")
+# optional arguments
+parser.add_argument(
+ "-o", "--outfile",
+ help="Write results to this file.")
+# misc arguments
+parser.add_argument(
+ "-v", "--verbose",
+ help="Be verbose.",
+ action="store_true")
+parser.add_argument(
+ "-d", "--debug",
+ help="Print lots of debugging information",
+ action="store_true")
+parser.add_argument(
+ '--version',
+ action='version',
+ version='0.1.0')
+
+args = parser.parse_args()
+
+if args.debug:
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+ logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+ logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info(" alignments: '{}'".format(args.alignments))
+logging.info(" bclib: '{}'".format(args.bclib))
+if args.outfile:
+ logging.info(" outfile: enabled writing to file")
+ logging.info(" outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# load barcode library into dictionary
+input_handle = open(args.bclib, "rU")
+input_seq_iterator = SeqIO.parse(input_handle, "fasta")
+bcs = pd.DataFrame.from_records(
+ data=fasta_tuple_generator(input_seq_iterator),
+ columns=["read_id", "bc"])
+
+# load alignments
+alns = pd.read_csv(
+ args.alignments,
+ sep="\t",
+ names=["chrom", "start", "stop", "read_id", "score", "strand"])
+
+# combine barcode library and alignments
+bcalib = pd.merge(
+ bcs, alns,
+ on="read_id",
+ how="inner",
+ sort=False)
+if bcalib.empty:
+ raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.")
+n_alns = len(alns.index)
+n_bcalib = len(bcalib.index)
+if n_bcalib < n_alns:
+ logging.warning(
+ "{} of {} alignments could not be associated with a random barcode.".format(
+ n_alns - n_bcalib, n_alns))
+
+# remove entries with barcodes that has uncalled base N
+bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index)
+n_bcalib_cleaned = len(bcalib_cleaned)
+if n_bcalib_cleaned < n_bcalib:
+ msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
+ n_bcalib - n_bcalib_cleaned, n_bcalib)
+ if n_bcalib_cleaned < (0.8 * n_bcalib):
+ logging.warning(msg)
+ else:
+ logging.info(msg)
+
+# count and merge pcr duplicates
+# grouping sorts by keys, so the ouput will be properly sorted
+merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index()
+merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True)
+
+# write coordinates of crosslinking event alignments
+eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
+merged.to_csv(
+ eventalnout,
+ columns=['chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'],
+ sep="\t", index=False, header=False)
+eventalnout.close()
diff -r ae0f58d3318f -r de4ea3aa1090 merge_pcr_duplicates.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/merge_pcr_duplicates.xml Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,52 @@
+
+ Merge PCR duplicates according to random barcode library.
+
+ macros.xml
+
+
+
+ python merge_pcr_duplicates.py --version
+ $default]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r ae0f58d3318f -r de4ea3aa1090 remove_tail.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/remove_tail.py Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+tool_description = """
+Remove a certain number of nucleotides from the 3'-tails of sequences in fastq
+format.
+
+Example usage:
+- remove the last 7 nucleotides from file input.fastq, write result to file
+ output.fastq:
+remove_tail.py input.fastq 7 --out output.fastq
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+from sys import stdout
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+ epilog=epilog,
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+ "infile",
+ help="Path to fastq file.")
+parser.add_argument(
+ "length",
+ type=int,
+ help="Remove this many nts.")
+# optional arguments
+parser.add_argument(
+ "-o", "--outfile",
+ help="Write results to this file.")
+parser.add_argument(
+ "-v", "--verbose",
+ help="Be verbose.",
+ action="store_true")
+parser.add_argument(
+ "-d", "--debug",
+ help="Print lots of debugging information",
+ action="store_true")
+parser.add_argument(
+ '--version',
+ action='version',
+ version='0.1.0')
+
+args = parser.parse_args()
+if args.debug:
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+ logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+ logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info(" infile: '{}'".format(args.infile))
+logging.info(" length: '{}'".format(args.length))
+if args.outfile:
+ logging.info(" outfile: enabled writing to file")
+ logging.info(" outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# check length parameter
+if args.length < 0:
+ raise ValueError("Length must be a positive integer, is '{}'.".format(args.length))
+
+# remove tail
+with (open(args.outfile, "w") if args.outfile is not None else stdout) as samout:
+ for header, seq, qual in FastqGeneralIterator(open(args.infile)):
+
+ # if removing tail would lead to an empty sequence,
+ # set sequence to a single N to keep fastq synchronized
+ if len(seq) <= args.length:
+ seq = "N"
+ qual = "B"
+ logging.debug("read '{}' was too short to remove full tail".format(header))
+ logging.debug("seq: {}".format(seq))
+ logging.debug("len(seq): {}".format(len(seq)))
+ else:
+ seq = seq[0:-args.length]
+ qual = qual[0:-args.length]
+
+ samout.write("@%s\n%s\n+\n%s\n" % (header, seq, qual))
diff -r ae0f58d3318f -r de4ea3aa1090 remove_tail.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/remove_tail.xml Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,43 @@
+
+ Remove nts from 3'-end.
+
+ macros.xml
+
+
+
+ python remove_tail.py --version
+ $default]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r ae0f58d3318f -r de4ea3aa1090 rm_spurious_events.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rm_spurious_events.py Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+tool_description = """
+Remove spurious events originating from errors in random sequence tags.
+
+This script compares all events sharing the same coordinates. Among each group
+of events the maximum number of PCR duplicates is determined. All events that
+are supported by less than 10 percent of this maximum count are removed.
+
+By default output is written to stdout.
+
+Input:
+* bed6 file containing crosslinking events with score field set to number of PCR
+ duplicates
+
+Output:
+* bed6 file with spurious crosslinking events removed, sorted by fields chrom,
+ start, stop, strand
+
+Example usage:
+- remove spurious events from spurious.bed and write results to file cleaned.bed
+rm_spurious_events.py spurious.bed --out cleaned.bed
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+from sys import stdout
+import pandas as pd
+
+
+class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
+ argparse.RawDescriptionHelpFormatter):
+ # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
+ pass
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+ epilog=epilog,
+ formatter_class=DefaultsRawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+ "events",
+ help="Path to bed6 file containing alignments.")
+# optional arguments
+parser.add_argument(
+ "-o", "--outfile",
+ help="Write results to this file.")
+parser.add_argument(
+ "-t", "--threshold",
+ type=float,
+ default=0.1,
+ help="Threshold for spurious event removal."
+)
+# misc arguments
+parser.add_argument(
+ "-v", "--verbose",
+ help="Be verbose.",
+ action="store_true")
+parser.add_argument(
+ "-d", "--debug",
+ help="Print lots of debugging information",
+ action="store_true")
+parser.add_argument(
+ '--version',
+ action='version',
+ version='0.1.0')
+
+args = parser.parse_args()
+
+if args.debug:
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+ logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+ logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info(" alignments: '{}'".format(args.events))
+logging.info(" threshold: '{}'".format(args.threshold))
+if args.outfile:
+ logging.info(" outfile: enabled writing to file")
+ logging.info(" outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# check threshold parameter value
+if args.threshold < 0 or args.threshold > 1:
+ raise ValueError("Threshold must be in [0,1].")
+
+# load alignments
+alns = pd.read_csv(
+ args.events,
+ sep="\t",
+ names=["chrom", "start", "stop", "read_id", "score", "strand"])
+
+# remove all alignments that not enough PCR duplicates with respect to
+# the group maximum
+grouped = alns.groupby(['chrom', 'start', 'stop', 'strand'], group_keys=False)
+alns_cleaned = grouped.apply(lambda g: g[g["score"] >= args.threshold * g["score"].max()])
+
+# write coordinates of crosslinking event alignments
+alns_cleaned_out = (open(args.outfile, "w") if args.outfile is not None else stdout)
+alns_cleaned.to_csv(
+ alns_cleaned_out,
+ columns=['chrom', 'start', 'stop', 'read_id', 'score', 'strand'],
+ sep="\t", index=False, header=False)
+alns_cleaned_out.close()
diff -r ae0f58d3318f -r de4ea3aa1090 rm_spurious_events.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rm_spurious_events.xml Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,54 @@
+
+ Remove spurious events.
+
+ macros.xml
+
+
+
+ python rm_spurious_events.py --version
+ $default]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r ae0f58d3318f -r de4ea3aa1090 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+