# HG changeset patch # User rnateam # Date 1445524005 14400 # Node ID de4ea3aa1090ed0e9df5bbc30891f9f1ea21307e # Parent ae0f58d3318faf4f553156ed21d954fbb3ecef92 Uploaded diff -r ae0f58d3318f -r de4ea3aa1090 convert_bc_to_binary_RY.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/convert_bc_to_binary_RY.py Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,101 @@ +#!/usr/bin/env python + +tool_description = """ +Convert standard nucleotides to IUPAC nucleotide codes used for binary barcodes. + +A and G are converted to nucleotide code R. T, U and C are converted to Y. By +default output is written to stdout. + +Example usage: +- write converted sequences from file in.fa to file file out.fa: +convert_bc_to_binary_RY.py in.fa --outfile out.fa +""" + +epilog = """ +Author: Daniel Maticzka +Copyright: 2015 +License: Apache +Email: maticzkd@informatik.uni-freiburg.de +Status: Testing +""" + +import argparse +import logging +from string import maketrans +from sys import stdout +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.Alphabet import IUPAC + +# # avoid ugly python IOError when stdout output is piped into another program +# # and then truncated (such as piping to head) +# from signal import signal, SIGPIPE, SIG_DFL +# signal(SIGPIPE, SIG_DFL) + +# parse command line arguments +parser = argparse.ArgumentParser(description=tool_description, + epilog=epilog, + formatter_class=argparse.RawDescriptionHelpFormatter) +# positional arguments +parser.add_argument( + "infile", + help="Path to fasta input file.") +# optional arguments +parser.add_argument( + "-o", "--outfile", + help="Write results to this file.") +parser.add_argument( + "-v", "--verbose", + help="Be verbose.", + action="store_true") +parser.add_argument( + "-d", "--debug", + help="Print lots of debugging information", + action="store_true") +parser.add_argument( + '--version', + action='version', + version='0.1.0') + + +def translate_nt_to_RY(seq): + """Translates nucleotides to RY (A,G -> R; C,U,T -> Y). + + >>> translate_nt_to_RY("ACGUTACGUT") + RYRYYRYRYY + """ + trans_table = maketrans("AGCUT", "RRYYY") + trans_seq = seq.translate(trans_table) + logging.debug(seq + " -> " + trans_seq) + return trans_seq + + +def translate_nt_to_RY_iterator(robj): + """Translate SeqRecords sequences to RY alphabet.""" + for record in robj: + record.seq = Seq(translate_nt_to_RY(str(record.seq)), + IUPAC.unambiguous_dna) + yield record + +# handle arguments +args = parser.parse_args() +if args.debug: + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") +elif args.verbose: + logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") +else: + logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") +logging.info("Parsed arguments:") +if args.outfile: + logging.info(" outfile: enabled writing to file") + logging.info(" outfile: '{}'".format(args.outfile)) +logging.info(" outfile: '{}'".format(args.outfile)) +logging.info("") + +# get input iterator +input_handle = open(args.infile, "rU") +input_seq_iterator = SeqIO.parse(input_handle, "fasta") +convert_seq_iterator = translate_nt_to_RY_iterator(input_seq_iterator) +output_handle = (open(args.outfile, "w") if args.outfile is not None else stdout) +SeqIO.write(convert_seq_iterator, output_handle, "fasta") +output_handle.close() diff -r ae0f58d3318f -r de4ea3aa1090 convert_bc_to_binary_RY.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/convert_bc_to_binary_RY.xml Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,39 @@ + + Convert to binary barcodes. + + macros.xml + + + + python convert_bc_to_binary_RY.py --version + $default]]> + + + + + + + + + + + + + + diff -r ae0f58d3318f -r de4ea3aa1090 coords2clnt.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/coords2clnt.py Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,92 @@ +#!/usr/bin/env python + +tool_description = """ +Given coordinates of the aligned reads, calculate positions of the crosslinked +nucleotides. Crosslinked nts are assumed to be one nt upstream of the 5'-end of +the read. + +By default output is written to stdout. + +Input: +* bed6 file containing coordinates of aligned reads +* bed6 file containing coordinates of crosslinking events + +Example usage: +- convert read coordinates from file in.bed to coordinates of the crosslinking + events, written to out.bed: +coords2clnt.py in.bed --outfile out.bed +""" + +epilog = """ +Author: Daniel Maticzka +Copyright: 2015 +License: Apache +Email: maticzkd@informatik.uni-freiburg.de +Status: Testing +""" + +import argparse +import logging +from sys import stdout +from pybedtools import BedTool +from pybedtools.featurefuncs import five_prime +# avoid ugly python IOError when stdout output is piped into another program +# and then truncated (such as piping to head) +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) + +# parse command line arguments +parser = argparse.ArgumentParser(description=tool_description, + epilog=epilog, + formatter_class=argparse.RawDescriptionHelpFormatter) +# positional arguments +parser.add_argument( + "infile", + help="Path to bed input file.") +# optional arguments +parser.add_argument( + "-o", "--outfile", + help="Write results to this file.") +parser.add_argument( + "-v", "--verbose", + help="Be verbose.", + action="store_true") +parser.add_argument( + "-d", "--debug", + help="Print lots of debugging information", + action="store_true") +parser.add_argument( + '--version', + action='version', + version='0.1.0') + + +# handle arguments +args = parser.parse_args() +if args.debug: + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") +elif args.verbose: + logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") +else: + logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") +logging.info("Parsed arguments:") +if args.outfile: + logging.info(" outfile: enabled writing to file") + logging.info(" outfile: '{}'".format(args.outfile)) +logging.info(" outfile: '{}'".format(args.outfile)) +logging.info("") + +# data processing +alns = BedTool(args.infile) +clnts = alns.each(five_prime, upstream=1, downstream=0) + +# write to file or to stdout +if args.outfile: + clnts.saveas(args.outfile) +else: + tmptool = clnts.saveas() + logging.debug("results written to temporary file :" + tmptool.fn) + tmp = open(tmptool.fn) + for line in tmp: + stdout.write(line) + tmp.close() diff -r ae0f58d3318f -r de4ea3aa1090 coords2clnt.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/coords2clnt.xml Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,45 @@ + + Extract crosslinked nucleotide. + + macros.xml + + + + + + + python coords2clnt.py --version + $default]]> + + + + + + + + + + + + + + diff -r ae0f58d3318f -r de4ea3aa1090 extract_aln_ends.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_aln_ends.py Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,140 @@ +#!/usr/bin/env python + +tool_description = """ +Extract alignment ends from sam file. + +The resulting bed file contains the outer coordinates of the alignments. The +bed name field is set to the read id and the score field is set to the edit +distance of the alignment. The crosslinked nucleotide is one nt upstream of the +5'-end of the bed entries. + +This script only reports results for alignments that are properly aligned in FR +("forward-reverse") direction. + +By default output is written to stdout. + +Input: +* sam file containing alignments (paired-end sequencing) + +Output: +* bed6 file containing outer coordinates (sorted by read id) + +Example usage: +- Extract coordinates from file input.bam and write to file output.bed +extract_aln_ends.py input.bam --out output.bed +""" + +epilog = """ +Author: Daniel Maticzka +Copyright: 2015 +License: Apache +Email: maticzkd@informatik.uni-freiburg.de +Status: Development +""" + +import argparse +import logging +from sys import stdout +from shutil import rmtree +from tempfile import mkdtemp +from pybedtools import BedTool +import pysam + + +class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, + argparse.RawDescriptionHelpFormatter): + # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter + pass + +# avoid ugly python IOError when stdout output is piped into another program +# and then truncated (such as piping to head) +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) + +# parse command line arguments +parser = argparse.ArgumentParser(description=tool_description, + epilog=epilog, + formatter_class=DefaultsRawDescriptionHelpFormatter) +# positional arguments +parser.add_argument( + "sam", + help="Path to sam file containing alignments.") +# optional arguments +parser.add_argument( + "-o", "--outfile", + help="Write results to this file.") +# misc arguments +parser.add_argument( + "-v", "--verbose", + help="Be verbose.", + action="store_true") +parser.add_argument( + "-d", "--debug", + help="Print lots of debugging information", + action="store_true") +parser.add_argument( + '--version', + action='version', + version='0.1.0') + +args = parser.parse_args() + +if args.debug: + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") +elif args.verbose: + logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") +else: + logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") +logging.info("Parsed arguments:") +logging.info(" sam: '{}'".format(args.sam)) +if args.outfile: + logging.info(" outfile: enabled writing to file") + logging.info(" outfile: '{}'".format(args.outfile)) +logging.info("") + +# convert to bam for use with pybedtools +try: + # setup temporary directory + tmpdir = mkdtemp() + logging.debug("tmpdir: " + tmpdir) + + fn_sorted = tmpdir + "/sorted.bam" + fn_fixedmates = tmpdir + "/fixedmates.bam" + + # sort by id + logging.debug("calling samtools sort") + pysam.sort(args.sam, "-n", "-o{}".format(fn_sorted), "-T sortprefix") + + # fix mate information + # also removes secondary and unmapped reads + logging.debug("calling samtools fixmates") + pysam.fixmate("-r", fn_sorted, fn_fixedmates) + + # bedtools bam2bed + alns = BedTool(fn_fixedmates) + alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True) + + # determine alignment ends and write to file + with (open(args.outfile, "w") if args.outfile is not None else stdout) as out: + for i in alns_bedpe: + chrom = i.fields[0] + fmstart = i.fields[1] + fmend = i.fields[2] + smstart = i.fields[4] + smend = i.fields[5] + readid = i.fields[6] + score = i.fields[7] + fmstrand = i.fields[8] + if fmstrand == "+": + start = fmstart + end = smend + elif fmstrand == "-": + start = smstart + end = fmend + else: + logging.warning("Skipping {}, strand information is missing: '{}'".format(readid, i)) + continue + out.write("\t".join([chrom, start, end, readid, score, fmstrand]) + "\n") +finally: + logging.debug("removed tmpdir: " + tmpdir) + rmtree(tmpdir) diff -r ae0f58d3318f -r de4ea3aa1090 extract_aln_ends.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_aln_ends.xml Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,52 @@ + + Extract alignment ends from sam file. + + macros.xml + + + + python extract_aln_ends.py --version + $default]]> + + + + + + + + + + + + + + diff -r ae0f58d3318f -r de4ea3aa1090 extract_bcs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_bcs.py Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,166 @@ +#!/usr/bin/env python + +tool_description = """ +Exract barcodes from a FASTQ file according to a user-specified pattern. + +By default output is written to stdout. + +Example usage: +- move nucleotides at positions 1-3 and 6-7 to FASTQ header and write to file + output.fastq: +fastq_extract_barcodes.py barcoded_input.fastq XXXNNXX --out output.fastq +""" + +epilog = """ +Author: Daniel Maticzka +Copyright: 2015 +License: Apache +Email: maticzkd@informatik.uni-freiburg.de +Status: Testing +""" + +import argparse +import logging +import re +from sys import stdout +from Bio.SeqIO.QualityIO import FastqGeneralIterator + +# avoid ugly python IOError when stdout output is piped into another program +# and then truncated (such as piping to head) +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) + +# parse command line arguments +parser = argparse.ArgumentParser(description=tool_description, + epilog=epilog, + formatter_class=argparse.RawDescriptionHelpFormatter) +# positional arguments +parser.add_argument( + "infile", + help="Path to fastq file.") +parser.add_argument( + "pattern", + help="Pattern of barcode nucleotides starting at 5'-end. X positions will be moved to the header, N positions will be kept.") +# optional arguments +parser.add_argument( + "-o", "--outfile", + help="Write results to this file.") +parser.add_argument( + "-b", "--bcs", + dest="out_bc_fasta", + help="If set, barcodes are written to this file in FASTA format.") +parser.add_argument( + "-v", "--verbose", + help="Be verbose.", + action="store_true") +parser.add_argument( + "-d", "--debug", + help="Print lots of debugging information", + action="store_true") +parser.add_argument( + '--version', + action='version', + version='0.1.0') + +args = parser.parse_args() +if args.debug: + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") +elif args.verbose: + logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") +else: + logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") +logging.info("Parsed arguments:") +logging.info(" infile: '{}'".format(args.infile)) +logging.info(" pattern: '{}'".format(args.pattern)) +if args.outfile: + logging.info(" outfile: enabled writing to file") + logging.info(" outfile: '{}'".format(args.outfile)) +if args.out_bc_fasta: + logging.info(" bcs: enabled writing barcodes to fasta file") + logging.info(" bcs: {}".format(args.out_bc_fasta)) +logging.info("") + +# check if supplied pattern is valid +valid_pattern = re.compile("^[XN]+$") +pattern_match = valid_pattern.match(args.pattern) +if pattern_match is None: + raise ValueError("Error: supplied pattern '{}' is not valid.".format(args.pattern)) + +# check if at least one barcode position is included in the pattern +has_bcpos_pattern = re.compile("X") +pattern_match = has_bcpos_pattern.search(args.pattern) +if pattern_match is None: + raise ValueError("Error: supplied pattern '{}' does not contain a barcode position 'X'.".format(args.pattern)) + +logging.info("Barcode pattern analysis:") +# get X positions of pattern string +barcode_nt_pattern = re.compile("X+") +barcode_positions = [] +for m in re.finditer(barcode_nt_pattern, args.pattern): + logging.info(' found barcode positions in pattern: %02d-%02d: %s' % (m.start(), m.end(), m.group(0))) + barcode_positions.append((m.start(), m.end())) +logging.info(" barcode positions: {}".format(barcode_positions)) +# get last position of a barcode nt in the pattern +# reads must be long enough for all +min_readlen = barcode_positions[-1][-1] +logging.info(" last position of a barcode nt in pattern: {}".format(min_readlen)) +logging.info("") + +# get coordinates of nucleotides to keep +# the tail after the last barcode nt is handled separately +seq_positions = [] +last_seq_start = 0 +for bcstart, bcstop in barcode_positions: + seq_positions.append((last_seq_start, bcstart)) + last_seq_start = bcstop +logging.info(" sequence positions: {}".format(seq_positions)) +logging.info(" start of sequence tail: {}".format(last_seq_start)) + +samout = (open(args.outfile, "w") if args.outfile is not None else stdout) +if args.out_bc_fasta is not None: + faout = open(args.out_bc_fasta, "w") +for header, seq, qual in FastqGeneralIterator(open(args.infile)): + + # skip reads that are too short to extract the full requested barcode + if len(seq) < min_readlen: + logging.warning("skipping read '{}', is too short to extract the full requested barcode".format(header)) + logging.debug("seq: {}".format(seq)) + logging.debug("len(seq): {}".format(len(seq))) + continue + + # extract barcode nucleotides + barcode_list = [] + for bcstart, bcstop in barcode_positions: + barcode_list.append(seq[bcstart:bcstop]) + barcode = "".join(barcode_list) + logging.debug("extracted barcode: {}".format(barcode)) + + # create new sequence and quality string without barcode nucleotides + new_seq_list = [] + new_qual_list = [] + for seqstart, seqstop in seq_positions: + new_seq_list.append(seq[seqstart:seqstop]) + new_qual_list.append(qual[seqstart:seqstop]) + new_seq_list.append(seq[last_seq_start:]) + new_qual_list.append(qual[last_seq_start:]) + new_seq = "".join(new_seq_list) + new_qual = "".join(new_qual_list) + # check if at least one nucleotide is left. having none would break fastq + if len(new_seq) == 0: + logging.warning("skipping read '{}', no sequence remains after barcode extraction".format(header)) + logging.debug("seq: {}".format(seq)) + logging.debug("len(seq): {}".format(len(seq))) + continue + + # write barcode nucleotides into header + annotated_header = header + " " + barcode + samout.write("@%s\n%s\n+\n%s\n" % (annotated_header, new_seq, new_qual)) + + # write barcode to fasta if requested + if args.out_bc_fasta is not None: + faout.write(">{}\n{}\n".format(header, barcode)) + +# close files +samout.close() +if args.out_bc_fasta is not None: + faout.close() diff -r ae0f58d3318f -r de4ea3aa1090 extract_bcs.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_bcs.xml Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,44 @@ + + Extract barcodes using pattern. + + macros.xml + + + + python extract_bcs.py --version + $default]]> + + + + + + + + + + + + + + + + diff -r ae0f58d3318f -r de4ea3aa1090 macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,17 @@ + + + + bctools + + + + + + + + + + 10.1016/j.molcel.2013.07.001 + + + diff -r ae0f58d3318f -r de4ea3aa1090 merge_pcr_duplicates.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_pcr_duplicates.py Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,144 @@ +#!/usr/bin/env python + +tool_description = """ +Merge PCR duplicates according to random barcode library. + +Barcodes containing uncalled base 'N' are removed. By default output is written +to stdout. + +Input: +* bed6 file containing alignments with fastq read-id in name field +* fasta library with fastq read-id as sequence ids + +Output: +* bed6 file with random barcode in name field and number of PCR duplicates as + score, sorted by fields chrom, start, stop, strand, name + +Example usage: +- read PCR duplicates from file duplicates.bed and write merged results to file + merged.bed: +merge_pcr_duplicates.py duplicates.bed bclibrary.fa --out merged.bed +""" + +epilog = """ +Author: Daniel Maticzka +Copyright: 2015 +License: Apache +Email: maticzkd@informatik.uni-freiburg.de +Status: Testing +""" + +import argparse +import logging +from sys import stdout +from Bio import SeqIO +import pandas as pd + +# avoid ugly python IOError when stdout output is piped into another program +# and then truncated (such as piping to head) +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) + + +def fasta_tuple_generator(fasta_iterator): + "Yields id, sequence tuples given an iterator over Biopython SeqIO objects." + for record in input_seq_iterator: + yield (record.id, str(record.seq)) + + +# parse command line arguments +parser = argparse.ArgumentParser(description=tool_description, + epilog=epilog, + formatter_class=argparse.RawDescriptionHelpFormatter) +# positional arguments +parser.add_argument( + "alignments", + help="Path to bed6 file containing alignments.") +parser.add_argument( + "bclib", + help="Path to fasta barcode library.") +# optional arguments +parser.add_argument( + "-o", "--outfile", + help="Write results to this file.") +# misc arguments +parser.add_argument( + "-v", "--verbose", + help="Be verbose.", + action="store_true") +parser.add_argument( + "-d", "--debug", + help="Print lots of debugging information", + action="store_true") +parser.add_argument( + '--version', + action='version', + version='0.1.0') + +args = parser.parse_args() + +if args.debug: + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") +elif args.verbose: + logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") +else: + logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") +logging.info("Parsed arguments:") +logging.info(" alignments: '{}'".format(args.alignments)) +logging.info(" bclib: '{}'".format(args.bclib)) +if args.outfile: + logging.info(" outfile: enabled writing to file") + logging.info(" outfile: '{}'".format(args.outfile)) +logging.info("") + +# load barcode library into dictionary +input_handle = open(args.bclib, "rU") +input_seq_iterator = SeqIO.parse(input_handle, "fasta") +bcs = pd.DataFrame.from_records( + data=fasta_tuple_generator(input_seq_iterator), + columns=["read_id", "bc"]) + +# load alignments +alns = pd.read_csv( + args.alignments, + sep="\t", + names=["chrom", "start", "stop", "read_id", "score", "strand"]) + +# combine barcode library and alignments +bcalib = pd.merge( + bcs, alns, + on="read_id", + how="inner", + sort=False) +if bcalib.empty: + raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.") +n_alns = len(alns.index) +n_bcalib = len(bcalib.index) +if n_bcalib < n_alns: + logging.warning( + "{} of {} alignments could not be associated with a random barcode.".format( + n_alns - n_bcalib, n_alns)) + +# remove entries with barcodes that has uncalled base N +bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index) +n_bcalib_cleaned = len(bcalib_cleaned) +if n_bcalib_cleaned < n_bcalib: + msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format( + n_bcalib - n_bcalib_cleaned, n_bcalib) + if n_bcalib_cleaned < (0.8 * n_bcalib): + logging.warning(msg) + else: + logging.info(msg) + +# count and merge pcr duplicates +# grouping sorts by keys, so the ouput will be properly sorted +merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index() +merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True) + +# write coordinates of crosslinking event alignments +eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout) +merged.to_csv( + eventalnout, + columns=['chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'], + sep="\t", index=False, header=False) +eventalnout.close() diff -r ae0f58d3318f -r de4ea3aa1090 merge_pcr_duplicates.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_pcr_duplicates.xml Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,52 @@ + + Merge PCR duplicates according to random barcode library. + + macros.xml + + + + python merge_pcr_duplicates.py --version + $default]]> + + + + + + + + + + + + + + + + diff -r ae0f58d3318f -r de4ea3aa1090 remove_tail.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/remove_tail.py Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +tool_description = """ +Remove a certain number of nucleotides from the 3'-tails of sequences in fastq +format. + +Example usage: +- remove the last 7 nucleotides from file input.fastq, write result to file + output.fastq: +remove_tail.py input.fastq 7 --out output.fastq +""" + +epilog = """ +Author: Daniel Maticzka +Copyright: 2015 +License: Apache +Email: maticzkd@informatik.uni-freiburg.de +Status: Testing +""" + +import argparse +import logging +from sys import stdout +from Bio.SeqIO.QualityIO import FastqGeneralIterator + +# avoid ugly python IOError when stdout output is piped into another program +# and then truncated (such as piping to head) +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) + +# parse command line arguments +parser = argparse.ArgumentParser(description=tool_description, + epilog=epilog, + formatter_class=argparse.RawDescriptionHelpFormatter) +# positional arguments +parser.add_argument( + "infile", + help="Path to fastq file.") +parser.add_argument( + "length", + type=int, + help="Remove this many nts.") +# optional arguments +parser.add_argument( + "-o", "--outfile", + help="Write results to this file.") +parser.add_argument( + "-v", "--verbose", + help="Be verbose.", + action="store_true") +parser.add_argument( + "-d", "--debug", + help="Print lots of debugging information", + action="store_true") +parser.add_argument( + '--version', + action='version', + version='0.1.0') + +args = parser.parse_args() +if args.debug: + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") +elif args.verbose: + logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") +else: + logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") +logging.info("Parsed arguments:") +logging.info(" infile: '{}'".format(args.infile)) +logging.info(" length: '{}'".format(args.length)) +if args.outfile: + logging.info(" outfile: enabled writing to file") + logging.info(" outfile: '{}'".format(args.outfile)) +logging.info("") + +# check length parameter +if args.length < 0: + raise ValueError("Length must be a positive integer, is '{}'.".format(args.length)) + +# remove tail +with (open(args.outfile, "w") if args.outfile is not None else stdout) as samout: + for header, seq, qual in FastqGeneralIterator(open(args.infile)): + + # if removing tail would lead to an empty sequence, + # set sequence to a single N to keep fastq synchronized + if len(seq) <= args.length: + seq = "N" + qual = "B" + logging.debug("read '{}' was too short to remove full tail".format(header)) + logging.debug("seq: {}".format(seq)) + logging.debug("len(seq): {}".format(len(seq))) + else: + seq = seq[0:-args.length] + qual = qual[0:-args.length] + + samout.write("@%s\n%s\n+\n%s\n" % (header, seq, qual)) diff -r ae0f58d3318f -r de4ea3aa1090 remove_tail.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/remove_tail.xml Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,43 @@ + + Remove nts from 3'-end. + + macros.xml + + + + python remove_tail.py --version + $default]]> + + + + + + + + + + + + + + + + diff -r ae0f58d3318f -r de4ea3aa1090 rm_spurious_events.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rm_spurious_events.py Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,118 @@ +#!/usr/bin/env python + +tool_description = """ +Remove spurious events originating from errors in random sequence tags. + +This script compares all events sharing the same coordinates. Among each group +of events the maximum number of PCR duplicates is determined. All events that +are supported by less than 10 percent of this maximum count are removed. + +By default output is written to stdout. + +Input: +* bed6 file containing crosslinking events with score field set to number of PCR + duplicates + +Output: +* bed6 file with spurious crosslinking events removed, sorted by fields chrom, + start, stop, strand + +Example usage: +- remove spurious events from spurious.bed and write results to file cleaned.bed +rm_spurious_events.py spurious.bed --out cleaned.bed +""" + +epilog = """ +Author: Daniel Maticzka +Copyright: 2015 +License: Apache +Email: maticzkd@informatik.uni-freiburg.de +Status: Testing +""" + +import argparse +import logging +from sys import stdout +import pandas as pd + + +class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, + argparse.RawDescriptionHelpFormatter): + # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter + pass + +# avoid ugly python IOError when stdout output is piped into another program +# and then truncated (such as piping to head) +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) + +# parse command line arguments +parser = argparse.ArgumentParser(description=tool_description, + epilog=epilog, + formatter_class=DefaultsRawDescriptionHelpFormatter) +# positional arguments +parser.add_argument( + "events", + help="Path to bed6 file containing alignments.") +# optional arguments +parser.add_argument( + "-o", "--outfile", + help="Write results to this file.") +parser.add_argument( + "-t", "--threshold", + type=float, + default=0.1, + help="Threshold for spurious event removal." +) +# misc arguments +parser.add_argument( + "-v", "--verbose", + help="Be verbose.", + action="store_true") +parser.add_argument( + "-d", "--debug", + help="Print lots of debugging information", + action="store_true") +parser.add_argument( + '--version', + action='version', + version='0.1.0') + +args = parser.parse_args() + +if args.debug: + logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") +elif args.verbose: + logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") +else: + logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") +logging.info("Parsed arguments:") +logging.info(" alignments: '{}'".format(args.events)) +logging.info(" threshold: '{}'".format(args.threshold)) +if args.outfile: + logging.info(" outfile: enabled writing to file") + logging.info(" outfile: '{}'".format(args.outfile)) +logging.info("") + +# check threshold parameter value +if args.threshold < 0 or args.threshold > 1: + raise ValueError("Threshold must be in [0,1].") + +# load alignments +alns = pd.read_csv( + args.events, + sep="\t", + names=["chrom", "start", "stop", "read_id", "score", "strand"]) + +# remove all alignments that not enough PCR duplicates with respect to +# the group maximum +grouped = alns.groupby(['chrom', 'start', 'stop', 'strand'], group_keys=False) +alns_cleaned = grouped.apply(lambda g: g[g["score"] >= args.threshold * g["score"].max()]) + +# write coordinates of crosslinking event alignments +alns_cleaned_out = (open(args.outfile, "w") if args.outfile is not None else stdout) +alns_cleaned.to_csv( + alns_cleaned_out, + columns=['chrom', 'start', 'stop', 'read_id', 'score', 'strand'], + sep="\t", index=False, header=False) +alns_cleaned_out.close() diff -r ae0f58d3318f -r de4ea3aa1090 rm_spurious_events.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rm_spurious_events.xml Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,54 @@ + + Remove spurious events. + + macros.xml + + + + python rm_spurious_events.py --version + $default]]> + + + + + + + + + + + + + + + + diff -r ae0f58d3318f -r de4ea3aa1090 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Oct 22 10:26:45 2015 -0400 @@ -0,0 +1,12 @@ + + + + + + + + + + + +