diff merge_pcr_duplicates.py @ 2:de4ea3aa1090 draft

Uploaded
author rnateam
date Thu, 22 Oct 2015 10:26:45 -0400
parents
children 17ef0e0dae68
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/merge_pcr_duplicates.py	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+
+tool_description = """
+Merge PCR duplicates according to random barcode library.
+
+Barcodes containing uncalled base 'N' are removed. By default output is written
+to stdout.
+
+Input:
+* bed6 file containing alignments with fastq read-id in name field
+* fasta library with fastq read-id as sequence ids
+
+Output:
+* bed6 file with random barcode in name field and number of PCR duplicates as
+  score, sorted by fields chrom, start, stop, strand, name
+
+Example usage:
+- read PCR duplicates from file duplicates.bed and write merged results to file
+  merged.bed:
+merge_pcr_duplicates.py duplicates.bed bclibrary.fa --out merged.bed
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+from sys import stdout
+from Bio import SeqIO
+import pandas as pd
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+
+def fasta_tuple_generator(fasta_iterator):
+    "Yields id, sequence tuples given an iterator over Biopython SeqIO objects."
+    for record in input_seq_iterator:
+        yield (record.id, str(record.seq))
+
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+                                 epilog=epilog,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+    "alignments",
+    help="Path to bed6 file containing alignments.")
+parser.add_argument(
+    "bclib",
+    help="Path to fasta barcode library.")
+# optional arguments
+parser.add_argument(
+    "-o", "--outfile",
+    help="Write results to this file.")
+# misc arguments
+parser.add_argument(
+    "-v", "--verbose",
+    help="Be verbose.",
+    action="store_true")
+parser.add_argument(
+    "-d", "--debug",
+    help="Print lots of debugging information",
+    action="store_true")
+parser.add_argument(
+    '--version',
+    action='version',
+    version='0.1.0')
+
+args = parser.parse_args()
+
+if args.debug:
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info("  alignments: '{}'".format(args.alignments))
+logging.info("  bclib: '{}'".format(args.bclib))
+if args.outfile:
+    logging.info("  outfile: enabled writing to file")
+    logging.info("  outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# load barcode library into dictionary
+input_handle = open(args.bclib, "rU")
+input_seq_iterator = SeqIO.parse(input_handle, "fasta")
+bcs = pd.DataFrame.from_records(
+    data=fasta_tuple_generator(input_seq_iterator),
+    columns=["read_id", "bc"])
+
+# load alignments
+alns = pd.read_csv(
+    args.alignments,
+    sep="\t",
+    names=["chrom", "start", "stop", "read_id", "score", "strand"])
+
+# combine barcode library and alignments
+bcalib = pd.merge(
+    bcs, alns,
+    on="read_id",
+    how="inner",
+    sort=False)
+if bcalib.empty:
+    raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.")
+n_alns = len(alns.index)
+n_bcalib = len(bcalib.index)
+if n_bcalib < n_alns:
+    logging.warning(
+        "{} of {} alignments could not be associated with a random barcode.".format(
+            n_alns - n_bcalib, n_alns))
+
+# remove entries with barcodes that has uncalled base N
+bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index)
+n_bcalib_cleaned = len(bcalib_cleaned)
+if n_bcalib_cleaned < n_bcalib:
+    msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
+        n_bcalib - n_bcalib_cleaned, n_bcalib)
+    if n_bcalib_cleaned < (0.8 * n_bcalib):
+        logging.warning(msg)
+    else:
+        logging.info(msg)
+
+# count and merge pcr duplicates
+# grouping sorts by keys, so the ouput will be properly sorted
+merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index()
+merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True)
+
+# write coordinates of crosslinking event alignments
+eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
+merged.to_csv(
+    eventalnout,
+    columns=['chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'],
+    sep="\t", index=False, header=False)
+eventalnout.close()