diff rm_spurious_events.py @ 2:de4ea3aa1090 draft

Uploaded
author rnateam
date Thu, 22 Oct 2015 10:26:45 -0400
parents
children 0b9aab6aaebf
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rm_spurious_events.py	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+tool_description = """
+Remove spurious events originating from errors in random sequence tags.
+
+This script compares all events sharing the same coordinates. Among each group
+of events the maximum number of PCR duplicates is determined. All events that
+are supported by less than 10 percent of this maximum count are removed.
+
+By default output is written to stdout.
+
+Input:
+* bed6 file containing crosslinking events with score field set to number of PCR
+  duplicates
+
+Output:
+* bed6 file with spurious crosslinking events removed, sorted by fields chrom,
+  start, stop, strand
+
+Example usage:
+- remove spurious events from spurious.bed and write results to file cleaned.bed
+rm_spurious_events.py spurious.bed --out cleaned.bed
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Testing
+"""
+
+import argparse
+import logging
+from sys import stdout
+import pandas as pd
+
+
+class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
+                                          argparse.RawDescriptionHelpFormatter):
+    # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
+    pass
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+                                 epilog=epilog,
+                                 formatter_class=DefaultsRawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+    "events",
+    help="Path to bed6 file containing alignments.")
+# optional arguments
+parser.add_argument(
+    "-o", "--outfile",
+    help="Write results to this file.")
+parser.add_argument(
+    "-t", "--threshold",
+    type=float,
+    default=0.1,
+    help="Threshold for spurious event removal."
+)
+# misc arguments
+parser.add_argument(
+    "-v", "--verbose",
+    help="Be verbose.",
+    action="store_true")
+parser.add_argument(
+    "-d", "--debug",
+    help="Print lots of debugging information",
+    action="store_true")
+parser.add_argument(
+    '--version',
+    action='version',
+    version='0.1.0')
+
+args = parser.parse_args()
+
+if args.debug:
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info("  alignments: '{}'".format(args.events))
+logging.info("  threshold: '{}'".format(args.threshold))
+if args.outfile:
+    logging.info("  outfile: enabled writing to file")
+    logging.info("  outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# check threshold parameter value
+if args.threshold < 0 or args.threshold > 1:
+    raise ValueError("Threshold must be in [0,1].")
+
+# load alignments
+alns = pd.read_csv(
+    args.events,
+    sep="\t",
+    names=["chrom", "start", "stop", "read_id", "score", "strand"])
+
+# remove all alignments that not enough PCR duplicates with respect to
+# the group maximum
+grouped = alns.groupby(['chrom', 'start', 'stop', 'strand'], group_keys=False)
+alns_cleaned = grouped.apply(lambda g: g[g["score"] >= args.threshold * g["score"].max()])
+
+# write coordinates of crosslinking event alignments
+alns_cleaned_out = (open(args.outfile, "w") if args.outfile is not None else stdout)
+alns_cleaned.to_csv(
+    alns_cleaned_out,
+    columns=['chrom', 'start', 'stop', 'read_id', 'score', 'strand'],
+    sep="\t", index=False, header=False)
+alns_cleaned_out.close()