diff extract_aln_ends.py @ 2:de4ea3aa1090 draft

Uploaded
author rnateam
date Thu, 22 Oct 2015 10:26:45 -0400
parents
children 570a7de9f151
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_aln_ends.py	Thu Oct 22 10:26:45 2015 -0400
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+
+tool_description = """
+Extract alignment ends from sam file.
+
+The resulting bed file contains the outer coordinates of the alignments. The
+bed name field is set to the read id and the score field is set to the edit
+distance of the alignment. The crosslinked nucleotide is one nt upstream of the
+5'-end of the bed entries.
+
+This script only reports results for alignments that are properly aligned in FR
+("forward-reverse") direction.
+
+By default output is written to stdout.
+
+Input:
+* sam file containing alignments (paired-end sequencing)
+
+Output:
+* bed6 file containing outer coordinates (sorted by read id)
+
+Example usage:
+- Extract coordinates from file input.bam and write to file output.bed
+extract_aln_ends.py input.bam --out output.bed
+"""
+
+epilog = """
+Author: Daniel Maticzka
+Copyright: 2015
+License: Apache
+Email: maticzkd@informatik.uni-freiburg.de
+Status: Development
+"""
+
+import argparse
+import logging
+from sys import stdout
+from shutil import rmtree
+from tempfile import mkdtemp
+from pybedtools import BedTool
+import pysam
+
+
+class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
+                                          argparse.RawDescriptionHelpFormatter):
+    # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
+    pass
+
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+# parse command line arguments
+parser = argparse.ArgumentParser(description=tool_description,
+                                 epilog=epilog,
+                                 formatter_class=DefaultsRawDescriptionHelpFormatter)
+# positional arguments
+parser.add_argument(
+    "sam",
+    help="Path to sam file containing alignments.")
+# optional arguments
+parser.add_argument(
+    "-o", "--outfile",
+    help="Write results to this file.")
+# misc arguments
+parser.add_argument(
+    "-v", "--verbose",
+    help="Be verbose.",
+    action="store_true")
+parser.add_argument(
+    "-d", "--debug",
+    help="Print lots of debugging information",
+    action="store_true")
+parser.add_argument(
+    '--version',
+    action='version',
+    version='0.1.0')
+
+args = parser.parse_args()
+
+if args.debug:
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
+elif args.verbose:
+    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
+else:
+    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
+logging.info("Parsed arguments:")
+logging.info("  sam: '{}'".format(args.sam))
+if args.outfile:
+    logging.info("  outfile: enabled writing to file")
+    logging.info("  outfile: '{}'".format(args.outfile))
+logging.info("")
+
+# convert to bam for use with pybedtools
+try:
+    # setup temporary directory
+    tmpdir = mkdtemp()
+    logging.debug("tmpdir: " + tmpdir)
+
+    fn_sorted = tmpdir + "/sorted.bam"
+    fn_fixedmates = tmpdir + "/fixedmates.bam"
+
+    # sort by id
+    logging.debug("calling samtools sort")
+    pysam.sort(args.sam, "-n", "-o{}".format(fn_sorted), "-T sortprefix")
+
+    # fix mate information
+    # also removes secondary and unmapped reads
+    logging.debug("calling samtools fixmates")
+    pysam.fixmate("-r", fn_sorted, fn_fixedmates)
+
+    # bedtools bam2bed
+    alns = BedTool(fn_fixedmates)
+    alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True)
+
+    # determine alignment ends and write to file
+    with (open(args.outfile, "w") if args.outfile is not None else stdout) as out:
+        for i in alns_bedpe:
+            chrom = i.fields[0]
+            fmstart = i.fields[1]
+            fmend = i.fields[2]
+            smstart = i.fields[4]
+            smend = i.fields[5]
+            readid = i.fields[6]
+            score = i.fields[7]
+            fmstrand = i.fields[8]
+            if fmstrand == "+":
+                start = fmstart
+                end = smend
+            elif fmstrand == "-":
+                start = smstart
+                end = fmend
+            else:
+                logging.warning("Skipping {}, strand information is missing: '{}'".format(readid, i))
+                continue
+            out.write("\t".join([chrom, start, end, readid, score, fmstrand]) + "\n")
+finally:
+    logging.debug("removed tmpdir: " + tmpdir)
+    rmtree(tmpdir)