view extract_aln_ends.py @ 10:17603d9eee69 draft

Uploaded
author rnateam
date Tue, 17 Nov 2015 11:35:45 -0500
parents de4ea3aa1090
children 570a7de9f151
line wrap: on
line source

#!/usr/bin/env python

tool_description = """
Extract alignment ends from sam file.

The resulting bed file contains the outer coordinates of the alignments. The
bed name field is set to the read id and the score field is set to the edit
distance of the alignment. The crosslinked nucleotide is one nt upstream of the
5'-end of the bed entries.

This script only reports results for alignments that are properly aligned in FR
("forward-reverse") direction.

By default output is written to stdout.

Input:
* sam file containing alignments (paired-end sequencing)

Output:
* bed6 file containing outer coordinates (sorted by read id)

Example usage:
- Extract coordinates from file input.bam and write to file output.bed
extract_aln_ends.py input.bam --out output.bed
"""

epilog = """
Author: Daniel Maticzka
Copyright: 2015
License: Apache
Email: maticzkd@informatik.uni-freiburg.de
Status: Development
"""

import argparse
import logging
from sys import stdout
from shutil import rmtree
from tempfile import mkdtemp
from pybedtools import BedTool
import pysam


class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
                                          argparse.RawDescriptionHelpFormatter):
    # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
    pass

# avoid ugly python IOError when stdout output is piped into another program
# and then truncated (such as piping to head)
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)

# parse command line arguments
parser = argparse.ArgumentParser(description=tool_description,
                                 epilog=epilog,
                                 formatter_class=DefaultsRawDescriptionHelpFormatter)
# positional arguments
parser.add_argument(
    "sam",
    help="Path to sam file containing alignments.")
# optional arguments
parser.add_argument(
    "-o", "--outfile",
    help="Write results to this file.")
# misc arguments
parser.add_argument(
    "-v", "--verbose",
    help="Be verbose.",
    action="store_true")
parser.add_argument(
    "-d", "--debug",
    help="Print lots of debugging information",
    action="store_true")
parser.add_argument(
    '--version',
    action='version',
    version='0.1.0')

args = parser.parse_args()

if args.debug:
    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
elif args.verbose:
    logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
else:
    logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
logging.info("Parsed arguments:")
logging.info("  sam: '{}'".format(args.sam))
if args.outfile:
    logging.info("  outfile: enabled writing to file")
    logging.info("  outfile: '{}'".format(args.outfile))
logging.info("")

# convert to bam for use with pybedtools
try:
    # setup temporary directory
    tmpdir = mkdtemp()
    logging.debug("tmpdir: " + tmpdir)

    fn_sorted = tmpdir + "/sorted.bam"
    fn_fixedmates = tmpdir + "/fixedmates.bam"

    # sort by id
    logging.debug("calling samtools sort")
    pysam.sort(args.sam, "-n", "-o{}".format(fn_sorted), "-T sortprefix")

    # fix mate information
    # also removes secondary and unmapped reads
    logging.debug("calling samtools fixmates")
    pysam.fixmate("-r", fn_sorted, fn_fixedmates)

    # bedtools bam2bed
    alns = BedTool(fn_fixedmates)
    alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True)

    # determine alignment ends and write to file
    with (open(args.outfile, "w") if args.outfile is not None else stdout) as out:
        for i in alns_bedpe:
            chrom = i.fields[0]
            fmstart = i.fields[1]
            fmend = i.fields[2]
            smstart = i.fields[4]
            smend = i.fields[5]
            readid = i.fields[6]
            score = i.fields[7]
            fmstrand = i.fields[8]
            if fmstrand == "+":
                start = fmstart
                end = smend
            elif fmstrand == "-":
                start = smstart
                end = fmend
            else:
                logging.warning("Skipping {}, strand information is missing: '{}'".format(readid, i))
                continue
            out.write("\t".join([chrom, start, end, readid, score, fmstrand]) + "\n")
finally:
    logging.debug("removed tmpdir: " + tmpdir)
    rmtree(tmpdir)