Mercurial > repos > rnateam > bctools
view extract_aln_ends.py @ 19:885602df4554 draft
another try with bedtools dependency
author | rnateam |
---|---|
date | Mon, 30 Nov 2015 11:43:33 -0500 |
parents | 570a7de9f151 |
children | 0b9aab6aaebf |
line wrap: on
line source
#!/usr/bin/env python tool_description = """ Extract alignment ends from sam file. The resulting bed file contains the outer coordinates of the alignments. The bed name field is set to the read id and the score field is set to the edit distance of the alignment. The crosslinked nucleotide is one nt upstream of the 5'-end of the bed entries. This script only reports results for alignments that are properly aligned in FR ("forward-reverse") direction. By default output is written to stdout. Input: * alignments in SAM or BAM format (paired-end sequencing) Output: * bed6 file containing outer coordinates (sorted by read id) Example usage: - Extract coordinates from file input.bam and write to file output.bed extract_aln_ends.py input.bam --out output.bed """ epilog = """ Author: Daniel Maticzka Copyright: 2015 License: Apache Email: maticzkd@informatik.uni-freiburg.de Status: Development """ import argparse import logging from sys import stdout from shutil import rmtree from tempfile import mkdtemp from pybedtools import BedTool import pysam class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter pass # avoid ugly python IOError when stdout output is piped into another program # and then truncated (such as piping to head) from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE, SIG_DFL) # parse command line arguments parser = argparse.ArgumentParser(description=tool_description, epilog=epilog, formatter_class=DefaultsRawDescriptionHelpFormatter) # positional arguments parser.add_argument( "infile", help="Path to alignments in SAM or BAM format.") # optional arguments parser.add_argument( "-o", "--outfile", help="Write results to this file.") # misc arguments parser.add_argument( "-v", "--verbose", help="Be verbose.", action="store_true") parser.add_argument( "-d", "--debug", help="Print lots of debugging information", action="store_true") parser.add_argument( '--version', action='version', version='0.2.0') args = parser.parse_args() if args.debug: logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") elif args.verbose: logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") else: logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") logging.info("Parsed arguments:") logging.info(" infile: '{}'".format(args.infile)) if args.outfile: logging.info(" outfile: enabled writing to file") logging.info(" outfile: '{}'".format(args.outfile)) logging.info("") # convert to bam for use with pybedtools try: # setup temporary directory tmpdir = mkdtemp() logging.debug("tmpdir: " + tmpdir) fn_sorted = tmpdir + "/sorted.bam" fn_fixedmates = tmpdir + "/fixedmates.bam" # sort by id logging.debug("calling samtools sort") pysam.sort(args.infile, "-n", "-o{}".format(fn_sorted), "-T sortprefix") # fix mate information # also removes secondary and unmapped reads logging.debug("calling samtools fixmates") pysam.fixmate("-r", fn_sorted, fn_fixedmates) # bedtools bam2bed alns = BedTool(fn_fixedmates) alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True) # determine alignment ends and write to file with (open(args.outfile, "w") if args.outfile is not None else stdout) as out: for i in alns_bedpe: chrom = i.fields[0] fmstart = i.fields[1] fmend = i.fields[2] smstart = i.fields[4] smend = i.fields[5] readid = i.fields[6] score = i.fields[7] fmstrand = i.fields[8] if fmstrand == "+": start = fmstart end = smend elif fmstrand == "-": start = smstart end = fmend else: logging.warning("Skipping {}, strand information is missing: '{}'".format(readid, i)) continue out.write("\t".join([chrom, start, end, readid, score, fmstrand]) + "\n") finally: logging.debug("removed tmpdir: " + tmpdir) rmtree(tmpdir)