Mercurial > repos > rnateam > bctools
comparison extract_aln_ends.py @ 2:de4ea3aa1090 draft
Uploaded
| author | rnateam |
|---|---|
| date | Thu, 22 Oct 2015 10:26:45 -0400 |
| parents | |
| children | 570a7de9f151 |
comparison
equal
deleted
inserted
replaced
| 1:ae0f58d3318f | 2:de4ea3aa1090 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 tool_description = """ | |
| 4 Extract alignment ends from sam file. | |
| 5 | |
| 6 The resulting bed file contains the outer coordinates of the alignments. The | |
| 7 bed name field is set to the read id and the score field is set to the edit | |
| 8 distance of the alignment. The crosslinked nucleotide is one nt upstream of the | |
| 9 5'-end of the bed entries. | |
| 10 | |
| 11 This script only reports results for alignments that are properly aligned in FR | |
| 12 ("forward-reverse") direction. | |
| 13 | |
| 14 By default output is written to stdout. | |
| 15 | |
| 16 Input: | |
| 17 * sam file containing alignments (paired-end sequencing) | |
| 18 | |
| 19 Output: | |
| 20 * bed6 file containing outer coordinates (sorted by read id) | |
| 21 | |
| 22 Example usage: | |
| 23 - Extract coordinates from file input.bam and write to file output.bed | |
| 24 extract_aln_ends.py input.bam --out output.bed | |
| 25 """ | |
| 26 | |
| 27 epilog = """ | |
| 28 Author: Daniel Maticzka | |
| 29 Copyright: 2015 | |
| 30 License: Apache | |
| 31 Email: maticzkd@informatik.uni-freiburg.de | |
| 32 Status: Development | |
| 33 """ | |
| 34 | |
| 35 import argparse | |
| 36 import logging | |
| 37 from sys import stdout | |
| 38 from shutil import rmtree | |
| 39 from tempfile import mkdtemp | |
| 40 from pybedtools import BedTool | |
| 41 import pysam | |
| 42 | |
| 43 | |
| 44 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, | |
| 45 argparse.RawDescriptionHelpFormatter): | |
| 46 # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter | |
| 47 pass | |
| 48 | |
| 49 # avoid ugly python IOError when stdout output is piped into another program | |
| 50 # and then truncated (such as piping to head) | |
| 51 from signal import signal, SIGPIPE, SIG_DFL | |
| 52 signal(SIGPIPE, SIG_DFL) | |
| 53 | |
| 54 # parse command line arguments | |
| 55 parser = argparse.ArgumentParser(description=tool_description, | |
| 56 epilog=epilog, | |
| 57 formatter_class=DefaultsRawDescriptionHelpFormatter) | |
| 58 # positional arguments | |
| 59 parser.add_argument( | |
| 60 "sam", | |
| 61 help="Path to sam file containing alignments.") | |
| 62 # optional arguments | |
| 63 parser.add_argument( | |
| 64 "-o", "--outfile", | |
| 65 help="Write results to this file.") | |
| 66 # misc arguments | |
| 67 parser.add_argument( | |
| 68 "-v", "--verbose", | |
| 69 help="Be verbose.", | |
| 70 action="store_true") | |
| 71 parser.add_argument( | |
| 72 "-d", "--debug", | |
| 73 help="Print lots of debugging information", | |
| 74 action="store_true") | |
| 75 parser.add_argument( | |
| 76 '--version', | |
| 77 action='version', | |
| 78 version='0.1.0') | |
| 79 | |
| 80 args = parser.parse_args() | |
| 81 | |
| 82 if args.debug: | |
| 83 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") | |
| 84 elif args.verbose: | |
| 85 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") | |
| 86 else: | |
| 87 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") | |
| 88 logging.info("Parsed arguments:") | |
| 89 logging.info(" sam: '{}'".format(args.sam)) | |
| 90 if args.outfile: | |
| 91 logging.info(" outfile: enabled writing to file") | |
| 92 logging.info(" outfile: '{}'".format(args.outfile)) | |
| 93 logging.info("") | |
| 94 | |
| 95 # convert to bam for use with pybedtools | |
| 96 try: | |
| 97 # setup temporary directory | |
| 98 tmpdir = mkdtemp() | |
| 99 logging.debug("tmpdir: " + tmpdir) | |
| 100 | |
| 101 fn_sorted = tmpdir + "/sorted.bam" | |
| 102 fn_fixedmates = tmpdir + "/fixedmates.bam" | |
| 103 | |
| 104 # sort by id | |
| 105 logging.debug("calling samtools sort") | |
| 106 pysam.sort(args.sam, "-n", "-o{}".format(fn_sorted), "-T sortprefix") | |
| 107 | |
| 108 # fix mate information | |
| 109 # also removes secondary and unmapped reads | |
| 110 logging.debug("calling samtools fixmates") | |
| 111 pysam.fixmate("-r", fn_sorted, fn_fixedmates) | |
| 112 | |
| 113 # bedtools bam2bed | |
| 114 alns = BedTool(fn_fixedmates) | |
| 115 alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True) | |
| 116 | |
| 117 # determine alignment ends and write to file | |
| 118 with (open(args.outfile, "w") if args.outfile is not None else stdout) as out: | |
| 119 for i in alns_bedpe: | |
| 120 chrom = i.fields[0] | |
| 121 fmstart = i.fields[1] | |
| 122 fmend = i.fields[2] | |
| 123 smstart = i.fields[4] | |
| 124 smend = i.fields[5] | |
| 125 readid = i.fields[6] | |
| 126 score = i.fields[7] | |
| 127 fmstrand = i.fields[8] | |
| 128 if fmstrand == "+": | |
| 129 start = fmstart | |
| 130 end = smend | |
| 131 elif fmstrand == "-": | |
| 132 start = smstart | |
| 133 end = fmend | |
| 134 else: | |
| 135 logging.warning("Skipping {}, strand information is missing: '{}'".format(readid, i)) | |
| 136 continue | |
| 137 out.write("\t".join([chrom, start, end, readid, score, fmstrand]) + "\n") | |
| 138 finally: | |
| 139 logging.debug("removed tmpdir: " + tmpdir) | |
| 140 rmtree(tmpdir) |
