2
|
1 #!/usr/bin/env python
|
|
2
|
|
3 tool_description = """
|
|
4 Extract alignment ends from sam file.
|
|
5
|
|
6 The resulting bed file contains the outer coordinates of the alignments. The
|
|
7 bed name field is set to the read id and the score field is set to the edit
|
|
8 distance of the alignment. The crosslinked nucleotide is one nt upstream of the
|
|
9 5'-end of the bed entries.
|
|
10
|
|
11 This script only reports results for alignments that are properly aligned in FR
|
|
12 ("forward-reverse") direction.
|
|
13
|
|
14 By default output is written to stdout.
|
|
15
|
|
16 Input:
|
14
|
17 * alignments in SAM or BAM format (paired-end sequencing)
|
2
|
18
|
|
19 Output:
|
|
20 * bed6 file containing outer coordinates (sorted by read id)
|
|
21
|
|
22 Example usage:
|
|
23 - Extract coordinates from file input.bam and write to file output.bed
|
|
24 extract_aln_ends.py input.bam --out output.bed
|
|
25 """
|
|
26
|
|
27 epilog = """
|
|
28 Author: Daniel Maticzka
|
|
29 Copyright: 2015
|
|
30 License: Apache
|
|
31 Email: maticzkd@informatik.uni-freiburg.de
|
|
32 Status: Development
|
|
33 """
|
|
34
|
|
35 import argparse
|
|
36 import logging
|
|
37 from sys import stdout
|
|
38 from shutil import rmtree
|
|
39 from tempfile import mkdtemp
|
|
40 from pybedtools import BedTool
|
|
41 import pysam
|
|
42
|
|
43
|
|
44 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
|
|
45 argparse.RawDescriptionHelpFormatter):
|
|
46 # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
|
|
47 pass
|
|
48
|
|
49 # avoid ugly python IOError when stdout output is piped into another program
|
|
50 # and then truncated (such as piping to head)
|
|
51 from signal import signal, SIGPIPE, SIG_DFL
|
|
52 signal(SIGPIPE, SIG_DFL)
|
|
53
|
|
54 # parse command line arguments
|
|
55 parser = argparse.ArgumentParser(description=tool_description,
|
|
56 epilog=epilog,
|
|
57 formatter_class=DefaultsRawDescriptionHelpFormatter)
|
|
58 # positional arguments
|
|
59 parser.add_argument(
|
14
|
60 "infile",
|
|
61 help="Path to alignments in SAM or BAM format.")
|
2
|
62 # optional arguments
|
|
63 parser.add_argument(
|
|
64 "-o", "--outfile",
|
|
65 help="Write results to this file.")
|
|
66 # misc arguments
|
|
67 parser.add_argument(
|
|
68 "-v", "--verbose",
|
|
69 help="Be verbose.",
|
|
70 action="store_true")
|
|
71 parser.add_argument(
|
|
72 "-d", "--debug",
|
|
73 help="Print lots of debugging information",
|
|
74 action="store_true")
|
|
75 parser.add_argument(
|
|
76 '--version',
|
|
77 action='version',
|
14
|
78 version='0.2.0')
|
2
|
79
|
|
80 args = parser.parse_args()
|
|
81
|
|
82 if args.debug:
|
|
83 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
|
|
84 elif args.verbose:
|
|
85 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
|
|
86 else:
|
|
87 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
|
|
88 logging.info("Parsed arguments:")
|
14
|
89 logging.info(" infile: '{}'".format(args.infile))
|
2
|
90 if args.outfile:
|
|
91 logging.info(" outfile: enabled writing to file")
|
|
92 logging.info(" outfile: '{}'".format(args.outfile))
|
|
93 logging.info("")
|
|
94
|
|
95 # convert to bam for use with pybedtools
|
|
96 try:
|
|
97 # setup temporary directory
|
|
98 tmpdir = mkdtemp()
|
|
99 logging.debug("tmpdir: " + tmpdir)
|
|
100
|
|
101 fn_sorted = tmpdir + "/sorted.bam"
|
|
102 fn_fixedmates = tmpdir + "/fixedmates.bam"
|
|
103
|
|
104 # sort by id
|
|
105 logging.debug("calling samtools sort")
|
14
|
106 pysam.sort(args.infile, "-n", "-o{}".format(fn_sorted), "-T sortprefix")
|
2
|
107
|
|
108 # fix mate information
|
|
109 # also removes secondary and unmapped reads
|
|
110 logging.debug("calling samtools fixmates")
|
|
111 pysam.fixmate("-r", fn_sorted, fn_fixedmates)
|
|
112
|
|
113 # bedtools bam2bed
|
|
114 alns = BedTool(fn_fixedmates)
|
|
115 alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True)
|
|
116
|
|
117 # determine alignment ends and write to file
|
|
118 with (open(args.outfile, "w") if args.outfile is not None else stdout) as out:
|
|
119 for i in alns_bedpe:
|
|
120 chrom = i.fields[0]
|
|
121 fmstart = i.fields[1]
|
|
122 fmend = i.fields[2]
|
|
123 smstart = i.fields[4]
|
|
124 smend = i.fields[5]
|
|
125 readid = i.fields[6]
|
|
126 score = i.fields[7]
|
|
127 fmstrand = i.fields[8]
|
|
128 if fmstrand == "+":
|
|
129 start = fmstart
|
|
130 end = smend
|
|
131 elif fmstrand == "-":
|
|
132 start = smstart
|
|
133 end = fmend
|
|
134 else:
|
|
135 logging.warning("Skipping {}, strand information is missing: '{}'".format(readid, i))
|
|
136 continue
|
|
137 out.write("\t".join([chrom, start, end, readid, score, fmstrand]) + "\n")
|
|
138 finally:
|
|
139 logging.debug("removed tmpdir: " + tmpdir)
|
|
140 rmtree(tmpdir)
|