2
|
1 #!/usr/bin/env python
|
|
2
|
50
|
3 import argparse
|
|
4 import logging
|
|
5 from sys import stdout
|
|
6 from shutil import rmtree
|
|
7 from tempfile import mkdtemp
|
|
8 from pybedtools import BedTool
|
|
9 import pysam
|
|
10 # avoid ugly python IOError when stdout output is piped into another program
|
|
11 # and then truncated (such as piping to head)
|
|
12 from signal import signal, SIGPIPE, SIG_DFL
|
|
13 signal(SIGPIPE, SIG_DFL)
|
|
14
|
2
|
15 tool_description = """
|
|
16 Extract alignment ends from sam file.
|
|
17
|
|
18 The resulting bed file contains the outer coordinates of the alignments. The
|
|
19 bed name field is set to the read id and the score field is set to the edit
|
|
20 distance of the alignment. The crosslinked nucleotide is one nt upstream of the
|
|
21 5'-end of the bed entries.
|
|
22
|
|
23 This script only reports results for alignments that are properly aligned in FR
|
|
24 ("forward-reverse") direction.
|
|
25
|
|
26 By default output is written to stdout.
|
|
27
|
|
28 Input:
|
14
|
29 * alignments in SAM or BAM format (paired-end sequencing)
|
2
|
30
|
|
31 Output:
|
|
32 * bed6 file containing outer coordinates (sorted by read id)
|
|
33
|
|
34 Example usage:
|
|
35 - Extract coordinates from file input.bam and write to file output.bed
|
|
36 extract_aln_ends.py input.bam --out output.bed
|
|
37 """
|
|
38
|
|
39 epilog = """
|
|
40 Author: Daniel Maticzka
|
|
41 Copyright: 2015
|
|
42 License: Apache
|
|
43 Email: maticzkd@informatik.uni-freiburg.de
|
|
44 Status: Development
|
|
45 """
|
|
46
|
|
47
|
|
48 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
|
|
49 argparse.RawDescriptionHelpFormatter):
|
|
50 # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
|
|
51 pass
|
|
52
|
|
53 # parse command line arguments
|
|
54 parser = argparse.ArgumentParser(description=tool_description,
|
|
55 epilog=epilog,
|
|
56 formatter_class=DefaultsRawDescriptionHelpFormatter)
|
|
57 # positional arguments
|
|
58 parser.add_argument(
|
14
|
59 "infile",
|
|
60 help="Path to alignments in SAM or BAM format.")
|
2
|
61 # optional arguments
|
|
62 parser.add_argument(
|
|
63 "-o", "--outfile",
|
|
64 help="Write results to this file.")
|
|
65 # misc arguments
|
|
66 parser.add_argument(
|
|
67 "-v", "--verbose",
|
|
68 help="Be verbose.",
|
|
69 action="store_true")
|
|
70 parser.add_argument(
|
|
71 "-d", "--debug",
|
|
72 help="Print lots of debugging information",
|
|
73 action="store_true")
|
|
74 parser.add_argument(
|
|
75 '--version',
|
|
76 action='version',
|
14
|
77 version='0.2.0')
|
2
|
78
|
|
79 args = parser.parse_args()
|
|
80
|
|
81 if args.debug:
|
|
82 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
|
|
83 elif args.verbose:
|
|
84 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
|
|
85 else:
|
|
86 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
|
|
87 logging.info("Parsed arguments:")
|
14
|
88 logging.info(" infile: '{}'".format(args.infile))
|
2
|
89 if args.outfile:
|
|
90 logging.info(" outfile: enabled writing to file")
|
|
91 logging.info(" outfile: '{}'".format(args.outfile))
|
|
92 logging.info("")
|
|
93
|
|
94 # convert to bam for use with pybedtools
|
|
95 try:
|
|
96 # setup temporary directory
|
|
97 tmpdir = mkdtemp()
|
|
98 logging.debug("tmpdir: " + tmpdir)
|
|
99
|
|
100 fn_sorted = tmpdir + "/sorted.bam"
|
|
101 fn_fixedmates = tmpdir + "/fixedmates.bam"
|
|
102
|
|
103 # sort by id
|
|
104 logging.debug("calling samtools sort")
|
14
|
105 pysam.sort(args.infile, "-n", "-o{}".format(fn_sorted), "-T sortprefix")
|
2
|
106
|
|
107 # fix mate information
|
|
108 # also removes secondary and unmapped reads
|
|
109 logging.debug("calling samtools fixmates")
|
|
110 pysam.fixmate("-r", fn_sorted, fn_fixedmates)
|
|
111
|
|
112 # bedtools bam2bed
|
|
113 alns = BedTool(fn_fixedmates)
|
|
114 alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True)
|
|
115
|
|
116 # determine alignment ends and write to file
|
|
117 with (open(args.outfile, "w") if args.outfile is not None else stdout) as out:
|
|
118 for i in alns_bedpe:
|
|
119 chrom = i.fields[0]
|
|
120 fmstart = i.fields[1]
|
|
121 fmend = i.fields[2]
|
|
122 smstart = i.fields[4]
|
|
123 smend = i.fields[5]
|
|
124 readid = i.fields[6]
|
|
125 score = i.fields[7]
|
|
126 fmstrand = i.fields[8]
|
|
127 if fmstrand == "+":
|
|
128 start = fmstart
|
|
129 end = smend
|
|
130 elif fmstrand == "-":
|
|
131 start = smstart
|
|
132 end = fmend
|
|
133 else:
|
|
134 logging.warning("Skipping {}, strand information is missing: '{}'".format(readid, i))
|
|
135 continue
|
|
136 out.write("\t".join([chrom, start, end, readid, score, fmstrand]) + "\n")
|
|
137 finally:
|
|
138 logging.debug("removed tmpdir: " + tmpdir)
|
|
139 rmtree(tmpdir)
|