comparison extract_aln_ends.py @ 2:de4ea3aa1090 draft

Uploaded
author rnateam
date Thu, 22 Oct 2015 10:26:45 -0400
parents
children 570a7de9f151
comparison
equal deleted inserted replaced
1:ae0f58d3318f 2:de4ea3aa1090
1 #!/usr/bin/env python
2
3 tool_description = """
4 Extract alignment ends from sam file.
5
6 The resulting bed file contains the outer coordinates of the alignments. The
7 bed name field is set to the read id and the score field is set to the edit
8 distance of the alignment. The crosslinked nucleotide is one nt upstream of the
9 5'-end of the bed entries.
10
11 This script only reports results for alignments that are properly aligned in FR
12 ("forward-reverse") direction.
13
14 By default output is written to stdout.
15
16 Input:
17 * sam file containing alignments (paired-end sequencing)
18
19 Output:
20 * bed6 file containing outer coordinates (sorted by read id)
21
22 Example usage:
23 - Extract coordinates from file input.bam and write to file output.bed
24 extract_aln_ends.py input.bam --out output.bed
25 """
26
27 epilog = """
28 Author: Daniel Maticzka
29 Copyright: 2015
30 License: Apache
31 Email: maticzkd@informatik.uni-freiburg.de
32 Status: Development
33 """
34
35 import argparse
36 import logging
37 from sys import stdout
38 from shutil import rmtree
39 from tempfile import mkdtemp
40 from pybedtools import BedTool
41 import pysam
42
43
44 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
45 argparse.RawDescriptionHelpFormatter):
46 # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
47 pass
48
49 # avoid ugly python IOError when stdout output is piped into another program
50 # and then truncated (such as piping to head)
51 from signal import signal, SIGPIPE, SIG_DFL
52 signal(SIGPIPE, SIG_DFL)
53
54 # parse command line arguments
55 parser = argparse.ArgumentParser(description=tool_description,
56 epilog=epilog,
57 formatter_class=DefaultsRawDescriptionHelpFormatter)
58 # positional arguments
59 parser.add_argument(
60 "sam",
61 help="Path to sam file containing alignments.")
62 # optional arguments
63 parser.add_argument(
64 "-o", "--outfile",
65 help="Write results to this file.")
66 # misc arguments
67 parser.add_argument(
68 "-v", "--verbose",
69 help="Be verbose.",
70 action="store_true")
71 parser.add_argument(
72 "-d", "--debug",
73 help="Print lots of debugging information",
74 action="store_true")
75 parser.add_argument(
76 '--version',
77 action='version',
78 version='0.1.0')
79
80 args = parser.parse_args()
81
82 if args.debug:
83 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
84 elif args.verbose:
85 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
86 else:
87 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
88 logging.info("Parsed arguments:")
89 logging.info(" sam: '{}'".format(args.sam))
90 if args.outfile:
91 logging.info(" outfile: enabled writing to file")
92 logging.info(" outfile: '{}'".format(args.outfile))
93 logging.info("")
94
95 # convert to bam for use with pybedtools
96 try:
97 # setup temporary directory
98 tmpdir = mkdtemp()
99 logging.debug("tmpdir: " + tmpdir)
100
101 fn_sorted = tmpdir + "/sorted.bam"
102 fn_fixedmates = tmpdir + "/fixedmates.bam"
103
104 # sort by id
105 logging.debug("calling samtools sort")
106 pysam.sort(args.sam, "-n", "-o{}".format(fn_sorted), "-T sortprefix")
107
108 # fix mate information
109 # also removes secondary and unmapped reads
110 logging.debug("calling samtools fixmates")
111 pysam.fixmate("-r", fn_sorted, fn_fixedmates)
112
113 # bedtools bam2bed
114 alns = BedTool(fn_fixedmates)
115 alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True)
116
117 # determine alignment ends and write to file
118 with (open(args.outfile, "w") if args.outfile is not None else stdout) as out:
119 for i in alns_bedpe:
120 chrom = i.fields[0]
121 fmstart = i.fields[1]
122 fmend = i.fields[2]
123 smstart = i.fields[4]
124 smend = i.fields[5]
125 readid = i.fields[6]
126 score = i.fields[7]
127 fmstrand = i.fields[8]
128 if fmstrand == "+":
129 start = fmstart
130 end = smend
131 elif fmstrand == "-":
132 start = smstart
133 end = fmend
134 else:
135 logging.warning("Skipping {}, strand information is missing: '{}'".format(readid, i))
136 continue
137 out.write("\t".join([chrom, start, end, readid, score, fmstrand]) + "\n")
138 finally:
139 logging.debug("removed tmpdir: " + tmpdir)
140 rmtree(tmpdir)