annotate extract_aln_ends.py @ 19:885602df4554 draft

another try with bedtools dependency
author rnateam
date Mon, 30 Nov 2015 11:43:33 -0500
parents 570a7de9f151
children 0b9aab6aaebf
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
1 #!/usr/bin/env python
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
3 tool_description = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
4 Extract alignment ends from sam file.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
5
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
6 The resulting bed file contains the outer coordinates of the alignments. The
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
7 bed name field is set to the read id and the score field is set to the edit
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
8 distance of the alignment. The crosslinked nucleotide is one nt upstream of the
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
9 5'-end of the bed entries.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
10
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
11 This script only reports results for alignments that are properly aligned in FR
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
12 ("forward-reverse") direction.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
13
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
14 By default output is written to stdout.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
15
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
16 Input:
14
570a7de9f151 read from bam; fix header issue
rnateam
parents: 2
diff changeset
17 * alignments in SAM or BAM format (paired-end sequencing)
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
18
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
19 Output:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
20 * bed6 file containing outer coordinates (sorted by read id)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
21
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
22 Example usage:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
23 - Extract coordinates from file input.bam and write to file output.bed
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
24 extract_aln_ends.py input.bam --out output.bed
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
25 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
26
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
27 epilog = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
28 Author: Daniel Maticzka
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
29 Copyright: 2015
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
30 License: Apache
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
31 Email: maticzkd@informatik.uni-freiburg.de
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
32 Status: Development
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
33 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
34
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
35 import argparse
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
36 import logging
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
37 from sys import stdout
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
38 from shutil import rmtree
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
39 from tempfile import mkdtemp
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
40 from pybedtools import BedTool
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
41 import pysam
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
42
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
43
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
44 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
45 argparse.RawDescriptionHelpFormatter):
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
46 # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
47 pass
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
48
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
49 # avoid ugly python IOError when stdout output is piped into another program
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
50 # and then truncated (such as piping to head)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
51 from signal import signal, SIGPIPE, SIG_DFL
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
52 signal(SIGPIPE, SIG_DFL)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
53
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
54 # parse command line arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
55 parser = argparse.ArgumentParser(description=tool_description,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
56 epilog=epilog,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
57 formatter_class=DefaultsRawDescriptionHelpFormatter)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
58 # positional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
59 parser.add_argument(
14
570a7de9f151 read from bam; fix header issue
rnateam
parents: 2
diff changeset
60 "infile",
570a7de9f151 read from bam; fix header issue
rnateam
parents: 2
diff changeset
61 help="Path to alignments in SAM or BAM format.")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
62 # optional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
63 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
64 "-o", "--outfile",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
65 help="Write results to this file.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
66 # misc arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
67 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
68 "-v", "--verbose",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
69 help="Be verbose.",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
70 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
71 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
72 "-d", "--debug",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
73 help="Print lots of debugging information",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
74 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
75 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
76 '--version',
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
77 action='version',
14
570a7de9f151 read from bam; fix header issue
rnateam
parents: 2
diff changeset
78 version='0.2.0')
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
79
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
80 args = parser.parse_args()
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
81
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
82 if args.debug:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
83 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
84 elif args.verbose:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
85 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
86 else:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
87 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
88 logging.info("Parsed arguments:")
14
570a7de9f151 read from bam; fix header issue
rnateam
parents: 2
diff changeset
89 logging.info(" infile: '{}'".format(args.infile))
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
90 if args.outfile:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
91 logging.info(" outfile: enabled writing to file")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
92 logging.info(" outfile: '{}'".format(args.outfile))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
93 logging.info("")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
94
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
95 # convert to bam for use with pybedtools
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
96 try:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
97 # setup temporary directory
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
98 tmpdir = mkdtemp()
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
99 logging.debug("tmpdir: " + tmpdir)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
100
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
101 fn_sorted = tmpdir + "/sorted.bam"
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
102 fn_fixedmates = tmpdir + "/fixedmates.bam"
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
103
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
104 # sort by id
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
105 logging.debug("calling samtools sort")
14
570a7de9f151 read from bam; fix header issue
rnateam
parents: 2
diff changeset
106 pysam.sort(args.infile, "-n", "-o{}".format(fn_sorted), "-T sortprefix")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
107
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
108 # fix mate information
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
109 # also removes secondary and unmapped reads
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
110 logging.debug("calling samtools fixmates")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
111 pysam.fixmate("-r", fn_sorted, fn_fixedmates)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
112
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
113 # bedtools bam2bed
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
114 alns = BedTool(fn_fixedmates)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
115 alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
116
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
117 # determine alignment ends and write to file
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
118 with (open(args.outfile, "w") if args.outfile is not None else stdout) as out:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
119 for i in alns_bedpe:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
120 chrom = i.fields[0]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
121 fmstart = i.fields[1]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
122 fmend = i.fields[2]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
123 smstart = i.fields[4]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
124 smend = i.fields[5]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
125 readid = i.fields[6]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
126 score = i.fields[7]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
127 fmstrand = i.fields[8]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
128 if fmstrand == "+":
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
129 start = fmstart
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
130 end = smend
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
131 elif fmstrand == "-":
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
132 start = smstart
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
133 end = fmend
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
134 else:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
135 logging.warning("Skipping {}, strand information is missing: '{}'".format(readid, i))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
136 continue
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
137 out.write("\t".join([chrom, start, end, readid, score, fmstrand]) + "\n")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
138 finally:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
139 logging.debug("removed tmpdir: " + tmpdir)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
140 rmtree(tmpdir)