annotate extract_aln_ends.py @ 58:bbbae1ee87e0 draft default tip

fix for flexbar with small data issue
author rnateam
date Tue, 16 Feb 2016 10:08:58 -0500
parents 0b9aab6aaebf
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
1 #!/usr/bin/env python
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
2
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
3 import argparse
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
4 import logging
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
5 from sys import stdout
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
6 from shutil import rmtree
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
7 from tempfile import mkdtemp
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
8 from pybedtools import BedTool
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
9 import pysam
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
10 # avoid ugly python IOError when stdout output is piped into another program
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
11 # and then truncated (such as piping to head)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
12 from signal import signal, SIGPIPE, SIG_DFL
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
13 signal(SIGPIPE, SIG_DFL)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
14
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
15 tool_description = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
16 Extract alignment ends from sam file.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
17
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
18 The resulting bed file contains the outer coordinates of the alignments. The
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
19 bed name field is set to the read id and the score field is set to the edit
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
20 distance of the alignment. The crosslinked nucleotide is one nt upstream of the
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
21 5'-end of the bed entries.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
22
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
23 This script only reports results for alignments that are properly aligned in FR
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
24 ("forward-reverse") direction.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
25
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
26 By default output is written to stdout.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
27
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
28 Input:
14
570a7de9f151 read from bam; fix header issue
rnateam
parents: 2
diff changeset
29 * alignments in SAM or BAM format (paired-end sequencing)
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
30
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
31 Output:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
32 * bed6 file containing outer coordinates (sorted by read id)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
33
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
34 Example usage:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
35 - Extract coordinates from file input.bam and write to file output.bed
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
36 extract_aln_ends.py input.bam --out output.bed
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
37 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
38
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
39 epilog = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
40 Author: Daniel Maticzka
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
41 Copyright: 2015
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
42 License: Apache
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
43 Email: maticzkd@informatik.uni-freiburg.de
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
44 Status: Development
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
45 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
46
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
47
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
48 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
49 argparse.RawDescriptionHelpFormatter):
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
50 # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
51 pass
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
52
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
53 # parse command line arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
54 parser = argparse.ArgumentParser(description=tool_description,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
55 epilog=epilog,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
56 formatter_class=DefaultsRawDescriptionHelpFormatter)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
57 # positional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
58 parser.add_argument(
14
570a7de9f151 read from bam; fix header issue
rnateam
parents: 2
diff changeset
59 "infile",
570a7de9f151 read from bam; fix header issue
rnateam
parents: 2
diff changeset
60 help="Path to alignments in SAM or BAM format.")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
61 # optional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
62 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
63 "-o", "--outfile",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
64 help="Write results to this file.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
65 # misc arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
66 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
67 "-v", "--verbose",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
68 help="Be verbose.",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
69 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
70 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
71 "-d", "--debug",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
72 help="Print lots of debugging information",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
73 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
74 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
75 '--version',
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
76 action='version',
14
570a7de9f151 read from bam; fix header issue
rnateam
parents: 2
diff changeset
77 version='0.2.0')
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
78
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
79 args = parser.parse_args()
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
80
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
81 if args.debug:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
82 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
83 elif args.verbose:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
84 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
85 else:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
86 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
87 logging.info("Parsed arguments:")
14
570a7de9f151 read from bam; fix header issue
rnateam
parents: 2
diff changeset
88 logging.info(" infile: '{}'".format(args.infile))
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
89 if args.outfile:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
90 logging.info(" outfile: enabled writing to file")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
91 logging.info(" outfile: '{}'".format(args.outfile))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
92 logging.info("")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
93
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
94 # convert to bam for use with pybedtools
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
95 try:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
96 # setup temporary directory
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
97 tmpdir = mkdtemp()
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
98 logging.debug("tmpdir: " + tmpdir)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
99
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
100 fn_sorted = tmpdir + "/sorted.bam"
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
101 fn_fixedmates = tmpdir + "/fixedmates.bam"
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
102
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
103 # sort by id
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
104 logging.debug("calling samtools sort")
14
570a7de9f151 read from bam; fix header issue
rnateam
parents: 2
diff changeset
105 pysam.sort(args.infile, "-n", "-o{}".format(fn_sorted), "-T sortprefix")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
106
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
107 # fix mate information
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
108 # also removes secondary and unmapped reads
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
109 logging.debug("calling samtools fixmates")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
110 pysam.fixmate("-r", fn_sorted, fn_fixedmates)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
111
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
112 # bedtools bam2bed
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
113 alns = BedTool(fn_fixedmates)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
114 alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
115
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
116 # determine alignment ends and write to file
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
117 with (open(args.outfile, "w") if args.outfile is not None else stdout) as out:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
118 for i in alns_bedpe:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
119 chrom = i.fields[0]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
120 fmstart = i.fields[1]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
121 fmend = i.fields[2]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
122 smstart = i.fields[4]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
123 smend = i.fields[5]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
124 readid = i.fields[6]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
125 score = i.fields[7]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
126 fmstrand = i.fields[8]
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
127 if fmstrand == "+":
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
128 start = fmstart
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
129 end = smend
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
130 elif fmstrand == "-":
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
131 start = smstart
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
132 end = fmend
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
133 else:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
134 logging.warning("Skipping {}, strand information is missing: '{}'".format(readid, i))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
135 continue
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
136 out.write("\t".join([chrom, start, end, readid, score, fmstrand]) + "\n")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
137 finally:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
138 logging.debug("removed tmpdir: " + tmpdir)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
139 rmtree(tmpdir)