Mercurial > repos > rnateam > bctools
comparison extract_aln_ends.py @ 2:de4ea3aa1090 draft
Uploaded
author | rnateam |
---|---|
date | Thu, 22 Oct 2015 10:26:45 -0400 |
parents | |
children | 570a7de9f151 |
comparison
equal
deleted
inserted
replaced
1:ae0f58d3318f | 2:de4ea3aa1090 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 tool_description = """ | |
4 Extract alignment ends from sam file. | |
5 | |
6 The resulting bed file contains the outer coordinates of the alignments. The | |
7 bed name field is set to the read id and the score field is set to the edit | |
8 distance of the alignment. The crosslinked nucleotide is one nt upstream of the | |
9 5'-end of the bed entries. | |
10 | |
11 This script only reports results for alignments that are properly aligned in FR | |
12 ("forward-reverse") direction. | |
13 | |
14 By default output is written to stdout. | |
15 | |
16 Input: | |
17 * sam file containing alignments (paired-end sequencing) | |
18 | |
19 Output: | |
20 * bed6 file containing outer coordinates (sorted by read id) | |
21 | |
22 Example usage: | |
23 - Extract coordinates from file input.bam and write to file output.bed | |
24 extract_aln_ends.py input.bam --out output.bed | |
25 """ | |
26 | |
27 epilog = """ | |
28 Author: Daniel Maticzka | |
29 Copyright: 2015 | |
30 License: Apache | |
31 Email: maticzkd@informatik.uni-freiburg.de | |
32 Status: Development | |
33 """ | |
34 | |
35 import argparse | |
36 import logging | |
37 from sys import stdout | |
38 from shutil import rmtree | |
39 from tempfile import mkdtemp | |
40 from pybedtools import BedTool | |
41 import pysam | |
42 | |
43 | |
44 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, | |
45 argparse.RawDescriptionHelpFormatter): | |
46 # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter | |
47 pass | |
48 | |
49 # avoid ugly python IOError when stdout output is piped into another program | |
50 # and then truncated (such as piping to head) | |
51 from signal import signal, SIGPIPE, SIG_DFL | |
52 signal(SIGPIPE, SIG_DFL) | |
53 | |
54 # parse command line arguments | |
55 parser = argparse.ArgumentParser(description=tool_description, | |
56 epilog=epilog, | |
57 formatter_class=DefaultsRawDescriptionHelpFormatter) | |
58 # positional arguments | |
59 parser.add_argument( | |
60 "sam", | |
61 help="Path to sam file containing alignments.") | |
62 # optional arguments | |
63 parser.add_argument( | |
64 "-o", "--outfile", | |
65 help="Write results to this file.") | |
66 # misc arguments | |
67 parser.add_argument( | |
68 "-v", "--verbose", | |
69 help="Be verbose.", | |
70 action="store_true") | |
71 parser.add_argument( | |
72 "-d", "--debug", | |
73 help="Print lots of debugging information", | |
74 action="store_true") | |
75 parser.add_argument( | |
76 '--version', | |
77 action='version', | |
78 version='0.1.0') | |
79 | |
80 args = parser.parse_args() | |
81 | |
82 if args.debug: | |
83 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") | |
84 elif args.verbose: | |
85 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") | |
86 else: | |
87 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") | |
88 logging.info("Parsed arguments:") | |
89 logging.info(" sam: '{}'".format(args.sam)) | |
90 if args.outfile: | |
91 logging.info(" outfile: enabled writing to file") | |
92 logging.info(" outfile: '{}'".format(args.outfile)) | |
93 logging.info("") | |
94 | |
95 # convert to bam for use with pybedtools | |
96 try: | |
97 # setup temporary directory | |
98 tmpdir = mkdtemp() | |
99 logging.debug("tmpdir: " + tmpdir) | |
100 | |
101 fn_sorted = tmpdir + "/sorted.bam" | |
102 fn_fixedmates = tmpdir + "/fixedmates.bam" | |
103 | |
104 # sort by id | |
105 logging.debug("calling samtools sort") | |
106 pysam.sort(args.sam, "-n", "-o{}".format(fn_sorted), "-T sortprefix") | |
107 | |
108 # fix mate information | |
109 # also removes secondary and unmapped reads | |
110 logging.debug("calling samtools fixmates") | |
111 pysam.fixmate("-r", fn_sorted, fn_fixedmates) | |
112 | |
113 # bedtools bam2bed | |
114 alns = BedTool(fn_fixedmates) | |
115 alns_bedpe = alns.bam_to_bed(bedpe=True, mate1=True, ed=True) | |
116 | |
117 # determine alignment ends and write to file | |
118 with (open(args.outfile, "w") if args.outfile is not None else stdout) as out: | |
119 for i in alns_bedpe: | |
120 chrom = i.fields[0] | |
121 fmstart = i.fields[1] | |
122 fmend = i.fields[2] | |
123 smstart = i.fields[4] | |
124 smend = i.fields[5] | |
125 readid = i.fields[6] | |
126 score = i.fields[7] | |
127 fmstrand = i.fields[8] | |
128 if fmstrand == "+": | |
129 start = fmstart | |
130 end = smend | |
131 elif fmstrand == "-": | |
132 start = smstart | |
133 end = fmend | |
134 else: | |
135 logging.warning("Skipping {}, strand information is missing: '{}'".format(readid, i)) | |
136 continue | |
137 out.write("\t".join([chrom, start, end, readid, score, fmstrand]) + "\n") | |
138 finally: | |
139 logging.debug("removed tmpdir: " + tmpdir) | |
140 rmtree(tmpdir) |