Mercurial > repos > artbio > lumpy_sv
comparison extractSplitReads_BwaMem.py @ 0:a1225091082c draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy-sv commit 6a109f553d1691e243372258ad564244586875c3"
author | artbio |
---|---|
date | Mon, 14 Oct 2019 07:11:39 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a1225091082c |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import re | |
4 import sys | |
5 from optparse import OptionParser | |
6 | |
7 | |
8 def extractSplitsFromBwaMem(inFile, numSplits, includeDups, minNonOverlap): | |
9 if inFile == "stdin": | |
10 data = sys.stdin | |
11 else: | |
12 data = open(inFile, 'r') | |
13 for line in data: | |
14 split = 0 | |
15 if line[0] == '@': | |
16 print(line.strip()) | |
17 continue | |
18 samList = line.strip().split('\t') | |
19 sam = SAM(samList) | |
20 if includeDups == 0 and (1024 & sam.flag) == 1024: | |
21 continue | |
22 for el in sam.tags: | |
23 if "SA:" in el: | |
24 if(len(el.split(";"))) <= numSplits: | |
25 split = 1 | |
26 mate = el.split(",") | |
27 mateCigar = mate[3] | |
28 mateFlag = int(0) | |
29 if mate[2] == "-": | |
30 mateFlag = int(16) | |
31 if split: | |
32 read1 = sam.flag & 64 | |
33 if read1 == 64: | |
34 tag = "_1" | |
35 else: | |
36 tag = "_2" | |
37 samList[0] = sam.query + tag | |
38 readCigar = sam.cigar | |
39 readCigarOps = extractCigarOps(readCigar, sam.flag) | |
40 readQueryPos = calcQueryPosFromCigar(readCigarOps) | |
41 mateCigarOps = extractCigarOps(mateCigar, mateFlag) | |
42 mateQueryPos = calcQueryPosFromCigar(mateCigarOps) | |
43 overlap = calcQueryOverlap(readQueryPos.qsPos, readQueryPos.qePos, | |
44 mateQueryPos.qsPos, mateQueryPos.qePos) | |
45 nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap | |
46 nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap | |
47 mno = min(nonOverlap1, nonOverlap2) | |
48 if mno >= minNonOverlap: | |
49 print("\t".join(samList)) | |
50 | |
51 # ----------------------------------------------------------------------- | |
52 # functions | |
53 # ----------------------------------------------------------------------- | |
54 | |
55 | |
56 class SAM (object): | |
57 """ | |
58 __very__ basic class for SAM input. | |
59 """ | |
60 def __init__(self, samList=[]): | |
61 if len(samList) > 0: | |
62 self.query = samList[0] | |
63 self.flag = int(samList[1]) | |
64 self.ref = samList[2] | |
65 self.pos = int(samList[3]) | |
66 self.mapq = int(samList[4]) | |
67 self.cigar = samList[5] | |
68 self.matRef = samList[6] | |
69 self.matePos = int(samList[7]) | |
70 self.iSize = int(samList[8]) | |
71 self.seq = samList[9] | |
72 self.qual = samList[10] | |
73 self.tags = samList[11:] | |
74 # tags is a list of each tag:vtype:value sets | |
75 self.valid = 1 | |
76 else: | |
77 self.valid = 0 | |
78 self.query = 'null' | |
79 | |
80 def extractTagValue(self, tagID): | |
81 for tag in self.tags: | |
82 tagParts = tag.split(':', 2) | |
83 if (tagParts[0] == tagID): | |
84 if (tagParts[1] == 'i'): | |
85 return int(tagParts[2]) | |
86 elif (tagParts[1] == 'H'): | |
87 return int(tagParts[2], 16) | |
88 return tagParts[2] | |
89 return None | |
90 | |
91 | |
92 cigarPattern = '([0-9]+[MIDNSHP])' | |
93 cigarSearch = re.compile(cigarPattern) | |
94 atomicCigarPattern = '([0-9]+)([MIDNSHP])' | |
95 atomicCigarSearch = re.compile(atomicCigarPattern) | |
96 | |
97 | |
98 def extractCigarOps(cigar, flag): | |
99 if (cigar == "*"): | |
100 cigarOps = [] | |
101 elif (flag & 0x0010): | |
102 cigarOpStrings = cigarSearch.findall(cigar) | |
103 cigarOps = [] | |
104 for opString in cigarOpStrings: | |
105 cigarOpList = atomicCigarSearch.findall(opString) | |
106 # print cigarOpList | |
107 # "struct" for the op and it's length | |
108 cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1]) | |
109 # add to the list of cigarOps | |
110 cigarOps.append(cigar) | |
111 cigarOps = cigarOps | |
112 cigarOps.reverse() | |
113 # do in reverse order because negative strand | |
114 else: | |
115 cigarOpStrings = cigarSearch.findall(cigar) | |
116 cigarOps = [] | |
117 for opString in cigarOpStrings: | |
118 cigarOpList = atomicCigarSearch.findall(opString) | |
119 # "struct" for the op and it's length | |
120 cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1]) | |
121 # add to the list of cigarOps | |
122 cigarOps.append(cigar) | |
123 # cigarOps = cigarOps | |
124 return(cigarOps) | |
125 | |
126 | |
127 def calcQueryPosFromCigar(cigarOps): | |
128 qsPos = 0 | |
129 qePos = 0 | |
130 qLen = 0 | |
131 # if first op is a H, need to shift start position | |
132 # the opPosition counter sees if the for loop is looking | |
133 # at the first index of the cigar object | |
134 opPosition = 0 | |
135 for cigar in cigarOps: | |
136 if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'): | |
137 qsPos += cigar.length | |
138 qePos += cigar.length | |
139 qLen += cigar.length | |
140 elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'): | |
141 qLen += cigar.length | |
142 elif cigar.op == 'M' or cigar.op == 'I': | |
143 qePos += cigar.length | |
144 qLen += cigar.length | |
145 opPosition += 1 | |
146 d = queryPos(qsPos, qePos, qLen) | |
147 return d | |
148 | |
149 | |
150 class cigarOp (object): | |
151 """ | |
152 sturct to store a discrete CIGAR operations | |
153 """ | |
154 def __init__(self, opLength, op): | |
155 self.length = int(opLength) | |
156 self.op = op | |
157 | |
158 | |
159 class queryPos (object): | |
160 """ | |
161 struct to store the start and end positions of query CIGAR operations | |
162 """ | |
163 def __init__(self, qsPos, qePos, qLen): | |
164 self.qsPos = int(qsPos) | |
165 self.qePos = int(qePos) | |
166 self.qLen = int(qLen) | |
167 | |
168 | |
169 def calcQueryOverlap(s1, e1, s2, e2): | |
170 o = 1 + min(e1, e2) - max(s1, s2) | |
171 return max(0, o) | |
172 | |
173 ############################################### | |
174 | |
175 | |
176 class Usage(Exception): | |
177 def __init__(self, msg): | |
178 self.msg = msg | |
179 | |
180 | |
181 def main(): | |
182 usage = """%prog -i <file> | |
183 | |
184 extractSplitReads_BwaMem v0.1.0 | |
185 Author: Ira Hall | |
186 Description: Get split-read alignments from bwa-mem in lumpy compatible | |
187 format. Ignores reads marked as duplicates. | |
188 Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405. | |
189 """ | |
190 parser = OptionParser(usage) | |
191 | |
192 parser.add_option("-i", "--inFile", dest="inFile", | |
193 help="A SAM file or standard input (-i stdin).", | |
194 metavar="FILE") | |
195 parser.add_option("-n", "--numSplits", dest="numSplits", default=2, | |
196 type="int", | |
197 help='''The maximum number of split-read mappings to | |
198 allow per read. Reads with more are excluded. | |
199 Default=2''', metavar="INT") | |
200 parser.add_option("-d", "--includeDups", dest="includeDups", | |
201 action="store_true", default=0, | |
202 help='''Include alignments marked as duplicates. | |
203 Default=False''') | |
204 parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", | |
205 default=20, type="int", help='''minimum non-overlap between | |
206 split alignments on the query (default=20)''', | |
207 metavar="INT") | |
208 (opts, args) = parser.parse_args() | |
209 if opts.inFile is None: | |
210 parser.print_help() | |
211 print | |
212 else: | |
213 try: | |
214 extractSplitsFromBwaMem(opts.inFile, opts.numSplits, | |
215 opts.includeDups, opts.minNonOverlap) | |
216 except IOError as err: | |
217 sys.stderr.write("IOError " + str(err) + "\n") | |
218 return | |
219 | |
220 | |
221 if __name__ == "__main__": | |
222 sys.exit(main()) |