Mercurial > repos > artbio > lumpy_sv
diff extractSplitReads_BwaMem.py @ 0:a1225091082c draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy-sv commit 6a109f553d1691e243372258ad564244586875c3"
author | artbio |
---|---|
date | Mon, 14 Oct 2019 07:11:39 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extractSplitReads_BwaMem.py Mon Oct 14 07:11:39 2019 -0400 @@ -0,0 +1,222 @@ +#!/usr/bin/env python + +import re +import sys +from optparse import OptionParser + + +def extractSplitsFromBwaMem(inFile, numSplits, includeDups, minNonOverlap): + if inFile == "stdin": + data = sys.stdin + else: + data = open(inFile, 'r') + for line in data: + split = 0 + if line[0] == '@': + print(line.strip()) + continue + samList = line.strip().split('\t') + sam = SAM(samList) + if includeDups == 0 and (1024 & sam.flag) == 1024: + continue + for el in sam.tags: + if "SA:" in el: + if(len(el.split(";"))) <= numSplits: + split = 1 + mate = el.split(",") + mateCigar = mate[3] + mateFlag = int(0) + if mate[2] == "-": + mateFlag = int(16) + if split: + read1 = sam.flag & 64 + if read1 == 64: + tag = "_1" + else: + tag = "_2" + samList[0] = sam.query + tag + readCigar = sam.cigar + readCigarOps = extractCigarOps(readCigar, sam.flag) + readQueryPos = calcQueryPosFromCigar(readCigarOps) + mateCigarOps = extractCigarOps(mateCigar, mateFlag) + mateQueryPos = calcQueryPosFromCigar(mateCigarOps) + overlap = calcQueryOverlap(readQueryPos.qsPos, readQueryPos.qePos, + mateQueryPos.qsPos, mateQueryPos.qePos) + nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap + nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap + mno = min(nonOverlap1, nonOverlap2) + if mno >= minNonOverlap: + print("\t".join(samList)) + +# ----------------------------------------------------------------------- +# functions +# ----------------------------------------------------------------------- + + +class SAM (object): + """ + __very__ basic class for SAM input. + """ + def __init__(self, samList=[]): + if len(samList) > 0: + self.query = samList[0] + self.flag = int(samList[1]) + self.ref = samList[2] + self.pos = int(samList[3]) + self.mapq = int(samList[4]) + self.cigar = samList[5] + self.matRef = samList[6] + self.matePos = int(samList[7]) + self.iSize = int(samList[8]) + self.seq = samList[9] + self.qual = samList[10] + self.tags = samList[11:] + # tags is a list of each tag:vtype:value sets + self.valid = 1 + else: + self.valid = 0 + self.query = 'null' + + def extractTagValue(self, tagID): + for tag in self.tags: + tagParts = tag.split(':', 2) + if (tagParts[0] == tagID): + if (tagParts[1] == 'i'): + return int(tagParts[2]) + elif (tagParts[1] == 'H'): + return int(tagParts[2], 16) + return tagParts[2] + return None + + +cigarPattern = '([0-9]+[MIDNSHP])' +cigarSearch = re.compile(cigarPattern) +atomicCigarPattern = '([0-9]+)([MIDNSHP])' +atomicCigarSearch = re.compile(atomicCigarPattern) + + +def extractCigarOps(cigar, flag): + if (cigar == "*"): + cigarOps = [] + elif (flag & 0x0010): + cigarOpStrings = cigarSearch.findall(cigar) + cigarOps = [] + for opString in cigarOpStrings: + cigarOpList = atomicCigarSearch.findall(opString) +# print cigarOpList + # "struct" for the op and it's length + cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1]) + # add to the list of cigarOps + cigarOps.append(cigar) + cigarOps = cigarOps + cigarOps.reverse() + # do in reverse order because negative strand + else: + cigarOpStrings = cigarSearch.findall(cigar) + cigarOps = [] + for opString in cigarOpStrings: + cigarOpList = atomicCigarSearch.findall(opString) + # "struct" for the op and it's length + cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1]) + # add to the list of cigarOps + cigarOps.append(cigar) +# cigarOps = cigarOps + return(cigarOps) + + +def calcQueryPosFromCigar(cigarOps): + qsPos = 0 + qePos = 0 + qLen = 0 + # if first op is a H, need to shift start position + # the opPosition counter sees if the for loop is looking + # at the first index of the cigar object + opPosition = 0 + for cigar in cigarOps: + if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'): + qsPos += cigar.length + qePos += cigar.length + qLen += cigar.length + elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'): + qLen += cigar.length + elif cigar.op == 'M' or cigar.op == 'I': + qePos += cigar.length + qLen += cigar.length + opPosition += 1 + d = queryPos(qsPos, qePos, qLen) + return d + + +class cigarOp (object): + """ + sturct to store a discrete CIGAR operations + """ + def __init__(self, opLength, op): + self.length = int(opLength) + self.op = op + + +class queryPos (object): + """ + struct to store the start and end positions of query CIGAR operations + """ + def __init__(self, qsPos, qePos, qLen): + self.qsPos = int(qsPos) + self.qePos = int(qePos) + self.qLen = int(qLen) + + +def calcQueryOverlap(s1, e1, s2, e2): + o = 1 + min(e1, e2) - max(s1, s2) + return max(0, o) + +############################################### + + +class Usage(Exception): + def __init__(self, msg): + self.msg = msg + + +def main(): + usage = """%prog -i <file> + +extractSplitReads_BwaMem v0.1.0 +Author: Ira Hall +Description: Get split-read alignments from bwa-mem in lumpy compatible +format. Ignores reads marked as duplicates. +Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405. + """ + parser = OptionParser(usage) + + parser.add_option("-i", "--inFile", dest="inFile", + help="A SAM file or standard input (-i stdin).", + metavar="FILE") + parser.add_option("-n", "--numSplits", dest="numSplits", default=2, + type="int", + help='''The maximum number of split-read mappings to + allow per read. Reads with more are excluded. + Default=2''', metavar="INT") + parser.add_option("-d", "--includeDups", dest="includeDups", + action="store_true", default=0, + help='''Include alignments marked as duplicates. + Default=False''') + parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", + default=20, type="int", help='''minimum non-overlap between + split alignments on the query (default=20)''', + metavar="INT") + (opts, args) = parser.parse_args() + if opts.inFile is None: + parser.print_help() + print + else: + try: + extractSplitsFromBwaMem(opts.inFile, opts.numSplits, + opts.includeDups, opts.minNonOverlap) + except IOError as err: + sys.stderr.write("IOError " + str(err) + "\n") + return + + +if __name__ == "__main__": + sys.exit(main())