Mercurial > repos > boris > filter_on_md
changeset 13:eb7156b4115e
Uploaded new py
author | boris |
---|---|
date | Fri, 20 Apr 2012 15:26:56 -0400 |
parents | c2dd4ddbfd3f |
children | 7c06a1b047cb |
files | MDtag_filter.py |
diffstat | 1 files changed, 102 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MDtag_filter.py Fri Apr 20 15:26:56 2012 -0400 @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# Boris Rebolledo Jaramillo +# berebolledo@gmail.com + +"""Filter mapped reads on MD tag string. + +This tool reads the MD tag of mapped reads (see SAM format specification). +The user defines the 5' and 3' windows n and m (in bp), respectively. +The mapped read is discarded if it contains any number of mismatches within n +bases of the read 5' end and within m bases of the read 3' end. +The resulting SAM file is enriched for mapped reads that show internal variation +(if any) over reads whose variation is found within the read ends. +The user might also want to keep the discarded reads in an additional file. + +usage: %prog in.sam n m out.sam save_discarded?[yes/no] [discarded.sam] + +""" + +import sys,re + +if len(sys.argv) < 6: + sys.exit('ERROR! Usage: %prog <in.sam> <n> <m> <out.sam> ' + '<save_discarded?[yes/no]> [<discarded.sam>]') +if sys.argv[5] == 'yes' and len(sys.argv) < 7: + sys.exit('ERROR! Usage: %prog <in.sam> <n> <m> <out.sam> ' + '<save_discarded?[yes/no]> [<discarded.sam>]') + +sam_file=list(open(sys.argv[1])) +sam_mdfiltered=open(sys.argv[4],'w+') +if sys.argv[5] == 'yes': + sam_discarded=open(sys.argv[6],'w+') + + +# The MD tag is generated out of the alignment operations, regardless the strand +# the read was mapped to. It describes the alignment from the leftmost aligned +# base, which might be either the 5' or 3' end of the original read. +# Luckily, the read orientation in the alignment can be extracted from the +# alignment flag. +# The code used here to identify the mapped reads orientation was taken from +# the 'explain_sam_flags.py' script of the Picard-Tools. +# It defines the meaning of the sam flags in plain English, so they can be used +# to identify reads mapped to the reverse strand: flag = 16 +# http://picard-tools.sourcearchive.com/documentation/1.25-1/ + +lstFlags = [ + ("read paired", 0x1), + ("read mapped in proper pair", 0x2), + ("read unmapped", 0x4), + ("mate unmapped", 0x8), + ("read reverse strand", 0x10), + ("mate reverse strand", 0x20), + ("first in pair", 0x40), + ("second in pair", 0x80), + ("not primary alignment", 0x100), + ("read fails platform/vendor quality checks", 0x200), + ("read is PCR or optical duplicate", 0x400) + ] + +for line in sam_file: + if line.split("\t")[0].startswith('@'): + if sys.argv[5] == 'yes': + sam_discarded.write(line) + sam_mdfiltered.write(line) + else: + sam_mdfiltered.write(line) + else: + if len(line.split("\t")) > 11: # Any SAM optional fields? + for field in line.split("\t")[11:]: + if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', field): # is MD tag field present? + flag = int(line.split("\t")[1]) + for strFlagName, iMask in lstFlags: + if flag & iMask: + if iMask == 16: + strand = 'reverse' + break + else: + strand = 'forward' + else: # flag = 0 is not part of the definitions, + # but it corresponds to forward strand mapping + strand = 'forward' + # The position of the optional fields in the SAM format is variable. Finds the location of the MD tag: + mdtag_idx = [i for i, item in enumerate(line.split("\t")[11:]) if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', item)][0] + mdtag=line.split("\t")[mdtag_idx+11].strip().split(":")[2] + md_list = re.split(r'(\D)',mdtag) + if strand == 'forward': + if int(md_list[0]) >= int(sys.argv[2]) and int(md_list[-1]) >= int(sys.argv[3]): + sam_mdfiltered.write(line) + else: + if sys.argv[5] == 'yes': + sam_discarded.write(line) + elif strand == 'reverse': + if int(md_list[0]) >= int(sys.argv[3]) and int(md_list[-1]) >= int(sys.argv[2]): + sam_mdfiltered.write(line) + else: + if sys.argv[5] == 'yes': + sam_discarded.write(line) + break + +sam_mdfiltered.close() + +if sys.argv[5] == 'yes': + sam_discarded.close() \ No newline at end of file