Mercurial > repos > boris > filter_on_md
changeset 8:fc93a30e02e6
Uploaded py script
author | boris |
---|---|
date | Wed, 18 Apr 2012 11:36:55 -0400 |
parents | cd3e67ed18be |
children | 20b654fe58e2 |
files | MDtag_filter.py |
diffstat | 1 files changed, 86 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MDtag_filter.py Wed Apr 18 11:36:55 2012 -0400 @@ -0,0 +1,86 @@ +#!/usr/bin/env python +# Boris Rebolledo Jaramillo +# berebolledo@gmail.com +# file: MDtag_filter.py +# usage: python MDtag_filter.py in.sam n m out.sam + +""" +Mismatches at the start and end of a mapped read are most likely sequencing errors. +This tool aims to overcome the variation noise due to sequencing errors. +It reads the MD tag of a mapped read (See SAM format specification), and discards +mapped reads that contain variation at either end of the mapped read. +The user defines n and m. The mapped read is discarded if it contains mismatches +within n bases of the read start and m bases of the read end. + +""" + +import sys,re + +if len(sys.argv) != 5: + # stop the program and print an error message + sys.exit("ERROR! Usage: python mdtagfilter.py <in_sam> <n> <m> <out_sam>") + +#Read input SAM, and create output SAM +sam_file=list(open(sys.argv[1])) +sam_mdfiltered=open(sys.argv[4],'w+') + + +# The MD tag is generated out of the alignment regardless its strand orientation. +# It describes the alignment from the leftmost aligned base, which might be either the start +# or end of the mapped read, depending on the strand it was mapped to. +# Part of the following code was taken from the 'explain_sam_flags.py' script +# http://picard-tools.sourcearchive.com/documentation/1.25-1/explain__sam__flags_8py-source.html +# It defines the meaning of the sam flags in plain English +# so they can be used to identify reads mapped to the reverse strand: flag = 16 + +lstFlags = [ + ("read paired", 0x1), + ("read mapped in proper pair", 0x2), + ("read unmapped", 0x4), + ("mate unmapped", 0x8), + ("read reverse strand", 0x10), + ("mate reverse strand", 0x20), + ("first in pair", 0x40), + ("second in pair", 0x80), + ("not primary alignment", 0x100), + ("read fails platform/vendor quality checks", 0x200), + ("read is PCR or optical duplicate", 0x400) + ] + +for line in sam_file: + if line.split("\t")[0].startswith('@'): + sam_mdfiltered.write(line) + else: + if len(line.split("\t")) > 11: # There are 11 mandatory fields in the SAM format. The MD tag is an optional field. Thus, this will take mapped reads with optional tags only. + for element in line.split("\t")[11:]: + if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', element): # It asks if MD tag is present among the optional fields + # Define strandness based on the flag value + # The iteration provides all the matched flags definitions according + # to the flag value + # The iteration stops when flag 16 is contained in flag + # which means that the read was mapped to the reverse strand + flag = int(line.split("\t")[1]) + for strFlagName, iMask in lstFlags: + if flag & iMask: + if iMask == 16: + strand = 'reverse' + break + else: + strand = 'forward' + else: # this is for flag = 0 which is not part of the definitions + # but it corresponds to the forward strand based on + # a BioStar discussion in + # http://biostar.stackexchange.com/questions/7397/in-sam-format-clarify-the-meaning-of-the-0-flag + strand = 'forward' + # The position of the optional fields in the SAM format is variable. Finds the location of the MD tag: + mdtag_idx = [i for i, item in enumerate(line.split("\t")[11:]) if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', item)][0] + mdtag=line.split("\t")[mdtag_idx+11].strip().split(":")[2] + md_list = re.split(r'(\D)',mdtag) + if strand == 'forward': + if int(md_list[0]) >= int(sys.argv[2]) and int(md_list[-1]) >= int(sys.argv[3]): + sam_mdfiltered.write(line) + elif strand == 'reverse': + if int(md_list[0]) >= int(sys.argv[3]) and int(md_list[-1]) >= int(sys.argv[2]): + sam_mdfiltered.write(line) + break +sam_mdfiltered.close() \ No newline at end of file