Mercurial > repos > boris > filter_on_md
changeset 9:20b654fe58e2
Uploaded
author | boris |
---|---|
date | Fri, 20 Apr 2012 15:23:12 -0400 |
parents | fc93a30e02e6 |
children | 5108a91b93fe |
files | MDtag_filter.py |
diffstat | 1 files changed, 73 insertions(+), 57 deletions(-) [+] |
line wrap: on
line diff
--- a/MDtag_filter.py Wed Apr 18 11:36:55 2012 -0400 +++ b/MDtag_filter.py Fri Apr 20 15:23:12 2012 -0400 @@ -1,37 +1,46 @@ #!/usr/bin/env python # Boris Rebolledo Jaramillo # berebolledo@gmail.com -# file: MDtag_filter.py -# usage: python MDtag_filter.py in.sam n m out.sam + +"""Filter mapped reads on MD tag string. -""" -Mismatches at the start and end of a mapped read are most likely sequencing errors. -This tool aims to overcome the variation noise due to sequencing errors. -It reads the MD tag of a mapped read (See SAM format specification), and discards -mapped reads that contain variation at either end of the mapped read. -The user defines n and m. The mapped read is discarded if it contains mismatches -within n bases of the read start and m bases of the read end. +This tool reads the MD tag of mapped reads (see SAM format specification). +The user defines the 5' and 3' windows n and m (in bp), respectively. +The mapped read is discarded if it contains any number of mismatches within n +bases of the read 5' end and within m bases of the read 3' end. +The resulting SAM file is enriched for mapped reads that show internal variation +(if any) over reads whose variation is found within the read ends. +The user might also want to keep the discarded reads in an additional file. + +usage: %prog in.sam n m out.sam save_discarded?[yes/no] [discarded.sam] """ import sys,re -if len(sys.argv) != 5: - # stop the program and print an error message - sys.exit("ERROR! Usage: python mdtagfilter.py <in_sam> <n> <m> <out_sam>") - -#Read input SAM, and create output SAM +if len(sys.argv) < 6: + sys.exit('ERROR! Usage: %prog <in.sam> <n> <m> <out.sam> ' + '<save_discarded?[yes/no]> [<discarded.sam>]') +if sys.argv[5] == 'yes' and len(sys.argv) < 7: + sys.exit('ERROR! Usage: %prog <in.sam> <n> <m> <out.sam> ' + '<save_discarded?[yes/no]> [<discarded.sam>]') + sam_file=list(open(sys.argv[1])) sam_mdfiltered=open(sys.argv[4],'w+') +if sys.argv[5] == 'yes': + sam_discarded=open(sys.argv[6],'w+') -# The MD tag is generated out of the alignment regardless its strand orientation. -# It describes the alignment from the leftmost aligned base, which might be either the start -# or end of the mapped read, depending on the strand it was mapped to. -# Part of the following code was taken from the 'explain_sam_flags.py' script -# http://picard-tools.sourcearchive.com/documentation/1.25-1/explain__sam__flags_8py-source.html -# It defines the meaning of the sam flags in plain English -# so they can be used to identify reads mapped to the reverse strand: flag = 16 +# The MD tag is generated out of the alignment operations, regardless the strand +# the read was mapped to. It describes the alignment from the leftmost aligned +# base, which might be either the 5' or 3' end of the original read. +# Luckily, the read orientation in the alignment can be extracted from the +# alignment flag. +# The code used here to identify the mapped reads orientation was taken from +# the 'explain_sam_flags.py' script of the Picard-Tools. +# It defines the meaning of the sam flags in plain English, so they can be used +# to identify reads mapped to the reverse strand: flag = 16 +# http://picard-tools.sourcearchive.com/documentation/1.25-1/ lstFlags = [ ("read paired", 0x1), @@ -48,39 +57,46 @@ ] for line in sam_file: - if line.split("\t")[0].startswith('@'): - sam_mdfiltered.write(line) - else: - if len(line.split("\t")) > 11: # There are 11 mandatory fields in the SAM format. The MD tag is an optional field. Thus, this will take mapped reads with optional tags only. - for element in line.split("\t")[11:]: - if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', element): # It asks if MD tag is present among the optional fields - # Define strandness based on the flag value - # The iteration provides all the matched flags definitions according - # to the flag value - # The iteration stops when flag 16 is contained in flag - # which means that the read was mapped to the reverse strand - flag = int(line.split("\t")[1]) - for strFlagName, iMask in lstFlags: - if flag & iMask: - if iMask == 16: - strand = 'reverse' - break - else: - strand = 'forward' - else: # this is for flag = 0 which is not part of the definitions - # but it corresponds to the forward strand based on - # a BioStar discussion in - # http://biostar.stackexchange.com/questions/7397/in-sam-format-clarify-the-meaning-of-the-0-flag - strand = 'forward' - # The position of the optional fields in the SAM format is variable. Finds the location of the MD tag: - mdtag_idx = [i for i, item in enumerate(line.split("\t")[11:]) if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', item)][0] - mdtag=line.split("\t")[mdtag_idx+11].strip().split(":")[2] - md_list = re.split(r'(\D)',mdtag) - if strand == 'forward': - if int(md_list[0]) >= int(sys.argv[2]) and int(md_list[-1]) >= int(sys.argv[3]): - sam_mdfiltered.write(line) - elif strand == 'reverse': - if int(md_list[0]) >= int(sys.argv[3]) and int(md_list[-1]) >= int(sys.argv[2]): - sam_mdfiltered.write(line) - break -sam_mdfiltered.close() \ No newline at end of file + if line.split("\t")[0].startswith('@'): + if sys.argv[5] == 'yes': + sam_discarded.write(line) + sam_mdfiltered.write(line) + else: + sam_mdfiltered.write(line) + else: + if len(line.split("\t")) > 11: # Any SAM optional fields? + for field in line.split("\t")[11:]: + if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', field): # is MD tag field present? + flag = int(line.split("\t")[1]) + for strFlagName, iMask in lstFlags: + if flag & iMask: + if iMask == 16: + strand = 'reverse' + break + else: + strand = 'forward' + else: # flag = 0 is not part of the definitions, + # but it corresponds to forward strand mapping + strand = 'forward' + # The position of the optional fields in the SAM format is variable. Finds the location of the MD tag: + mdtag_idx = [i for i, item in enumerate(line.split("\t")[11:]) if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', item)][0] + mdtag=line.split("\t")[mdtag_idx+11].strip().split(":")[2] + md_list = re.split(r'(\D)',mdtag) + if strand == 'forward': + if int(md_list[0]) >= int(sys.argv[2]) and int(md_list[-1]) >= int(sys.argv[3]): + sam_mdfiltered.write(line) + else: + if sys.argv[5] == 'yes': + sam_discarded.write(line) + elif strand == 'reverse': + if int(md_list[0]) >= int(sys.argv[3]) and int(md_list[-1]) >= int(sys.argv[2]): + sam_mdfiltered.write(line) + else: + if sys.argv[5] == 'yes': + sam_discarded.write(line) + break + +sam_mdfiltered.close() + +if sys.argv[5] == 'yes': + sam_discarded.close() \ No newline at end of file