Mercurial > repos > boris > filter_on_md
changeset 7:cd3e67ed18be
Deleted selected files
author | boris |
---|---|
date | Wed, 18 Apr 2012 11:36:25 -0400 |
parents | 4bb72b1de36f |
children | fc93a30e02e6 |
files | test-data/MDtag_filter.py |
diffstat | 1 files changed, 0 insertions(+), 86 deletions(-) [+] |
line wrap: on
line diff
--- a/test-data/MDtag_filter.py Wed Apr 18 11:35:57 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,86 +0,0 @@ -#!/usr/bin/env python -# Boris Rebolledo Jaramillo -# berebolledo@gmail.com -# file: MDtag_filter.py -# usage: python MDtag_filter.py in.sam n m out.sam - -""" -Mismatches at the start and end of a mapped read are most likely sequencing errors. -This tool aims to overcome the variation noise due to sequencing errors. -It reads the MD tag of a mapped read (See SAM format specification), and discards -mapped reads that contain variation at either end of the mapped read. -The user defines n and m. The mapped read is discarded if it contains mismatches -within n bases of the read start and m bases of the read end. - -""" - -import sys,re - -if len(sys.argv) != 5: - # stop the program and print an error message - sys.exit("ERROR! Usage: python mdtagfilter.py <in_sam> <n> <m> <out_sam>") - -#Read input SAM, and create output SAM -sam_file=list(open(sys.argv[1])) -sam_mdfiltered=open(sys.argv[4],'w+') - - -# The MD tag is generated out of the alignment regardless its strand orientation. -# It describes the alignment from the leftmost aligned base, which might be either the start -# or end of the mapped read, depending on the strand it was mapped to. -# Part of the following code was taken from the 'explain_sam_flags.py' script -# http://picard-tools.sourcearchive.com/documentation/1.25-1/explain__sam__flags_8py-source.html -# It defines the meaning of the sam flags in plain English -# so they can be used to identify reads mapped to the reverse strand: flag = 16 - -lstFlags = [ - ("read paired", 0x1), - ("read mapped in proper pair", 0x2), - ("read unmapped", 0x4), - ("mate unmapped", 0x8), - ("read reverse strand", 0x10), - ("mate reverse strand", 0x20), - ("first in pair", 0x40), - ("second in pair", 0x80), - ("not primary alignment", 0x100), - ("read fails platform/vendor quality checks", 0x200), - ("read is PCR or optical duplicate", 0x400) - ] - -for line in sam_file: - if line.split("\t")[0].startswith('@'): - sam_mdfiltered.write(line) - else: - if len(line.split("\t")) > 11: # There are 11 mandatory fields in the SAM format. The MD tag is an optional field. Thus, this will take mapped reads with optional tags only. - for element in line.split("\t")[11:]: - if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', element): # It asks if MD tag is present among the optional fields - # Define strandness based on the flag value - # The iteration provides all the matched flags definitions according - # to the flag value - # The iteration stops when flag 16 is contained in flag - # which means that the read was mapped to the reverse strand - flag = int(line.split("\t")[1]) - for strFlagName, iMask in lstFlags: - if flag & iMask: - if iMask == 16: - strand = 'reverse' - break - else: - strand = 'forward' - else: # this is for flag = 0 which is not part of the definitions - # but it corresponds to the forward strand based on - # a BioStar discussion in - # http://biostar.stackexchange.com/questions/7397/in-sam-format-clarify-the-meaning-of-the-0-flag - strand = 'forward' - # The position of the optional fields in the SAM format is variable. Finds the location of the MD tag: - mdtag_idx = [i for i, item in enumerate(line.split("\t")[11:]) if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', item)][0] - mdtag=line.split("\t")[mdtag_idx+11].strip().split(":")[2] - md_list = re.split(r'(\D)',mdtag) - if strand == 'forward': - if int(md_list[0]) >= int(sys.argv[2]) and int(md_list[-1]) >= int(sys.argv[3]): - sam_mdfiltered.write(line) - elif strand == 'reverse': - if int(md_list[0]) >= int(sys.argv[3]) and int(md_list[-1]) >= int(sys.argv[2]): - sam_mdfiltered.write(line) - break -sam_mdfiltered.close() \ No newline at end of file