Mercurial > repos > boris > filter_on_md

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/MDtag_filter.py	Wed Apr 18 11:35:24 2012 -0400
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+# Boris Rebolledo Jaramillo
+# berebolledo@gmail.com
+# file: MDtag_filter.py
+# usage: python MDtag_filter.py in.sam n m out.sam
+
+"""
+Mismatches at the start and end of a mapped read are most likely sequencing errors.
+This tool aims to overcome the variation noise due to sequencing errors.
+It reads the MD tag of a mapped read (See SAM format specification), and discards
+mapped reads that contain variation at either end of the mapped read.
+The user defines n and m. The mapped read is discarded if it contains mismatches
+within n bases of the read start and m bases of the read end.
+
+"""
+
+import sys,re
+
+if len(sys.argv) != 5:
+  # stop the program and print an error message
+  sys.exit("ERROR! Usage: python mdtagfilter.py <in_sam> <n> <m> <out_sam>")
+
+#Read input SAM, and create output SAM
+sam_file=list(open(sys.argv[1]))
+sam_mdfiltered=open(sys.argv[4],'w+')
+
+
+# The MD tag is generated out of the alignment regardless its strand orientation.
+# It describes the alignment from the leftmost aligned base, which might be either the start
+# or end of the mapped read, depending on the strand it was mapped to.
+# Part of the following code was taken from the 'explain_sam_flags.py' script
+# http://picard-tools.sourcearchive.com/documentation/1.25-1/explain__sam__flags_8py-source.html
+# It defines the meaning of the sam flags in plain English
+# so they can be used to identify reads mapped to the reverse strand: flag = 16
+
+lstFlags = [
+    ("read paired", 0x1),
+    ("read mapped in proper pair", 0x2),
+    ("read unmapped", 0x4),
+    ("mate unmapped", 0x8),
+    ("read reverse strand", 0x10),
+    ("mate reverse strand", 0x20),
+    ("first in pair", 0x40),
+    ("second in pair", 0x80),
+    ("not primary alignment", 0x100),
+    ("read fails platform/vendor quality checks", 0x200),
+    ("read is PCR or optical duplicate", 0x400)
+    ]
+
+for line in sam_file:
+	if line.split("\t")[0].startswith('@'):
+		sam_mdfiltered.write(line)
+	else:
+		if len(line.split("\t")) > 11: # There are 11 mandatory fields in the SAM format. The MD tag is an optional field. Thus, this will take mapped reads with optional tags only.
+			for element in line.split("\t")[11:]:
+				if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', element): # It asks if MD tag is present among the optional fields
+					# Define strandness based on the flag value
+					# The iteration provides all the matched flags definitions according
+					# to the flag value
+					# The iteration stops when flag 16 is contained in flag
+					# which means that the read was mapped to the reverse strand
+					flag = int(line.split("\t")[1])
+					for strFlagName, iMask in lstFlags:
+						if flag & iMask:
+							if iMask == 16:
+								strand = 'reverse'
+								break
+							else:
+								strand = 'forward'
+						else: 	# this is for flag = 0 which is not part of the definitions
+							# but it corresponds to the forward strand based on
+							# a BioStar discussion in
+							# http://biostar.stackexchange.com/questions/7397/in-sam-format-clarify-the-meaning-of-the-0-flag
+							strand = 'forward'
+					# The position of the optional fields in the SAM format is variable. Finds the location of the MD tag:
+					mdtag_idx = [i for i, item in enumerate(line.split("\t")[11:]) if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', item)][0]
+					mdtag=line.split("\t")[mdtag_idx+11].strip().split(":")[2]
+					md_list = re.split(r'(\D)',mdtag)
+					if strand == 'forward':
+						if int(md_list[0]) >= int(sys.argv[2]) and int(md_list[-1]) >= int(sys.argv[3]):
+							sam_mdfiltered.write(line)
+					elif strand == 'reverse':
+						if int(md_list[0]) >= int(sys.argv[3]) and int(md_list[-1]) >= int(sys.argv[2]):
+							sam_mdfiltered.write(line)
+					break
+sam_mdfiltered.close()
\ No newline at end of file