Mercurial > repos > boris > filter_on_md

--- a/MDtag_filter.py	Wed Apr 18 11:36:55 2012 -0400
+++ b/MDtag_filter.py	Fri Apr 20 15:23:12 2012 -0400
@@ -1,37 +1,46 @@
 #!/usr/bin/env python
 # Boris Rebolledo Jaramillo
 # berebolledo@gmail.com
-# file: MDtag_filter.py
-# usage: python MDtag_filter.py in.sam n m out.sam
+
+"""Filter mapped reads on MD tag string.

-"""
-Mismatches at the start and end of a mapped read are most likely sequencing errors.
-This tool aims to overcome the variation noise due to sequencing errors.
-It reads the MD tag of a mapped read (See SAM format specification), and discards
-mapped reads that contain variation at either end of the mapped read.
-The user defines n and m. The mapped read is discarded if it contains mismatches
-within n bases of the read start and m bases of the read end.
+This tool reads the MD tag of mapped reads (see SAM format specification).
+The user defines the 5' and 3' windows n and m (in bp), respectively.
+The mapped read is discarded if it contains any number of mismatches within n
+bases of the read 5' end and within m bases of the read 3' end.
+The resulting SAM file is enriched for mapped reads that show internal variation
+(if any) over reads whose variation is found within the read ends.
+The user might also want to keep the discarded reads in an additional file.
+
+usage: %prog in.sam n m out.sam save_discarded?[yes/no] [discarded.sam]

 """

 import sys,re

-if len(sys.argv) != 5:
-  # stop the program and print an error message
-  sys.exit("ERROR! Usage: python mdtagfilter.py <in_sam> <n> <m> <out_sam>")
-
-#Read input SAM, and create output SAM
+if len(sys.argv) < 6:
+    sys.exit('ERROR! Usage: %prog <in.sam> <n> <m> <out.sam> '
+            '<save_discarded?[yes/no]> [<discarded.sam>]')
+if sys.argv[5] == 'yes' and len(sys.argv) < 7:
+    sys.exit('ERROR! Usage: %prog <in.sam> <n> <m> <out.sam> '
+            '<save_discarded?[yes/no]> [<discarded.sam>]')
+
 sam_file=list(open(sys.argv[1]))
 sam_mdfiltered=open(sys.argv[4],'w+')
+if sys.argv[5] == 'yes':
+    sam_discarded=open(sys.argv[6],'w+')


-# The MD tag is generated out of the alignment regardless its strand orientation.
-# It describes the alignment from the leftmost aligned base, which might be either the start
-# or end of the mapped read, depending on the strand it was mapped to.
-# Part of the following code was taken from the 'explain_sam_flags.py' script
-# http://picard-tools.sourcearchive.com/documentation/1.25-1/explain__sam__flags_8py-source.html
-# It defines the meaning of the sam flags in plain English
-# so they can be used to identify reads mapped to the reverse strand: flag = 16
+# The MD tag is generated out of the alignment operations, regardless the strand
+# the read was mapped to. It describes the alignment from the leftmost aligned
+# base, which might be either the 5' or 3' end of the original read.
+# Luckily, the read orientation in the alignment can be extracted from the
+# alignment flag.
+# The code used here to identify the mapped reads orientation was taken from
+# the 'explain_sam_flags.py' script of the Picard-Tools.
+# It defines the meaning of the sam flags in plain English, so they can be used
+# to identify reads mapped to the reverse strand: flag = 16
+# http://picard-tools.sourcearchive.com/documentation/1.25-1/

 lstFlags = [
     ("read paired", 0x1),
@@ -48,39 +57,46 @@
     ]

 for line in sam_file:
-	if line.split("\t")[0].startswith('@'):
-		sam_mdfiltered.write(line)
-	else:
-		if len(line.split("\t")) > 11: # There are 11 mandatory fields in the SAM format. The MD tag is an optional field. Thus, this will take mapped reads with optional tags only.
-			for element in line.split("\t")[11:]:
-				if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', element): # It asks if MD tag is present among the optional fields
-					# Define strandness based on the flag value
-					# The iteration provides all the matched flags definitions according
-					# to the flag value
-					# The iteration stops when flag 16 is contained in flag
-					# which means that the read was mapped to the reverse strand
-					flag = int(line.split("\t")[1])
-					for strFlagName, iMask in lstFlags:
-						if flag & iMask:
-							if iMask == 16:
-								strand = 'reverse'
-								break
-							else:
-								strand = 'forward'
-						else: 	# this is for flag = 0 which is not part of the definitions
-							# but it corresponds to the forward strand based on
-							# a BioStar discussion in
-							# http://biostar.stackexchange.com/questions/7397/in-sam-format-clarify-the-meaning-of-the-0-flag
-							strand = 'forward'
-					# The position of the optional fields in the SAM format is variable. Finds the location of the MD tag:
-					mdtag_idx = [i for i, item in enumerate(line.split("\t")[11:]) if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', item)][0]
-					mdtag=line.split("\t")[mdtag_idx+11].strip().split(":")[2]
-					md_list = re.split(r'(\D)',mdtag)
-					if strand == 'forward':
-						if int(md_list[0]) >= int(sys.argv[2]) and int(md_list[-1]) >= int(sys.argv[3]):
-							sam_mdfiltered.write(line)
-					elif strand == 'reverse':
-						if int(md_list[0]) >= int(sys.argv[3]) and int(md_list[-1]) >= int(sys.argv[2]):
-							sam_mdfiltered.write(line)
-					break
-sam_mdfiltered.close()
\ No newline at end of file
+    if line.split("\t")[0].startswith('@'):
+        if sys.argv[5] == 'yes':
+            sam_discarded.write(line)
+            sam_mdfiltered.write(line)
+        else:
+            sam_mdfiltered.write(line)
+    else:
+        if len(line.split("\t")) > 11:        # Any SAM optional fields?
+            for field in line.split("\t")[11:]:
+                if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', field):        # is MD tag field present?
+                    flag = int(line.split("\t")[1])
+                    for strFlagName, iMask in lstFlags:
+                        if flag & iMask:
+                            if iMask == 16:
+                                strand = 'reverse'
+                                break
+                            else:
+                                strand = 'forward'
+                        else:        # flag = 0 is not part of the definitions,
+                                     # but it corresponds to forward strand mapping
+                            strand = 'forward'
+                    # The position of the optional fields in the SAM format is variable. Finds the location of the MD tag:
+                    mdtag_idx = [i for i, item in enumerate(line.split("\t")[11:]) if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', item)][0]
+                    mdtag=line.split("\t")[mdtag_idx+11].strip().split(":")[2]
+                    md_list = re.split(r'(\D)',mdtag)
+                    if strand == 'forward':
+                        if int(md_list[0]) >= int(sys.argv[2]) and int(md_list[-1]) >= int(sys.argv[3]):
+                            sam_mdfiltered.write(line)
+                        else:
+                            if sys.argv[5] == 'yes':
+                                sam_discarded.write(line)
+                    elif strand == 'reverse':
+                        if int(md_list[0]) >= int(sys.argv[3]) and int(md_list[-1]) >= int(sys.argv[2]):
+                            sam_mdfiltered.write(line)
+                        else:
+                            if sys.argv[5] == 'yes':
+                                sam_discarded.write(line)
+                    break
+
+sam_mdfiltered.close()
+
+if sys.argv[5] == 'yes':
+    sam_discarded.close()
\ No newline at end of file