changeset 7:cd3e67ed18be

Deleted selected files
author boris
date Wed, 18 Apr 2012 11:36:25 -0400
parents 4bb72b1de36f
children fc93a30e02e6
files test-data/MDtag_filter.py
diffstat 1 files changed, 0 insertions(+), 86 deletions(-) [+]
line wrap: on
line diff
--- a/test-data/MDtag_filter.py	Wed Apr 18 11:35:57 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,86 +0,0 @@
-#!/usr/bin/env python
-# Boris Rebolledo Jaramillo
-# berebolledo@gmail.com
-# file: MDtag_filter.py
-# usage: python MDtag_filter.py in.sam n m out.sam
-
-"""
-Mismatches at the start and end of a mapped read are most likely sequencing errors.
-This tool aims to overcome the variation noise due to sequencing errors.
-It reads the MD tag of a mapped read (See SAM format specification), and discards
-mapped reads that contain variation at either end of the mapped read.
-The user defines n and m. The mapped read is discarded if it contains mismatches
-within n bases of the read start and m bases of the read end. 
-
-"""
-
-import sys,re
-
-if len(sys.argv) != 5: 
-  # stop the program and print an error message
-  sys.exit("ERROR! Usage: python mdtagfilter.py <in_sam> <n> <m> <out_sam>")
-
-#Read input SAM, and create output SAM
-sam_file=list(open(sys.argv[1]))
-sam_mdfiltered=open(sys.argv[4],'w+')
-
-
-# The MD tag is generated out of the alignment regardless its strand orientation.
-# It describes the alignment from the leftmost aligned base, which might be either the start
-# or end of the mapped read, depending on the strand it was mapped to.
-# Part of the following code was taken from the 'explain_sam_flags.py' script
-# http://picard-tools.sourcearchive.com/documentation/1.25-1/explain__sam__flags_8py-source.html
-# It defines the meaning of the sam flags in plain English
-# so they can be used to identify reads mapped to the reverse strand: flag = 16
-
-lstFlags = [
-    ("read paired", 0x1),
-    ("read mapped in proper pair", 0x2),
-    ("read unmapped", 0x4),
-    ("mate unmapped", 0x8),
-    ("read reverse strand", 0x10),
-    ("mate reverse strand", 0x20),
-    ("first in pair", 0x40),
-    ("second in pair", 0x80),
-    ("not primary alignment", 0x100),
-    ("read fails platform/vendor quality checks", 0x200),
-    ("read is PCR or optical duplicate", 0x400)
-    ]
-
-for line in sam_file:
-	if line.split("\t")[0].startswith('@'):
-		sam_mdfiltered.write(line)
-	else:
-		if len(line.split("\t")) > 11: # There are 11 mandatory fields in the SAM format. The MD tag is an optional field. Thus, this will take mapped reads with optional tags only. 
-			for element in line.split("\t")[11:]:
-				if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', element): # It asks if MD tag is present among the optional fields
-					# Define strandness based on the flag value
-					# The iteration provides all the matched flags definitions according
-					# to the flag value
-					# The iteration stops when flag 16 is contained in flag
-					# which means that the read was mapped to the reverse strand
-					flag = int(line.split("\t")[1]) 
-					for strFlagName, iMask in lstFlags:
-						if flag & iMask:
-							if iMask == 16: 
-								strand = 'reverse'
-								break
-							else:
-								strand = 'forward'
-						else: 	# this is for flag = 0 which is not part of the definitions
-							# but it corresponds to the forward strand based on
-							# a BioStar discussion in
-							# http://biostar.stackexchange.com/questions/7397/in-sam-format-clarify-the-meaning-of-the-0-flag
-							strand = 'forward'  
-					# The position of the optional fields in the SAM format is variable. Finds the location of the MD tag:
-					mdtag_idx = [i for i, item in enumerate(line.split("\t")[11:]) if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', item)][0]
-					mdtag=line.split("\t")[mdtag_idx+11].strip().split(":")[2]
-					md_list = re.split(r'(\D)',mdtag)
-					if strand == 'forward':
-						if int(md_list[0]) >= int(sys.argv[2]) and int(md_list[-1]) >= int(sys.argv[3]):
-							sam_mdfiltered.write(line)
-					elif strand == 'reverse':
-						if int(md_list[0]) >= int(sys.argv[3]) and int(md_list[-1]) >= int(sys.argv[2]):
-							sam_mdfiltered.write(line)
-					break
-sam_mdfiltered.close()
\ No newline at end of file