# HG changeset patch # User boris # Date 1334762449 14400 # Node ID 1f3c9fa5a997457970bac8b545adf8260584c3b7 # Parent fd91c3d62b4d5ee8b512834e1ddc903f534c5f53 Deleted selected files diff -r fd91c3d62b4d -r 1f3c9fa5a997 MDtag_filter --- a/MDtag_filter Wed Apr 18 11:19:27 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,182 +0,0 @@ -#!/usr/bin/env python -# Boris Rebolledo Jaramillo -# berebolledo@gmail.com -# file: MDtag_filter.py -# usage: python MDtag_filter.py in.sam n m out.sam - -""" -Mismatches at the start and end of a mapped read are most likely sequencing errors. -This tool aims to overcome the variation noise due to sequencing errors. -It reads the MD tag of a mapped read (See SAM format specification), and discards -mapped reads that contain variation at either end of the mapped read. -The user defines n and m. The mapped read is discarded if it contains mismatches -within n bases of the read start and m bases of the read end. - -""" - -import sys,re - -if len(sys.argv) != 5: - # stop the program and print an error message - sys.exit("ERROR! Usage: python mdtagfilter.py ") - -#Read input SAM, and create output SAM -sam_file=list(open(sys.argv[1])) -sam_mdfiltered=open(sys.argv[4],'w+') - - -# The MD tag is generated out of the alignment regardless its strand orientation. -# It describes the alignment from the leftmost aligned base, which might be either the start -# or end of the mapped read, depending on the strand it was mapped to. -# Part of the following code was taken from the 'explain_sam_flags.py' script -# http://picard-tools.sourcearchive.com/documentation/1.25-1/explain__sam__flags_8py-source.html -# It defines the meaning of the sam flags in plain English -# so they can be used to identify reads mapped to the reverse strand: flag = 16 - -lstFlags = [ - ("read paired", 0x1), - ("read mapped in proper pair", 0x2), - ("read unmapped", 0x4), - ("mate unmapped", 0x8), - ("read reverse strand", 0x10), - ("mate reverse strand", 0x20), - ("first in pair", 0x40), - ("second in pair", 0x80), - ("not primary alignment", 0x100), - ("read fails platform/vendor quality checks", 0x200), - ("read is PCR or optical duplicate", 0x400) - ] - -for line in sam_file: - if line.split("\t")[0].startswith('@'): - sam_mdfiltered.write(line) - else: - if len(line.split("\t")) > 11: # There are 11 mandatory fields in the SAM format. The MD tag is an optional field. Thus, this will take mapped reads with optional tags only. - for element in line.split("\t")[11:]: - if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', element): # It asks if MD tag is present among the optional fields - # Define strandness based on the flag value - # The iteration provides all the matched flags definitions according - # to the flag value - # The iteration stops when flag 16 is contained in flag - # which means that the read was mapped to the reverse strand - flag = int(line.split("\t")[1]) - for strFlagName, iMask in lstFlags: - if flag & iMask: - if iMask == 16: - strand = 'reverse' - break - else: - strand = 'forward' - else: # this is for flag = 0 which is not part of the definitions - # but it corresponds to the forward strand based on - # a BioStar discussion in - # http://biostar.stackexchange.com/questions/7397/in-sam-format-clarify-the-meaning-of-the-0-flag - strand = 'forward' - # The position of the optional fields in the SAM format is variable. Finds the location of the MD tag: - mdtag_idx = [i for i, item in enumerate(line.split("\t")[11:]) if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', item)][0] - mdtag=line.split("\t")[mdtag_idx+11].strip().split(":")[2] - md_list = re.split(r'(\D)',mdtag) - if strand == 'forward': - if int(md_list[0]) >= int(sys.argv[2]) and int(md_list[-1]) >= int(sys.argv[3]): - sam_mdfiltered.write(line) - elif strand == 'reverse': - if int(md_list[0]) >= int(sys.argv[3]) and int(md_list[-1]) >= int(sys.argv[2]): - sam_mdfiltered.write(line) - break -sam_mdfiltered.close() - on MD tag string - MDtag_filter.py $in_sam $n $m $out_sam - - - - - - - - - - - - - - - - - - -Mismatches at the start and end of a mapped read are most likely sequencing errors. -This tool aims to control the variation noise due to sequencing errors. - ------ - -.. class:: infomark - -**What it does** - -This tool reads the MD tag string of mapped reads. It discards mapped reads that contain variation at either end. -The user defines n and m. The mapped read is discarded if it contains any number of mismatches within **n** bases of the read start and within **m** bases of the read end. - ------ - -.. class:: warningmark - -**Note** - -Mapped reads without an MD tag will be removed from the output SAM file. - ------ - -.. class:: infomark - -**About formats** - -**SAM format** -- SAM stands for Sequence Alignment/Map format. It is a TAB-delimited text format consisting of a header section, which is optional, and an alignment section. Each alignment line has 11 mandatory fields for essential alignment information such as mapping position, and variable number of optional fields for flexible or aligner specific information. - -Each alignment line has 11 **mandatory** fields:: - - 1. QNAME - Query template NAME - 2. FLAG - bitwise FLAG - 3. RNAME - Reference sequence NAME - 4. POS - 1-based leftmost mapping POSition - 5. MAPQ - MAPping Quality - 6. CIGAR - CIGAR string - 7. RNEXT - Ref. name of the mate/next segment - 8. PNEXT - Position of the mate/next segment observed - 9. TLEN - Template LENgth - 10. SEQ - segment SEQuence - 11. QUAL - ASCII of Phred-scaled base QUALity+33 - - -All **optional** fields follow the TAG\:TYPE\:VALUE format, where TAG is a two-character string that matches [A-Za-z][A-Za-z0-9]. TYPE is a single case sensitive letter which defines the format of VALUE:: - - MD TAG - - MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)* with Z = Printable string, including space. - - String for mismatching positions. The MD field aims to achieve SNP/indel calling without looking at the reference. - For example, a string ‘10A5^AC6’ means from the leftmost reference base in the alignment, there are 10 matches followed by an A on the reference which is different from the aligned read base; the next 5 reference bases are matches followed by a 2bp deletion from the reference; the deleted sequence is AC; the last 6 bases are matches. The MD field ought to match the CIGAR string. - ------ - -**Example** - -- For the following dataset:: - - SRR057527.13746413 16 1 1164232 35 1I35M * 0 0 CGAAAGTGAGGTCCTGGCTCCAATCCAATCCCCGGG 333333033333333333333333333333333333 X0:i:1 X1:i:0 OC:Z:36M RG:Z:rnaseq XG:i:0 NM:i:2 XM:i:2 XO:i:0 OP:i:1164231 OQ:Z:CCCCCCDCCCCBCCCCCCCCCCCCCCCCCCCCCCCC XT:A:U - SRR057527.8574994 16 1 565901 23 36M * 0 0 GAGCCTAATCTACTCCACCTCAATCACACTACTCCC 333333333333333303333333333333333333 X0:i:1 X1:i:1 XA:Z:MT,-5351,36M,2; MD:Z:1C34 RG:Z:rnaseq XG:i:0 NM:i:1 XM:i:1 XO:i:0 OQ:Z:CCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCC XT:A:U - SRR057528.178504 0 1 566573 23 36M * 0 0 ACTGGGCCAGCCAGGCAACCTTCTAGGTAACGACCA 233333323222222232333222222222222222 X0:i:1 X1:i:1 XA:Z:MT,+6023,36M,1; MD:Z:36 RG:Z:rnaseq XG:i:0 NM:i:0 XM:i:0 XO:i:0 OQ:Z::?CCCCBAB@AA@@A@B@??BA@A;AA@======:@ XT:A:U - SRR057527.20391474 0 1 565512 23 36M * 0 0 GGCAGTTGAGGGGGATTAAACCAAACCCAACTACGC %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% X0:i:1 X1:i:1 XA:Z:MT,+4962,36M,2; MD:Z:11T24 RG:Z:rnaseq XG:i:0 NM:i:1 XM:i:1 XO:i:0 OQ:Z:%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% XT:A:U - SRR057513.2261668 16 1 16267 15 36M * 0 0 CACTTCTGGATGCTAGGGTTACACTGGGAGTCACAG 333333333333333333333333333333333333 X0:i:1 X1:i:6 MD:Z:30A5 RG:Z:rnaseq XG:i:0 NM:i:1 XM:i:1 XO:i:0 OQ:Z:IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII XT:A:U - - - -- running this tool with **n = 5** and **m =10**, will return:: - - SRR057528.178504 0 1 566573 23 36M * 0 0 ACTGGGCCAGCCAGGCAACCTTCTAGGTAACGACCA 233333323222222232333222222222222222 X0:i:1 X1:i:1 XA:Z:MT,+6023,36M,1; MD:Z:36 RG:Z:rnaseq XG:i:0 NM:i:0 XM:i:0 XO:i:0 OQ:Z::?CCCCBAB@AA@@A@B@??BA@A;AA@======:@ XT:A:U - SRR057527.20391474 0 1 565512 23 36M * 0 0 GGCAGTTGAGGGGGATTAAACCAAACCCAACTACGC %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% X0:i:1 X1:i:1 XA:Z:MT,+4962,36M,2; MD:Z:11T24 RG:Z:rnaseq XG:i:0 NM:i:1 XM:i:1 XO:i:0 OQ:Z:%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% XT:A:U - SRR057513.2261668 16 1 16267 15 36M * 0 0 CACTTCTGGATGCTAGGGTTACACTGGGAGTCACAG 333333333333333333333333333333333333 X0:i:1 X1:i:6 MD:Z:30A5 RG:Z:rnaseq XG:i:0 NM:i:1 XM:i:1 XO:i:0 OQ:Z:IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII XT:A:U - - - - - \ No newline at end of file