changeset 12:c2dd4ddbfd3f

Uploaded new xml
author boris
date Fri, 20 Apr 2012 15:26:39 -0400
parents 8a56e45f925f
children eb7156b4115e
files test-data/MDtag_filter.xml
diffstat 1 files changed, 123 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/MDtag_filter.xml	Fri Apr 20 15:26:39 2012 -0400
@@ -0,0 +1,123 @@
+<tool id="MDtag_filter" name="Filter mapped reads">
+  <description>on MD tag string</description>
+  <command interpreter="python">MDtag_filter.py $in_sam $n $m $out_sam ${create.choice} $discarded_sam</command>
+  <inputs>
+    <param format="sam" name="in_sam" type="data" label="Input SAM file"/>
+    <param name="n" type="integer" value='0' label="5' end window (n)" help="Any number of mismatches within this window will cause the read to be discarded"/>
+    <param name="m" type="integer" value='0' label="3' end window (m)" help="Any number of mismatches within this window will cause the read to be discarded"/>
+    <conditional name="create">
+      <param name="choice" type="select" label="Create additional SAM file for discarded reads?">
+	<option value="no" selected="true">No</option>
+	<option value="yes">Yes</option>
+      </param>
+      <when value="no">
+      <!-- do nothing here -->
+      </when>
+      <when value="yes">
+      <!-- do nothing here -->
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="sam" name="out_sam" label="MDtag_filter_(selected)_from_${in_sam.name}" metadata_source="in_sam"/>
+    <data format="sam" name="discarded_sam" label="MDtag_filter_(discarded)_from_${in_sam.name}" metadata_source="in_sam">
+      <filter> (create['choice']=='yes') </filter>
+    </data>  
+  </outputs>
+  <tests>
+    <test>
+      <param name="in_sam" value="test_for_md_filter.sam"/>
+      <param name="n" value="5"/>
+      <param name="m" value="5"/>
+      <output name="out_sam" file="test_md_filtered.sam"/>
+    </test>
+    <test>
+      <param name="in_sam" value="test_for_md_filter.sam"/>
+      <param name="n" value="5"/>
+      <param name="m" value="5"/>
+      <param name="choice" value="yes"/>
+      <output name="out_sam" file="test_md_selected.sam"/>
+      <output name="discarded_sam" file="test_md_discarded.sam"/>
+    </test>
+  </tests>
+
+  <help>
+    
+Mismatches at either end of a mapped read are most likely sequencing errors.
+This tool aims to control the variation noise due to potential sequencing errors. 
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool reads the MD tag of mapped reads (see SAM format specification). The user defines the 5' and 3' windows **n** and **m** (in bp), respectively.
+The mapped read is discarded if it contains any number of mismatches within **n** bases of the read 5' end and within **m** bases of the read 3' end.
+The resulting SAM file is enriched for mapped reads that show internal variation (if any) over reads whose variation is found within the read ends.
+The user might also want to keep the discarded reads in an additional file.
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+Mapped reads without an MD tag will be removed from the output SAM file(s).
+
+-----
+
+.. class:: infomark
+
+**About formats**
+
+**SAM format** -- SAM stands for Sequence Alignment/Map format. It is a TAB-delimited text format consisting of a header section, which is optional, and an alignment section. Each alignment line has 11 mandatory fields for essential alignment information such as mapping position, and variable number of optional fields for flexible or aligner specific information.
+
+Each alignment line has 11 **mandatory** fields::
+
+    1.  QNAME - Query template NAME
+    2.  FLAG  - bitwise FLAG
+    3.  RNAME - Reference sequence NAME
+    4.  POS   - 1-based leftmost mapping POSition 
+    5.  MAPQ  - MAPping Quality
+    6.  CIGAR - CIGAR string
+    7.  RNEXT - Ref. name of the mate/next segment
+    8.  PNEXT - Position of the mate/next segment observed
+    9.  TLEN  - Template LENgth
+    10. SEQ   - segment SEQuence
+    11. QUAL  - ASCII of Phred-scaled base QUALity+33
+    
+
+All **optional** fields follow the TAG\:TYPE\:VALUE format, where TAG is a two-character string that matches [A-Za-z][A-Za-z0-9]. TYPE is a single case sensitive letter which defines the format of VALUE::
+
+    MD TAG
+    
+    MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)* with Z = Printable string, including space.
+    
+    String for mismatching positions. The MD field aims to achieve SNP/indel calling without looking at the reference.
+    For example, a string ‘10A5^AC6’ means from the leftmost reference base in the alignment, there are 10 matches followed by an A on the reference which is different from the aligned read base; the next 5 reference bases are matches followed by a 2bp deletion from the reference; the deleted sequence is AC; the last 6 bases are matches. The MD field ought to match the CIGAR string.
+
+-----
+
+**Example**
+
+- For the following dataset::
+
+   SRR057527.13746413	16	1	1164232	35	1I35M	*	0	0	CGAAAGTGAGGTCCTGGCTCCAATCCAATCCCCGGG	333333033333333333333333333333333333	X0:i:1	X1:i:0	OC:Z:36M	RG:Z:rnaseq	XG:i:0	NM:i:2	XM:i:2	XO:i:0	OP:i:1164231	OQ:Z:CCCCCCDCCCCBCCCCCCCCCCCCCCCCCCCCCCCC	XT:A:U
+   SRR057527.8574994	16	1	565901	23	36M	*	0	0	GAGCCTAATCTACTCCACCTCAATCACACTACTCCC	333333333333333303333333333333333333	X0:i:1	X1:i:1	XA:Z:MT,-5351,36M,2;	MD:Z:1C34	RG:Z:rnaseq	XG:i:0	NM:i:1	XM:i:1	XO:i:0	OQ:Z:CCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCC	XT:A:U
+   SRR057528.178504	0	1	566573	23	36M	*	0	0	ACTGGGCCAGCCAGGCAACCTTCTAGGTAACGACCA	233333323222222232333222222222222222	X0:i:1	X1:i:1	XA:Z:MT,+6023,36M,1;	MD:Z:36	RG:Z:rnaseq	XG:i:0	NM:i:0	XM:i:0	XO:i:0	OQ:Z::?CCCCBAB@AA@@A@B@??BA@A;AA@======:@	XT:A:U
+   SRR057527.20391474	0	1	565512	23	36M	*	0	0	GGCAGTTGAGGGGGATTAAACCAAACCCAACTACGC	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%	X0:i:1	X1:i:1	XA:Z:MT,+4962,36M,2;	MD:Z:11T24	RG:Z:rnaseq	XG:i:0	NM:i:1	XM:i:1	XO:i:0	OQ:Z:%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%	XT:A:U
+   SRR057513.2261668	16	1	16267	15	36M	*	0	0	CACTTCTGGATGCTAGGGTTACACTGGGAGTCACAG	333333333333333333333333333333333333	X0:i:1	X1:i:6	MD:Z:30A5	RG:Z:rnaseq	XG:i:0	NM:i:1	XM:i:1	XO:i:0	OQ:Z:IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	XT:A:U
+
+
+
+- running this tool with **n = 5** and **m =10**,  will return::
+
+   SRR057528.178504	0	1	566573	23	36M	*	0	0	ACTGGGCCAGCCAGGCAACCTTCTAGGTAACGACCA	233333323222222232333222222222222222	X0:i:1	X1:i:1	XA:Z:MT,+6023,36M,1;	MD:Z:36	RG:Z:rnaseq	XG:i:0	NM:i:0	XM:i:0	XO:i:0	OQ:Z::?CCCCBAB@AA@@A@B@??BA@A;AA@======:@	XT:A:U
+   SRR057527.20391474	0	1	565512	23	36M	*	0	0	GGCAGTTGAGGGGGATTAAACCAAACCCAACTACGC	%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%	X0:i:1	X1:i:1	XA:Z:MT,+4962,36M,2;	MD:Z:11T24	RG:Z:rnaseq	XG:i:0	NM:i:1	XM:i:1	XO:i:0	OQ:Z:%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%	XT:A:U
+   SRR057513.2261668	16	1	16267	15	36M	*	0	0	CACTTCTGGATGCTAGGGTTACACTGGGAGTCACAG	333333333333333333333333333333333333	X0:i:1	X1:i:6	MD:Z:30A5	RG:Z:rnaseq	XG:i:0	NM:i:1	XM:i:1	XO:i:0	OQ:Z:IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	XT:A:U
+   
+
+  </help>
+
+</tool>
\ No newline at end of file