annotate picard_MarkDuplicates.xml @ 117:5fe20cda6a51 draft

Uploaded
author devteam
date Tue, 25 Feb 2014 18:54:45 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
117
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
1 <tool name="Mark Duplicate reads" id="rgPicardMarkDups" version="1.106.0">
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
2 <descrition>locates duplicate molecules</descrition>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
3 <command interpreter="python">
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
4 picard_wrapper.py -i "${input_file}" -n "${out_prefix}" --tmpdir "${__new_file_path__}" -o "${out_file}"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
5 --remdups "${remDups}" --assumesorted "${assumeSorted}" --readregex "${readRegex}" --optdupdist "${optDupeDist}"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
6 -j "\$JAVA_JAR_PATH/MarkDuplicates.jar" -d "${html_file.files_path}" -t "${html_file}" -e "${input_file.ext}"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
7 </command>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
8 <requirements><requirement type="package" version="1.106.0">picard</requirement></requirements>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
9 <inputs>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
10 <param format="bam,sam" name="input_file" type="data" label="SAM/BAM dataset to mark duplicates in"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
11 help="If empty, upload or import a SAM/BAM dataset."/>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
12 <param name="out_prefix" value="Dupes Marked" type="text"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
13 label="Title for the output file" help="Use this remind you what the job was for" size="80" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
14 <param name="remDups" value="false" type="boolean" label="Remove duplicates from output file"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
15 truevalue="true" falsevalue="false" checked="yes"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
16 help="If true do not write duplicates to the output file instead of writing them with appropriate flags set." />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
17 <param name="assumeSorted" value="true" type="boolean" label="Assume reads are already ordered"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
18 truevalue="true" falsevalue="false" checked="yes"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
19 help="If true assume input data are already sorted (most Galaxy SAM/BAM should be)." />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
20 <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" type="text" size="80"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
21 label="Regular expression that can be used to parse read names in the incoming SAM file"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
22 help="Names are parsed to extract: tile/region, x coordinate and y coordinate, to estimate optical duplication rate" >
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
23 <sanitizer>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
24 <valid initial="string.printable">
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
25 <remove value="&apos;"/>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
26 </valid>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
27 <mapping initial="none">
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
28 <add source="&apos;" target="__sq__"/>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
29 </mapping>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
30 </sanitizer>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
31 </param>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
32 <param name="optDupeDist" value="100" type="integer"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
33 label="The maximum offset between two duplicate clusters in order to consider them optical duplicates." size="5"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
34 help="e.g. 5-10 pixels. Later Illumina software versions multiply pixel values by 10, in which case 50-100." >
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
35 <validator type="in_range" message="Minimum optical dupe distance must be positive" min="0" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
36 </param>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
37
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
38 </inputs>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
39 <outputs>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
40 <data format="bam" name="out_file" label="MarkDups_${out_prefix}.bam"/>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
41 <data format="html" name="html_file" label="MarkDups_${out_prefix}.html"/>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
42 </outputs>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
43 <tests>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
44 <test>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
45 <param name="input_file" value="picard_input_tiny_coord.bam" ftype="bam" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
46 <param name="out_prefix" value="Dupes Marked" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
47 <param name="remDups" value="false" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
48 <param name="assumeSorted" value="true" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
49 <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
50 <param name="optDupeDist" value="100" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
51 <output name="out_file" file="picard_output_markdups_sortedpairsam.bam" ftype="bam" compare="diff" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
52 <output name="html_file" file="picard_output_markdups_sortedpairsam.html" ftype="html" lines_diff="75" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
53 </test>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
54 <test>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
55 <param name="input_file" value="picard_input_tiny_coord.sam" ftype="sam" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
56 <param name="out_prefix" value="Dupes Marked" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
57 <param name="remDups" value="true" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
58 <param name="assumeSorted" value="true" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
59 <param name="readRegex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
60 <param name="optDupeDist" value="100" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
61 <output name="out_file" file="picard_output_markdups_remdupes.bam" ftype="bam" compare="diff" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
62 <output name="html_file" file="picard_output_markdups_sortedpairsam.html" ftype="html" lines_diff="75" />
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
63 </test>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
64 </tests>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
65
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
66 <help>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
67
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
68 .. class:: infomark
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
69
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
70 **Purpose**
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
71
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
72 Marks all duplicate reads in a provided SAM or BAM file and either removes them or flags them.
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
73
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
74 **Picard documentation**
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
75
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
76 This is a Galaxy wrapper for MarkDuplicates, a part of the external package Picard-tools_.
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
77
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
78 .. _Picard-tools: http://www.google.com/search?q=picard+samtools
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
79
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
80 -----
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
81
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
82 .. class:: infomark
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
83
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
84 **Inputs, outputs, and parameters**
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
85
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
86 Picard documentation says (reformatted for Galaxy):
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
87
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
88 .. csv-table:: Mark Duplicates docs
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
89 :header-rows: 1
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
90
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
91 Option,Description
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
92 "INPUT=File","The input SAM or BAM file to analyze. Must be coordinate sorted. Required."
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
93 "OUTPUT=File","The output file to right marked records to Required."
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
94 "METRICS_FILE=File","File to write duplication metrics to Required."
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
95 "REMOVE_DUPLICATES=Boolean","If true do not write duplicates to the output file instead of writing them with appropriate flags set. Default value: false."
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
96 "ASSUME_SORTED=Boolean","If true, assume that the input file is coordinate sorted, even if the header says otherwise. Default value: false."
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
97 "MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=Integer","This option is obsolete. ReadEnds will always be spilled to disk. Default value: 50000."
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
98 "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=Integer","Maximum number of file handles to keep open when spilling read ends to disk."
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
99 "READ_NAME_REGEX=String","Regular expression that can be used to parse read names in the incoming SAM file. Read names are parsed to extract three variables: tile/region, x coordinate and y coordinate. "
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
100 "OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer","The maximum offset between two duplicte clusters in order to consider them optical duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels) unless using later versions of the Illumina pipeline that multiply pixel values by 10, in which case 50-100 is more normal. Default value: 100"
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
101
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
102 .. class:: warningmark
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
103
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
104 **Warning on SAM/BAM quality**
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
105
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
106 Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
107 flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
108 to be the only way to deal with SAM/BAM that cannot be parsed.
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
109 .. class:: infomark
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
110
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
111 **Note on the Regular Expression**
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
112
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
113 (from the Picard docs)
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
114 This tool requires a valid regular expression to parse out the read names in the incoming SAM or BAM file. These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size. The regular expression should contain three capture groups for the three variables, in order. Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
115
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
116 Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules. All records are then written to the output file with the duplicate records flagged unless the remove duplicates option is selected. In some cases you may want to do this, but please only do this if you really understand what you are doing.
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
117
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
118 </help>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
119 </tool>
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
120
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
121
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
122
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
123
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
124
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
125
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
126
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
127
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
128
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
129
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
130
5fe20cda6a51 Uploaded
devteam
parents:
diff changeset
131