annotate picard_CollectAlignmentSummaryMetrics.xml @ 151:fe8432b20247 draft

merged
author Rayan Chikhi <chikhi@psu.edu>
date Mon, 21 Jul 2014 16:32:08 -0400
parents 767bcb0a06df
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
99
e18ad84f747a Uploaded
devteam
parents:
diff changeset
1 <tool name="SAM/BAM Alignment Summary Metrics" id="PicardASMetrics" version="1.106.0">
e18ad84f747a Uploaded
devteam
parents:
diff changeset
2 <description>writes a file containing summary alignment metrics</description>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
3 <requirements><requirement type="package" version="1.106.0">picard</requirement></requirements>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
4 <command interpreter="python">
e18ad84f747a Uploaded
devteam
parents:
diff changeset
5 picard_wrapper.py -i "$input_file" -d "$html_file.files_path" -t "$html_file"
e18ad84f747a Uploaded
devteam
parents:
diff changeset
6 --assumesorted "$sorted" -b "$bisulphite" --adaptors "$adaptors" --maxinsert "$maxinsert" -n "$out_prefix" --datatype "$input_file.ext"
e18ad84f747a Uploaded
devteam
parents:
diff changeset
7 -j "\$JAVA_JAR_PATH/CollectAlignmentSummaryMetrics.jar" --tmpdir "${__new_file_path__}"
e18ad84f747a Uploaded
devteam
parents:
diff changeset
8 #if $genomeSource.refGenomeSource == "history":
e18ad84f747a Uploaded
devteam
parents:
diff changeset
9 --ref-file "$genomeSource.ownFile"
e18ad84f747a Uploaded
devteam
parents:
diff changeset
10 #else
e18ad84f747a Uploaded
devteam
parents:
diff changeset
11 --ref "${genomeSource.index.fields.path}"
e18ad84f747a Uploaded
devteam
parents:
diff changeset
12 #end if
e18ad84f747a Uploaded
devteam
parents:
diff changeset
13 </command>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
14 <inputs>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
15 <param format="sam,bam" name="input_file" type="data" label="SAM/BAM dataset to generate statistics for"
e18ad84f747a Uploaded
devteam
parents:
diff changeset
16 help="If empty, upload or import a SAM/BAM dataset."/>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
17 <param name="out_prefix" value="Picard Alignment Summary Metrics" type="text"
e18ad84f747a Uploaded
devteam
parents:
diff changeset
18 label="Title for the output file" help="Use this remind you what the job was for." size="80" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
19
e18ad84f747a Uploaded
devteam
parents:
diff changeset
20 <conditional name="genomeSource">
e18ad84f747a Uploaded
devteam
parents:
diff changeset
21
e18ad84f747a Uploaded
devteam
parents:
diff changeset
22 <param name="refGenomeSource" type="select" label="Select Reference Genome">
e18ad84f747a Uploaded
devteam
parents:
diff changeset
23 <option value="default" selected="true">Use the assigned data genome/build</option>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
24 <option value="indexed">Select a different built-in genome</option>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
25 <option value="history">Use a genome (fasta format) from my history</option>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
26 </param>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
27 <when value="default">
e18ad84f747a Uploaded
devteam
parents:
diff changeset
28 <param name="index" type="select" label="Check the assigned reference genome" help="Galaxy thinks that the reads in you dataset were aligned against this reference. If this is not correct, use the 'Select a build-in reference genome' option of the 'Select Reference Genome' dropdown to select approprtiate Reference.">
e18ad84f747a Uploaded
devteam
parents:
diff changeset
29 <options from_data_table="all_fasta">
e18ad84f747a Uploaded
devteam
parents:
diff changeset
30 <filter type="data_meta" ref="input_file" key="dbkey" column="dbkey" multiple="True" separator="," />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
31 <validator type="no_options" message="No reference build available for selected input" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
32 </options>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
33 </param>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
34 </when>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
35 <when value="indexed">
e18ad84f747a Uploaded
devteam
parents:
diff changeset
36 <param name="index" type="select" label="Select a built-in reference genome" help="This list contains genomes cached at this Galaxy instance. If your genome of interest is not present here request it by using 'Help' link at the top of Galaxy interface or use the 'Use a genome (fasta format) from my history' option of the 'Select Reference Genome' dropdown.">
e18ad84f747a Uploaded
devteam
parents:
diff changeset
37 <options from_data_table="all_fasta">
e18ad84f747a Uploaded
devteam
parents:
diff changeset
38 </options>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
39 </param>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
40 </when>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
41 <when value="history">
e18ad84f747a Uploaded
devteam
parents:
diff changeset
42 <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference genome from history" help="This option works best for relatively small genomes. If you are working with large human-sized genomes, send request to Galaxy team for adding your reference to this Galaxy instance by using 'Help' link at the top of Galaxy interface."/>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
43 </when>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
44 </conditional>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
45 <param name="sorted" type="boolean" label="Assume the input file is already sorted" checked="true" truevalue="true" falsevalue="false"/>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
46 <param name="bisulphite" type="boolean" label="Input file contains Bisulphite sequenced reads" checked="false" falsevalue="false" truevalue="true" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
47 <param name="adaptors" value="" type="text" area="true" label="Adapter sequences" help="One per line if multiple" size="5x120" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
48 <param name="maxinsert" value="100000" type="integer" label="Larger paired end reads and inter-chromosomal pairs considered chimeric " size="20" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
49 </inputs>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
50 <outputs>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
51 <data format="html" name="html_file" label="${out_prefix}.html" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
52 </outputs>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
53 <tests>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
54 <test>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
55 <!-- this test works OK -->
e18ad84f747a Uploaded
devteam
parents:
diff changeset
56 <param name="out_prefix" value="AsMetrics" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
57 <param name="bisulphite" value="false" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
58 <param name="sorted" value="true" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
59 <param name="adaptors" value="" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
60 <param name="maxinsert" value="100000" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
61 <param name="refGenomeSource" value="history" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
62 <param name="ownFile" value="picard_input_hg18.trimmed.fasta" />
146
767bcb0a06df fixed downsamplesam
Rayan Chikhi <chikhi@psu.edu>
parents: 99
diff changeset
63 <param name="input_file" value="picard_input_tiny.sam" dbkey="hg18" ftype="sam" />
99
e18ad84f747a Uploaded
devteam
parents:
diff changeset
64 <output name="html_file" file="picard_output_alignment_summary_metrics.html" ftype="html" lines_diff="55"/>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
65 </test>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
66 <test>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
67 <param name="out_prefix" value="AsMetricsIndexed" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
68 <param name="bisulphite" value="false" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
69 <param name="sorted" value="true" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
70 <param name="adaptors" value="" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
71 <param name="maxinsert" value="100000" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
72 <param name="refGenomeSource" value="indexed" />
e18ad84f747a Uploaded
devteam
parents:
diff changeset
73 <param name="index" value="hg19" />
146
767bcb0a06df fixed downsamplesam
Rayan Chikhi <chikhi@psu.edu>
parents: 99
diff changeset
74 <param name="input_file" value="picard_input_sorted_pair.sam" dbkey="hg19" ftype="sam" />
99
e18ad84f747a Uploaded
devteam
parents:
diff changeset
75 <output name="html_file" file="picard_output_AsMetrics_indexed_hg18_sorted_pair.html" ftype="html" lines_diff="50"/>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
76 </test>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
77 </tests>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
78 <help>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
79
e18ad84f747a Uploaded
devteam
parents:
diff changeset
80 .. class:: infomark
e18ad84f747a Uploaded
devteam
parents:
diff changeset
81
e18ad84f747a Uploaded
devteam
parents:
diff changeset
82 **Summary**
e18ad84f747a Uploaded
devteam
parents:
diff changeset
83
e18ad84f747a Uploaded
devteam
parents:
diff changeset
84 This Galaxy tool uses Picard to report high-level measures of alignment based on a provided sam or bam file.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
85
e18ad84f747a Uploaded
devteam
parents:
diff changeset
86 **Picard documentation**
e18ad84f747a Uploaded
devteam
parents:
diff changeset
87
e18ad84f747a Uploaded
devteam
parents:
diff changeset
88 This is a Galaxy wrapper for CollectAlignmentSummaryMetrics, a part of the external package Picard-tools_.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
89
e18ad84f747a Uploaded
devteam
parents:
diff changeset
90 .. _Picard-tools: http://www.google.com/search?q=picard+samtools
e18ad84f747a Uploaded
devteam
parents:
diff changeset
91
e18ad84f747a Uploaded
devteam
parents:
diff changeset
92 -----
e18ad84f747a Uploaded
devteam
parents:
diff changeset
93
e18ad84f747a Uploaded
devteam
parents:
diff changeset
94 .. class:: infomark
e18ad84f747a Uploaded
devteam
parents:
diff changeset
95
e18ad84f747a Uploaded
devteam
parents:
diff changeset
96 **Syntax**
e18ad84f747a Uploaded
devteam
parents:
diff changeset
97
e18ad84f747a Uploaded
devteam
parents:
diff changeset
98 - **Input** - SAM/BAM format aligned short read data in your current history
e18ad84f747a Uploaded
devteam
parents:
diff changeset
99 - **Title** - the title to use for all output files from this job - use it for high level metadata
e18ad84f747a Uploaded
devteam
parents:
diff changeset
100 - **Reference Genome** - Galaxy (and Picard) needs to know which genomic reference was used to generate alignemnts within the input SAM/BAM dataset. Here you have three choices:
e18ad84f747a Uploaded
devteam
parents:
diff changeset
101
e18ad84f747a Uploaded
devteam
parents:
diff changeset
102 - *Assigned data genome/build* - a genome specified for this dataset. If you your SAM/BAM dataset has an assigned reference genome it will be displayed below this dropdown. If it does not -> use one of the following two options.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
103 - *Select a different built-in genome* - this option will list all reference genomes presently cached at this instance of Galaxy.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
104 - *Select a reference genome from history* - alternatively you can upload your own version of reference genome into your history and use it with this option. This is however not advisable with large human-sized genomes. If your genome is large contact Galaxy team using "Help" link at the top of the interface and provide exact details on where we can download sequences you would like to use as the refenece. We will then install them as a part of locally cached genomic references.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
105
e18ad84f747a Uploaded
devteam
parents:
diff changeset
106 - **Assume Sorted** - saves sorting time - but only if true!
e18ad84f747a Uploaded
devteam
parents:
diff changeset
107 - **Bisulphite data** - see Picard documentation http://picard.sourceforge.net/command-line-overview.shtml#CollectAlignmentSummaryMetrics
e18ad84f747a Uploaded
devteam
parents:
diff changeset
108 - **Maximum acceptable insertion length** - see Picard documentation at http://picard.sourceforge.net/command-line-overview.shtml#CollectAlignmentSummaryMetrics
e18ad84f747a Uploaded
devteam
parents:
diff changeset
109
e18ad84f747a Uploaded
devteam
parents:
diff changeset
110 -----
e18ad84f747a Uploaded
devteam
parents:
diff changeset
111
e18ad84f747a Uploaded
devteam
parents:
diff changeset
112 .. class:: infomark
e18ad84f747a Uploaded
devteam
parents:
diff changeset
113
e18ad84f747a Uploaded
devteam
parents:
diff changeset
114 **Inputs, outputs, and parameters**
e18ad84f747a Uploaded
devteam
parents:
diff changeset
115
e18ad84f747a Uploaded
devteam
parents:
diff changeset
116 The Picard documentation (reformatted for Galaxy) says:
e18ad84f747a Uploaded
devteam
parents:
diff changeset
117
e18ad84f747a Uploaded
devteam
parents:
diff changeset
118 .. csv-table::
e18ad84f747a Uploaded
devteam
parents:
diff changeset
119 :header-rows: 1
e18ad84f747a Uploaded
devteam
parents:
diff changeset
120
e18ad84f747a Uploaded
devteam
parents:
diff changeset
121 Option,Description
e18ad84f747a Uploaded
devteam
parents:
diff changeset
122 "INPUT=File","SAM or BAM file Required."
e18ad84f747a Uploaded
devteam
parents:
diff changeset
123 "OUTPUT=File","File to write insert size metrics to Required."
e18ad84f747a Uploaded
devteam
parents:
diff changeset
124 "REFERENCE_SEQUENCE=File","Reference sequence file Required."
e18ad84f747a Uploaded
devteam
parents:
diff changeset
125 "ASSUME_SORTED=Boolean","If true (default), unsorted SAM/BAM files will be considerd coordinate sorted "
e18ad84f747a Uploaded
devteam
parents:
diff changeset
126 "MAX_INSERT_SIZE=Integer","Paired end reads above this insert size will be considered chimeric along with inter-chromosomal pairs. Default value: 100000."
e18ad84f747a Uploaded
devteam
parents:
diff changeset
127 "ADAPTER_SEQUENCE=String","This option may be specified 0 or more times. "
e18ad84f747a Uploaded
devteam
parents:
diff changeset
128 "IS_BISULFITE_SEQUENCED=Boolean","Whether the SAM or BAM file consists of bisulfite sequenced reads. Default value: false. "
e18ad84f747a Uploaded
devteam
parents:
diff changeset
129 "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created."
e18ad84f747a Uploaded
devteam
parents:
diff changeset
130
e18ad84f747a Uploaded
devteam
parents:
diff changeset
131 The output produced by the tool has the following columns::
e18ad84f747a Uploaded
devteam
parents:
diff changeset
132
e18ad84f747a Uploaded
devteam
parents:
diff changeset
133 1. CATEGORY: One of either UNPAIRED (for a fragment run), FIRST_OF_PAIR when metrics are for only the first read in a paired run, SECOND_OF_PAIR when the metrics are for only the second read in a paired run or PAIR when the metrics are aggregeted for both first and second reads in a pair.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
134 2. TOTAL_READS: The total number of reads including all PF and non-PF reads. When CATEGORY equals PAIR this value will be 2x the number of clusters.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
135 3. PF_READS: The number of PF reads where PF is defined as passing Illumina's filter.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
136 4. PCT_PF_READS: The percentage of reads that are PF (PF_READS / TOTAL_READS)
e18ad84f747a Uploaded
devteam
parents:
diff changeset
137 5. PF_NOISE_READS: The number of PF reads that are marked as noise reads. A noise read is one which is composed entirey of A bases and/or N bases. These reads are marked as they are usually artifactual and are of no use in downstream analysis.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
138 6. PF_READS_ALIGNED: The number of PF reads that were aligned to the reference sequence. This includes reads that aligned with low quality (i.e. their alignments are ambiguous).
e18ad84f747a Uploaded
devteam
parents:
diff changeset
139 7. PCT_PF_READS_ALIGNED: The percentage of PF reads that aligned to the reference sequence. PF_READS_ALIGNED / PF_READS
e18ad84f747a Uploaded
devteam
parents:
diff changeset
140 8. PF_HQ_ALIGNED_READS: The number of PF reads that were aligned to the reference sequence with a mapping quality of Q20 or higher signifying that the aligner estimates a 1/100 (or smaller) chance that the alignment is wrong.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
141 9. PF_HQ_ALIGNED_BASES: The number of bases aligned to the reference sequence in reads that were mapped at high quality. Will usually approximate PF_HQ_ALIGNED_READS * READ_LENGTH but may differ when either mixed read lengths are present or many reads are aligned with gaps.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
142 10. PF_HQ_ALIGNED_Q20_BASES: The subest of PF_HQ_ALIGNED_BASES where the base call quality was Q20 or higher.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
143 11. PF_HQ_MEDIAN_MISMATCHES: The median number of mismatches versus the reference sequence in reads that were aligned to the reference at high quality (i.e. PF_HQ_ALIGNED READS).
e18ad84f747a Uploaded
devteam
parents:
diff changeset
144 12. PF_HQ_ERROR_RATE: The percentage of bases that mismatch the reference in PF HQ aligned reads.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
145 13. MEAN_READ_LENGTH: The mean read length of the set of reads examined. When looking at the data for a single lane with equal length reads this number is just the read length. When looking at data for merged lanes with differing read lengths this is the mean read length of all reads.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
146 14. READS_ALIGNED_IN_PAIRS: The number of aligned reads who's mate pair was also aligned to the reference.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
147 15. PCT_READS_ALIGNED_IN_PAIRS: The percentage of reads who's mate pair was also aligned to the reference. READS_ALIGNED_IN_PAIRS / PF_READS_ALIGNED
e18ad84f747a Uploaded
devteam
parents:
diff changeset
148 16. BAD_CYCLES: The number of instrument cycles in which 80% or more of base calls were no-calls.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
149 17. STRAND_BALANCE: The number of PF reads aligned to the positive strand of the genome divided by the number of PF reads aligned to the genome.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
150 18. PCT_CHIMERAS: The percentage of reads that map outside of a maximum insert size (usually 100kb) or that have the two ends mapping to different chromosomes.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
151 19. PCT_ADAPTER: The percentage of PF reads that are unaligned and match to a known adapter sequence right from the start of the read.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
152
e18ad84f747a Uploaded
devteam
parents:
diff changeset
153 .. class:: warningmark
e18ad84f747a Uploaded
devteam
parents:
diff changeset
154
e18ad84f747a Uploaded
devteam
parents:
diff changeset
155 **Warning on SAM/BAM quality**
e18ad84f747a Uploaded
devteam
parents:
diff changeset
156
e18ad84f747a Uploaded
devteam
parents:
diff changeset
157 Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
e18ad84f747a Uploaded
devteam
parents:
diff changeset
158 flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
e18ad84f747a Uploaded
devteam
parents:
diff changeset
159 to be the only way to deal with SAM/BAM that cannot be parsed.
e18ad84f747a Uploaded
devteam
parents:
diff changeset
160
e18ad84f747a Uploaded
devteam
parents:
diff changeset
161
e18ad84f747a Uploaded
devteam
parents:
diff changeset
162 </help>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
163 </tool>
e18ad84f747a Uploaded
devteam
parents:
diff changeset
164
e18ad84f747a Uploaded
devteam
parents:
diff changeset
165