comparison picard_CollectAlignmentSummaryMetrics.xml @ 99:e18ad84f747a draft

Uploaded
author devteam
date Mon, 24 Feb 2014 17:27:59 -0500 (2014-02-24)
parents
children 767bcb0a06df
comparison
equal deleted inserted replaced
98:a1657d0eca21 99:e18ad84f747a
1 <tool name="SAM/BAM Alignment Summary Metrics" id="PicardASMetrics" version="1.106.0">
2 <description>writes a file containing summary alignment metrics</description>
3 <requirements><requirement type="package" version="1.106.0">picard</requirement></requirements>
4 <command interpreter="python">
5 picard_wrapper.py -i "$input_file" -d "$html_file.files_path" -t "$html_file"
6 --assumesorted "$sorted" -b "$bisulphite" --adaptors "$adaptors" --maxinsert "$maxinsert" -n "$out_prefix" --datatype "$input_file.ext"
7 -j "\$JAVA_JAR_PATH/CollectAlignmentSummaryMetrics.jar" --tmpdir "${__new_file_path__}"
8 #if $genomeSource.refGenomeSource == "history":
9 --ref-file "$genomeSource.ownFile"
10 #else
11 --ref "${genomeSource.index.fields.path}"
12 #end if
13 </command>
14 <inputs>
15 <param format="sam,bam" name="input_file" type="data" label="SAM/BAM dataset to generate statistics for"
16 help="If empty, upload or import a SAM/BAM dataset."/>
17 <param name="out_prefix" value="Picard Alignment Summary Metrics" type="text"
18 label="Title for the output file" help="Use this remind you what the job was for." size="80" />
19
20 <conditional name="genomeSource">
21
22 <param name="refGenomeSource" type="select" label="Select Reference Genome">
23 <option value="default" selected="true">Use the assigned data genome/build</option>
24 <option value="indexed">Select a different built-in genome</option>
25 <option value="history">Use a genome (fasta format) from my history</option>
26 </param>
27 <when value="default">
28 <param name="index" type="select" label="Check the assigned reference genome" help="Galaxy thinks that the reads in you dataset were aligned against this reference. If this is not correct, use the 'Select a build-in reference genome' option of the 'Select Reference Genome' dropdown to select approprtiate Reference.">
29 <options from_data_table="all_fasta">
30 <filter type="data_meta" ref="input_file" key="dbkey" column="dbkey" multiple="True" separator="," />
31 <validator type="no_options" message="No reference build available for selected input" />
32 </options>
33 </param>
34 </when>
35 <when value="indexed">
36 <param name="index" type="select" label="Select a built-in reference genome" help="This list contains genomes cached at this Galaxy instance. If your genome of interest is not present here request it by using 'Help' link at the top of Galaxy interface or use the 'Use a genome (fasta format) from my history' option of the 'Select Reference Genome' dropdown.">
37 <options from_data_table="all_fasta">
38 </options>
39 </param>
40 </when>
41 <when value="history">
42 <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference genome from history" help="This option works best for relatively small genomes. If you are working with large human-sized genomes, send request to Galaxy team for adding your reference to this Galaxy instance by using 'Help' link at the top of Galaxy interface."/>
43 </when>
44 </conditional>
45 <param name="sorted" type="boolean" label="Assume the input file is already sorted" checked="true" truevalue="true" falsevalue="false"/>
46 <param name="bisulphite" type="boolean" label="Input file contains Bisulphite sequenced reads" checked="false" falsevalue="false" truevalue="true" />
47 <param name="adaptors" value="" type="text" area="true" label="Adapter sequences" help="One per line if multiple" size="5x120" />
48 <param name="maxinsert" value="100000" type="integer" label="Larger paired end reads and inter-chromosomal pairs considered chimeric " size="20" />
49 </inputs>
50 <outputs>
51 <data format="html" name="html_file" label="${out_prefix}.html" />
52 </outputs>
53 <tests>
54 <test>
55 <!-- this test works OK -->
56 <param name="out_prefix" value="AsMetrics" />
57 <param name="bisulphite" value="false" />
58 <param name="sorted" value="true" />
59 <param name="adaptors" value="" />
60 <param name="maxinsert" value="100000" />
61 <param name="refGenomeSource" value="history" />
62 <param name="ownFile" value="picard_input_hg18.trimmed.fasta" />
63 <param name="input_file" value="picard_input_tiny.sam" dbkey="hg18" />
64 <output name="html_file" file="picard_output_alignment_summary_metrics.html" ftype="html" lines_diff="55"/>
65 </test>
66 <test>
67 <param name="out_prefix" value="AsMetricsIndexed" />
68 <param name="bisulphite" value="false" />
69 <param name="sorted" value="true" />
70 <param name="adaptors" value="" />
71 <param name="maxinsert" value="100000" />
72 <param name="refGenomeSource" value="indexed" />
73 <param name="index" value="hg19" />
74 <param name="input_file" value="picard_input_sorted_pair.sam" dbkey="hg19" />
75 <output name="html_file" file="picard_output_AsMetrics_indexed_hg18_sorted_pair.html" ftype="html" lines_diff="50"/>
76 </test>
77 </tests>
78 <help>
79
80 .. class:: infomark
81
82 **Summary**
83
84 This Galaxy tool uses Picard to report high-level measures of alignment based on a provided sam or bam file.
85
86 **Picard documentation**
87
88 This is a Galaxy wrapper for CollectAlignmentSummaryMetrics, a part of the external package Picard-tools_.
89
90 .. _Picard-tools: http://www.google.com/search?q=picard+samtools
91
92 -----
93
94 .. class:: infomark
95
96 **Syntax**
97
98 - **Input** - SAM/BAM format aligned short read data in your current history
99 - **Title** - the title to use for all output files from this job - use it for high level metadata
100 - **Reference Genome** - Galaxy (and Picard) needs to know which genomic reference was used to generate alignemnts within the input SAM/BAM dataset. Here you have three choices:
101
102 - *Assigned data genome/build* - a genome specified for this dataset. If you your SAM/BAM dataset has an assigned reference genome it will be displayed below this dropdown. If it does not -> use one of the following two options.
103 - *Select a different built-in genome* - this option will list all reference genomes presently cached at this instance of Galaxy.
104 - *Select a reference genome from history* - alternatively you can upload your own version of reference genome into your history and use it with this option. This is however not advisable with large human-sized genomes. If your genome is large contact Galaxy team using "Help" link at the top of the interface and provide exact details on where we can download sequences you would like to use as the refenece. We will then install them as a part of locally cached genomic references.
105
106 - **Assume Sorted** - saves sorting time - but only if true!
107 - **Bisulphite data** - see Picard documentation http://picard.sourceforge.net/command-line-overview.shtml#CollectAlignmentSummaryMetrics
108 - **Maximum acceptable insertion length** - see Picard documentation at http://picard.sourceforge.net/command-line-overview.shtml#CollectAlignmentSummaryMetrics
109
110 -----
111
112 .. class:: infomark
113
114 **Inputs, outputs, and parameters**
115
116 The Picard documentation (reformatted for Galaxy) says:
117
118 .. csv-table::
119 :header-rows: 1
120
121 Option,Description
122 "INPUT=File","SAM or BAM file Required."
123 "OUTPUT=File","File to write insert size metrics to Required."
124 "REFERENCE_SEQUENCE=File","Reference sequence file Required."
125 "ASSUME_SORTED=Boolean","If true (default), unsorted SAM/BAM files will be considerd coordinate sorted "
126 "MAX_INSERT_SIZE=Integer","Paired end reads above this insert size will be considered chimeric along with inter-chromosomal pairs. Default value: 100000."
127 "ADAPTER_SEQUENCE=String","This option may be specified 0 or more times. "
128 "IS_BISULFITE_SEQUENCED=Boolean","Whether the SAM or BAM file consists of bisulfite sequenced reads. Default value: false. "
129 "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created."
130
131 The output produced by the tool has the following columns::
132
133 1. CATEGORY: One of either UNPAIRED (for a fragment run), FIRST_OF_PAIR when metrics are for only the first read in a paired run, SECOND_OF_PAIR when the metrics are for only the second read in a paired run or PAIR when the metrics are aggregeted for both first and second reads in a pair.
134 2. TOTAL_READS: The total number of reads including all PF and non-PF reads. When CATEGORY equals PAIR this value will be 2x the number of clusters.
135 3. PF_READS: The number of PF reads where PF is defined as passing Illumina's filter.
136 4. PCT_PF_READS: The percentage of reads that are PF (PF_READS / TOTAL_READS)
137 5. PF_NOISE_READS: The number of PF reads that are marked as noise reads. A noise read is one which is composed entirey of A bases and/or N bases. These reads are marked as they are usually artifactual and are of no use in downstream analysis.
138 6. PF_READS_ALIGNED: The number of PF reads that were aligned to the reference sequence. This includes reads that aligned with low quality (i.e. their alignments are ambiguous).
139 7. PCT_PF_READS_ALIGNED: The percentage of PF reads that aligned to the reference sequence. PF_READS_ALIGNED / PF_READS
140 8. PF_HQ_ALIGNED_READS: The number of PF reads that were aligned to the reference sequence with a mapping quality of Q20 or higher signifying that the aligner estimates a 1/100 (or smaller) chance that the alignment is wrong.
141 9. PF_HQ_ALIGNED_BASES: The number of bases aligned to the reference sequence in reads that were mapped at high quality. Will usually approximate PF_HQ_ALIGNED_READS * READ_LENGTH but may differ when either mixed read lengths are present or many reads are aligned with gaps.
142 10. PF_HQ_ALIGNED_Q20_BASES: The subest of PF_HQ_ALIGNED_BASES where the base call quality was Q20 or higher.
143 11. PF_HQ_MEDIAN_MISMATCHES: The median number of mismatches versus the reference sequence in reads that were aligned to the reference at high quality (i.e. PF_HQ_ALIGNED READS).
144 12. PF_HQ_ERROR_RATE: The percentage of bases that mismatch the reference in PF HQ aligned reads.
145 13. MEAN_READ_LENGTH: The mean read length of the set of reads examined. When looking at the data for a single lane with equal length reads this number is just the read length. When looking at data for merged lanes with differing read lengths this is the mean read length of all reads.
146 14. READS_ALIGNED_IN_PAIRS: The number of aligned reads who's mate pair was also aligned to the reference.
147 15. PCT_READS_ALIGNED_IN_PAIRS: The percentage of reads who's mate pair was also aligned to the reference. READS_ALIGNED_IN_PAIRS / PF_READS_ALIGNED
148 16. BAD_CYCLES: The number of instrument cycles in which 80% or more of base calls were no-calls.
149 17. STRAND_BALANCE: The number of PF reads aligned to the positive strand of the genome divided by the number of PF reads aligned to the genome.
150 18. PCT_CHIMERAS: The percentage of reads that map outside of a maximum insert size (usually 100kb) or that have the two ends mapping to different chromosomes.
151 19. PCT_ADAPTER: The percentage of PF reads that are unaligned and match to a known adapter sequence right from the start of the read.
152
153 .. class:: warningmark
154
155 **Warning on SAM/BAM quality**
156
157 Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
158 flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
159 to be the only way to deal with SAM/BAM that cannot be parsed.
160
161
162 </help>
163 </tool>
164
165