comparison computeGCBias.xml @ 110:fa6ef7619bbd draft default tip

Uploaded
author bgruening
date Mon, 26 Jan 2015 13:10:16 -0500
parents
children
comparison
equal deleted inserted replaced
109:89dd3b812906 110:fa6ef7619bbd
1 <tool id="deeptools_computeGCBias" name="computeGCBias" version="@WRAPPER_VERSION@.0">
2 <description>to see whether your samples should be normalized for GC bias</description>
3 <expand macro="requirements" />
4 <expand macro="stdio" />
5 <macros>
6 <token name="@BINARY@">computeGCBias</token>
7 <import>deepTools_macros.xml</import>
8 </macros>
9 <command>
10 <![CDATA[
11 ln -s $bamInput local_bamInput.bam;
12 ln -s $bamInput.metadata.bam_index local_bamInput.bam.bai;
13
14 computeGCBias
15 @THREADS@
16
17 --bamfile 'local_bamInput.bam'
18 --GCbiasFrequenciesFile $outFileName
19 --fragmentLength $fragmentLength
20
21 @reference_genome_source@
22
23 #if $effectiveGenomeSize.effectiveGenomeSize_opt == "specific":
24 --effectiveGenomeSize $effectiveGenomeSize.effectiveGenomeSize
25 #else:
26 --effectiveGenomeSize $effectiveGenomeSize.effectiveGenomeSize_opt
27 #end if
28
29 #if str($region).strip() != '':
30 --region '$region'
31 #end if
32
33 #if $advancedOpt.showAdvancedOpt == "yes":
34
35 --sampleSize '$advancedOpt.sampleSize'
36 --regionSize '$advancedOpt.regionSize'
37
38 #if $advancedOpt.filterOut:
39 --filterOut $advancedOpt.filterOut
40 #end if
41
42 #if $advancedOpt.extraSampling:
43 --extraSampling $advancedOpt.extraSampling
44 #end if
45 #end if
46
47 #if str($image_format) != 'none':
48 --biasPlot $outImageName
49 --plotFileFormat $image_format
50 #end if
51 ]]>
52 </command>
53 <inputs>
54 <param name="bamInput" format="bam" type="data" label="BAM file"
55 help="The BAM file must be sorted."/>
56
57 <expand macro="reference_genome_source" />
58 <expand macro="effectiveGenomeSize" />
59 <expand macro="fragmentLength" />
60 <expand macro="region_limit_operation" />
61
62 <conditional name="advancedOpt">
63 <param name="showAdvancedOpt" type="select" label="Show advanced options" >
64 <option value="no" selected="true">no</option>
65 <option value="yes">yes</option>
66 </param>
67 <when value="no" />
68 <when value="yes">
69 <param name="sampleSize" type="integer" value="50000000" min="1"
70 label="Number of sampling points to be considered" help="(--sampleSize)" />
71 <param name="regionSize" type="integer" value="300" min="1"
72 label="Region size"
73 help ="To plot the reads per GC over a region, the size of the region is required (see below for more details of the mthod). By default, the bin size is set to 300 bp, which is close to the standard fragment size many sequencing applications. However, if the depth of sequencing is low, a larger bin size will be required, otherwise many bins will not overlap with any read. (--regionSize)"/>
74 <param name="filterOut" type="data" format="bed" optional="true"
75 label="BED file containing genomic regions to be excluded from the estimation of the correction"
76 help="Such regions usually contain repetitive regions and peaks that if included will bias the correction. It is recommended to filter out known repetitive regions if multi-reads (reads that map to more than one genomic position) were excluded. In the case of ChIP-seq data, it is recommended to first use a peak caller to identify and filter out the identified peaks. (--filterOut)" />
77 <param name="extraSampling" type="data" format="bed" optional="true"
78 label="BED file containing genomic regions for which extra sampling is required because they are underrepresented in the genome"
79 help="(--extraSampling)" />
80 </when>
81 </conditional>
82 <param name="image_format" type="select"
83 label="GC bias plot"
84 help="If given, a diagnostic image summarizing the GC bias found on the sample will be created. (--plotFileFormat)">
85 <option value="none">No image</option>
86 <option value="png" selected="true">Image in png format</option>
87 <option value="pdf">Image in pdf format</option>
88 <option value="svg">Image in svg format</option>
89 <option value="eps">Image in eps format</option>
90 <option value="emf">Image in emf format</option>
91 </param>
92 </inputs>
93 <outputs>
94 <data name="outFileName" format="tabular" />
95 <data name="outImageName" format="png" label="${tool.name} GC-bias Plot">
96 <filter>
97 ((
98 image_format != 'none'
99 ))
100 </filter>
101 <change_format>
102 <when input="image_format" value="pdf" format="pdf" />
103 <when input="image_format" value="svg" format="svg" />
104 <when input="image_format" value="eps" format="eps" />
105 <when input="image_format" value="emf" format="emf" />
106 </change_format>
107 </data>
108 </outputs>
109 <tests>
110 <test>
111 <param name="bamInput" value="phiX.bam" ftype="bam" />
112 <param name="image_format" value="png" />
113 <param name="showAdvancedOpt" value="yes" />
114 <param name="regionSize" value="1" />
115 <param name="fragmentLength" value="100" />
116 <param name="ref_source" value="history" />
117 <param name="input1" value="phiX.2bit" />
118 <output name="outFileName" file="computeGCBias_result1.tabular" ftype="tabular" />
119 <output name="outImageName" file="computeGCBias_result1.png" ftype="png" />
120 </test>
121 </tests>
122 <help>
123 <![CDATA[
124 **What it does**
125
126 This tool computes the GC bias using the method proposed by Benjamini and Speed (2012) Nucleic Acids Res. (see below for more explanations)
127 The output is used to plot the bias and can also be used later on to correct the bias with the tool correctGCbias.
128 There are two plots produced by the tool: a boxplot showing the absolute read numbers per genomic-GC bin and an x-y plot
129 depicting the ratio of observed/expected reads per genomic GC content bin.
130
131 -----
132
133 **Summary of the method used**
134
135 In order to estimate how many reads with what kind of GC content one should have sequenced, we first need to determine how many regions the specific
136 reference genome contains for each amount of GC content, i.e. how many regions in the genome have 50% GC (or 10% GC or 90% GC or...).
137 We then sample a large number of equally sized genome bins and count how many times we see a bin with 50% GC (or 10% GC or 90% or...). These EXPECTED values are independent of any
138 sequencing as it only depends on the respective reference genome (i.e. it will most likely vary between mouse and fruit fly due to their genome's different GC contents).
139 The OBSERVED values are based on the reads from the sequenced sample. Instead of noting how many genomic regions there are per GC content, we now count the reads per GC content.
140 In an ideal sample without GC bias, the ratio of OBSERVED/EXPECTED values should be close to 1 regardless of the GC content. Due to PCR (over)amplifications, the majority of ChIP samples
141 usually shows a significant bias towards reads with high GC content (>50%)
142
143 .. image:: $PATH_TO_IMAGES/QC_GCplots_input.png
144
145
146 You can find more details on the computeGCBias wiki page: computeGCBias wiki: https://github.com/fidelram/deepTools/wiki/QC#wiki-computeGCbias
147
148
149 **Output files**:
150
151 - Diagnostic plot
152
153 - box plot of absolute read numbers per genomic GC bin
154 - x-y plot of observed/expected read ratios per genomic GC content bin
155
156 - Data matrix
157
158 - to be used for GC correction with correctGCbias
159
160
161 -----
162
163 @REFERENCES@
164 ]]>
165 </help>
166 <expand macro="citations" />
167 </tool>