6
|
1 <tool id="deeptools_computeGCBias" name="computeGCBias" version="1.0.1">
|
5
|
2 <description>to see whether your samples should be normalized for GC bias</description>
|
|
3
|
0
|
4 <requirements>
|
6
|
5 <requirement type="package" version="1.5.1_df852fa1ef13251a17274ee18fbf919fbc515079">deepTools</requirement>
|
|
6 <requirement type="package" >deepTools</requirement>
|
0
|
7 </requirements>
|
|
8 <stdio>
|
|
9 <exit_code range="0" level="warning" description="Warning" />
|
|
10 </stdio>
|
|
11 <command>
|
1
|
12 #import tempfile
|
|
13 #set $temp_dir = os.path.abspath(tempfile.mkdtemp())
|
|
14
|
|
15 #set $temp_bam_handle = tempfile.NamedTemporaryFile( dir=$temp_dir )
|
|
16 #set $temp_bam_path = $temp_bam_handle.name + '.bam'
|
|
17 #silent $temp_bam_handle.close()
|
|
18 #silent os.system("ln -s %s %s" % (str($bamInput), $temp_bam_path))
|
|
19 #silent os.system("ln -s %s %s.bai" % (str($bamInput.metadata.bam_index), $temp_bam_path))
|
|
20
|
0
|
21 computeGCBias
|
1
|
22
|
|
23 ##ToDo
|
|
24 --numberOfProcessors 4
|
|
25
|
|
26 --bamfile '$temp_bam_path'
|
0
|
27 --species '$species'
|
|
28 --GCbiasFrequenciesFile $outFileName
|
|
29 --fragmentLength $fragmentLength
|
1
|
30
|
0
|
31 #if $source.ref_source=="history":
|
|
32 --genome $source.input1
|
|
33 #else:
|
|
34 --genome "${source.input1_2bit.fields.path}"
|
|
35 #end if
|
1
|
36
|
0
|
37 #if $advancedOpt.showAdvancedOpt == "yes":
|
|
38 #if str($advancedOpt.region.value) != '':
|
|
39 --region '$advancedOpt.region'
|
|
40 #end if
|
|
41
|
|
42 --binSize '$advancedOpt.binSize'
|
|
43 --sampleSize '$advancedOpt.sampleSize'
|
|
44 --regionSize '$advancedOpt.regionSize'
|
1
|
45
|
0
|
46 #if $advancedOpt.filterOut:
|
|
47 --filterOut $advancedOpt.filterOut
|
|
48 #end if
|
1
|
49
|
0
|
50 #if $advancedOpt.extraSampling:
|
|
51 --extraSampling $advancedOpt.extraSampling
|
|
52 #end if
|
1
|
53
|
0
|
54 #end if
|
1
|
55
|
5
|
56 #if $saveBiasPlot:
|
|
57 --biasPlot $biasPlot
|
0
|
58 #end if
|
1
|
59
|
5
|
60 ## #if $output.showOutputSettings == "yes"
|
|
61 ## #if $output.saveBiasPlot:
|
|
62 ## --biasPlot biasPlot.png ;
|
|
63 ## mv biasPlot.png $biasPlot
|
|
64 ## #end if
|
|
65 ## #end if
|
|
66
|
1
|
67 ; rm $temp_dir -rf
|
0
|
68
|
|
69 </command>
|
1
|
70 <inputs>
|
0
|
71
|
1
|
72 <param name="bamInput" format="bam" type="data" label="Input BAM file"
|
6
|
73 help="The BAM file must be sorted."/>
|
1
|
74 <!--<param name="species" type="text" value="" label="Species name abbreviation" />-->
|
|
75
|
|
76 <param name="species" type="select" label="Species name abbreviation">
|
|
77 <option value="hg19">hg19</option>
|
|
78 <option value="ce10">ce10</option>
|
|
79 <option value="dm3">dm3</option>
|
|
80 <option value="mm9">mm9</option>
|
|
81 </param>
|
|
82
|
|
83 <conditional name="source">
|
|
84 <param name="ref_source" type="select" label="Reference genome">
|
|
85 <option value="cached">locally cached</option>
|
|
86 <option value="history">in your history</option>
|
0
|
87 </param>
|
1
|
88 <when value="cached">
|
|
89 <param name="input1_2bit" type="select" label="Using reference genome" help="If your genome of interest is not listed, contact the Galaxy team">
|
|
90 <options from_data_table="deepTools_seqs" />
|
|
91 </param>
|
|
92 </when>
|
|
93 <when value="history">
|
|
94 <param name="input1" type="data" format="twobit" label="Select a reference dataset in 2bit format" />
|
|
95 </when>
|
|
96 </conditional>
|
|
97 <param name="fragmentLength" type="integer" value="300" min="1"
|
|
98 label="Fragment length used for the sequencing"
|
6
|
99 help ="If paired-end reads are used, the fragment length is computed from the BAM file."/>
|
1
|
100
|
|
101 <conditional name="advancedOpt">
|
|
102 <param name="showAdvancedOpt" type="select" label="Show advanced options" >
|
|
103 <option value="no" selected="true">no</option>
|
|
104 <option value="yes">yes</option>
|
|
105 </param>
|
|
106 <when value="no" />
|
|
107 <when value="yes">
|
0
|
108 <param name="region" type="text" value=""
|
|
109 label="Region of the genome to limit the operation to"
|
|
110 help="This is useful when testing parameters to reduce the computing time. The format is chr:start:end, for example "chr10" or "chr10:456700:891000"" />
|
|
111
|
|
112 <param name="binSize" type="integer" value="50" min="1"
|
|
113 label="Bin size in bp"
|
|
114 help="Size of the bins in bp for the ouput of the bigwig/bedgraph file."/>
|
|
115
|
|
116 <param name="sampleSize" type="integer" value="50000000" min="1"
|
|
117 label="Number of sampling points to be considered" />
|
|
118
|
|
119 <param name="regionSize" type="integer" value="300" min="1"
|
|
120 label="Region size"
|
6
|
121 help ="To plot the reads per GC over a region, the size of the region is required (see below for more details of the mthod). By default, the bin size is set to 300 bp, which is close to the standard fragment size many sequencing applications. However, if the depth of sequencing is low, a larger bin size will be required, otherwise many bins will not overlap with any read."/>
|
0
|
122
|
|
123 <param name="filterOut" type="data" format="bed" optional="true"
|
|
124 label="BED file containing genomic regions to be excluded from the estimation of the correction"
|
|
125 help="Such regions usually contain repetitive regions and peaks that if included will bias the correction. It is recommended to filter out known repetitive regions if multi-reads (reads that map to more than one genomic position) were excluded. In the case of ChIP-seq data, it is recommended to first use a peak caller to identify and filter out the identified peaks." />
|
|
126 <param name="extraSampling" type="data" format="bed" optional="true"
|
|
127 label="BED file containing genomic regions for which extra sampling is required because they are underrepresented in the genome"
|
|
128 help="" />
|
1
|
129 </when>
|
|
130 </conditional>
|
0
|
131
|
5
|
132 <param name="saveBiasPlot" type="boolean" truevalue="--biasPlot" falsevalue="" checked="True" label="Save a diagnostic image summarizing the GC bias found on the sample"/>
|
|
133 <!--
|
1
|
134 <conditional name="output" >
|
0
|
135 <param name="showOutputSettings" type="select" label="Show additional output options" >
|
|
136 <option value="no" selected="true">no</option>
|
|
137 <option value="yes">yes</option>
|
|
138 </param>
|
|
139 <when value="no" />
|
|
140 <when value="yes">
|
|
141 <param name="saveBiasPlot" type="boolean" label="Save a diagnostic image summarizing the GC bias found on the sample"/>
|
|
142 </when>
|
|
143 </conditional>
|
5
|
144 -->
|
0
|
145 </inputs>
|
|
146 <outputs>
|
|
147 <data format="tabular" name="outFileName" />
|
|
148 <data format="png" name="biasPlot" label="${tool.name} on ${on_string}: bias plot">
|
6
|
149 <filter>saveBiasPlot is True</filter>
|
|
150 <!--<filter>(output['showOutputSettings'] == 'yes' and output['saveBiasPlot'] == True)</filter>-->
|
0
|
151 </data>
|
|
152 </outputs>
|
|
153 <help>
|
|
154
|
|
155 **What it does**
|
|
156
|
6
|
157 This tool computes the GC bias using the method proposed by Benjamini and Speed (2012). Nucleic Acids Res. (see below for more explanations)
|
5
|
158 The output is used to plot the bias and can also be used later on to correct the bias with the tool correctGCbias.
|
6
|
159 There are two plots produced by the tool: a boxplot showing the absolute read numbers per genomic-GC bin and an x-y plot
|
|
160 depicting the ratio of observed/expected reads per genomic GC content bin.
|
|
161
|
|
162 -----
|
|
163
|
|
164 **Summary of the method used**
|
|
165
|
|
166 In order to estimate how many reads with what kind of GC content one should have sequenced, we first need to determine how many regions the specific
|
|
167 reference genome contains for each amount of GC content, i.e. how many regions in the genome have 50% GC (or 10% GC or 90% GC or...).
|
|
168 We then sample a large number of equally sized genome bins and count how many times we see a bin with 50% GC (or 10% GC or 90% or...). These EXPECTED values are independent of any
|
|
169 sequencing as it only depends on the respective reference genome (i.e. it will most likely vary between mouse and fruit fly due to their genome's different GC contents).
|
|
170 The OBSERVED values are based on the reads from the sequenced sample. Instead of noting how many genomic regions there are per GC content, we now count the reads per GC content.
|
|
171 In an ideal sample without GC bias, the ratio of OBSERVED/EXPECTED values should be close to 1 regardless of the GC content. Due to PCR (over)amplifications, the majority of ChIP samples
|
|
172 usually shows a significant bias towards reads with high GC content (>50%)
|
0
|
173
|
|
174 -----
|
|
175
|
|
176 .. class:: infomark
|
|
177
|
6
|
178 If you would like to give us feedback or you run into any trouble, please send an email to deeptools@googlegroups.com
|
0
|
179
|
|
180 This tool is developed by the `Bioinformatics and Deep-Sequencing Unit`_ at the `Max Planck Institute for Immunobiology and Epigenetics`_.
|
|
181
|
|
182 .. _Bioinformatics and Deep-Sequencing Unit: http://www3.ie-freiburg.mpg.de/facilities/research-facilities/bioinformatics-and-deep-sequencing-unit/
|
|
183 .. _Max Planck Institute for Immunobiology and Epigenetics: http://www3.ie-freiburg.mpg.de
|
|
184
|
|
185 </help>
|
|
186 </tool>
|