comparison diffbind.xml @ 23:393393c58c35 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/diffbind commit cc4c1c4131518b9cbf986a1f252767ff73ca938e
author iuc
date Sat, 07 Apr 2018 15:45:03 -0400
parents 51f0f4df83c2
children 15bbd86c6c7b
comparison
equal deleted inserted replaced
22:51f0f4df83c2 23:393393c58c35
1 <tool id="diffbind" name="DiffBind" version="2.6.5.0"> 1 <tool id="diffbind" name="DiffBind" version="2.6.6.1">
2 <description> differential binding analysis of ChIP-Seq peak data</description> 2 <description> differential binding analysis of ChIP-Seq peak data</description>
3 <requirements> 3 <requirements>
4 <requirement type="package" version="2.6.5">bioconductor-diffbind</requirement> 4 <requirement type="package" version="2.6.6">bioconductor-diffbind</requirement>
5 <requirement type="package" version="1.20.0">r-getopt</requirement> 5 <requirement type="package" version="1.20.0">r-getopt</requirement>
6 <!--added rmysql requirement to remove: "Warning: namespace ‘RMySQL’ is not available"--> 6 <requirement type="package" version="0.2.15">r-rjson</requirement>
7 <requirement type="package" version="0.10.11">r-rmysql</requirement>
8 </requirements> 7 </requirements>
9 <stdio> 8 <stdio>
10 <regex match="Execution halted" 9 <regex match="Execution halted"
11 source="both" 10 source="both"
12 level="fatal" 11 level="fatal"
19 source="both" 18 source="both"
20 level="fatal" 19 level="fatal"
21 description="An undefined error occured, please check your intput carefully and contact your administrator." /> 20 description="An undefined error occured, please check your intput carefully and contact your administrator." />
22 </stdio> 21 </stdio>
23 <version_command><![CDATA[ 22 <version_command><![CDATA[
24 echo $(R --version | grep version | grep -v GNU)", DiffBind version" $(R --vanilla --slave -e "library(DiffBind); cat(sessionInfo()\$otherPkgs\$DiffBind\$Version)" 2> /dev/null | grep -v -i "WARNING: ")," getopt version" $(R --vanilla --slave -e "library(getopt); cat(sessionInfo()\$otherPkgs\$getopt\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", rmysql version" $(R --vanilla --slave -e "library(rmysql); cat(sessionInfo()\$otherPkgs\$rmysql\$Version)" 2> /dev/null | grep -v -i "WARNING: ") 23 echo $(R --version | grep version | grep -v GNU)", DiffBind version" $(R --vanilla --slave -e "library(DiffBind); cat(sessionInfo()\$otherPkgs\$DiffBind\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", rjson version" $(R --vanilla --slave -e "library(rjson); cat(sessionInfo()\$otherPkgs\$rjson\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
25 ]]></version_command> 24 ]]></version_command>
26 <command><![CDATA[ 25 <command><![CDATA[
27 ## seems that diffbind also needs file extensions to work properly 26 #import re
28 #set $counter = 1 27 #import json
29 #for $sample in $samples: 28
30 ln -s $sample.bamreads #echo str($counter) + "_bamreads.bam"# && 29 ## Adapted from DESeq2 wrapper
31 ln -s ${sample.bamreads.metadata.bam_index} #echo str($counter) + "_bamreads.bai"# && 30 #set $temp_factor_names = list()
32 #if str( $sample.bamcontrol ) != 'None': 31 #set $temp_factor = list()
33 ln -s $sample.bamcontrol #echo str($counter) + "_bamcontrol.bam"# && 32
34 ln -s ${sample.bamcontrol.metadata.bam_index} #echo str($counter) + "_bamcontrol.bai"# && 33 #for $g in $rep_group:
35 #end if 34
36 #set $counter = $counter + 1 35 #set $peak_files = list()
36 #set $bam_files = list()
37 #set $bam_controls = list()
38
39 #for $file in $g.peaks:
40 #set $file_name = re.sub('[^\w\-\s]', '_', str($file.element_identifier))
41 ln -s '${file}' ${g.groupName}-${file_name}-peaks.bed &&
42 $peak_files.append(str($g.groupName) + '-' + $file_name + '-peaks.bed')
43 #end for
44
45 #for $bam in $g.bamreads:
46 #set $bam_name = re.sub('[^\w\-\s]', '_', str($bam.element_identifier))
47 ln -s '${bam}' ${bam_name}-bamreads.bam &&
48 ln -s ${bam.metadata.bam_index} ${bam_name}-bamreads.bai &&
49 $bam_files.append($bam_name + '-bamreads.bam')
50 #end for
51
52 $temp_factor.append( {str($g.groupName): $peak_files} )
53 $temp_factor.append( {str($g.groupName): $bam_files} )
54
55 #if str( $g.bamcontrol ) != 'None':
56 #for $ctrl in $g.bamcontrol:
57 #set $ctrl_name = re.sub('[^\w\-\s]', '_', str($ctrl.element_identifier))
58 ln -s '${ctrl}' ${g.groupName}-${ctrl_name}-bamcontrol.bam &&
59 ln -s ${ctrl.metadata.bam_index} ${g.groupName}-${ctrl_name}-bamcontrol.bai &&
60 $bam_controls.append(str($g.groupName) + '-' + $ctrl_name + '-bamcontrol.bam')
37 #end for 61 #end for
38 62 $temp_factor.append( {str($g.groupName): $bam_controls} )
39 Rscript '$__tool_directory__/diffbind.R' 63 #end if
40 -i $infile 64
41 -o '$outfile' 65 #end for
42 -p '$plots' 66
43 -f $format 67 $temp_factor.reverse()
44 -t $th 68 $temp_factor_names.append([str($factorName), $temp_factor])
45 69
46 #if $binding_affinity_matrix: 70
47 -b 71 Rscript '$__tool_directory__/diffbind.R'
48 #end if 72
73 -i '#echo json.dumps(temp_factor_names)#'
74 -o '$outfile'
75 -t $th
76 -f $out.format
77 -p '$plots'
78
79 #if $scorecol:
80 -n "$scorecol"
81 #end if
82 #if $lowerbetter:
83 -l "$lowerbetter"
84 #end if
85 #if $summits:
86 -s "$summits"
87 #end if
88
89 #if $out.binding_matrix:
90 -b
91 #end if
92
93 #if $out.rdata:
94 -r
95 #end if
96
97 #if $out.analysis_info:
98 -a
99 #end if
100
101 #if $out.rscript:
102 && cp '$__tool_directory__/diffbind.R' '$rscript'
103 #end if
49 ]]> 104 ]]>
50 </command> 105 </command>
51 <configfiles>
52 <configfile name="infile"><![CDATA[
53 #set $counter = 1
54 #for $sample in $samples:
55 #if str( $sample.bamcontrol ) != 'None' and $counter == 1:
56 SampleID,Tissue,Factor,Condition,Replicate,bamReads,bamControl,Peaks
57 #elif $counter == 1:
58 SampleID,Tissue,Factor,Condition,Replicate,bamReads,Peaks
59 #end if
60 #if str( $sample.bamcontrol ) != 'None':
61 $sample.sample_id,$sample.tissue,$sample.factor,$sample.condition,$sample.replicate,#echo str($counter) + '_bamreads.bam'#,#echo str($counter) + '_bamcontrol.bam'#,$sample.peaks
62 #else:
63 $sample.sample_id,$sample.tissue,$sample.factor,$sample.condition,$sample.replicate,#echo str($counter) + '_bamreads.bam'#,$sample.peaks
64 #end if
65 #set $counter = $counter + 1
66 #end for]]></configfile>
67 </configfiles>
68 <inputs> 106 <inputs>
69 <repeat name="samples" title="Samples" min="2"> 107 <param name="factorName" type="text" label="Name" help="Name of experiment factor of interest (e.g. Condition). One factor must be entered and there must be two or more groups. NOTE: Please only use letters, numbers or underscores.">
70 <param name="sample_id" type="text" value="Sample ID" label="Specify a sample id" help="e.g. BT474.1-" /> 108 <sanitizer>
71 <param name="tissue" type="text" value="Tissue" label="Specify the tissue" help="e.g. BT474" /> 109 <valid initial="string.letters,string.digits"><add value="_" /></valid>
72 <param name="factor" type="text" value="Factor Name" label="Specify a factor name" help="e.g. ER" /> 110 </sanitizer>
73 <param name="condition" type="text" value="Condition" label="Specify the condition" help="e.g. Resistent" /> 111 </param>
74 <param name="replicate" type="integer" value="1" label="Specify the replicate number" help="e.g. 1" /> 112 <repeat name="rep_group" title="Group" min="2" default="2">
75 <param name="bamreads" type="data" format="bam" label="Read BAM file" help="Specify the Read BAM file, used for Peak calling."/> 113 <param name="groupName" type="text" label="Name"
76 <param name="bamcontrol" type="data" format="bam" optional="True" label="Control BAM file" help="If specifying a control BAM file for this sample, then all samples are required to specify one."/> 114 help="Name of group that the peak files belong to (e.g. Resistant or Responsive). NOTE: Please only use letters, numbers or underscores (case sensitive).">
77 <param name="peaks" type="data" format="bed" label="Peak file" help="Result of your Peak calling experiment."/> 115 <sanitizer>
116 <valid initial="string.letters,string.digits"><add value="_" /></valid>
117 </sanitizer>
118 </param>
119 <param name="peaks" type="data" format="bed" multiple="true" label="Peak files" help="Result of your Peak calling experiment"/>
120 <param name="bamreads" type="data" format="bam" multiple="true" label="Read BAM file" help="Specify the Read BAM file used for Peak calling."/>
121 <param name="bamcontrol" type="data" format="bam" multiple="true" optional="True" label="Control BAM file" help="If specifying a control BAM file, all samples are required to specify one."/>
78 </repeat> 122 </repeat>
79 <param name="th" type="float" value="1" min="0" max="1" 123
80 label="FDR Threshold" 124 <param name="scorecol" type="integer" min="0" value="8" label="Score Column" help="Column in peak files that contains peak scores. Default: 8 (narrowPeak)"/>
81 help="Significance threshold; all sites with FDR less than or equal to this value will be included in the report. A value of 1 will include all binding sites in the report. Default: 1"/> 125 <param name="lowerbetter" type="boolean" truevalue="True" falsevalue="" checked="False" label="Lower score is better?" help="DiffBind by default assumes that a higher score indicates a better peak, for example narrowPeaks -log10pvalue. If this is not the case, for example if the score is a p-value or FDR, set this option to Yes. Default: No" />
82 <param name="pdf" type="boolean" truevalue="" falsevalue="" checked="true" 126 <param name="summits" type="integer" min="0" optional="True" label="Summits" help="Extend peaks Nbp up- and downstream of the summit. For punctate peaks it is advisable to extend (e.g. 250bp), see the DiffBind User Guide"/>
83 label="Visualising the analysis results" 127 <param name="th" type="float" value="0.05" min="0" max="1" label="FDR Threshold" help="Significance threshold; all sites with FDR less than or equal to this value will be included in the output. A value of 1 will output all binding sites. Default: 0.05"/>
84 help="output an additional PDF file" /> 128
85 <param name="format" type="select" label="Output Format"> 129 <!-- Output Options -->
86 <option value="bed">BED</option> 130 <section name="out" expanded="false" title="Output Options">
87 <option value="gff">GFF</option> 131 <param name="format" type="select" label="Output Format">
88 <option value="wig">WIG</option> 132 <option value="bed">BED</option>
89 </param> 133 <option value="gff">GFF</option>
90 <param name="binding_affinity_matrix" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output binding affinity matrix?" help="Output a table of the binding scores" /> 134 <option value="wig">WIG</option>
135 </param>
136 <param name="pdf" type="boolean" truevalue="True" falsevalue="" checked="False" label="Visualising the analysis results" help="output an additional PDF file" />
137 <param name="binding_matrix" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output binding affinity matrix?" help="Output a table of the binding scores" />
138 <param name="rdata" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output RData file?" help="Output all the data used by R to construct the plots and tables, can be loaded into R. Default: No"/>
139 <param name="rscript" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Output Rscript?" help="If this option is set to Yes, the Rscript used will be provided as a text file in the output. Default: No"/>
140 <param name="analysis_info" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Output analysis info?" help="If this option is set to Yes, information from the dba.count and dba.analyze commmands will be output in a text file. Default: No"/>
141 </section>
91 </inputs> 142 </inputs>
143
92 <outputs> 144 <outputs>
93 <data name="outfile" format="bed" label="Differential binding sites on ${on_string}"> 145 <data name="outfile" format="bed" label="${tool.name} on ${on_string}: Differentially bound sites">
94 <change_format> 146 <change_format>
95 <when input="format" value="wig" format="wig" /> 147 <when input="format" value="wig" format="wig" />
96 <when input="format" value="gff" format="gff" /> 148 <when input="format" value="gff" format="gff" />
97 </change_format> 149 </change_format>
98 </data> 150 </data>
99 <data name="plots" format="pdf" label="Differential binding sites on ${on_string}"> 151 <data name="plots" format="pdf" label="${tool.name} on ${on_string}: Plots">
100 <filter>pdf == True</filter> 152 <filter>out['pdf']</filter>
101 </data> 153 </data>
102 <data name="binding_matrix" format="tabular" from_work_dir="bmatrix.tab" label="Differential binding sites on ${on_string}"> 154 <data name="binding_matrix" format="tabular" from_work_dir="bmatrix.tab" label="${tool.name} on ${on_string}: Binding matrix">
103 <filter>binding_affinity_matrix == True</filter> 155 <filter>out['binding_matrix']</filter>
156 </data>
157 <data name="rdata" format="rdata" from_work_dir="DiffBind_analysis.RData" label="${tool.name} on ${on_string}: RData file">
158 <filter>out['rdata']</filter>
159 </data>
160 <data name="rscript" format="txt" label="${tool.name} on ${on_string}: Rscript">
161 <filter>out['rscript']</filter>
162 </data>
163 <data name="analysis_info" format="txt" from_work_dir="DiffBind_analysis_info.txt" label="${tool.name} on ${on_string}: Analysis info">
164 <filter>out['analysis_info']</filter>
104 </data> 165 </data>
105 </outputs> 166 </outputs>
167
106 <tests> 168 <tests>
107 <test> 169 <test expect_num_outputs="6">
108 <repeat name="samples"> 170 <param name="factorName" value="Condition"/>
109 <param name="sample_id" value="BT4741" /> 171 <repeat name="rep_group">
110 <param name="tissue" value="BT474" /> 172 <param name="groupName" value="Resistant"/>
111 <param name="factor" value="ER" /> 173 <param name="peaks" value="BT474_ER_1.bed.gz,BT474_ER_2.bed.gz"/>
112 <param name="condition" value="Resistant" /> 174 <param name="bamreads" ftype="bam" value="BT474_ER_1.bam,BT474_ER_2.bam" />
113 <param name="replicate" value="1" />
114 <param name="bamreads" ftype="bam" value="BT474_ER_1.bam" />
115 <param name="peaks" ftype="bed" value="BT474_ER_1.bed.gz" />
116 </repeat> 175 </repeat>
117 <repeat name="samples"> 176 <repeat name="rep_group">
118 <param name="sample_id" value="BT4742" /> 177 <param name="groupName" value="Responsive"/>
119 <param name="tissue" value="BT474" /> 178 <param name="peaks" value="MCF7_ER_1.bed.gz,MCF7_ER_2.bed.gz"/>
120 <param name="factor" value="ER" /> 179 <param name="bamreads" ftype="bam" value="MCF7_ER_1.bam,MCF7_ER_2.bam" />
121 <param name="condition" value="Resistant" />
122 <param name="replicate" value="2" />
123 <param name="bamreads" ftype="bam" value="BT474_ER_2.bam" />
124 <param name="peaks" ftype="bed" value="BT474_ER_2.bed.gz" />
125 </repeat> 180 </repeat>
126 <repeat name="samples"> 181 <param name="scorecol" value="5" />
127 <param name="sample_id" value="MCF71" />
128 <param name="tissue" value="MCF7" />
129 <param name="factor" value="ER" />
130 <param name="condition" value="Responsive" />
131 <param name="replicate" value="1" />
132 <param name="bamreads" ftype="bam" value="MCF7_ER_1.bam" />
133 <param name="peaks" ftype="bed" value="MCF7_ER_1.bed.gz" />
134 </repeat>
135 <repeat name="samples">
136 <param name="sample_id" value="MCF72" />
137 <param name="tissue" value="MCF7" />
138 <param name="factor" value="ER" />
139 <param name="condition" value="Responsive" />
140 <param name="replicate" value="2" />
141 <param name="bamreads" ftype="bam" value="MCF7_ER_2.bam" />
142 <param name="peaks" ftype="bed" value="MCF7_ER_2.bed.gz" />
143 </repeat>
144 <param name="pdf" value="True" /> 182 <param name="pdf" value="True" />
145 <param name="binding_affinity_matrix" value="True" /> 183 <param name="binding_matrix" value="True" />
184 <param name="rdata" value="True" />
185 <param name="rscript" value="True"/>
186 <param name="analysis_info" value="True"/>
146 <output name="outfile" value="out_diffbind.bed" /> 187 <output name="outfile" value="out_diffbind.bed" />
188 <output name="plots" value="out_plots.pdf" compare="sim_size" />
147 <output name="binding_matrix" value="out_binding.matrix" /> 189 <output name="binding_matrix" value="out_binding.matrix" />
190 <output name="rdata" value="DiffBind_analysis.RData" compare="sim_size"/>
191 <output name="rscript" value="out_rscript.txt"/>
192 <output name="analysis_info" value="out_analysis_info.txt" compare="sim_size" >
193 <assert_contents>
194 <has_text text="SessionInfo"/>
195 </assert_contents>
196 </output>
148 </test> 197 </test>
149 </tests> 198 </tests>
150 <help><![CDATA[ 199 <help><![CDATA[
151 200
152 .. class:: infomark 201 .. class:: infomark
164 between two sample groups. It includes functions to support the processing of peak sets, 213 between two sample groups. It includes functions to support the processing of peak sets,
165 including overlapping and merging peak sets, counting sequencing reads overlapping intervals 214 including overlapping and merging peak sets, counting sequencing reads overlapping intervals
166 in peak sets, and identifying statistically significantly differentially bound sites based on 215 in peak sets, and identifying statistically significantly differentially bound sites based on
167 evidence of binding affinity (measured by differences in read densities). To this end it uses 216 evidence of binding affinity (measured by differences in read densities). To this end it uses
168 statistical routines developed in an RNA-Seq context (primarily the Bioconductor packages 217 statistical routines developed in an RNA-Seq context (primarily the Bioconductor packages
169 edgeR and DESeq2 ). Additionally, the package builds on Rgraphics routines to provide a 218 edgeR and DESeq2). Additionally, the package builds on Rgraphics routines to provide a
170 set of standardized plots to aid in binding analysis. 219 set of standardized plots to aid in binding analysis.
171 220
172 The `DiffBind User Guide`_ includes a brief overview of the processing flow, followed by four sections of 221 The `DiffBind User Guide`_ includes a brief overview of the processing flow, followed by four sections of
173 examples: the first focusing on the core task of obtaining differentially bound sites based on 222 examples: the first focusing on the core task of obtaining differentially bound sites based on
174 affinity data, the second working through the main plotting routines, the third discussing the 223 affinity data, the second working through the main plotting routines, the third discussing the
175 use of a blocking factor, and the fourth revisiting occupancy data (peak calls) in more detail, 224 use of a blocking factor, and the fourth revisiting occupancy data (peak calls) in more detail,
176 as well as comparing the results of an occupancy-based analysis with an affinity-based one. 225 as well as comparing the results of an occupancy-based analysis with an affinity-based one.
177 Finally, certain technical aspects of the how these analyses are accomplished are detailed. 226 Finally, certain technical aspects of the how these analyses are accomplished are detailed.
178 227
179 Note DiffBind requires a minimum of four samples (two groups with two replicates each). 228 Note this DiffBind tool requires a minimum of four samples (two groups with two replicates each).
180 229
181 .. _DiffBind: https://bioconductor.org/packages/release/bioc/html/DiffBind.html 230 -----
182 .. _`Bioconductor package`: https://bioconductor.org/packages/release/bioc/html/DiffBind.html
183 .. _`DiffBind User Guide`: https://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf
184 231
185 **Inputs** 232 **Inputs**
186 233
187 DiffBind works primarily with peaksets, which are sets of genomic intervals representing 234 DiffBind works primarily with peaksets, which are sets of genomic intervals representing
188 candidate protein binding sites. Each interval consists of a chromosome, a start and end 235 candidate protein binding sites. Each interval consists of a chromosome, a start and end
192 be associated with each peakset (one for the ChIP data, and optionally another representing 239 be associated with each peakset (one for the ChIP data, and optionally another representing
193 a control sample) 240 a control sample)
194 241
195 **Sample Information** 242 **Sample Information**
196 243
197 You have to specify your sample information in the tool form above. 244 You have to specify your sample information in the tool form above, where Factor is the groups you want to compare (e.g Resistant and Responsive).
198 245
199 Example: 246 Example:
200 247
201 ============= ========== ========== ============= ============= 248 ============= =============
202 **SampleID** **Tissue** **Factor** **Condition** **Replicate** 249 **SampleID** **Group**
203 ------------- ---------- ---------- ------------- ------------- 250 ------------- -------------
204 BT4741 BT474 ER Resistant 1 251 BT4741 Resistant
205 BT4742 BT474 ER Resistant 2 252 BT4742 Resistant
206 MCF71 MCF7 ER Responsive 1 253 MCF71 Responsive
207 MCF72 MCF7 ER Responsive 2 254 MCF72 Responsive
208 MCF73 MCF7 ER Responsive 3 255 ============= =============
209 T47D1 T47D ER Responsive 1
210 T47D2 T47D ER Responsive 2
211 MCF7r1 MCF7 ER Resistant 1
212 MCF7r2 MCF7 ER Resistant 2
213 ZR751 ZR75 ER Responsive 1
214 ZR752 ZR75 ER Responsive 2
215 ============= ========== ========== ============= =============
216
217 Or provide a sample sheet tabular file such as below.
218
219 Example:
220
221 ======== ====== ====== ========== ========== ========= ==================== ========= ===================== ================= ==========
222 SampleID Tissue Factor Condition Treatment Replicate bamReads ControlID bamControl Peaks PeakCaller
223 ======== ====== ====== ========== ========== ========= ==================== ========= ===================== ================= ==========
224 BT4741 BT474 ER Resistant Full-Media 1 Chr18_BT474_ER_1.bam BT474c Chr18_BT474_input.bam BT474_ER_1.bed.gz bed
225 BT4742 BT474 ER Resistant Full-Media 2 Chr18_BT474_ER_2.bam BT474c Chr18_BT474_input.bam BT474_ER_2.bed.gz bed
226 MCF71 MCF7 ER Responsive Full-Media 1 Chr18_MCF7_ER_1.bam MCF7c Chr18_MCF7_input.bam MCF7_ER_1.bed.gz bed
227 MCF72 MCF7 ER Responsive Full-Media 2 Chr18_MCF7_ER_2.bam MCF7c Chr18_MCF7_input.bam MCF7_ER_2.bed.gz bed
228 MCF73 MCF7 ER Responsive Full-Media 3 Chr18_MCF7_ER_3.bam MCF7c Chr18_MCF7_input.bam MCF7_ER_3.bed.gz bed
229 T47D1 T47D ER Responsive Full-Media 1 Chr18_T47D_ER_1.bam T47Dc Chr18_T47D_input.bam T47D_ER_1.bed.gz bed
230 T47D2 T47D ER Responsive Full-Media 2 Chr18_T47D_ER_2.bam T47Dc Chr18_T47D_input.bam T47D_ER_2.bed.gz bed
231 MCF7r1 MCF7 ER Resistant Full-Media 1 Chr18_TAMR_ER_1.bam TAMRc Chr18_TAMR_input.bam TAMR_ER_1.bed.gz bed
232 MCF7r2 MCF7 ER Resistant Full-Media 2 Chr18_TAMR_ER_2.bam TAMRc Chr18_TAMR_input.bam TAMR_ER_2.bed.gz bed
233 ZR751 ZR75 ER Responsive Full-Media 1 Chr18_ZR75_ER_1.bam ZR75c Chr18_ZR75_input.bam ZR75_ER_1.bed.gz bed
234 ZR752 ZR75 ER Responsive Full-Media 2 Chr18_ZR75_ER_2.bam ZR75c Chr18_ZR75_input.bam ZR75_ER_2.bed.gz bed
235 ======== ====== ====== ========== ========== ========= ==================== ========= ===================== ================= ==========
236 256
237 257
238 **Peak files** 258 **Peak files**
239 259
240 Result of your Peak calling experiment in bed format, one file for each sample is required. 260 Result of your Peak calling experiment in bed format, one file for each sample is required. The peak caller, format and score column can be specified in the tool form above. The default settings expect narrowPeak bed format, which has the score in the 8th column (-log10pvalue), and can be output from MACS2.
241 261
242 Example: 262 Example (MACS.xls file in bed format):
243 263
244 ======= ======= ======= =============== ======= 264 ======= ======= ======= =============== ==============
245 1 2 3 4 **5** 265 1 2 3 4 **5 (Score)**
246 ======= ======= ======= =============== ======= 266 ======= ======= ======= =============== ==============
247 chr18 215562 216063 MACS_peak_16037 56.11 267 chr18 215562 216063 MACS_peak_16037 56.11
248 chr18 311530 312105 MACS_peak_16038 222.49 268 chr18 311530 312105 MACS_peak_16038 222.49
249 chr18 356656 357315 MACS_peak_16039 92.06 269 chr18 356656 357315 MACS_peak_16039 92.06
250 chr18 371110 372092 MACS_peak_16040 123.86 270 chr18 371110 372092 MACS_peak_16040 123.86
251 chr18 395116 396464 MACS_peak_16041 1545.39 271 chr18 395116 396464 MACS_peak_16041 1545.39
252 chr18 399014 400382 MACS_peak_16042 1835.19 272 chr18 399014 400382 MACS_peak_16042 1835.19
253 chr18 499134 500200 MACS_peak_16043 748.32 273 chr18 499134 500200 MACS_peak_16043 748.32
254 chr18 503518 504552 MACS_peak_16044 818.30 274 chr18 503518 504552 MACS_peak_16044 818.30
255 chr18 531672 532274 MACS_peak_16045 159.30 275 chr18 531672 532274 MACS_peak_16045 159.30
256 chr18 568326 569282 MACS_peak_16046 601.11 276 chr18 568326 569282 MACS_peak_16046 601.11
257 ======= ======= ======= =============== ======= 277 ======= ======= ======= =============== ==============
258 278
259 * BAM file which contains the mapped sequencing reads can be associated with each peakset 279 * BAM file which contains the mapped sequencing reads associated with each peakset, one file for each sample is required.
260 * Control BAM file represents a control dataset and are optional, but have to specified for all when used. 280 * Optional: Control BAM file representing a control dataset. If used, has to be specified for all samples. Note that the DiffBind authors say control reads are best utilized prior to running DiffBind, at the peak calling stage (e.g. with MACS2) and in blacklists, see this `Bioconductor post`_.
261 281
282 -----
262 283
263 **Outputs** 284 **Outputs**
264 285
286 This tool outputs
287
288 * differentially bound sites in BED, WIG or GFF format
289
290 Optionally, under **Output Options** you can choose to output
291
292 * a PDF of plots (Heatmap, PCA, MA, Volcano, Boxplots)
293 * a binding affinity matrix
294 * the R script used by this tool
295 * an RData file of the R objects generated
296 * a text file with information on the analysis (number of Intervals, FriP scores, method used)
297
298 **Differentially Bound Sites**
299
265 As output format you can choose BED, GFF, WIG. 300 As output format you can choose BED, GFF, WIG.
266 301
267 Example: 302 Example - BED format:
268 303
269 ======== ====== =======+ 304 ======== ====== ====== ===== ====== ===== =============== ============== ======= ======== ========
270 seqnames ranges strand Conc Conc_Resistant 305 seqnames start end width strand Conc Conc_Responsive Conc_Resistant Fold p.value **FDR**
271 306 ======== ====== ====== ===== ====== ===== =============== ============== ======= ======== ========
272 2452 chr18 [64490686, 64491186] * | 6.36 1.39 307 chr18 394600 396513 1914 * 7.15 5.55 7.89 -2.35 7.06e-24 9.84e-21
273 1291 chr18 [34597713, 34598213] * | 5.33 0.22 308 chr18 111567 112005 439 * 5.71 6.53 3.63 2.89 1.27e-08 8.88e-06
274 976 chr18 [26860997, 26861497] * | 7.3 3.13 309 chr18 346464 347342 879 * 5 5.77 3.24 2.52 6.51e-06 0.00303
275 2338 chr18 [60892900, 60893400] * | 7.13 1.84 310 chr18 399014 400382 1369 * 7.62 7 8.05 -1.04 1.04e-05 0.00364
276 2077 chr18 [55569087, 55569587] * | 5.52 1.89 311 chr18 371110 372102 993 * 4.63 3.07 5.36 -2.3 8.1e-05 0.0226
277 312 ======== ====== ====== ===== ====== ===== =============== ============== ======= ======== ========
278 Conc_Responsive Fold p-value FDR 313
279 <numeric> <numeric> <numeric> <numeric> 314 Columns contain the following data:
280 2452 7 -5.61 3.57e-10 1.02e-06 315
281 1291 5.97 -5.75 1.1e-09 1.57e-06 316 * **1st**: Chromosome name
282 976 7.92 -4.79 1.1e-08 1.05e-05 317 * **2nd**: Start position of site
283 2338 7.77 -5.93 1.68e-08 1.17e-05 318 * **3rd**: End position of site
284 2077 6.13 -4.23 2.36e-08 1.17e-05 319 * **4th**: Length of site
285 320 * **5th**: Strand
286 The value columns show the 321 * **6th**: Mean read concentration over all the samples (the default calculation uses log2 normalized ChIP read counts with control read counts subtracted)
287 Conc mean read concentration over all the samples (the default calculation uses log2 normalized ChIP read counts with control read counts subtracted) 322 * **7th**: Mean concentration over the first (e.g. Resistant) group
288 Conc_Resistant mean concentration over the first (Resistant) group 323 * **8th**: Mean concentration over second (e.g. Responsive) group
289 Conc_Responsive mean concentration over second (Responsive) group 324 * **9th**: Fold shows the difference in mean concentrations between the two groups (e.g. Resistant - Responsive), with a positive value indicating increased binding affinity in the first group and a negative value indicating increased binding affinity in the second group.
290 Fold column shows the difference in mean concentrations between the two groups (Conc_Resistant - Conc_Responsive), with a positive value indicating increased binding affinity in the Resistant group and a negative value indicating increased binding affinity in the Responsive group. 325 * **10th**: P-value confidence measure for identifying these sites as differentially bound
291 p-value confidence measure for identifying these sites as differentially bound 326 * **11th**: a multiple testing corrected FDR p-value
292 FDR a multiple testing corrected FDR p-value
293 327
294 328
295 **Binding Affinity Matrix** 329 **Binding Affinity Matrix**
296 330
297 The final result of counting is a binding affinity matrix containing a (normalized) read count for each sample at every potential binding site. With this matrix, the samples can be re-clustered using affinity, rather than occupancy, data. The binding affinity matrix can be used for QC plotting as well as for subsequent 331 The final result of counting is a binding affinity matrix containing a (normalized) read count for each sample at every potential binding site. With this matrix, the samples can be re-clustered using affinity, rather than occupancy, data. The binding affinity matrix can be used for QC plotting as well as for subsequent
298 differential analysis. 332 differential analysis.
299 333
300 Example: 334 Example:
301 335
302 ====== ====== ====== ========== ========== ========= ====== ========= ==== 336 ===== ====== ====== ================ ================ ================ ================
303 ID Tissue Factor Condition Treatment Replicate Caller Intervals FRiP 337 CHR START END MCF7_ER_1.bed MCF7_ER_2.bed BT474_ER_1.bed BT474_ER_2.bed
304 ====== ====== ====== ========== ========== ========= ====== ========= ==== 338 ===== ====== ====== ================ ================ ================ ================
305 BT4741 BT474 ER Resistant Full-Media 1 counts 2845 0.16 339 chr18 111567 112005 137.615208000375 59.878372946728 29.4139375878664 19.9594576489093
306 BT4742 BT474 ER Resistant Full-Media 2 counts 2845 0.15 340 chr18 189223 189652 19.9594576489093 12.6059732519427 11.5554754809475 23.110950961895
307 MCF71 MCF7 ER Responsive Full-Media 1 counts 2845 0.27 341 chr18 215232 216063 11.5554754809475 15.7574665649284 31.5149331298568 72.4843461986707
308 MCF72 MCF7 ER Responsive Full-Media 2 counts 2845 0.17 342 chr18 311530 312172 17.8584621069189 11.5554754809475 54.6258840917518 43.0704086108043
309 MCF73 MCF7 ER Responsive Full-Media 3 counts 2845 0.23 343 chr18 346464 347342 75.6358395116564 40.9694130688139 21.0099554199046 16.8079643359236
310 T47D1 T47D ER Responsive Full-Media 1 counts 2845 0.10 344 chr18 356560 357362 11.5554754809475 14.7069687939332 57.7773774047375 53.5753863207566
311 T47D2 T47D ER Responsive Full-Media 2 counts 2845 0.06 345 chr18 371110 372102 8.40398216796182 9.45447993895705 81.9388261376278 82.989323908623
312 MCF7r1 MCF7 ER Resistant Full-Media 1 counts 2845 0.20 346 chr18 394600 396513 56.7268796337423 43.0704086108043 510.541916703681 438.05757050501
313 MCF7r2 MCF7 ER Resistant Full-Media 2 counts 2845 0.13 347 chr18 399014 400382 156.524167878289 117.655750351465 558.864814169461 496.885445680743
314 ZR751 ZR75 ER Responsive Full-Media 1 counts 2845 0.32 348 chr18 498906 500200 767.913870597511 278.381909313735 196.443083176108 181.736114382174
315 ZR752 ZR75 ER Responsive Full-Media 2 counts 2845 0.22 349 ===== ====== ====== ================ ================ ================ ================
316 ====== ====== ====== ========== ========== ========= ====== ========= ==== 350
317 351 -----
318
319 352
320 **More Information** 353 **More Information**
321 354
322 Generally, processing data with DiffBind involves five phases: 355 Generally, processing data with DiffBind involves five phases:
323 356
326 #. Counting reads 359 #. Counting reads
327 #. Differential binding affinity analysis 360 #. Differential binding affinity analysis
328 #. Plotting and reporting 361 #. Plotting and reporting
329 362
330 363
331 * **Reading in peaksets**: 364 **Reading in peaksets**:
332 365
333 The first step is to read in a set of peaksets and associated 366 The first step is to read in a set of peaksets and associated
334 metadata. Peaksets are derived either from ChIP-Seq peak callers, such as MACS 367 metadata. Peaksets are derived either from ChIP-Seq peak callers, such as **MACS2**, or using some other criterion (e.g. genomic windows, or all the promoter regions
335 ([1]), or using some other criterion (e.g. genomic windows, or all the promoter regions 368 in a genome). A single experiment can have more than
336 in a genome). The easiest way to read in peaksets is using a comma-separated value
337 (csv) sample sheet with one line for each peakset. (Spreadsheets in Excel® format, with
338 a .xls or .xlsx suffix, are also accepted.) A single experiment can have more than
339 one associated peakset; e.g. if multiple peak callers are used for comparison purposes 369 one associated peakset; e.g. if multiple peak callers are used for comparison purposes
340 each sample would have more than one line in the sample sheet. Once the peaksets 370 each sample would have more than one line in the sample sheet. Once the peaksets
341 are read in, a merging function finds all overlapping peaks and derives a single set of 371 are read in, a merging function finds all overlapping peaks and derives a single set of
342 unique genomic intervals covering all the supplied peaks (a consensus peakset for the 372 unique genomic intervals covering all the supplied peaks (a consensus peakset for the
343 experiment). 373 experiment).
344 374
345 * **Occupancy analysis**: 375 **Occupancy analysis**:
346 376
347 Peaksets, especially those generated by peak callers, provide 377 Peaksets, especially those generated by peak callers, provide
348 an insight into the potential occupancy of the protein being ChIPed for at specific 378 an insight into the potential occupancy of the protein being ChIPed for at specific
349 genomic loci. After the peaksets have been loaded, it can be useful to perform some 379 genomic loci. After the peaksets have been loaded, it can be useful to perform some
350 exploratory plotting to determine how these occupancy maps agree with each other, 380 exploratory plotting to determine how these occupancy maps agree with each other,
354 overlaps to be examined, as well as functions to determine how well similar samples 384 overlaps to be examined, as well as functions to determine how well similar samples
355 cluster together. Beyond quality control, the product of an occupancy analysis may be 385 cluster together. Beyond quality control, the product of an occupancy analysis may be
356 a consensus peakset, representing an overall set of candidate binding sites to be used 386 a consensus peakset, representing an overall set of candidate binding sites to be used
357 in further analysis. 387 in further analysis.
358 388
359 * **Counting reads**: 389 **Counting reads**:
360 390
361 Once a consensus peakset has been derived, DiffBind can use the 391 Once a consensus peakset has been derived, DiffBind can use the
362 supplied sequence read files to count how many reads overlap each interval for each 392 supplied sequence read files to count how many reads overlap each interval for each
363 unique sample. The peaks in the consensus peakset may be re-centered and trimmed 393 unique sample. The peaks in the consensus peakset may be re-centered and trimmed
364 based on calculating their summits (point of greatest read overlap) in order to provide 394 based on calculating their summits (point of greatest read overlap) in order to provide
366 containing a (normalized) read count for each sample at every potential binding site. 396 containing a (normalized) read count for each sample at every potential binding site.
367 With this matrix, the samples can be re-clustered using affinity, rather than occupancy, 397 With this matrix, the samples can be re-clustered using affinity, rather than occupancy,
368 data. The binding affinity matrix is used for QC plotting as well as for subsequent 398 data. The binding affinity matrix is used for QC plotting as well as for subsequent
369 differential analysis. 399 differential analysis.
370 400
371 * **Differential binding affinity analysis**: 401 **Differential binding affinity analysis**:
372 402
373 The core functionality of DiffBind is the 403 The core functionality of DiffBind is the
374 differential binding affinity analysis, which enables binding sites to be identified that 404 differential binding affinity analysis, which enables binding sites to be identified that
375 are statistically significantly differentially bound between sample groups. To accomplish 405 are statistically significantly differentially bound between sample groups. To accomplish
376 this, first a contrast (or contrasts) is established, dividing the samples into groups to 406 this, first a contrast (or contrasts) is established, dividing the samples into groups to
377 be compared. Next the core analysis routines are executed, by default using DESeq2 . 407 be compared. Next the core analysis routines are executed, by default using DESeq2 .
378 This will assign a p-value and FDR to each candidate binding site indicating confidence 408 This will assign a p-value and FDR to each candidate binding site indicating confidence
379 that they are differentially bound. 409 that they are differentially bound.
380 410
381 * **Plotting and reporting**: 411 **Plotting and reporting**:
382 412
383 Once one or more contrasts have been run, DiffBind provides 413 Once one or more contrasts have been run, DiffBind provides
384 a number of functions for reporting and plotting the results. MA plots give an 414 a number of functions for reporting and plotting the results. MA plots give an
385 overview of the results of the analysis, while correlation heatmaps and PCA plots show 415 overview of the results of the analysis, while correlation heatmaps and PCA plots show
386 how the groups cluster based on differentially bound sites. Boxplots show the distribution 416 how the groups cluster based on differentially bound sites. Boxplots show the distribution
387 of reads within differentially bound sites corresponding to whether they gain or 417 of reads within differentially bound sites corresponding to whether they gain or
388 lose affinity between the two sample groups. A reporting mechanism enables differentially 418 lose affinity between the two sample groups. A reporting mechanism enables differentially
389 bound sites to be extracted for further processing, such as annotation, motif, and 419 bound sites to be extracted for further processing, such as annotation, motif, and
390 pathway analyses. 420 pathway analyses.
391 421
422 -----
423
392 **References** 424 **References**
393 425
394 DiffBind Authors: Rory Stark, Gordon Brown (2011) 426 DiffBind Authors: Rory Stark, Gordon Brown (2011)
395 Wrapper authors: Bjoern Gruening, Pavankumar Videm 427 Wrapper authors: Bjoern Gruening, Pavankumar Videm
428
429 .. _DiffBind: https://bioconductor.org/packages/release/bioc/html/DiffBind.html
430 .. _`Bioconductor package`: https://bioconductor.org/packages/release/bioc/html/DiffBind.html
431 .. _`DiffBind User Guide`: https://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf
432 .. _`Bioconductor post`: https://support.bioconductor.org/p/69924/
396 433
397 ]]> 434 ]]>
398 </help> 435 </help>
399 <citations> 436 <citations>
400 <citation type="doi">doi:10.1038/nature10730</citation> 437 <citation type="doi">doi:10.1038/nature10730</citation>