77
|
1 <tool id="rgDifferentialCount" name="Differential_Count" version="0.22">
|
61
|
2 <description>models using BioConductor packages</description>
|
|
3 <requirements>
|
|
4 <requirement type="package" version="2.14">biocbasics</requirement>
|
|
5 <requirement type="package" version="3.0.2">r302</requirement>
|
|
6 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
|
77
|
7 <requirement type="package" version="9.07">ghostscript</requirement>
|
61
|
8 </requirements>
|
|
9
|
|
10 <command interpreter="python">
|
|
11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts"
|
|
12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
|
|
13 </command>
|
|
14 <inputs>
|
|
15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
|
|
16 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
|
|
17 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs"
|
|
18 help="Supply a meaningful name here to remind you what the outputs contain">
|
|
19 <sanitizer invalid_char="">
|
|
20 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
|
|
21 </sanitizer>
|
|
22 </param>
|
|
23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
|
|
24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
|
|
25 multiple="true" use_header_names="true" size="120" display="checkboxes">
|
|
26 <validator type="no_options" message="Please select at least one column."/>
|
|
27 </param>
|
|
28 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
|
|
29 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
|
|
30 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
|
|
31 </param>
|
|
32 <param name="subjectids" type="text" optional="true" size="120" value = ""
|
|
33 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input"
|
|
34 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'">
|
|
35 <sanitizer>
|
|
36 <valid initial="string.letters,string.digits"><add value="," /> </valid>
|
|
37 </sanitizer>
|
|
38 </param>
|
|
39 <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
|
|
40 help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
|
|
41 <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1"
|
|
42 label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
|
|
43 help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
|
|
44
|
|
45 <conditional name="edgeR">
|
|
46 <param name="doedgeR" type="select"
|
|
47 label="Run this model using edgeR"
|
|
48 help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
|
|
49 <option value="F">Do not run edgeR</option>
|
|
50 <option value="T" selected="true">Run edgeR</option>
|
|
51 </param>
|
|
52 <when value="T">
|
77
|
53 <param name="edgeR_priordf" type="integer" value="10" size="3"
|
|
54 label="prior.df for tagwise dispersion - larger value = more squeezing of tag dispersions to common dispersion. Replaces prior.n and prior.df = prior.n * residual.df"
|
|
55 help="10 = edgeR default. Use a larger value to 'smooth' small samples. See edgeR docs and note below"/>
|
81
|
56 <param name="edgeR_robust_method" type="select" value="20" size="3"
|
77
|
57 label="Use robust dispersion method"
|
|
58 help="Use ordinary, anscombe or deviance robust deviance estimates">
|
|
59 <option value="ordinary" selected="true">Use ordinary deviance estimates</option>
|
|
60 <option value="deviance">Use robust deviance estimates</option>
|
|
61 <option value="anscombe">use Anscombe robust deviance estimates</option>
|
|
62 </param>
|
61
|
63 </when>
|
|
64 <when value="F"></when>
|
|
65 </conditional>
|
|
66 <conditional name="DESeq2">
|
|
67 <param name="doDESeq2" type="select"
|
|
68 label="Run the same model with DESeq2 and compare findings"
|
|
69 help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
|
|
70 <option value="F" selected="true">Do not run DESeq2</option>
|
|
71 <option value="T">Run DESeq2</option>
|
|
72 </param>
|
|
73 <when value="T">
|
|
74 <param name="DESeq_fitType" type="select">
|
|
75 <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
|
|
76 <option value="local">Local fit - this will automagically be used if parametric fit fails</option>
|
|
77 <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual linked below in the documentation</option>
|
|
78 </param>
|
|
79 </when>
|
|
80 <when value="F"> </when>
|
|
81 </conditional>
|
|
82 <param name="doVoom" type="select"
|
|
83 label="Run the same model with Voom/limma and compare findings"
|
|
84 help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma">
|
|
85 <option value="F" selected="true">Do not run VOOM</option>
|
|
86 <option value="T">Run VOOM</option>
|
|
87 </param>
|
|
88 <!--
|
|
89 <conditional name="camera">
|
|
90 <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets"
|
|
91 help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history">
|
|
92 <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option>
|
|
93 <option value="T">Run GSEA tests with the Camera algorithm</option>
|
|
94 </param>
|
|
95 <when value="T">
|
|
96 <conditional name="gmtSource">
|
|
97 <param name="refgmtSource" type="select"
|
|
98 label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set">
|
|
99 <option value="indexed" selected="true">Use a built-in gene set</option>
|
|
100 <option value="history">Use a gene set from my history</option>
|
|
101 <option value="both">Add a gene set from my history to a built in gene set</option>
|
|
102 </param>
|
|
103 <when value="indexed">
|
|
104 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
|
|
105 <options from_data_table="gseaGMT_3.1">
|
|
106 <filter type="sort_by" column="2" />
|
|
107 <validator type="no_options" message="No GMT v3.1 files are available - please install them"/>
|
|
108 </options>
|
|
109 </param>
|
|
110 </when>
|
|
111 <when value="history">
|
|
112 <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" />
|
|
113 </when>
|
|
114 <when value="both">
|
|
115 <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" />
|
|
116 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
|
|
117 <options from_data_table="gseaGMT_4">
|
|
118 <filter type="sort_by" column="2" />
|
|
119 <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/>
|
|
120 </options>
|
|
121 </param>
|
|
122 </when>
|
|
123 </conditional>
|
|
124 </when>
|
|
125 <when value="F">
|
|
126 </when>
|
|
127 </conditional>
|
|
128 -->
|
|
129 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
|
|
130 help="Conventional default value of 0.05 recommended"/>
|
|
131 <param name="fdrtype" type="select" label="FDR (Type II error) control method"
|
|
132 help="Use fdr or bh typically to control for the number of tests in a reliable way">
|
|
133 <option value="fdr" selected="true">fdr</option>
|
|
134 <option value="BH">Benjamini Hochberg</option>
|
|
135 <option value="BY">Benjamini Yukateli</option>
|
|
136 <option value="bonferroni">Bonferroni</option>
|
|
137 <option value="hochberg">Hochberg</option>
|
|
138 <option value="holm">Holm</option>
|
|
139 <option value="hommel">Hommel</option>
|
|
140 <option value="none">no control for multiple tests</option>
|
|
141 </param>
|
|
142 </inputs>
|
|
143 <outputs>
|
|
144 <data format="tabular" name="out_edgeR" label="${title}_topTable_edgeR.xls">
|
|
145 <filter>edgeR['doedgeR'] == "T"</filter>
|
|
146 </data>
|
|
147 <data format="tabular" name="out_DESeq2" label="${title}_topTable_DESeq2.xls">
|
|
148 <filter>DESeq2['doDESeq2'] == "T"</filter>
|
|
149 </data>
|
|
150 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
|
|
151 <filter>doVoom == "T"</filter>
|
|
152 </data>
|
|
153 <data format="html" name="html_file" label="${title}.html"/>
|
|
154 </outputs>
|
|
155 <stdio>
|
|
156 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix" />
|
|
157 </stdio>
|
|
158 <tests>
|
|
159 <test>
|
|
160 <param name='input1' value='test_bams2mx.xls' ftype='tabular' />
|
|
161 <param name='treatment_name' value='liver' />
|
|
162 <param name='title' value='edgeRtest' />
|
|
163 <param name='useNDF' value='' />
|
|
164 <param name='doedgeR' value='T' />
|
|
165 <param name='doVoom' value='T' />
|
|
166 <param name='doDESeq2' value='T' />
|
|
167 <param name='fdrtype' value='fdr' />
|
|
168 <param name='edgeR_priordf' value="8" />
|
77
|
169 <param name='edgeR_robust' value="ordinary" />
|
61
|
170 <param name='fdrthresh' value="0.05" />
|
|
171 <param name='control_name' value='heart' />
|
|
172 <param name='subjectids' value='' />
|
|
173 <param name='Control_cols' value='3,4,5,9' />
|
|
174 <param name='Treat_cols' value='2,6,7,8' />
|
|
175 <output name='out_edgeR' file='edgeRtest1out.xls' compare='diff' />
|
|
176 <output name='html_file' file='edgeRtest1out.html' compare='diff' lines_diff='20' />
|
|
177 </test>
|
|
178 </tests>
|
|
179
|
|
180 <configfiles>
|
|
181 <configfile name="runme">
|
|
182 <![CDATA[
|
|
183 #
|
|
184 # edgeR.Rscript
|
77
|
185 # updated feb 2014 adding outlier-robust deviance estimate options by ross for R 3.0.2/bioc 2.13
|
61
|
186 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
|
|
187 # Performs DGE on a count table containing n replicates of two conditions
|
|
188 #
|
|
189 # Parameters
|
|
190 #
|
|
191 # 1 - Output Dir
|
|
192
|
|
193 # Original edgeR code by: S.Lunke and A.Kaspi
|
|
194 reallybig = log10(.Machine\$double.xmax)
|
|
195 reallysmall = log10(.Machine\$double.xmin)
|
|
196 library('stringr')
|
|
197 library('gplots')
|
|
198 library('edgeR')
|
|
199 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
|
|
200 {
|
|
201 # Perform clustering for significant pvalues after controlling FWER
|
|
202 samples = colnames(cmat)
|
|
203 gu = unique(group)
|
|
204 gn = rownames(cmat)
|
|
205 if (length(gu) == 2) {
|
|
206 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
|
|
207 pcols = unlist(lapply(group,col.map))
|
|
208 } else {
|
|
209 colours = rainbow(length(gu),start=0,end=4/6)
|
|
210 pcols = colours[match(group,gu)] }
|
|
211 dm = cmat[(! is.na(gn)),]
|
|
212 # remove unlabelled hm rows
|
|
213 nprobes = nrow(dm)
|
|
214 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
|
|
215 if (nprobes > nsamp) {
|
|
216 dm =dm[1:nsamp,]
|
|
217 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
|
|
218 }
|
|
219 newcolnames = substr(colnames(dm),1,20)
|
|
220 colnames(dm) = newcolnames
|
|
221 pdf(outpdfname)
|
|
222 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
|
|
223 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
|
|
224 dev.off()
|
|
225 }
|
|
226
|
|
227 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
|
|
228 {
|
|
229 # for 2 groups only was
|
|
230 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
|
|
231 #pcols = unlist(lapply(group,col.map))
|
|
232 gu = unique(group)
|
|
233 colours = rainbow(length(gu),start=0.3,end=0.6)
|
|
234 pcols = colours[match(group,gu)]
|
|
235 nrows = nrow(cmat)
|
|
236 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
|
|
237 if (nrows > nsamp) {
|
|
238 cmat = cmat[c(1:nsamp),]
|
|
239 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
|
|
240 }
|
|
241 newcolnames = substr(colnames(cmat),1,20)
|
|
242 colnames(cmat) = newcolnames
|
|
243 pdf(outpdfname)
|
|
244 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
|
|
245 dev.off()
|
|
246 }
|
|
247
|
|
248 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
|
|
249 # stolen from https://gist.github.com/703512
|
|
250 {
|
|
251 o = -log10(sort(pvector,decreasing=F))
|
|
252 e = -log10( 1:length(o)/length(o) )
|
|
253 o[o==-Inf] = reallysmall
|
|
254 o[o==Inf] = reallybig
|
|
255 maint = descr
|
|
256 pdf(outpdf)
|
|
257 plot(e,o,pch=19,cex=1, main=maint, ...,
|
|
258 xlab=expression(Expected~~-log[10](italic(p))),
|
|
259 ylab=expression(Observed~~-log[10](italic(p))),
|
|
260 xlim=c(0,max(e)), ylim=c(0,max(o)))
|
|
261 lines(e,e,col="red")
|
|
262 grid(col = "lightgray", lty = "dotted")
|
|
263 dev.off()
|
|
264 }
|
|
265
|
|
266 smearPlot = function(DGEList,deTags, outSmear, outMain)
|
|
267 {
|
|
268 pdf(outSmear)
|
|
269 plotSmear(DGEList,de.tags=deTags,main=outMain)
|
|
270 grid(col="lightgray", lty="dotted")
|
|
271 dev.off()
|
|
272 }
|
|
273
|
|
274 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
|
|
275 { #
|
|
276 nc = ncol(rawrs)
|
|
277 for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
|
|
278 fullnames = colnames(rawrs)
|
|
279 newcolnames = substr(colnames(rawrs),1,20)
|
|
280 colnames(rawrs) = newcolnames
|
|
281 newcolnames = substr(colnames(cleanrs),1,20)
|
|
282 colnames(cleanrs) = newcolnames
|
|
283 defpar = par(no.readonly=T)
|
|
284 print.noquote('raw contig counts by sample:')
|
|
285 print.noquote(summary(rawrs))
|
|
286 print.noquote('normalised contig counts by sample:')
|
|
287 print.noquote(summary(cleanrs))
|
|
288 pdf(pdfname)
|
|
289 par(mfrow=c(1,2))
|
|
290 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
|
|
291 grid(col="lightgray",lty="dotted")
|
|
292 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
|
|
293 grid(col="lightgray",lty="dotted")
|
|
294 dev.off()
|
|
295 pdfname = "sample_counts_histogram.pdf"
|
|
296 nc = ncol(rawrs)
|
|
297 print.noquote(paste('Using ncol rawrs=',nc))
|
|
298 ncroot = round(sqrt(nc))
|
|
299 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
|
|
300 m = c()
|
|
301 for (i in c(1:nc)) {
|
|
302 rhist = hist(rawrs[,i],breaks=100,plot=F)
|
|
303 m = append(m,max(rhist\$counts))
|
|
304 }
|
|
305 ymax = max(m)
|
|
306 ncols = length(fullnames)
|
|
307 if (ncols > 20)
|
|
308 {
|
|
309 scale = 7*ncols/20
|
|
310 pdf(pdfname,width=scale,height=scale)
|
|
311 } else {
|
|
312 pdf(pdfname)
|
|
313 }
|
|
314 par(mfrow=c(ncroot,ncroot))
|
|
315 for (i in c(1:nc)) {
|
|
316 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
|
|
317 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
|
|
318 }
|
|
319 dev.off()
|
|
320 par(defpar)
|
|
321
|
|
322 }
|
|
323
|
|
324 cumPlot = function(rawrs,cleanrs,maint,myTitle)
|
|
325 { # updated to use ecdf
|
|
326 pdfname = "Filtering_rowsum_bar_charts.pdf"
|
|
327 defpar = par(no.readonly=T)
|
|
328 lrs = log(rawrs,10)
|
|
329 lim = max(lrs)
|
|
330 pdf(pdfname)
|
|
331 par(mfrow=c(2,1))
|
|
332 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
|
|
333 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
|
|
334 grid(col="lightgray", lty="dotted")
|
|
335 lrs = log(cleanrs,10)
|
|
336 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
|
|
337 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
|
|
338 grid(col="lightgray", lty="dotted")
|
|
339 dev.off()
|
|
340 par(defpar)
|
|
341 }
|
|
342
|
|
343 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
|
|
344 { # updated to use ecdf
|
|
345 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
|
|
346 pdf(pdfname)
|
|
347 par(mfrow=c(2,1))
|
|
348 lastx = max(rawrs)
|
|
349 rawe = knots(ecdf(rawrs))
|
|
350 cleane = knots(ecdf(cleanrs))
|
|
351 cy = 1:length(cleane)/length(cleane)
|
|
352 ry = 1:length(rawe)/length(rawe)
|
|
353 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
|
|
354 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
|
|
355 grid(col="blue")
|
|
356 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
|
|
357 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
|
|
358 grid(col="blue")
|
|
359 dev.off()
|
|
360 }
|
|
361
|
|
362
|
|
363
|
|
364 doGSEAold = function(y=NULL,design=NULL,histgmt="",
|
|
365 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
|
|
366 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
|
|
367 {
|
|
368 sink('Camera.log')
|
|
369 genesets = c()
|
|
370 if (bigmt > "")
|
|
371 {
|
|
372 bigenesets = readLines(bigmt)
|
|
373 genesets = bigenesets
|
|
374 }
|
|
375 if (histgmt > "")
|
|
376 {
|
|
377 hgenesets = readLines(histgmt)
|
|
378 if (bigmt > "") {
|
|
379 genesets = rbind(genesets,hgenesets)
|
|
380 } else {
|
|
381 genesets = hgenesets
|
|
382 } # use only history if no bi
|
|
383 }
|
|
384 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
|
|
385 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
|
|
386 outf = outfname
|
|
387 head=paste(myTitle,'edgeR GSEA')
|
|
388 write(head,file=outfname,append=F)
|
|
389 ntest=length(genesets)
|
|
390 urownames = toupper(rownames(y))
|
|
391 upcam = c()
|
|
392 downcam = c()
|
|
393 for (i in 1:ntest) {
|
|
394 gs = unlist(genesets[i])
|
|
395 g = gs[1] # geneset_id
|
|
396 u = gs[2]
|
|
397 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
|
|
398 glist = gs[3:length(gs)] # member gene symbols
|
|
399 glist = toupper(glist)
|
|
400 inglist = urownames %in% glist
|
|
401 nin = sum(inglist)
|
|
402 if ((nin > minnin) && (nin < maxnin)) {
|
|
403 ### print(paste('@@found',sum(inglist),'genes in glist'))
|
|
404 camres = camera(y=y,index=inglist,design=design)
|
|
405 if (! is.null(camres)) {
|
|
406 rownames(camres) = g # gene set name
|
|
407 camres = cbind(GeneSet=g,URL=u,camres)
|
|
408 if (camres\$Direction == "Up")
|
|
409 {
|
|
410 upcam = rbind(upcam,camres) } else {
|
|
411 downcam = rbind(downcam,camres)
|
|
412 }
|
|
413 }
|
|
414 }
|
|
415 }
|
|
416 uscam = upcam[order(upcam\$PValue),]
|
|
417 unadjp = uscam\$PValue
|
|
418 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
|
|
419 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
|
|
420 dscam = downcam[order(downcam\$PValue),]
|
|
421 unadjp = dscam\$PValue
|
|
422 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
|
|
423 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
|
|
424 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
|
|
425 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
|
|
426 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
|
|
427 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
|
|
428 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
|
|
429 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
|
|
430 sink()
|
|
431 }
|
|
432
|
|
433
|
|
434
|
|
435
|
|
436 doGSEA = function(y=NULL,design=NULL,histgmt="",
|
|
437 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
|
|
438 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
|
|
439 {
|
|
440 sink('Camera.log')
|
|
441 genesets = c()
|
|
442 if (bigmt > "")
|
|
443 {
|
|
444 bigenesets = readLines(bigmt)
|
|
445 genesets = bigenesets
|
|
446 }
|
|
447 if (histgmt > "")
|
|
448 {
|
|
449 hgenesets = readLines(histgmt)
|
|
450 if (bigmt > "") {
|
|
451 genesets = rbind(genesets,hgenesets)
|
|
452 } else {
|
|
453 genesets = hgenesets
|
|
454 } # use only history if no bi
|
|
455 }
|
|
456 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
|
|
457 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
|
|
458 outf = outfname
|
|
459 head=paste(myTitle,'edgeR GSEA')
|
|
460 write(head,file=outfname,append=F)
|
|
461 ntest=length(genesets)
|
|
462 urownames = toupper(rownames(y))
|
|
463 upcam = c()
|
|
464 downcam = c()
|
|
465 incam = c()
|
|
466 urls = c()
|
|
467 gsids = c()
|
|
468 for (i in 1:ntest) {
|
|
469 gs = unlist(genesets[i])
|
|
470 gsid = gs[1] # geneset_id
|
|
471 url = gs[2]
|
|
472 if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
|
|
473 glist = gs[3:length(gs)] # member gene symbols
|
|
474 glist = toupper(glist)
|
|
475 inglist = urownames %in% glist
|
|
476 nin = sum(inglist)
|
|
477 if ((nin > minnin) && (nin < maxnin)) {
|
|
478 incam = c(incam,inglist)
|
|
479 gsids = c(gsids,gsid)
|
|
480 urls = c(urls,url)
|
|
481 }
|
|
482 }
|
|
483 incam = as.list(incam)
|
|
484 names(incam) = gsids
|
|
485 allcam = camera(y=y,index=incam,design=design)
|
|
486 allcamres = cbind(geneset=gsids,allcam,URL=urls)
|
|
487 for (i in 1:ntest) {
|
|
488 camres = allcamres[i]
|
|
489 res = try(test = (camres\$Direction == "Up"))
|
|
490 if ("try-error" %in% class(res)) {
|
|
491 cat("test failed, camres = :")
|
|
492 print.noquote(camres)
|
|
493 } else { if (camres\$Direction == "Up")
|
|
494 { upcam = rbind(upcam,camres)
|
|
495 } else { downcam = rbind(downcam,camres)
|
|
496 }
|
|
497
|
|
498 }
|
|
499 }
|
|
500 uscam = upcam[order(upcam\$PValue),]
|
|
501 unadjp = uscam\$PValue
|
|
502 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
|
|
503 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
|
|
504 dscam = downcam[order(downcam\$PValue),]
|
|
505 unadjp = dscam\$PValue
|
|
506 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
|
|
507 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
|
|
508 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
|
|
509 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
|
|
510 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
|
|
511 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
|
|
512 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
|
|
513 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
|
|
514 sink()
|
|
515 }
|
|
516
|
|
517
|
|
518 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
|
|
519 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
|
|
520 filterquantile=0.2, subjects=c(),mydesign=NULL,
|
|
521 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
|
|
522 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
|
81
|
523 doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary')
|
61
|
524 {
|
|
525 # Error handling
|
|
526 if (length(unique(group))!=2){
|
|
527 print("Number of conditions identified in experiment does not equal 2")
|
|
528 q()
|
|
529 }
|
|
530 require(edgeR)
|
|
531 options(width = 512)
|
|
532 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
|
|
533 allN = nrow(Count_Matrix)
|
|
534 nscut = round(ncol(Count_Matrix)/2)
|
|
535 colTotmillionreads = colSums(Count_Matrix)/1e6
|
|
536 counts.dataframe = as.data.frame(c())
|
|
537 rawrs = rowSums(Count_Matrix)
|
|
538 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
|
|
539 nzN = nrow(nonzerod)
|
|
540 nzrs = rowSums(nonzerod)
|
|
541 zN = allN - nzN
|
|
542 print('# Quantiles for non-zero row counts:',quote=F)
|
|
543 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
|
|
544 if (useNDF == T)
|
|
545 {
|
|
546 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
|
|
547 lo = colSums(Count_Matrix[!gt1rpin3,])
|
|
548 workCM = Count_Matrix[gt1rpin3,]
|
|
549 cleanrs = rowSums(workCM)
|
|
550 cleanN = length(cleanrs)
|
|
551 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
|
|
552 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
|
|
553 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
|
|
554 } else {
|
|
555 useme = (nzrs > quantile(nzrs,filterquantile))
|
|
556 workCM = nonzerod[useme,]
|
|
557 lo = colSums(nonzerod[!useme,])
|
|
558 cleanrs = rowSums(workCM)
|
|
559 cleanN = length(cleanrs)
|
|
560 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
|
|
561 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
|
|
562 maint = paste('Filter below',filterquantile,'quantile')
|
|
563 }
|
|
564 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
|
|
565 allgenes = rownames(workCM)
|
|
566 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
|
|
567 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
|
|
568 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
|
|
569 testreg = str_match(allgenes,reg)
|
|
570 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
|
|
571 {
|
|
572 print("@@ using ucsc substitution for urls")
|
|
573 contigurls = paste0(ucsc,"&position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
|
|
574 } else {
|
|
575 print("@@ using genecards substitution for urls")
|
|
576 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
|
|
577 }
|
|
578 print.noquote("# urls")
|
|
579 print.noquote(head(contigurls))
|
|
580 print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
|
|
581 cmrowsums = rowSums(workCM)
|
|
582 TName=unique(group)[1]
|
|
583 CName=unique(group)[2]
|
|
584 if (is.null(mydesign)) {
|
|
585 if (length(subjects) == 0)
|
|
586 {
|
|
587 mydesign = model.matrix(~group)
|
|
588 }
|
|
589 else {
|
|
590 subjf = factor(subjects)
|
|
591 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
|
|
592 }
|
|
593 }
|
|
594 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
|
|
595 print.noquote('Using design matrix:')
|
|
596 print.noquote(mydesign)
|
80
|
597 if (doedgeR == T) {
|
61
|
598 sink('edgeR.log')
|
|
599 #### Setup DGEList object
|
|
600 DGEList = DGEList(counts=workCM, group = group)
|
|
601 DGEList = calcNormFactors(DGEList)
|
77
|
602 if (robust_meth == 'ordinary') {
|
|
603 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
|
|
604 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
|
|
605 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
|
61
|
606
|
77
|
607 comdisp = DGEList\$common.dispersion
|
|
608 estpriorn = getPriorN(DGEList)
|
|
609 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
|
|
610 } else {
|
80
|
611 DGEList = estimateGLMRobustDisp(DGEList,design=mydesign, prior.df = edgeR_priordf, maxit = 6, residual.type = robust_meth)
|
77
|
612 }
|
80
|
613
|
|
614
|
61
|
615 DGLM = glmFit(DGEList,design=mydesign)
|
|
616 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
|
|
617 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
|
|
618 normData = (1e+06*DGEList\$counts/efflib)
|
|
619 uoutput = cbind(
|
|
620 Name=as.character(rownames(DGEList\$counts)),
|
|
621 DE\$table,
|
|
622 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
|
|
623 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
|
|
624 DGEList\$counts
|
|
625 )
|
|
626 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
|
|
627 goodness = gof(DGLM, pcutoff=fdrthresh)
|
|
628 if (sum(goodness\$outlier) > 0) {
|
|
629 print.noquote('GLM outliers:')
|
|
630 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
|
|
631 } else {
|
|
632 print('No GLM fit outlier genes found\n')
|
|
633 }
|
|
634 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
|
|
635 pdf("edgeR_GoodnessofFit.pdf")
|
|
636 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
|
|
637 abline(0,1,lwd=3)
|
|
638 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
|
|
639 dev.off()
|
|
640 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
|
|
641 normData = (1e+06*DGEList\$counts/efflib)
|
|
642 uniqueg = unique(group)
|
|
643 #### Plot MDS
|
|
644 sample_colors = match(group,levels(group))
|
|
645 sampleTypes = levels(factor(group))
|
|
646 print.noquote(sampleTypes)
|
|
647 pdf("edgeR_MDSplot.pdf")
|
|
648 plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
|
|
649 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
|
|
650 grid(col="blue")
|
|
651 dev.off()
|
|
652 colnames(normData) = paste( colnames(normData),'N',sep="_")
|
|
653 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
|
|
654 nzd = data.frame(log(nonzerod + 1e-2,10))
|
|
655 try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") )
|
|
656 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
|
|
657 tt = cbind(
|
|
658 Name=as.character(rownames(DGEList\$counts)),
|
|
659 DE\$table,
|
|
660 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
|
|
661 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
|
|
662 )
|
|
663 print.noquote("# edgeR Top tags\n")
|
|
664 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
|
|
665 tt = tt[order(DE\$table\$PValue),]
|
|
666 print.noquote(tt[1:50,])
|
|
667 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
|
|
668 nsig = length(deTags)
|
|
669 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
|
|
670 deColours = ifelse(deTags,'red','black')
|
|
671 pdf("edgeR_BCV_vs_abundance.pdf")
|
|
672 plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
|
|
673 dev.off()
|
|
674 dg = DGEList[order(DE\$table\$PValue),]
|
|
675 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
|
|
676 efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
|
|
677 normData = (1e+06*dg\$counts/efflib)
|
|
678 outpdfname="edgeR_top_100_heatmap.pdf"
|
|
679 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle))
|
|
680 outSmear = "edgeR_smearplot.pdf"
|
|
681 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
|
|
682 smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
|
|
683 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf')
|
|
684 norm.factor = DGEList\$samples\$norm.factors
|
|
685 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
|
|
686 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
|
|
687 edgeRcounts = rep(0, length(allgenes))
|
|
688 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
|
|
689 sink()
|
|
690 } ### doedgeR
|
|
691 if (doDESeq2 == T)
|
|
692 {
|
|
693 sink("DESeq2.log")
|
|
694 # DESeq2
|
|
695 require('DESeq2')
|
|
696 library('RColorBrewer')
|
|
697 if (length(subjects) == 0)
|
|
698 {
|
|
699 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
|
|
700 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
|
|
701 } else {
|
|
702 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
|
|
703 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
|
|
704 }
|
|
705 #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
|
|
706 #rDESeq = results(DESeq2)
|
|
707 #newCountDataSet(workCM, group)
|
|
708 deSeqDatsizefac = estimateSizeFactors(deSEQds)
|
|
709 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
|
77
|
710 resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
|
61
|
711 rDESeq = as.data.frame(results(resDESeq))
|
|
712 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
|
|
713 srDESeq = rDESeq[order(rDESeq\$pvalue),]
|
|
714 qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf')
|
|
715 cat("# DESeq top 50\n")
|
|
716 print.noquote(srDESeq[1:50,])
|
|
717 write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
|
|
718 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
|
|
719 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
|
|
720 DESeqcounts = rep(0, length(allgenes))
|
|
721 DESeqcounts[DESeqcountsindex] = 1
|
|
722 pdf("DESeq2_dispersion_estimates.pdf")
|
|
723 plotDispEsts(resDESeq)
|
|
724 dev.off()
|
|
725 ysmall = abs(min(rDESeq\$log2FoldChange))
|
|
726 ybig = abs(max(rDESeq\$log2FoldChange))
|
|
727 ylimit = min(4,ysmall,ybig)
|
|
728 pdf("DESeq2_MA_plot.pdf")
|
|
729 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
|
|
730 dev.off()
|
|
731 rlogres = rlogTransformation(resDESeq)
|
|
732 sampledists = dist( t( assay(rlogres) ) )
|
|
733 sdmat = as.matrix(sampledists)
|
|
734 pdf("DESeq2_sample_distance_plot.pdf")
|
|
735 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
|
|
736 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
|
|
737 dev.off()
|
|
738 ###outpdfname="DESeq2_top50_heatmap.pdf"
|
|
739 ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
|
|
740 sink()
|
|
741 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
|
|
742 if ("try-error" %in% class(result)) {
|
|
743 print.noquote('DESeq2 plotPCA failed.')
|
|
744 } else {
|
|
745 pdf("DESeq2_PCA_plot.pdf")
|
|
746 #### wtf - print? Seems needed to get this to work
|
|
747 print(ppca)
|
|
748 dev.off()
|
|
749 }
|
|
750 }
|
|
751
|
|
752 if (doVoom == T) {
|
|
753 sink('VOOM.log')
|
|
754 if (doedgeR == F) {
|
|
755 #### Setup DGEList object
|
|
756 DGEList = DGEList(counts=workCM, group = group)
|
|
757 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
|
|
758 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
|
|
759 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
|
|
760 }
|
83
|
761 calcNormFactors(DGEList)
|
|
762 ls = colSums(DGEList\$counts) * DGEList\$samples\$norm.factors
|
61
|
763 pdf("VOOM_mean_variance_plot.pdf")
|
82
|
764 dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = ls)
|
61
|
765 dev.off()
|
|
766 # Use limma to fit data
|
|
767 fit = lmFit(dat.voomed, mydesign)
|
|
768 fit = eBayes(fit)
|
|
769 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
|
|
770 qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='VOOM_qqplot.pdf')
|
|
771 rownames(rvoom) = rownames(workCM)
|
|
772 rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls)
|
|
773 srvoom = rvoom[order(rvoom\$P.Value),]
|
|
774 cat("# VOOM top 50\n")
|
|
775 print(srvoom[1:50,])
|
|
776 write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F)
|
|
777 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
|
|
778 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
|
|
779 voomcountsindex = which(allgenes %in% topresults.voom\$ID)
|
|
780 voomcounts = rep(0, length(allgenes))
|
|
781 voomcounts[voomcountsindex] = 1
|
|
782 sink()
|
|
783 }
|
|
784
|
|
785 if (doCamera) {
|
|
786 doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
|
|
787 outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
|
|
788 }
|
|
789
|
|
790 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
|
|
791 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
|
|
792 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
|
|
793 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
|
|
794 VOOM_limma = voomcounts, row.names = allgenes)
|
|
795 } else if ((doDESeq2==T) && (doedgeR==T)) {
|
|
796 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
|
|
797 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
|
|
798 } else if ((doVoom==T) && (doedgeR==T)) {
|
|
799 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
|
|
800 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
|
|
801 }
|
|
802
|
|
803 if (nrow(counts.dataframe > 1)) {
|
|
804 counts.venn = vennCounts(counts.dataframe)
|
|
805 vennf = "Venn_significant_genes_overlap.pdf"
|
|
806 pdf(vennf)
|
|
807 vennDiagram(counts.venn,main=vennmain,col="maroon")
|
|
808 dev.off()
|
|
809 }
|
|
810 } #### doDESeq2 or doVoom
|
|
811
|
|
812 }
|
|
813 #### Done
|
|
814
|
|
815 ###sink(stdout(),append=T,type="message")
|
|
816 builtin_gmt = ""
|
|
817 history_gmt = ""
|
|
818 history_gmt_name = ""
|
|
819 out_edgeR = F
|
|
820 out_DESeq2 = F
|
|
821 out_VOOM = "$out_VOOM"
|
78
|
822 edgeR_robust_meth = "ordinary" # control robust deviance options
|
80
|
823 doDESeq2 = $DESeq2.doDESeq2
|
61
|
824 doVoom = $doVoom
|
|
825 doCamera = F
|
|
826 doedgeR = $edgeR.doedgeR
|
77
|
827 edgeR_priordf = 10
|
61
|
828
|
|
829
|
|
830 #if $doVoom == "T":
|
|
831 out_VOOM = "$out_VOOM"
|
|
832 #end if
|
|
833
|
|
834 #if $DESeq2.doDESeq2 == "T":
|
|
835 out_DESeq2 = "$out_DESeq2"
|
79
|
836 doDESeq2 = T
|
61
|
837 DESeq_fitType = "$DESeq2.DESeq_fitType"
|
|
838 #end if
|
|
839
|
|
840 #if $edgeR.doedgeR == "T":
|
|
841 out_edgeR = "$out_edgeR"
|
|
842 edgeR_priordf = $edgeR.edgeR_priordf
|
81
|
843 edgeR_robust_meth = "$edgeR.edgeR_robust_method"
|
61
|
844 #end if
|
|
845
|
|
846
|
|
847 if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
|
|
848 {
|
|
849 write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
|
|
850 quit(save="no",status=2)
|
|
851 }
|
|
852
|
|
853 Out_Dir = "$html_file.files_path"
|
|
854 Input = "$input1"
|
|
855 TreatmentName = "$treatment_name"
|
|
856 TreatmentCols = "$Treat_cols"
|
|
857 ControlName = "$control_name"
|
|
858 ControlCols= "$Control_cols"
|
|
859 org = "$input1.dbkey"
|
|
860 if (org == "") { org = "hg19"}
|
|
861 fdrtype = "$fdrtype"
|
|
862 fdrthresh = $fdrthresh
|
|
863 useNDF = $useNDF
|
|
864 fQ = $fQ # non-differential centile cutoff
|
|
865 myTitle = "$title"
|
|
866 sids = strsplit("$subjectids",',')
|
|
867 subjects = unlist(sids)
|
|
868 nsubj = length(subjects)
|
|
869 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
|
|
870 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
|
|
871 cat('Got TCols=')
|
|
872 cat(TCols)
|
|
873 cat('; CCols=')
|
|
874 cat(CCols)
|
|
875 cat('\n')
|
|
876 useCols = c(TCols,CCols)
|
|
877 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
|
|
878 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
|
|
879 snames = colnames(Count_Matrix)
|
|
880 nsamples = length(snames)
|
|
881 if (nsubj > 0 & nsubj != nsamples) {
|
|
882 options("show.error.messages"=T)
|
|
883 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
|
|
884 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
|
|
885 write(mess, stderr())
|
|
886 quit(save="no",status=4)
|
|
887 }
|
|
888 if (length(subjects) != 0) {subjects = subjects[useCols]}
|
|
889 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
|
|
890 rn = rownames(Count_Matrix)
|
|
891 islib = rn %in% c('librarySize','NotInBedRegions')
|
|
892 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
|
|
893 Count_Matrix = Count_Matrix[subset(rn,! islib),]
|
|
894 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
|
|
895 group = factor(group, levels=c(ControlName,TreatmentName))
|
|
896 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
|
|
897 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
|
|
898 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
|
|
899 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,
|
|
900 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
|
81
|
901 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType,robust_meth=edgeR_robust_meth)
|
61
|
902 sessionInfo()
|
|
903 ]]>
|
|
904 </configfile>
|
|
905 </configfiles>
|
|
906 <help>
|
|
907
|
|
908 **What it does**
|
|
909
|
|
910 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
|
|
911 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
|
|
912
|
|
913 **Input**
|
|
914
|
|
915 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
|
|
916 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
|
|
917 non-negative integer count of reads from one sample overlapping the feature.
|
|
918 The matrix must have a header row uniquely identifying the source samples, and unique row names in
|
|
919 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
|
|
920
|
|
921 **Specifying comparisons**
|
|
922
|
|
923 This is basically dumbed down for two factors - case vs control.
|
|
924
|
|
925 More complex interfaces are possible but painful at present.
|
|
926 Probably need to specify a phenotype file to do this better.
|
|
927 Work in progress. Send code.
|
|
928
|
|
929 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
|
|
930 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
|
|
931 A list of integers, one for each subject or an empty string if samples are all independent.
|
|
932 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
|
|
933 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
|
|
934
|
|
935 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
|
|
936 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
|
|
937 8,9,1,1,2,2
|
|
938 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
|
|
939
|
|
940 **Methods available**
|
|
941
|
|
942 You can run 3 popular Bioconductor packages available for count data.
|
|
943
|
|
944 edgeR - see edgeR_ for details
|
|
945
|
|
946 VOOM/limma - see limma_VOOM_ for details
|
|
947
|
|
948 DESeq2 - see DESeq2_ for details
|
|
949
|
|
950 and optionally camera in edgeR which works better if MSigDB is installed.
|
|
951
|
|
952 **Outputs**
|
|
953
|
|
954 Some helpful plots and analysis results. Note that most of these are produced using R code
|
|
955 suggested by the excellent documentation and vignettes for the Bioconductor
|
|
956 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
|
|
957
|
|
958 **Note on Voom**
|
|
959
|
|
960 The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
|
|
961
|
|
962 This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
|
|
963
|
|
964 voom is an acronym for mean-variance modelling at the observational level.
|
|
965 The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
|
|
966 Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
|
|
967 This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
|
|
968 The weights are then used in the linear modelling process to adjust for heteroscedasticity.
|
|
969
|
|
970 In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
|
|
971 The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
|
|
972 The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
|
|
973 Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
|
|
974 Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
|
|
975 This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
|
|
976
|
|
977
|
|
978 Author(s)
|
|
979
|
|
980 Charity Law and Gordon Smyth
|
|
981
|
|
982 References
|
|
983
|
|
984 Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
|
|
985
|
|
986 Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
|
|
987 Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
|
|
988 http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
|
|
989
|
|
990 See Also
|
|
991
|
|
992 A voom case study is given in the edgeR User's Guide.
|
|
993
|
|
994 vooma is a similar function but for microarrays instead of RNA-seq.
|
|
995
|
|
996
|
|
997 ***old rant on changes to Bioconductor package variable names between versions***
|
|
998
|
|
999 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
|
|
1000 breaking this and all other code that assumed the old name for this variable,
|
|
1001 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
|
|
1002 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
|
|
1003 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
|
|
1004 when their old scripts break. This tool currently now works with 2.4.6.
|
|
1005
|
|
1006 **Note on prior.N**
|
|
1007
|
|
1008 http://seqanswers.com/forums/showthread.php?t=5591 says:
|
|
1009
|
|
1010 *prior.n*
|
|
1011
|
|
1012 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
|
|
1013 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
|
|
1014 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
|
|
1015 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
|
|
1016 common likelihood the weight of one observation.
|
|
1017
|
|
1018 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
|
|
1019 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
|
|
1020 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
|
|
1021 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
|
|
1022 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
|
|
1023 If you have more samples, then the tagwise dispersion estimates will be more reliable,
|
|
1024 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
|
|
1025
|
|
1026
|
|
1027 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
|
|
1028
|
|
1029 Dear Dorota,
|
|
1030
|
|
1031 The important settings are prior.df and trend.
|
|
1032
|
|
1033 prior.n and prior.df are related through prior.df = prior.n * residual.df,
|
|
1034 and your experiment has residual.df = 36 - 12 = 24. So the old setting of
|
|
1035 prior.n=10 is equivalent for your data to prior.df = 240, a very large
|
|
1036 value. Going the other way, the new setting of prior.df=10 is equivalent
|
|
1037 to prior.n=10/24.
|
|
1038
|
|
1039 To recover old results with the current software you would use
|
|
1040
|
|
1041 estimateTagwiseDisp(object, prior.df=240, trend="none")
|
|
1042
|
|
1043 To get the new default from old software you would use
|
|
1044
|
|
1045 estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
|
|
1046
|
|
1047 Actually the old trend method is equivalent to trend="loess" in the new
|
|
1048 software. You should use plotBCV(object) to see whether a trend is
|
|
1049 required.
|
|
1050
|
|
1051 Note you could also use
|
|
1052
|
|
1053 prior.n = getPriorN(object, prior.df=10)
|
|
1054
|
|
1055 to map between prior.df and prior.n.
|
|
1056
|
|
1057 ----
|
|
1058
|
|
1059 **Attributions**
|
|
1060
|
|
1061 edgeR - edgeR_
|
|
1062
|
|
1063 VOOM/limma - limma_VOOM_
|
|
1064
|
|
1065 DESeq2 - DESeq2_ for details
|
|
1066
|
|
1067 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
|
|
1068
|
|
1069 Galaxy_ (that's what you are using right now!) for gluing everything together
|
|
1070
|
|
1071 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
|
|
1072 licensed to you under the LGPL_ like other rgenetics artefacts
|
|
1073
|
|
1074 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
|
|
1075 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
|
|
1076 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
|
|
1077 .. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
|
|
1078 .. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
|
|
1079 .. _Galaxy: http://getgalaxy.org
|
|
1080 </help>
|
|
1081
|
|
1082 </tool>
|
|
1083
|
|
1084
|