comparison rgedgeRpaired_nocamera.xml @ 61:dfc1046c8806 draft

Uploaded
author fubar
date Mon, 10 Feb 2014 05:47:52 -0500
parents
children baa72cf05e58
comparison
equal deleted inserted replaced
60:9d75852bf019 61:dfc1046c8806
1 <tool id="rgDifferentialCount" name="Differential_Count" version="0.21">
2 <description>models using BioConductor packages</description>
3 <requirements>
4 <requirement type="package" version="2.14">biocbasics</requirement>
5 <requirement type="package" version="3.0.2">r302</requirement>
6 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
7 <requirement type="package" version="3.11.11">atlas</requirement>
8 <requirement type="package" version="9.10">ghostscript</requirement>
9 </requirements>
10
11 <command interpreter="python">
12 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts"
13 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
14 </command>
15 <inputs>
16 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
17 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
18 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs"
19 help="Supply a meaningful name here to remind you what the outputs contain">
20 <sanitizer invalid_char="">
21 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
22 </sanitizer>
23 </param>
24 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
25 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
26 multiple="true" use_header_names="true" size="120" display="checkboxes">
27 <validator type="no_options" message="Please select at least one column."/>
28 </param>
29 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
30 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
31 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
32 </param>
33 <param name="subjectids" type="text" optional="true" size="120" value = ""
34 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input"
35 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'">
36 <sanitizer>
37 <valid initial="string.letters,string.digits"><add value="," /> </valid>
38 </sanitizer>
39 </param>
40 <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
41 help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
42 <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1"
43 label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
44 help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
45
46 <conditional name="edgeR">
47 <param name="doedgeR" type="select"
48 label="Run this model using edgeR"
49 help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
50 <option value="F">Do not run edgeR</option>
51 <option value="T" selected="true">Run edgeR</option>
52 </param>
53 <when value="T">
54 <param name="edgeR_priordf" type="integer" value="20" size="3"
55 label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n and prior.df = prior.n * residual.df"
56 help="0 = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
57 </when>
58 <when value="F"></when>
59 </conditional>
60 <conditional name="DESeq2">
61 <param name="doDESeq2" type="select"
62 label="Run the same model with DESeq2 and compare findings"
63 help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
64 <option value="F" selected="true">Do not run DESeq2</option>
65 <option value="T">Run DESeq2</option>
66 </param>
67 <when value="T">
68 <param name="DESeq_fitType" type="select">
69 <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
70 <option value="local">Local fit - this will automagically be used if parametric fit fails</option>
71 <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual linked below in the documentation</option>
72 </param>
73 </when>
74 <when value="F"> </when>
75 </conditional>
76 <param name="doVoom" type="select"
77 label="Run the same model with Voom/limma and compare findings"
78 help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma">
79 <option value="F" selected="true">Do not run VOOM</option>
80 <option value="T">Run VOOM</option>
81 </param>
82 <!--
83 <conditional name="camera">
84 <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets"
85 help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history">
86 <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option>
87 <option value="T">Run GSEA tests with the Camera algorithm</option>
88 </param>
89 <when value="T">
90 <conditional name="gmtSource">
91 <param name="refgmtSource" type="select"
92 label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set">
93 <option value="indexed" selected="true">Use a built-in gene set</option>
94 <option value="history">Use a gene set from my history</option>
95 <option value="both">Add a gene set from my history to a built in gene set</option>
96 </param>
97 <when value="indexed">
98 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
99 <options from_data_table="gseaGMT_3.1">
100 <filter type="sort_by" column="2" />
101 <validator type="no_options" message="No GMT v3.1 files are available - please install them"/>
102 </options>
103 </param>
104 </when>
105 <when value="history">
106 <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" />
107 </when>
108 <when value="both">
109 <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" />
110 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
111 <options from_data_table="gseaGMT_4">
112 <filter type="sort_by" column="2" />
113 <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/>
114 </options>
115 </param>
116 </when>
117 </conditional>
118 </when>
119 <when value="F">
120 </when>
121 </conditional>
122 -->
123 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
124 help="Conventional default value of 0.05 recommended"/>
125 <param name="fdrtype" type="select" label="FDR (Type II error) control method"
126 help="Use fdr or bh typically to control for the number of tests in a reliable way">
127 <option value="fdr" selected="true">fdr</option>
128 <option value="BH">Benjamini Hochberg</option>
129 <option value="BY">Benjamini Yukateli</option>
130 <option value="bonferroni">Bonferroni</option>
131 <option value="hochberg">Hochberg</option>
132 <option value="holm">Holm</option>
133 <option value="hommel">Hommel</option>
134 <option value="none">no control for multiple tests</option>
135 </param>
136 </inputs>
137 <outputs>
138 <data format="tabular" name="out_edgeR" label="${title}_topTable_edgeR.xls">
139 <filter>edgeR['doedgeR'] == "T"</filter>
140 </data>
141 <data format="tabular" name="out_DESeq2" label="${title}_topTable_DESeq2.xls">
142 <filter>DESeq2['doDESeq2'] == "T"</filter>
143 </data>
144 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
145 <filter>doVoom == "T"</filter>
146 </data>
147 <data format="html" name="html_file" label="${title}.html"/>
148 </outputs>
149 <stdio>
150 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix" />
151 </stdio>
152 <tests>
153 <test>
154 <param name='input1' value='test_bams2mx.xls' ftype='tabular' />
155 <param name='treatment_name' value='liver' />
156 <param name='title' value='edgeRtest' />
157 <param name='useNDF' value='' />
158 <param name='doedgeR' value='T' />
159 <param name='doVoom' value='T' />
160 <param name='doDESeq2' value='T' />
161 <param name='fdrtype' value='fdr' />
162 <param name='edgeR_priordf' value="8" />
163 <param name='fdrthresh' value="0.05" />
164 <param name='control_name' value='heart' />
165 <param name='subjectids' value='' />
166 <param name='Control_cols' value='3,4,5,9' />
167 <param name='Treat_cols' value='2,6,7,8' />
168 <output name='out_edgeR' file='edgeRtest1out.xls' compare='diff' />
169 <output name='html_file' file='edgeRtest1out.html' compare='diff' lines_diff='20' />
170 </test>
171 </tests>
172
173 <configfiles>
174 <configfile name="runme">
175 <![CDATA[
176 #
177 # edgeR.Rscript
178 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
179 # Performs DGE on a count table containing n replicates of two conditions
180 #
181 # Parameters
182 #
183 # 1 - Output Dir
184
185 # Original edgeR code by: S.Lunke and A.Kaspi
186 reallybig = log10(.Machine\$double.xmax)
187 reallysmall = log10(.Machine\$double.xmin)
188 library('stringr')
189 library('gplots')
190 library('edgeR')
191 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
192 {
193 # Perform clustering for significant pvalues after controlling FWER
194 samples = colnames(cmat)
195 gu = unique(group)
196 gn = rownames(cmat)
197 if (length(gu) == 2) {
198 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
199 pcols = unlist(lapply(group,col.map))
200 } else {
201 colours = rainbow(length(gu),start=0,end=4/6)
202 pcols = colours[match(group,gu)] }
203 dm = cmat[(! is.na(gn)),]
204 # remove unlabelled hm rows
205 nprobes = nrow(dm)
206 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
207 if (nprobes > nsamp) {
208 dm =dm[1:nsamp,]
209 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
210 }
211 newcolnames = substr(colnames(dm),1,20)
212 colnames(dm) = newcolnames
213 pdf(outpdfname)
214 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
215 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
216 dev.off()
217 }
218
219 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
220 {
221 # for 2 groups only was
222 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
223 #pcols = unlist(lapply(group,col.map))
224 gu = unique(group)
225 colours = rainbow(length(gu),start=0.3,end=0.6)
226 pcols = colours[match(group,gu)]
227 nrows = nrow(cmat)
228 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
229 if (nrows > nsamp) {
230 cmat = cmat[c(1:nsamp),]
231 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
232 }
233 newcolnames = substr(colnames(cmat),1,20)
234 colnames(cmat) = newcolnames
235 pdf(outpdfname)
236 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
237 dev.off()
238 }
239
240 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
241 # stolen from https://gist.github.com/703512
242 {
243 o = -log10(sort(pvector,decreasing=F))
244 e = -log10( 1:length(o)/length(o) )
245 o[o==-Inf] = reallysmall
246 o[o==Inf] = reallybig
247 maint = descr
248 pdf(outpdf)
249 plot(e,o,pch=19,cex=1, main=maint, ...,
250 xlab=expression(Expected~~-log[10](italic(p))),
251 ylab=expression(Observed~~-log[10](italic(p))),
252 xlim=c(0,max(e)), ylim=c(0,max(o)))
253 lines(e,e,col="red")
254 grid(col = "lightgray", lty = "dotted")
255 dev.off()
256 }
257
258 smearPlot = function(DGEList,deTags, outSmear, outMain)
259 {
260 pdf(outSmear)
261 plotSmear(DGEList,de.tags=deTags,main=outMain)
262 grid(col="lightgray", lty="dotted")
263 dev.off()
264 }
265
266 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
267 { #
268 nc = ncol(rawrs)
269 for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
270 fullnames = colnames(rawrs)
271 newcolnames = substr(colnames(rawrs),1,20)
272 colnames(rawrs) = newcolnames
273 newcolnames = substr(colnames(cleanrs),1,20)
274 colnames(cleanrs) = newcolnames
275 defpar = par(no.readonly=T)
276 print.noquote('raw contig counts by sample:')
277 print.noquote(summary(rawrs))
278 print.noquote('normalised contig counts by sample:')
279 print.noquote(summary(cleanrs))
280 pdf(pdfname)
281 par(mfrow=c(1,2))
282 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
283 grid(col="lightgray",lty="dotted")
284 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
285 grid(col="lightgray",lty="dotted")
286 dev.off()
287 pdfname = "sample_counts_histogram.pdf"
288 nc = ncol(rawrs)
289 print.noquote(paste('Using ncol rawrs=',nc))
290 ncroot = round(sqrt(nc))
291 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
292 m = c()
293 for (i in c(1:nc)) {
294 rhist = hist(rawrs[,i],breaks=100,plot=F)
295 m = append(m,max(rhist\$counts))
296 }
297 ymax = max(m)
298 ncols = length(fullnames)
299 if (ncols > 20)
300 {
301 scale = 7*ncols/20
302 pdf(pdfname,width=scale,height=scale)
303 } else {
304 pdf(pdfname)
305 }
306 par(mfrow=c(ncroot,ncroot))
307 for (i in c(1:nc)) {
308 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
309 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
310 }
311 dev.off()
312 par(defpar)
313
314 }
315
316 cumPlot = function(rawrs,cleanrs,maint,myTitle)
317 { # updated to use ecdf
318 pdfname = "Filtering_rowsum_bar_charts.pdf"
319 defpar = par(no.readonly=T)
320 lrs = log(rawrs,10)
321 lim = max(lrs)
322 pdf(pdfname)
323 par(mfrow=c(2,1))
324 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
325 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
326 grid(col="lightgray", lty="dotted")
327 lrs = log(cleanrs,10)
328 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
329 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
330 grid(col="lightgray", lty="dotted")
331 dev.off()
332 par(defpar)
333 }
334
335 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
336 { # updated to use ecdf
337 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
338 pdf(pdfname)
339 par(mfrow=c(2,1))
340 lastx = max(rawrs)
341 rawe = knots(ecdf(rawrs))
342 cleane = knots(ecdf(cleanrs))
343 cy = 1:length(cleane)/length(cleane)
344 ry = 1:length(rawe)/length(rawe)
345 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
346 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
347 grid(col="blue")
348 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
349 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
350 grid(col="blue")
351 dev.off()
352 }
353
354
355
356 doGSEAold = function(y=NULL,design=NULL,histgmt="",
357 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
358 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
359 {
360 sink('Camera.log')
361 genesets = c()
362 if (bigmt > "")
363 {
364 bigenesets = readLines(bigmt)
365 genesets = bigenesets
366 }
367 if (histgmt > "")
368 {
369 hgenesets = readLines(histgmt)
370 if (bigmt > "") {
371 genesets = rbind(genesets,hgenesets)
372 } else {
373 genesets = hgenesets
374 } # use only history if no bi
375 }
376 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
377 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
378 outf = outfname
379 head=paste(myTitle,'edgeR GSEA')
380 write(head,file=outfname,append=F)
381 ntest=length(genesets)
382 urownames = toupper(rownames(y))
383 upcam = c()
384 downcam = c()
385 for (i in 1:ntest) {
386 gs = unlist(genesets[i])
387 g = gs[1] # geneset_id
388 u = gs[2]
389 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
390 glist = gs[3:length(gs)] # member gene symbols
391 glist = toupper(glist)
392 inglist = urownames %in% glist
393 nin = sum(inglist)
394 if ((nin > minnin) && (nin < maxnin)) {
395 ### print(paste('@@found',sum(inglist),'genes in glist'))
396 camres = camera(y=y,index=inglist,design=design)
397 if (! is.null(camres)) {
398 rownames(camres) = g # gene set name
399 camres = cbind(GeneSet=g,URL=u,camres)
400 if (camres\$Direction == "Up")
401 {
402 upcam = rbind(upcam,camres) } else {
403 downcam = rbind(downcam,camres)
404 }
405 }
406 }
407 }
408 uscam = upcam[order(upcam\$PValue),]
409 unadjp = uscam\$PValue
410 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
411 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
412 dscam = downcam[order(downcam\$PValue),]
413 unadjp = dscam\$PValue
414 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
415 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
416 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
417 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
418 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
419 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
420 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
421 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
422 sink()
423 }
424
425
426
427
428 doGSEA = function(y=NULL,design=NULL,histgmt="",
429 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
430 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
431 {
432 sink('Camera.log')
433 genesets = c()
434 if (bigmt > "")
435 {
436 bigenesets = readLines(bigmt)
437 genesets = bigenesets
438 }
439 if (histgmt > "")
440 {
441 hgenesets = readLines(histgmt)
442 if (bigmt > "") {
443 genesets = rbind(genesets,hgenesets)
444 } else {
445 genesets = hgenesets
446 } # use only history if no bi
447 }
448 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
449 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
450 outf = outfname
451 head=paste(myTitle,'edgeR GSEA')
452 write(head,file=outfname,append=F)
453 ntest=length(genesets)
454 urownames = toupper(rownames(y))
455 upcam = c()
456 downcam = c()
457 incam = c()
458 urls = c()
459 gsids = c()
460 for (i in 1:ntest) {
461 gs = unlist(genesets[i])
462 gsid = gs[1] # geneset_id
463 url = gs[2]
464 if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
465 glist = gs[3:length(gs)] # member gene symbols
466 glist = toupper(glist)
467 inglist = urownames %in% glist
468 nin = sum(inglist)
469 if ((nin > minnin) && (nin < maxnin)) {
470 incam = c(incam,inglist)
471 gsids = c(gsids,gsid)
472 urls = c(urls,url)
473 }
474 }
475 incam = as.list(incam)
476 names(incam) = gsids
477 allcam = camera(y=y,index=incam,design=design)
478 allcamres = cbind(geneset=gsids,allcam,URL=urls)
479 for (i in 1:ntest) {
480 camres = allcamres[i]
481 res = try(test = (camres\$Direction == "Up"))
482 if ("try-error" %in% class(res)) {
483 cat("test failed, camres = :")
484 print.noquote(camres)
485 } else { if (camres\$Direction == "Up")
486 { upcam = rbind(upcam,camres)
487 } else { downcam = rbind(downcam,camres)
488 }
489
490 }
491 }
492 uscam = upcam[order(upcam\$PValue),]
493 unadjp = uscam\$PValue
494 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
495 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
496 dscam = downcam[order(downcam\$PValue),]
497 unadjp = dscam\$PValue
498 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
499 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
500 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
501 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
502 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
503 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
504 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
505 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
506 sink()
507 }
508
509
510 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
511 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
512 filterquantile=0.2, subjects=c(),mydesign=NULL,
513 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
514 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
515 doCook=F,DESeq_fitType="parameteric")
516 {
517 # Error handling
518 if (length(unique(group))!=2){
519 print("Number of conditions identified in experiment does not equal 2")
520 q()
521 }
522 require(edgeR)
523 options(width = 512)
524 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
525 allN = nrow(Count_Matrix)
526 nscut = round(ncol(Count_Matrix)/2)
527 colTotmillionreads = colSums(Count_Matrix)/1e6
528 counts.dataframe = as.data.frame(c())
529 rawrs = rowSums(Count_Matrix)
530 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
531 nzN = nrow(nonzerod)
532 nzrs = rowSums(nonzerod)
533 zN = allN - nzN
534 print('# Quantiles for non-zero row counts:',quote=F)
535 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
536 if (useNDF == T)
537 {
538 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
539 lo = colSums(Count_Matrix[!gt1rpin3,])
540 workCM = Count_Matrix[gt1rpin3,]
541 cleanrs = rowSums(workCM)
542 cleanN = length(cleanrs)
543 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
544 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
545 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
546 } else {
547 useme = (nzrs > quantile(nzrs,filterquantile))
548 workCM = nonzerod[useme,]
549 lo = colSums(nonzerod[!useme,])
550 cleanrs = rowSums(workCM)
551 cleanN = length(cleanrs)
552 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
553 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
554 maint = paste('Filter below',filterquantile,'quantile')
555 }
556 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
557 allgenes = rownames(workCM)
558 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
559 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
560 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
561 testreg = str_match(allgenes,reg)
562 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
563 {
564 print("@@ using ucsc substitution for urls")
565 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
566 } else {
567 print("@@ using genecards substitution for urls")
568 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
569 }
570 print.noquote("# urls")
571 print.noquote(head(contigurls))
572 print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
573 cmrowsums = rowSums(workCM)
574 TName=unique(group)[1]
575 CName=unique(group)[2]
576 if (is.null(mydesign)) {
577 if (length(subjects) == 0)
578 {
579 mydesign = model.matrix(~group)
580 }
581 else {
582 subjf = factor(subjects)
583 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
584 }
585 }
586 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
587 print.noquote('Using design matrix:')
588 print.noquote(mydesign)
589 if (doedgeR) {
590 sink('edgeR.log')
591 #### Setup DGEList object
592 DGEList = DGEList(counts=workCM, group = group)
593 DGEList = calcNormFactors(DGEList)
594
595 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
596 comdisp = DGEList\$common.dispersion
597 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
598 if (edgeR_priordf > 0) {
599 print.noquote(paste("prior.df =",edgeR_priordf))
600 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
601 } else {
602 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
603 }
604 DGLM = glmFit(DGEList,design=mydesign)
605 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
606 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
607 normData = (1e+06*DGEList\$counts/efflib)
608 uoutput = cbind(
609 Name=as.character(rownames(DGEList\$counts)),
610 DE\$table,
611 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
612 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
613 DGEList\$counts
614 )
615 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
616 goodness = gof(DGLM, pcutoff=fdrthresh)
617 if (sum(goodness\$outlier) > 0) {
618 print.noquote('GLM outliers:')
619 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
620 } else {
621 print('No GLM fit outlier genes found\n')
622 }
623 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
624 pdf("edgeR_GoodnessofFit.pdf")
625 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
626 abline(0,1,lwd=3)
627 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
628 dev.off()
629 estpriorn = getPriorN(DGEList)
630 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
631 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
632 normData = (1e+06*DGEList\$counts/efflib)
633 uniqueg = unique(group)
634 #### Plot MDS
635 sample_colors = match(group,levels(group))
636 sampleTypes = levels(factor(group))
637 print.noquote(sampleTypes)
638 pdf("edgeR_MDSplot.pdf")
639 plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
640 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
641 grid(col="blue")
642 dev.off()
643 colnames(normData) = paste( colnames(normData),'N',sep="_")
644 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
645 nzd = data.frame(log(nonzerod + 1e-2,10))
646 try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") )
647 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
648 tt = cbind(
649 Name=as.character(rownames(DGEList\$counts)),
650 DE\$table,
651 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
652 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
653 )
654 print.noquote("# edgeR Top tags\n")
655 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
656 tt = tt[order(DE\$table\$PValue),]
657 print.noquote(tt[1:50,])
658 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
659 nsig = length(deTags)
660 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
661 deColours = ifelse(deTags,'red','black')
662 pdf("edgeR_BCV_vs_abundance.pdf")
663 plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
664 dev.off()
665 dg = DGEList[order(DE\$table\$PValue),]
666 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
667 efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
668 normData = (1e+06*dg\$counts/efflib)
669 outpdfname="edgeR_top_100_heatmap.pdf"
670 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle))
671 outSmear = "edgeR_smearplot.pdf"
672 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
673 smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
674 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf')
675 norm.factor = DGEList\$samples\$norm.factors
676 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
677 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
678 edgeRcounts = rep(0, length(allgenes))
679 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
680 sink()
681 } ### doedgeR
682 if (doDESeq2 == T)
683 {
684 sink("DESeq2.log")
685 # DESeq2
686 require('DESeq2')
687 library('RColorBrewer')
688 if (length(subjects) == 0)
689 {
690 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
691 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
692 } else {
693 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
694 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
695 }
696 #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
697 #rDESeq = results(DESeq2)
698 #newCountDataSet(workCM, group)
699 deSeqDatsizefac = estimateSizeFactors(deSEQds)
700 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
701 resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
702 rDESeq = as.data.frame(results(resDESeq))
703 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
704 srDESeq = rDESeq[order(rDESeq\$pvalue),]
705 qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf')
706 cat("# DESeq top 50\n")
707 print.noquote(srDESeq[1:50,])
708 write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
709 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
710 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
711 DESeqcounts = rep(0, length(allgenes))
712 DESeqcounts[DESeqcountsindex] = 1
713 pdf("DESeq2_dispersion_estimates.pdf")
714 plotDispEsts(resDESeq)
715 dev.off()
716 ysmall = abs(min(rDESeq\$log2FoldChange))
717 ybig = abs(max(rDESeq\$log2FoldChange))
718 ylimit = min(4,ysmall,ybig)
719 pdf("DESeq2_MA_plot.pdf")
720 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
721 dev.off()
722 rlogres = rlogTransformation(resDESeq)
723 sampledists = dist( t( assay(rlogres) ) )
724 sdmat = as.matrix(sampledists)
725 pdf("DESeq2_sample_distance_plot.pdf")
726 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
727 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
728 dev.off()
729 ###outpdfname="DESeq2_top50_heatmap.pdf"
730 ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
731 sink()
732 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
733 if ("try-error" %in% class(result)) {
734 print.noquote('DESeq2 plotPCA failed.')
735 } else {
736 pdf("DESeq2_PCA_plot.pdf")
737 #### wtf - print? Seems needed to get this to work
738 print(ppca)
739 dev.off()
740 }
741 }
742
743 if (doVoom == T) {
744 sink('VOOM.log')
745 if (doedgeR == F) {
746 #### Setup DGEList object
747 DGEList = DGEList(counts=workCM, group = group)
748 DGEList = calcNormFactors(DGEList)
749 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
750 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
751 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
752 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
753 norm.factor = DGEList\$samples\$norm.factors
754 }
755 pdf("VOOM_mean_variance_plot.pdf")
756 dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
757 dev.off()
758 # Use limma to fit data
759 fit = lmFit(dat.voomed, mydesign)
760 fit = eBayes(fit)
761 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
762 qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='VOOM_qqplot.pdf')
763 rownames(rvoom) = rownames(workCM)
764 rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls)
765 srvoom = rvoom[order(rvoom\$P.Value),]
766 cat("# VOOM top 50\n")
767 print(srvoom[1:50,])
768 write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F)
769 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
770 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
771 voomcountsindex = which(allgenes %in% topresults.voom\$ID)
772 voomcounts = rep(0, length(allgenes))
773 voomcounts[voomcountsindex] = 1
774 sink()
775 }
776
777 if (doCamera) {
778 doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
779 outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
780 }
781
782 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
783 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
784 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
785 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
786 VOOM_limma = voomcounts, row.names = allgenes)
787 } else if ((doDESeq2==T) && (doedgeR==T)) {
788 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
789 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
790 } else if ((doVoom==T) && (doedgeR==T)) {
791 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
792 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
793 }
794
795 if (nrow(counts.dataframe > 1)) {
796 counts.venn = vennCounts(counts.dataframe)
797 vennf = "Venn_significant_genes_overlap.pdf"
798 pdf(vennf)
799 vennDiagram(counts.venn,main=vennmain,col="maroon")
800 dev.off()
801 }
802 } #### doDESeq2 or doVoom
803
804 }
805 #### Done
806
807 ###sink(stdout(),append=T,type="message")
808 builtin_gmt = ""
809 history_gmt = ""
810 history_gmt_name = ""
811 out_edgeR = F
812 out_DESeq2 = F
813 out_VOOM = "$out_VOOM"
814 doDESeq2 = $DESeq2.doDESeq2 # make these T or F
815 doVoom = $doVoom
816 doCamera = F
817 doedgeR = $edgeR.doedgeR
818 edgeR_priordf = 0
819
820
821 #if $doVoom == "T":
822 out_VOOM = "$out_VOOM"
823 #end if
824
825 #if $DESeq2.doDESeq2 == "T":
826 out_DESeq2 = "$out_DESeq2"
827 DESeq_fitType = "$DESeq2.DESeq_fitType"
828 #end if
829
830 #if $edgeR.doedgeR == "T":
831 out_edgeR = "$out_edgeR"
832 edgeR_priordf = $edgeR.edgeR_priordf
833 #end if
834
835
836 if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
837 {
838 write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
839 quit(save="no",status=2)
840 }
841
842 Out_Dir = "$html_file.files_path"
843 Input = "$input1"
844 TreatmentName = "$treatment_name"
845 TreatmentCols = "$Treat_cols"
846 ControlName = "$control_name"
847 ControlCols= "$Control_cols"
848 org = "$input1.dbkey"
849 if (org == "") { org = "hg19"}
850 fdrtype = "$fdrtype"
851 fdrthresh = $fdrthresh
852 useNDF = $useNDF
853 fQ = $fQ # non-differential centile cutoff
854 myTitle = "$title"
855 sids = strsplit("$subjectids",',')
856 subjects = unlist(sids)
857 nsubj = length(subjects)
858 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
859 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
860 cat('Got TCols=')
861 cat(TCols)
862 cat('; CCols=')
863 cat(CCols)
864 cat('\n')
865 useCols = c(TCols,CCols)
866 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
867 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
868 snames = colnames(Count_Matrix)
869 nsamples = length(snames)
870 if (nsubj > 0 & nsubj != nsamples) {
871 options("show.error.messages"=T)
872 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
873 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
874 write(mess, stderr())
875 quit(save="no",status=4)
876 }
877 if (length(subjects) != 0) {subjects = subjects[useCols]}
878 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
879 rn = rownames(Count_Matrix)
880 islib = rn %in% c('librarySize','NotInBedRegions')
881 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
882 Count_Matrix = Count_Matrix[subset(rn,! islib),]
883 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
884 group = factor(group, levels=c(ControlName,TreatmentName))
885 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
886 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
887 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
888 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,
889 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
890 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType)
891 sessionInfo()
892 ]]>
893 </configfile>
894 </configfiles>
895 <help>
896
897 **What it does**
898
899 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
900 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
901
902 **Input**
903
904 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
905 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
906 non-negative integer count of reads from one sample overlapping the feature.
907 The matrix must have a header row uniquely identifying the source samples, and unique row names in
908 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
909
910 **Specifying comparisons**
911
912 This is basically dumbed down for two factors - case vs control.
913
914 More complex interfaces are possible but painful at present.
915 Probably need to specify a phenotype file to do this better.
916 Work in progress. Send code.
917
918 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
919 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
920 A list of integers, one for each subject or an empty string if samples are all independent.
921 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
922 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
923
924 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
925 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
926 8,9,1,1,2,2
927 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
928
929 **Methods available**
930
931 You can run 3 popular Bioconductor packages available for count data.
932
933 edgeR - see edgeR_ for details
934
935 VOOM/limma - see limma_VOOM_ for details
936
937 DESeq2 - see DESeq2_ for details
938
939 and optionally camera in edgeR which works better if MSigDB is installed.
940
941 **Outputs**
942
943 Some helpful plots and analysis results. Note that most of these are produced using R code
944 suggested by the excellent documentation and vignettes for the Bioconductor
945 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
946
947 **Note on Voom**
948
949 The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
950
951 This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
952
953 voom is an acronym for mean-variance modelling at the observational level.
954 The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
955 Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
956 This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
957 The weights are then used in the linear modelling process to adjust for heteroscedasticity.
958
959 In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
960 The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
961 The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
962 Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
963 Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
964 This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
965
966
967 Author(s)
968
969 Charity Law and Gordon Smyth
970
971 References
972
973 Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
974
975 Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
976 Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
977 http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
978
979 See Also
980
981 A voom case study is given in the edgeR User's Guide.
982
983 vooma is a similar function but for microarrays instead of RNA-seq.
984
985
986 ***old rant on changes to Bioconductor package variable names between versions***
987
988 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
989 breaking this and all other code that assumed the old name for this variable,
990 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
991 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
992 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
993 when their old scripts break. This tool currently now works with 2.4.6.
994
995 **Note on prior.N**
996
997 http://seqanswers.com/forums/showthread.php?t=5591 says:
998
999 *prior.n*
1000
1001 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
1002 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
1003 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
1004 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
1005 common likelihood the weight of one observation.
1006
1007 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
1008 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
1009 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
1010 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
1011 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
1012 If you have more samples, then the tagwise dispersion estimates will be more reliable,
1013 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
1014
1015
1016 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
1017
1018 Dear Dorota,
1019
1020 The important settings are prior.df and trend.
1021
1022 prior.n and prior.df are related through prior.df = prior.n * residual.df,
1023 and your experiment has residual.df = 36 - 12 = 24. So the old setting of
1024 prior.n=10 is equivalent for your data to prior.df = 240, a very large
1025 value. Going the other way, the new setting of prior.df=10 is equivalent
1026 to prior.n=10/24.
1027
1028 To recover old results with the current software you would use
1029
1030 estimateTagwiseDisp(object, prior.df=240, trend="none")
1031
1032 To get the new default from old software you would use
1033
1034 estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
1035
1036 Actually the old trend method is equivalent to trend="loess" in the new
1037 software. You should use plotBCV(object) to see whether a trend is
1038 required.
1039
1040 Note you could also use
1041
1042 prior.n = getPriorN(object, prior.df=10)
1043
1044 to map between prior.df and prior.n.
1045
1046 ----
1047
1048 **Attributions**
1049
1050 edgeR - edgeR_
1051
1052 VOOM/limma - limma_VOOM_
1053
1054 DESeq2 - DESeq2_ for details
1055
1056 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
1057
1058 Galaxy_ (that's what you are using right now!) for gluing everything together
1059
1060 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
1061 licensed to you under the LGPL_ like other rgenetics artefacts
1062
1063 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
1064 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
1065 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
1066 .. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
1067 .. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
1068 .. _Galaxy: http://getgalaxy.org
1069 </help>
1070
1071 </tool>
1072
1073