comparison rgedgeRpaired_nocamera.xml @ 27:ddd76b6db251 draft

Uploaded
author fubar
date Wed, 07 Aug 2013 02:10:19 -0400
parents c0fa3dde02d9
children
comparison
equal deleted inserted replaced
26:3ae79b4f8767 27:ddd76b6db251
1 <tool id="rgDifferentialCount" name="Differential_Count" version="0.20">
2 <description>models using BioConductor packages</description>
3 <requirements>
4 <requirement type="package" version="2.12">biocbasics</requirement>
5 <requirement type="package" version="3.0.1">r3</requirement>
6 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
7 <requirement type="package" version="9.07">ghostscript</requirement>
8 </requirements>
9
10 <command interpreter="python">
11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts"
12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
13 </command>
14 <inputs>
15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
16 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
17 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs"
18 help="Supply a meaningful name here to remind you what the outputs contain">
19 <sanitizer invalid_char="">
20 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
21 </sanitizer>
22 </param>
23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
25 multiple="true" use_header_names="true" size="120" display="checkboxes">
26 <validator type="no_options" message="Please select at least one column."/>
27 </param>
28 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
29 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
30 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
31 </param>
32 <param name="subjectids" type="text" optional="true" size="120" value = ""
33 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input"
34 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'">
35 <sanitizer>
36 <valid initial="string.letters,string.digits"><add value="," /> </valid>
37 </sanitizer>
38 </param>
39 <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
40 help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
41 <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1"
42 label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
43 help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
44
45 <conditional name="edgeR">
46 <param name="doedgeR" type="select"
47 label="Run this model using edgeR"
48 help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
49 <option value="F">Do not run edgeR</option>
50 <option value="T" selected="true">Run edgeR</option>
51 </param>
52 <when value="T">
53 <param name="edgeR_priordf" type="integer" value="20" size="3"
54 label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n and prior.df = prior.n * residual.df"
55 help="0 = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
56 </when>
57 <when value="F"></when>
58 </conditional>
59 <conditional name="DESeq2">
60 <param name="doDESeq2" type="select"
61 label="Run the same model with DESeq2 and compare findings"
62 help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
63 <option value="F" selected="true">Do not run DESeq2</option>
64 <option value="T">Run DESeq2</option>
65 </param>
66 <when value="T">
67 <param name="DESeq_fitType" type="select">
68 <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
69 <option value="local">Local fit - this will automagically be used if parametric fit fails</option>
70 <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual linked below in the documentation</option>
71 </param>
72 </when>
73 <when value="F"> </when>
74 </conditional>
75 <param name="doVoom" type="select"
76 label="Run the same model with Voom/limma and compare findings"
77 help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma">
78 <option value="F" selected="true">Do not run VOOM</option>
79 <option value="T">Run VOOM</option>
80 </param>
81 <!--
82 <conditional name="camera">
83 <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets"
84 help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history">
85 <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option>
86 <option value="T">Run GSEA tests with the Camera algorithm</option>
87 </param>
88 <when value="T">
89 <conditional name="gmtSource">
90 <param name="refgmtSource" type="select"
91 label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set">
92 <option value="indexed" selected="true">Use a built-in gene set</option>
93 <option value="history">Use a gene set from my history</option>
94 <option value="both">Add a gene set from my history to a built in gene set</option>
95 </param>
96 <when value="indexed">
97 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
98 <options from_data_table="gseaGMT_3.1">
99 <filter type="sort_by" column="2" />
100 <validator type="no_options" message="No GMT v3.1 files are available - please install them"/>
101 </options>
102 </param>
103 </when>
104 <when value="history">
105 <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" />
106 </when>
107 <when value="both">
108 <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" />
109 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
110 <options from_data_table="gseaGMT_4">
111 <filter type="sort_by" column="2" />
112 <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/>
113 </options>
114 </param>
115 </when>
116 </conditional>
117 </when>
118 <when value="F">
119 </when>
120 </conditional>
121 -->
122 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
123 help="Conventional default value of 0.05 recommended"/>
124 <param name="fdrtype" type="select" label="FDR (Type II error) control method"
125 help="Use fdr or bh typically to control for the number of tests in a reliable way">
126 <option value="fdr" selected="true">fdr</option>
127 <option value="BH">Benjamini Hochberg</option>
128 <option value="BY">Benjamini Yukateli</option>
129 <option value="bonferroni">Bonferroni</option>
130 <option value="hochberg">Hochberg</option>
131 <option value="holm">Holm</option>
132 <option value="hommel">Hommel</option>
133 <option value="none">no control for multiple tests</option>
134 </param>
135 </inputs>
136 <outputs>
137 <data format="tabular" name="out_edgeR" label="${title}_topTable_edgeR.xls">
138 <filter>edgeR['doedgeR'] == "T"</filter>
139 </data>
140 <data format="tabular" name="out_DESeq2" label="${title}_topTable_DESeq2.xls">
141 <filter>DESeq2['doDESeq2'] == "T"</filter>
142 </data>
143 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
144 <filter>doVoom == "T"</filter>
145 </data>
146 <data format="html" name="html_file" label="${title}.html"/>
147 </outputs>
148 <stdio>
149 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix" />
150 </stdio>
151 <tests>
152 <test>
153 <param name='input1' value='test_bams2mx.xls' ftype='tabular' />
154 <param name='treatment_name' value='liver' />
155 <param name='title' value='edgeRtest' />
156 <param name='useNDF' value='' />
157 <param name='doedgeR' value='T' />
158 <param name='doVoom' value='T' />
159 <param name='doDESeq2' value='T' />
160 <param name='fdrtype' value='fdr' />
161 <param name='edgeR_priordf' value="8" />
162 <param name='fdrthresh' value="0.05" />
163 <param name='control_name' value='heart' />
164 <param name='subjectids' value='' />
165 <param name='Control_cols' value='3,4,5,9' />
166 <param name='Treat_cols' value='2,6,7,8' />
167 <output name='out_edgeR' file='edgeRtest1out.xls' compare='diff' />
168 <output name='html_file' file='edgeRtest1out.html' compare='diff' lines_diff='20' />
169 </test>
170 </tests>
171
172 <configfiles>
173 <configfile name="runme">
174 <![CDATA[
175 #
176 # edgeR.Rscript
177 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
178 # Performs DGE on a count table containing n replicates of two conditions
179 #
180 # Parameters
181 #
182 # 1 - Output Dir
183
184 # Original edgeR code by: S.Lunke and A.Kaspi
185 reallybig = log10(.Machine\$double.xmax)
186 reallysmall = log10(.Machine\$double.xmin)
187 library('stringr')
188 library('gplots')
189 library('edgeR')
190 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
191 {
192 # Perform clustering for significant pvalues after controlling FWER
193 samples = colnames(cmat)
194 gu = unique(group)
195 gn = rownames(cmat)
196 if (length(gu) == 2) {
197 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
198 pcols = unlist(lapply(group,col.map))
199 } else {
200 colours = rainbow(length(gu),start=0,end=4/6)
201 pcols = colours[match(group,gu)] }
202 dm = cmat[(! is.na(gn)),]
203 # remove unlabelled hm rows
204 nprobes = nrow(dm)
205 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
206 if (nprobes > nsamp) {
207 dm =dm[1:nsamp,]
208 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
209 }
210 newcolnames = substr(colnames(dm),1,20)
211 colnames(dm) = newcolnames
212 pdf(outpdfname)
213 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
214 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
215 dev.off()
216 }
217
218 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
219 {
220 # for 2 groups only was
221 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
222 #pcols = unlist(lapply(group,col.map))
223 gu = unique(group)
224 colours = rainbow(length(gu),start=0.3,end=0.6)
225 pcols = colours[match(group,gu)]
226 nrows = nrow(cmat)
227 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
228 if (nrows > nsamp) {
229 cmat = cmat[c(1:nsamp),]
230 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
231 }
232 newcolnames = substr(colnames(cmat),1,20)
233 colnames(cmat) = newcolnames
234 pdf(outpdfname)
235 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
236 dev.off()
237 }
238
239 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
240 # stolen from https://gist.github.com/703512
241 {
242 o = -log10(sort(pvector,decreasing=F))
243 e = -log10( 1:length(o)/length(o) )
244 o[o==-Inf] = reallysmall
245 o[o==Inf] = reallybig
246 maint = descr
247 pdf(outpdf)
248 plot(e,o,pch=19,cex=1, main=maint, ...,
249 xlab=expression(Expected~~-log[10](italic(p))),
250 ylab=expression(Observed~~-log[10](italic(p))),
251 xlim=c(0,max(e)), ylim=c(0,max(o)))
252 lines(e,e,col="red")
253 grid(col = "lightgray", lty = "dotted")
254 dev.off()
255 }
256
257 smearPlot = function(DGEList,deTags, outSmear, outMain)
258 {
259 pdf(outSmear)
260 plotSmear(DGEList,de.tags=deTags,main=outMain)
261 grid(col="lightgray", lty="dotted")
262 dev.off()
263 }
264
265 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
266 { #
267 nc = ncol(rawrs)
268 for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
269 fullnames = colnames(rawrs)
270 newcolnames = substr(colnames(rawrs),1,20)
271 colnames(rawrs) = newcolnames
272 newcolnames = substr(colnames(cleanrs),1,20)
273 colnames(cleanrs) = newcolnames
274 defpar = par(no.readonly=T)
275 print.noquote('raw contig counts by sample:')
276 print.noquote(summary(rawrs))
277 print.noquote('normalised contig counts by sample:')
278 print.noquote(summary(cleanrs))
279 pdf(pdfname)
280 par(mfrow=c(1,2))
281 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
282 grid(col="lightgray",lty="dotted")
283 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
284 grid(col="lightgray",lty="dotted")
285 dev.off()
286 pdfname = "sample_counts_histogram.pdf"
287 nc = ncol(rawrs)
288 print.noquote(paste('Using ncol rawrs=',nc))
289 ncroot = round(sqrt(nc))
290 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
291 m = c()
292 for (i in c(1:nc)) {
293 rhist = hist(rawrs[,i],breaks=100,plot=F)
294 m = append(m,max(rhist\$counts))
295 }
296 ymax = max(m)
297 ncols = length(fullnames)
298 if (ncols > 20)
299 {
300 scale = 7*ncols/20
301 pdf(pdfname,width=scale,height=scale)
302 } else {
303 pdf(pdfname)
304 }
305 par(mfrow=c(ncroot,ncroot))
306 for (i in c(1:nc)) {
307 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
308 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
309 }
310 dev.off()
311 par(defpar)
312
313 }
314
315 cumPlot = function(rawrs,cleanrs,maint,myTitle)
316 { # updated to use ecdf
317 pdfname = "Filtering_rowsum_bar_charts.pdf"
318 defpar = par(no.readonly=T)
319 lrs = log(rawrs,10)
320 lim = max(lrs)
321 pdf(pdfname)
322 par(mfrow=c(2,1))
323 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
324 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
325 grid(col="lightgray", lty="dotted")
326 lrs = log(cleanrs,10)
327 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
328 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
329 grid(col="lightgray", lty="dotted")
330 dev.off()
331 par(defpar)
332 }
333
334 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
335 { # updated to use ecdf
336 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
337 pdf(pdfname)
338 par(mfrow=c(2,1))
339 lastx = max(rawrs)
340 rawe = knots(ecdf(rawrs))
341 cleane = knots(ecdf(cleanrs))
342 cy = 1:length(cleane)/length(cleane)
343 ry = 1:length(rawe)/length(rawe)
344 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
345 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
346 grid(col="blue")
347 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
348 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
349 grid(col="blue")
350 dev.off()
351 }
352
353
354
355 doGSEAold = function(y=NULL,design=NULL,histgmt="",
356 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
357 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
358 {
359 sink('Camera.log')
360 genesets = c()
361 if (bigmt > "")
362 {
363 bigenesets = readLines(bigmt)
364 genesets = bigenesets
365 }
366 if (histgmt > "")
367 {
368 hgenesets = readLines(histgmt)
369 if (bigmt > "") {
370 genesets = rbind(genesets,hgenesets)
371 } else {
372 genesets = hgenesets
373 } # use only history if no bi
374 }
375 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
376 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
377 outf = outfname
378 head=paste(myTitle,'edgeR GSEA')
379 write(head,file=outfname,append=F)
380 ntest=length(genesets)
381 urownames = toupper(rownames(y))
382 upcam = c()
383 downcam = c()
384 for (i in 1:ntest) {
385 gs = unlist(genesets[i])
386 g = gs[1] # geneset_id
387 u = gs[2]
388 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
389 glist = gs[3:length(gs)] # member gene symbols
390 glist = toupper(glist)
391 inglist = urownames %in% glist
392 nin = sum(inglist)
393 if ((nin > minnin) && (nin < maxnin)) {
394 ### print(paste('@@found',sum(inglist),'genes in glist'))
395 camres = camera(y=y,index=inglist,design=design)
396 if (! is.null(camres)) {
397 rownames(camres) = g # gene set name
398 camres = cbind(GeneSet=g,URL=u,camres)
399 if (camres\$Direction == "Up")
400 {
401 upcam = rbind(upcam,camres) } else {
402 downcam = rbind(downcam,camres)
403 }
404 }
405 }
406 }
407 uscam = upcam[order(upcam\$PValue),]
408 unadjp = uscam\$PValue
409 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
410 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
411 dscam = downcam[order(downcam\$PValue),]
412 unadjp = dscam\$PValue
413 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
414 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
415 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
416 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
417 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
418 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
419 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
420 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
421 sink()
422 }
423
424
425
426
427 doGSEA = function(y=NULL,design=NULL,histgmt="",
428 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
429 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
430 {
431 sink('Camera.log')
432 genesets = c()
433 if (bigmt > "")
434 {
435 bigenesets = readLines(bigmt)
436 genesets = bigenesets
437 }
438 if (histgmt > "")
439 {
440 hgenesets = readLines(histgmt)
441 if (bigmt > "") {
442 genesets = rbind(genesets,hgenesets)
443 } else {
444 genesets = hgenesets
445 } # use only history if no bi
446 }
447 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
448 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
449 outf = outfname
450 head=paste(myTitle,'edgeR GSEA')
451 write(head,file=outfname,append=F)
452 ntest=length(genesets)
453 urownames = toupper(rownames(y))
454 upcam = c()
455 downcam = c()
456 incam = c()
457 urls = c()
458 gsids = c()
459 for (i in 1:ntest) {
460 gs = unlist(genesets[i])
461 gsid = gs[1] # geneset_id
462 url = gs[2]
463 if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
464 glist = gs[3:length(gs)] # member gene symbols
465 glist = toupper(glist)
466 inglist = urownames %in% glist
467 nin = sum(inglist)
468 if ((nin > minnin) && (nin < maxnin)) {
469 incam = c(incam,inglist)
470 gsids = c(gsids,gsid)
471 urls = c(urls,url)
472 }
473 }
474 incam = as.list(incam)
475 names(incam) = gsids
476 allcam = camera(y=y,index=incam,design=design)
477 allcamres = cbind(geneset=gsids,allcam,URL=urls)
478 for (i in 1:ntest) {
479 camres = allcamres[i]
480 res = try(test = (camres\$Direction == "Up"))
481 if ("try-error" %in% class(res)) {
482 cat("test failed, camres = :")
483 print.noquote(camres)
484 } else { if (camres\$Direction == "Up")
485 { upcam = rbind(upcam,camres)
486 } else { downcam = rbind(downcam,camres)
487 }
488
489 }
490 }
491 uscam = upcam[order(upcam\$PValue),]
492 unadjp = uscam\$PValue
493 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
494 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
495 dscam = downcam[order(downcam\$PValue),]
496 unadjp = dscam\$PValue
497 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
498 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
499 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
500 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
501 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
502 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
503 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
504 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
505 sink()
506 }
507
508
509 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
510 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
511 filterquantile=0.2, subjects=c(),mydesign=NULL,
512 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
513 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
514 doCook=F,DESeq_fitType="parameteric")
515 {
516 # Error handling
517 if (length(unique(group))!=2){
518 print("Number of conditions identified in experiment does not equal 2")
519 q()
520 }
521 require(edgeR)
522 options(width = 512)
523 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
524 allN = nrow(Count_Matrix)
525 nscut = round(ncol(Count_Matrix)/2)
526 colTotmillionreads = colSums(Count_Matrix)/1e6
527 counts.dataframe = as.data.frame(c())
528 rawrs = rowSums(Count_Matrix)
529 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
530 nzN = nrow(nonzerod)
531 nzrs = rowSums(nonzerod)
532 zN = allN - nzN
533 print('# Quantiles for non-zero row counts:',quote=F)
534 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
535 if (useNDF == T)
536 {
537 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
538 lo = colSums(Count_Matrix[!gt1rpin3,])
539 workCM = Count_Matrix[gt1rpin3,]
540 cleanrs = rowSums(workCM)
541 cleanN = length(cleanrs)
542 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
543 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
544 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
545 } else {
546 useme = (nzrs > quantile(nzrs,filterquantile))
547 workCM = nonzerod[useme,]
548 lo = colSums(nonzerod[!useme,])
549 cleanrs = rowSums(workCM)
550 cleanN = length(cleanrs)
551 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
552 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
553 maint = paste('Filter below',filterquantile,'quantile')
554 }
555 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
556 allgenes = rownames(workCM)
557 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
558 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
559 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
560 testreg = str_match(allgenes,reg)
561 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
562 {
563 print("@@ using ucsc substitution for urls")
564 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
565 } else {
566 print("@@ using genecards substitution for urls")
567 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
568 }
569 print.noquote("# urls")
570 print.noquote(head(contigurls))
571 print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
572 cmrowsums = rowSums(workCM)
573 TName=unique(group)[1]
574 CName=unique(group)[2]
575 if (is.null(mydesign)) {
576 if (length(subjects) == 0)
577 {
578 mydesign = model.matrix(~group)
579 }
580 else {
581 subjf = factor(subjects)
582 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
583 }
584 }
585 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
586 print.noquote('Using design matrix:')
587 print.noquote(mydesign)
588 if (doedgeR) {
589 sink('edgeR.log')
590 #### Setup DGEList object
591 DGEList = DGEList(counts=workCM, group = group)
592 DGEList = calcNormFactors(DGEList)
593
594 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
595 comdisp = DGEList\$common.dispersion
596 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
597 if (edgeR_priordf > 0) {
598 print.noquote(paste("prior.df =",edgeR_priordf))
599 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
600 } else {
601 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
602 }
603 DGLM = glmFit(DGEList,design=mydesign)
604 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
605 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
606 normData = (1e+06*DGEList\$counts/efflib)
607 uoutput = cbind(
608 Name=as.character(rownames(DGEList\$counts)),
609 DE\$table,
610 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
611 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
612 DGEList\$counts
613 )
614 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
615 goodness = gof(DGLM, pcutoff=fdrthresh)
616 if (sum(goodness\$outlier) > 0) {
617 print.noquote('GLM outliers:')
618 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
619 } else {
620 print('No GLM fit outlier genes found\n')
621 }
622 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
623 pdf("edgeR_GoodnessofFit.pdf")
624 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
625 abline(0,1,lwd=3)
626 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
627 dev.off()
628 estpriorn = getPriorN(DGEList)
629 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
630 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
631 normData = (1e+06*DGEList\$counts/efflib)
632 uniqueg = unique(group)
633 #### Plot MDS
634 sample_colors = match(group,levels(group))
635 sampleTypes = levels(factor(group))
636 print.noquote(sampleTypes)
637 pdf("edgeR_MDSplot.pdf")
638 plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
639 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
640 grid(col="blue")
641 dev.off()
642 colnames(normData) = paste( colnames(normData),'N',sep="_")
643 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
644 nzd = data.frame(log(nonzerod + 1e-2,10))
645 try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") )
646 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
647 tt = cbind(
648 Name=as.character(rownames(DGEList\$counts)),
649 DE\$table,
650 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
651 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
652 )
653 print.noquote("# edgeR Top tags\n")
654 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
655 tt = tt[order(DE\$table\$PValue),]
656 print.noquote(tt[1:50,])
657 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
658 nsig = length(deTags)
659 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
660 deColours = ifelse(deTags,'red','black')
661 pdf("edgeR_BCV_vs_abundance.pdf")
662 plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
663 dev.off()
664 dg = DGEList[order(DE\$table\$PValue),]
665 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
666 efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
667 normData = (1e+06*dg\$counts/efflib)
668 outpdfname="edgeR_top_100_heatmap.pdf"
669 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle))
670 outSmear = "edgeR_smearplot.pdf"
671 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
672 smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
673 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf')
674 norm.factor = DGEList\$samples\$norm.factors
675 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
676 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
677 edgeRcounts = rep(0, length(allgenes))
678 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
679 sink()
680 } ### doedgeR
681 if (doDESeq2 == T)
682 {
683 sink("DESeq2.log")
684 # DESeq2
685 require('DESeq2')
686 library('RColorBrewer')
687 if (length(subjects) == 0)
688 {
689 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
690 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
691 } else {
692 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
693 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
694 }
695 #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
696 #rDESeq = results(DESeq2)
697 #newCountDataSet(workCM, group)
698 deSeqDatsizefac = estimateSizeFactors(deSEQds)
699 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
700 resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
701 rDESeq = as.data.frame(results(resDESeq))
702 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
703 srDESeq = rDESeq[order(rDESeq\$pvalue),]
704 qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf')
705 cat("# DESeq top 50\n")
706 print.noquote(srDESeq[1:50,])
707 write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
708 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
709 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
710 DESeqcounts = rep(0, length(allgenes))
711 DESeqcounts[DESeqcountsindex] = 1
712 pdf("DESeq2_dispersion_estimates.pdf")
713 plotDispEsts(resDESeq)
714 dev.off()
715 ysmall = abs(min(rDESeq\$log2FoldChange))
716 ybig = abs(max(rDESeq\$log2FoldChange))
717 ylimit = min(4,ysmall,ybig)
718 pdf("DESeq2_MA_plot.pdf")
719 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
720 dev.off()
721 rlogres = rlogTransformation(resDESeq)
722 sampledists = dist( t( assay(rlogres) ) )
723 sdmat = as.matrix(sampledists)
724 pdf("DESeq2_sample_distance_plot.pdf")
725 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
726 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
727 dev.off()
728 ###outpdfname="DESeq2_top50_heatmap.pdf"
729 ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
730 sink()
731 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
732 if ("try-error" %in% class(result)) {
733 print.noquote('DESeq2 plotPCA failed.')
734 } else {
735 pdf("DESeq2_PCA_plot.pdf")
736 #### wtf - print? Seems needed to get this to work
737 print(ppca)
738 dev.off()
739 }
740 }
741
742 if (doVoom == T) {
743 sink('VOOM.log')
744 if (doedgeR == F) {
745 #### Setup DGEList object
746 DGEList = DGEList(counts=workCM, group = group)
747 DGEList = calcNormFactors(DGEList)
748 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
749 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
750 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
751 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
752 norm.factor = DGEList\$samples\$norm.factors
753 }
754 pdf("VOOM_mean_variance_plot.pdf")
755 dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
756 dev.off()
757 # Use limma to fit data
758 fit = lmFit(dat.voomed, mydesign)
759 fit = eBayes(fit)
760 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
761 qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='VOOM_qqplot.pdf')
762 rownames(rvoom) = rownames(workCM)
763 rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls)
764 srvoom = rvoom[order(rvoom\$P.Value),]
765 cat("# VOOM top 50\n")
766 print(srvoom[1:50,])
767 write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F)
768 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
769 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
770 voomcountsindex = which(allgenes %in% topresults.voom\$ID)
771 voomcounts = rep(0, length(allgenes))
772 voomcounts[voomcountsindex] = 1
773 sink()
774 }
775
776 if (doCamera) {
777 doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
778 outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
779 }
780
781 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
782 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
783 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
784 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
785 VOOM_limma = voomcounts, row.names = allgenes)
786 } else if ((doDESeq2==T) && (doedgeR==T)) {
787 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
788 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
789 } else if ((doVoom==T) && (doedgeR==T)) {
790 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
791 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
792 }
793
794 if (nrow(counts.dataframe > 1)) {
795 counts.venn = vennCounts(counts.dataframe)
796 vennf = "Venn_significant_genes_overlap.pdf"
797 pdf(vennf)
798 vennDiagram(counts.venn,main=vennmain,col="maroon")
799 dev.off()
800 }
801 } #### doDESeq2 or doVoom
802
803 }
804 #### Done
805
806 ###sink(stdout(),append=T,type="message")
807 builtin_gmt = ""
808 history_gmt = ""
809 history_gmt_name = ""
810 out_edgeR = F
811 out_DESeq2 = F
812 out_VOOM = "$out_VOOM"
813 doDESeq2 = $DESeq2.doDESeq2 # make these T or F
814 doVoom = $doVoom
815 doCamera = F
816 doedgeR = $edgeR.doedgeR
817 edgeR_priordf = 0
818
819
820 #if $doVoom == "T":
821 out_VOOM = "$out_VOOM"
822 #end if
823
824 #if $DESeq2.doDESeq2 == "T":
825 out_DESeq2 = "$out_DESeq2"
826 DESeq_fitType = "$DESeq2.DESeq_fitType"
827 #end if
828
829 #if $edgeR.doedgeR == "T":
830 out_edgeR = "$out_edgeR"
831 edgeR_priordf = $edgeR.edgeR_priordf
832 #end if
833
834
835 if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
836 {
837 write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
838 quit(save="no",status=2)
839 }
840
841 Out_Dir = "$html_file.files_path"
842 Input = "$input1"
843 TreatmentName = "$treatment_name"
844 TreatmentCols = "$Treat_cols"
845 ControlName = "$control_name"
846 ControlCols= "$Control_cols"
847 org = "$input1.dbkey"
848 if (org == "") { org = "hg19"}
849 fdrtype = "$fdrtype"
850 fdrthresh = $fdrthresh
851 useNDF = $useNDF
852 fQ = $fQ # non-differential centile cutoff
853 myTitle = "$title"
854 sids = strsplit("$subjectids",',')
855 subjects = unlist(sids)
856 nsubj = length(subjects)
857 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
858 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
859 cat('Got TCols=')
860 cat(TCols)
861 cat('; CCols=')
862 cat(CCols)
863 cat('\n')
864 useCols = c(TCols,CCols)
865 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
866 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
867 snames = colnames(Count_Matrix)
868 nsamples = length(snames)
869 if (nsubj > 0 & nsubj != nsamples) {
870 options("show.error.messages"=T)
871 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
872 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
873 write(mess, stderr())
874 quit(save="no",status=4)
875 }
876 if (length(subjects) != 0) {subjects = subjects[useCols]}
877 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
878 rn = rownames(Count_Matrix)
879 islib = rn %in% c('librarySize','NotInBedRegions')
880 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
881 Count_Matrix = Count_Matrix[subset(rn,! islib),]
882 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
883 group = factor(group, levels=c(ControlName,TreatmentName))
884 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
885 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
886 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
887 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,
888 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
889 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType)
890 sessionInfo()
891 ]]>
892 </configfile>
893 </configfiles>
894 <help>
895
896 **What it does**
897
898 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
899 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
900
901 **Input**
902
903 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
904 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
905 non-negative integer count of reads from one sample overlapping the feature.
906 The matrix must have a header row uniquely identifying the source samples, and unique row names in
907 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
908
909 **Specifying comparisons**
910
911 This is basically dumbed down for two factors - case vs control.
912
913 More complex interfaces are possible but painful at present.
914 Probably need to specify a phenotype file to do this better.
915 Work in progress. Send code.
916
917 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
918 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
919 A list of integers, one for each subject or an empty string if samples are all independent.
920 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
921 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
922
923 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
924 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
925 8,9,1,1,2,2
926 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
927
928 **Methods available**
929
930 You can run 3 popular Bioconductor packages available for count data.
931
932 edgeR - see edgeR_ for details
933
934 VOOM/limma - see limma_VOOM_ for details
935
936 DESeq2 - see DESeq2_ for details
937
938 and optionally camera in edgeR which works better if MSigDB is installed.
939
940 **Outputs**
941
942 Some helpful plots and analysis results. Note that most of these are produced using R code
943 suggested by the excellent documentation and vignettes for the Bioconductor
944 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
945
946 **Note on Voom**
947
948 The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
949
950 This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
951
952 voom is an acronym for mean-variance modelling at the observational level.
953 The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
954 Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
955 This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
956 The weights are then used in the linear modelling process to adjust for heteroscedasticity.
957
958 In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
959 The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
960 The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
961 Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
962 Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
963 This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
964
965
966 Author(s)
967
968 Charity Law and Gordon Smyth
969
970 References
971
972 Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
973
974 Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
975 Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
976 http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
977
978 See Also
979
980 A voom case study is given in the edgeR User's Guide.
981
982 vooma is a similar function but for microarrays instead of RNA-seq.
983
984
985 ***old rant on changes to Bioconductor package variable names between versions***
986
987 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
988 breaking this and all other code that assumed the old name for this variable,
989 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
990 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
991 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
992 when their old scripts break. This tool currently now works with 2.4.6.
993
994 **Note on prior.N**
995
996 http://seqanswers.com/forums/showthread.php?t=5591 says:
997
998 *prior.n*
999
1000 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
1001 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
1002 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
1003 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
1004 common likelihood the weight of one observation.
1005
1006 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
1007 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
1008 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
1009 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
1010 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
1011 If you have more samples, then the tagwise dispersion estimates will be more reliable,
1012 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
1013
1014
1015 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
1016
1017 Dear Dorota,
1018
1019 The important settings are prior.df and trend.
1020
1021 prior.n and prior.df are related through prior.df = prior.n * residual.df,
1022 and your experiment has residual.df = 36 - 12 = 24. So the old setting of
1023 prior.n=10 is equivalent for your data to prior.df = 240, a very large
1024 value. Going the other way, the new setting of prior.df=10 is equivalent
1025 to prior.n=10/24.
1026
1027 To recover old results with the current software you would use
1028
1029 estimateTagwiseDisp(object, prior.df=240, trend="none")
1030
1031 To get the new default from old software you would use
1032
1033 estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
1034
1035 Actually the old trend method is equivalent to trend="loess" in the new
1036 software. You should use plotBCV(object) to see whether a trend is
1037 required.
1038
1039 Note you could also use
1040
1041 prior.n = getPriorN(object, prior.df=10)
1042
1043 to map between prior.df and prior.n.
1044
1045 ----
1046
1047 **Attributions**
1048
1049 edgeR - edgeR_
1050
1051 VOOM/limma - limma_VOOM_
1052
1053 DESeq2 - DESeq2_ for details
1054
1055 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
1056
1057 Galaxy_ (that's what you are using right now!) for gluing everything together
1058
1059 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
1060 licensed to you under the LGPL_ like other rgenetics artefacts
1061
1062 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
1063 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
1064 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
1065 .. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
1066 .. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
1067 .. _Galaxy: http://getgalaxy.org
1068 </help>
1069
1070 </tool>
1071
1072