comparison rgedgeRpaired.xml @ 27:ddd76b6db251 draft

Uploaded
author fubar
date Wed, 07 Aug 2013 02:10:19 -0400
parents c0fa3dde02d9
children
comparison
equal deleted inserted replaced
26:3ae79b4f8767 27:ddd76b6db251
1 <tool id="rgDifferentialCount" name="Differential_Count" version="0.20">
2 <description>models using BioConductor packages</description>
3 <requirements>
4 <requirement type="package" version="2.12">biocbasics</requirement>
5 <requirement type="package" version="3.0.1">r3</requirement>
6 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
7 <requirement type="package" version="9.07">ghostscript</requirement>
8 </requirements>
9
10 <command interpreter="python">
11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts"
12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
13 </command>
14 <inputs>
15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
16 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
17 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs"
18 help="Supply a meaningful name here to remind you what the outputs contain">
19 <sanitizer invalid_char="">
20 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
21 </sanitizer>
22 </param>
23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
25 multiple="true" use_header_names="true" size="120" display="checkboxes">
26 <validator type="no_options" message="Please select at least one column."/>
27 </param>
28 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
29 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
30 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
31 </param>
32 <param name="subjectids" type="text" optional="true" size="120" value = ""
33 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input"
34 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'">
35 <sanitizer>
36 <valid initial="string.letters,string.digits"><add value="," /> </valid>
37 </sanitizer>
38 </param>
39 <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
40 help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
41 <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1"
42 label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
43 help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
44
45 <conditional name="edgeR">
46 <param name="doedgeR" type="select"
47 label="Run this model using edgeR"
48 help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
49 <option value="F">Do not run edgeR</option>
50 <option value="T" selected="true">Run edgeR</option>
51 </param>
52 <when value="T">
53 <param name="edgeR_priordf" type="integer" value="20" size="3"
54 label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n and prior.df = prior.n * residual.df"
55 help="0 = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
56 </when>
57 <when value="F"></when>
58 </conditional>
59 <conditional name="DESeq2">
60 <param name="doDESeq2" type="select"
61 label="Run the same model with DESeq2 and compare findings"
62 help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
63 <option value="F" selected="true">Do not run DESeq2</option>
64 <option value="T">Run DESeq2</option>
65 </param>
66 <when value="T">
67 <param name="DESeq_fitType" type="select">
68 <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
69 <option value="local">Local fit - this will automagically be used if parametric fit fails</option>
70 <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual linked below in the documentation</option>
71 </param>
72 </when>
73 <when value="F"> </when>
74 </conditional>
75 <param name="doVoom" type="select"
76 label="Run the same model with Voom/limma and compare findings"
77 help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma">
78 <option value="F" selected="true">Do not run VOOM</option>
79 <option value="T">Run VOOM</option>
80 </param>
81 <!--
82 <conditional name="camera">
83 <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets"
84 help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history">
85 <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option>
86 <option value="T">Run GSEA tests with the Camera algorithm</option>
87 </param>
88 <when value="T">
89 <conditional name="gmtSource">
90 <param name="refgmtSource" type="select"
91 label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set">
92 <option value="indexed" selected="true">Use a built-in gene set</option>
93 <option value="history">Use a gene set from my history</option>
94 <option value="both">Add a gene set from my history to a built in gene set</option>
95 </param>
96 <when value="indexed">
97 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
98 <options from_data_table="gseaGMT_3.1">
99 <filter type="sort_by" column="2" />
100 <validator type="no_options" message="No GMT v3.1 files are available - please install them"/>
101 </options>
102 </param>
103 </when>
104 <when value="history">
105 <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" />
106 </when>
107 <when value="both">
108 <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" />
109 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
110 <options from_data_table="gseaGMT_4">
111 <filter type="sort_by" column="2" />
112 <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/>
113 </options>
114 </param>
115 </when>
116 </conditional>
117 </when>
118 <when value="F">
119 </when>
120 </conditional>
121 -->
122 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
123 help="Conventional default value of 0.05 recommended"/>
124 <param name="fdrtype" type="select" label="FDR (Type II error) control method"
125 help="Use fdr or bh typically to control for the number of tests in a reliable way">
126 <option value="fdr" selected="true">fdr</option>
127 <option value="BH">Benjamini Hochberg</option>
128 <option value="BY">Benjamini Yukateli</option>
129 <option value="bonferroni">Bonferroni</option>
130 <option value="hochberg">Hochberg</option>
131 <option value="holm">Holm</option>
132 <option value="hommel">Hommel</option>
133 <option value="none">no control for multiple tests</option>
134 </param>
135 </inputs>
136 <outputs>
137 <data format="tabular" name="out_edgeR" label="${title}_topTable_edgeR.xls">
138 <filter>edgeR['doedgeR'] == "T"</filter>
139 </data>
140 <data format="tabular" name="out_DESeq2" label="${title}_topTable_DESeq2.xls">
141 <filter>DESeq2['doDESeq2'] == "T"</filter>
142 </data>
143 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
144 <filter>doVoom == "T"</filter>
145 </data>
146 <data format="html" name="html_file" label="${title}.html"/>
147 </outputs>
148 <stdio>
149 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix" />
150 </stdio>
151 <tests>
152 <test>
153 <param name='input1' value='test_bams2mx.xls' ftype='tabular' />
154 <param name='treatment_name' value='liver' />
155 <param name='title' value='edgeRtest' />
156 <param name='useNDF' value='' />
157 <param name='doedgeR' value='T' />
158 <param name='doVoom' value='T' />
159 <param name='doDESeq2' value='T' />
160 <param name='fdrtype' value='fdr' />
161 <param name='edgeR_priordf' value="8" />
162 <param name='fdrthresh' value="0.05" />
163 <param name='control_name' value='heart' />
164 <param name='subjectids' value='' />
165 <param name='Control_cols' value='3,4,5,9' />
166 <param name='Treat_cols' value='2,6,7,8' />
167 <output name='out_edgeR' file='edgeRtest1out.xls' compare='diff' />
168 <output name='html_file' file='edgeRtest1out.html' compare='diff' lines_diff='20' />
169 </test>
170 </tests>
171
172 <configfiles>
173 <configfile name="runme">
174 <![CDATA[
175 #
176 # edgeR.Rscript
177 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
178 # Performs DGE on a count table containing n replicates of two conditions
179 #
180 # Parameters
181 #
182 # 1 - Output Dir
183
184 # Original edgeR code by: S.Lunke and A.Kaspi
185 reallybig = log10(.Machine\$double.xmax)
186 reallysmall = log10(.Machine\$double.xmin)
187 library('stringr')
188 library('gplots')
189 library('edgeR')
190 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
191 {
192 # Perform clustering for significant pvalues after controlling FWER
193 samples = colnames(cmat)
194 gu = unique(group)
195 gn = rownames(cmat)
196 if (length(gu) == 2) {
197 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
198 pcols = unlist(lapply(group,col.map))
199 } else {
200 colours = rainbow(length(gu),start=0,end=4/6)
201 pcols = colours[match(group,gu)] }
202 dm = cmat[(! is.na(gn)),]
203 # remove unlabelled hm rows
204 nprobes = nrow(dm)
205 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
206 if (nprobes > nsamp) {
207 dm =dm[1:nsamp,]
208 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
209 }
210 newcolnames = substr(colnames(dm),1,20)
211 colnames(dm) = newcolnames
212 pdf(outpdfname)
213 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
214 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
215 dev.off()
216 }
217
218 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
219 {
220 # for 2 groups only was
221 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
222 #pcols = unlist(lapply(group,col.map))
223 gu = unique(group)
224 colours = rainbow(length(gu),start=0.3,end=0.6)
225 pcols = colours[match(group,gu)]
226 nrows = nrow(cmat)
227 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
228 if (nrows > nsamp) {
229 cmat = cmat[c(1:nsamp),]
230 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
231 }
232 newcolnames = substr(colnames(cmat),1,20)
233 colnames(cmat) = newcolnames
234 pdf(outpdfname)
235 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
236 dev.off()
237 }
238
239 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
240 # stolen from https://gist.github.com/703512
241 {
242 o = -log10(sort(pvector,decreasing=F))
243 e = -log10( 1:length(o)/length(o) )
244 o[o==-Inf] = reallysmall
245 o[o==Inf] = reallybig
246 maint = descr
247 pdf(outpdf)
248 plot(e,o,pch=19,cex=1, main=maint, ...,
249 xlab=expression(Expected~~-log[10](italic(p))),
250 ylab=expression(Observed~~-log[10](italic(p))),
251 xlim=c(0,max(e)), ylim=c(0,max(o)))
252 lines(e,e,col="red")
253 grid(col = "lightgray", lty = "dotted")
254 dev.off()
255 }
256
257 smearPlot = function(DGEList,deTags, outSmear, outMain)
258 {
259 pdf(outSmear)
260 plotSmear(DGEList,de.tags=deTags,main=outMain)
261 grid(col="lightgray", lty="dotted")
262 dev.off()
263 }
264
265 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
266 { #
267 nc = ncol(rawrs)
268 for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
269 fullnames = colnames(rawrs)
270 newcolnames = substr(colnames(rawrs),1,20)
271 colnames(rawrs) = newcolnames
272 newcolnames = substr(colnames(cleanrs),1,20)
273 colnames(cleanrs) = newcolnames
274 defpar = par(no.readonly=T)
275 print.noquote('raw contig counts by sample:')
276 print.noquote(summary(rawrs))
277 print.noquote('normalised contig counts by sample:')
278 print.noquote(summary(cleanrs))
279 pdf(pdfname)
280 par(mfrow=c(1,2))
281 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
282 grid(col="lightgray",lty="dotted")
283 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
284 grid(col="lightgray",lty="dotted")
285 dev.off()
286 pdfname = "sample_counts_histogram.pdf"
287 nc = ncol(rawrs)
288 print.noquote(paste('Using ncol rawrs=',nc))
289 ncroot = round(sqrt(nc))
290 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
291 m = c()
292 for (i in c(1:nc)) {
293 rhist = hist(rawrs[,i],breaks=100,plot=F)
294 m = append(m,max(rhist\$counts))
295 }
296 ymax = max(m)
297 ncols = length(fullnames)
298 if (ncols > 20)
299 {
300 scale = 7*ncols/20
301 pdf(pdfname,width=scale,height=scale)
302 } else {
303 pdf(pdfname)
304 }
305 par(mfrow=c(ncroot,ncroot))
306 for (i in c(1:nc)) {
307 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
308 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
309 }
310 dev.off()
311 par(defpar)
312
313 }
314
315 cumPlot = function(rawrs,cleanrs,maint,myTitle)
316 { # updated to use ecdf
317 pdfname = "Filtering_rowsum_bar_charts.pdf"
318 defpar = par(no.readonly=T)
319 lrs = log(rawrs,10)
320 lim = max(lrs)
321 pdf(pdfname)
322 par(mfrow=c(2,1))
323 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
324 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
325 grid(col="lightgray", lty="dotted")
326 lrs = log(cleanrs,10)
327 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
328 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
329 grid(col="lightgray", lty="dotted")
330 dev.off()
331 par(defpar)
332 }
333
334 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
335 { # updated to use ecdf
336 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
337 pdf(pdfname)
338 par(mfrow=c(2,1))
339 lastx = max(rawrs)
340 rawe = knots(ecdf(rawrs))
341 cleane = knots(ecdf(cleanrs))
342 cy = 1:length(cleane)/length(cleane)
343 ry = 1:length(rawe)/length(rawe)
344 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
345 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
346 grid(col="blue")
347 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
348 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
349 grid(col="blue")
350 dev.off()
351 }
352
353
354
355 doGSEAold = function(y=NULL,design=NULL,histgmt="",
356 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
357 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
358 {
359 sink('Camera.log')
360 genesets = c()
361 if (bigmt > "")
362 {
363 bigenesets = readLines(bigmt)
364 genesets = bigenesets
365 }
366 if (histgmt > "")
367 {
368 hgenesets = readLines(histgmt)
369 if (bigmt > "") {
370 genesets = rbind(genesets,hgenesets)
371 } else {
372 genesets = hgenesets
373 } # use only history if no bi
374 }
375 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
376 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
377 outf = outfname
378 head=paste(myTitle,'edgeR GSEA')
379 write(head,file=outfname,append=F)
380 ntest=length(genesets)
381 urownames = toupper(rownames(y))
382 upcam = c()
383 downcam = c()
384 for (i in 1:ntest) {
385 gs = unlist(genesets[i])
386 g = gs[1] # geneset_id
387 u = gs[2]
388 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
389 glist = gs[3:length(gs)] # member gene symbols
390 glist = toupper(glist)
391 inglist = urownames %in% glist
392 nin = sum(inglist)
393 if ((nin > minnin) && (nin < maxnin)) {
394 ### print(paste('@@found',sum(inglist),'genes in glist'))
395 camres = camera(y=y,index=inglist,design=design)
396 if (! is.null(camres)) {
397 rownames(camres) = g # gene set name
398 camres = cbind(GeneSet=g,URL=u,camres)
399 if (camres\$Direction == "Up")
400 {
401 upcam = rbind(upcam,camres) } else {
402 downcam = rbind(downcam,camres)
403 }
404 }
405 }
406 }
407 uscam = upcam[order(upcam\$PValue),]
408 unadjp = uscam\$PValue
409 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
410 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
411 dscam = downcam[order(downcam\$PValue),]
412 unadjp = dscam\$PValue
413 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
414 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
415 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
416 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
417 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
418 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
419 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
420 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
421 sink()
422 }
423
424
425
426
427 doGSEA = function(y=NULL,design=NULL,histgmt="",
428 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
429 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
430 {
431 sink('Camera.log')
432 genesets = c()
433 if (bigmt > "")
434 {
435 bigenesets = readLines(bigmt)
436 genesets = bigenesets
437 }
438 if (histgmt > "")
439 {
440 hgenesets = readLines(histgmt)
441 if (bigmt > "") {
442 genesets = rbind(genesets,hgenesets)
443 } else {
444 genesets = hgenesets
445 } # use only history if no bi
446 }
447 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
448 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
449 outf = outfname
450 head=paste(myTitle,'edgeR GSEA')
451 write(head,file=outfname,append=F)
452 ntest=length(genesets)
453 urownames = toupper(rownames(y))
454 upcam = c()
455 downcam = c()
456 incam = c()
457 urls = c()
458 gsids = c()
459 for (i in 1:ntest) {
460 gs = unlist(genesets[i])
461 gsid = gs[1] # geneset_id
462 url = gs[2]
463 if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
464 glist = gs[3:length(gs)] # member gene symbols
465 glist = toupper(glist)
466 inglist = urownames %in% glist
467 nin = sum(inglist)
468 if ((nin > minnin) && (nin < maxnin)) {
469 incam = c(incam,inglist)
470 gsids = c(gsids,gsid)
471 urls = c(urls,url)
472 }
473 }
474 incam = as.list(incam)
475 names(incam) = gsids
476 allcam = camera(y=y,index=incam,design=design)
477 allcamres = cbind(geneset=gsids,allcam,URL=urls)
478 for (i in 1:ntest) {
479 camres = allcamres[i]
480 res = try(test = (camres\$Direction == "Up"))
481 if ("try-error" %in% class(res)) {
482 cat("test failed, camres = :")
483 print.noquote(camres)
484 } else { if (camres\$Direction == "Up")
485 { upcam = rbind(upcam,camres)
486 } else { downcam = rbind(downcam,camres)
487 }
488
489 }
490 }
491 uscam = upcam[order(upcam\$PValue),]
492 unadjp = uscam\$PValue
493 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
494 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
495 dscam = downcam[order(downcam\$PValue),]
496 unadjp = dscam\$PValue
497 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
498 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
499 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
500 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
501 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
502 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
503 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
504 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
505 sink()
506 }
507
508
509 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
510 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
511 filterquantile=0.2, subjects=c(),mydesign=NULL,
512 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
513 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
514 doCook=F,DESeq_fitType="parameteric")
515 {
516 # Error handling
517 if (length(unique(group))!=2){
518 print("Number of conditions identified in experiment does not equal 2")
519 q()
520 }
521 require(edgeR)
522 options(width = 512)
523 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
524 allN = nrow(Count_Matrix)
525 nscut = round(ncol(Count_Matrix)/2)
526 colTotmillionreads = colSums(Count_Matrix)/1e6
527 counts.dataframe = as.data.frame(c())
528 rawrs = rowSums(Count_Matrix)
529 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
530 nzN = nrow(nonzerod)
531 nzrs = rowSums(nonzerod)
532 zN = allN - nzN
533 print('# Quantiles for non-zero row counts:',quote=F)
534 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
535 if (useNDF == T)
536 {
537 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
538 lo = colSums(Count_Matrix[!gt1rpin3,])
539 workCM = Count_Matrix[gt1rpin3,]
540 cleanrs = rowSums(workCM)
541 cleanN = length(cleanrs)
542 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
543 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
544 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
545 } else {
546 useme = (nzrs > quantile(nzrs,filterquantile))
547 workCM = nonzerod[useme,]
548 lo = colSums(nonzerod[!useme,])
549 cleanrs = rowSums(workCM)
550 cleanN = length(cleanrs)
551 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
552 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
553 maint = paste('Filter below',filterquantile,'quantile')
554 }
555 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
556 allgenes = rownames(workCM)
557 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
558 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
559 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
560 testreg = str_match(allgenes,reg)
561 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
562 {
563 print("@@ using ucsc substitution for urls")
564 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
565 } else {
566 print("@@ using genecards substitution for urls")
567 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
568 }
569 print.noquote("# urls")
570 print.noquote(head(contigurls))
571 print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
572 cmrowsums = rowSums(workCM)
573 TName=unique(group)[1]
574 CName=unique(group)[2]
575 if (is.null(mydesign)) {
576 if (length(subjects) == 0)
577 {
578 mydesign = model.matrix(~group)
579 }
580 else {
581 subjf = factor(subjects)
582 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
583 }
584 }
585 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
586 print.noquote('Using design matrix:')
587 print.noquote(mydesign)
588 if (doedgeR) {
589 sink('edgeR.log')
590 #### Setup DGEList object
591 DGEList = DGEList(counts=workCM, group = group)
592 DGEList = calcNormFactors(DGEList)
593
594 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
595 comdisp = DGEList\$common.dispersion
596 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
597 if (edgeR_priordf > 0) {
598 print.noquote(paste("prior.df =",edgeR_priordf))
599 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
600 } else {
601 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
602 }
603 DGLM = glmFit(DGEList,design=mydesign)
604 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
605 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
606 normData = (1e+06*DGEList\$counts/efflib)
607 uoutput = cbind(
608 Name=as.character(rownames(DGEList\$counts)),
609 DE\$table,
610 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
611 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
612 DGEList\$counts
613 )
614 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
615 goodness = gof(DGLM, pcutoff=fdrthresh)
616 if (sum(goodness\$outlier) > 0) {
617 print.noquote('GLM outliers:')
618 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
619 } else {
620 print('No GLM fit outlier genes found\n')
621 }
622 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
623 pdf("edgeR_GoodnessofFit.pdf")
624 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
625 abline(0,1,lwd=3)
626 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
627 dev.off()
628 estpriorn = getPriorN(DGEList)
629 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
630 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
631 normData = (1e+06*DGEList\$counts/efflib)
632 uniqueg = unique(group)
633 #### Plot MDS
634 sample_colors = match(group,levels(group))
635 sampleTypes = levels(factor(group))
636 print.noquote(sampleTypes)
637 pdf("edgeR_MDSplot.pdf")
638 plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
639 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
640 grid(col="blue")
641 dev.off()
642 colnames(normData) = paste( colnames(normData),'N',sep="_")
643 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
644 nzd = data.frame(log(nonzerod + 1e-2,10))
645 try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") )
646 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
647 tt = cbind(
648 Name=as.character(rownames(DGEList\$counts)),
649 DE\$table,
650 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
651 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
652 )
653 print.noquote("# edgeR Top tags\n")
654 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
655 tt = tt[order(DE\$table\$PValue),]
656 print.noquote(tt[1:50,])
657 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
658 nsig = length(deTags)
659 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
660 deColours = ifelse(deTags,'red','black')
661 pdf("edgeR_BCV_vs_abundance.pdf")
662 plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
663 dev.off()
664 dg = DGEList[order(DE\$table\$PValue),]
665 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
666 efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
667 normData = (1e+06*dg\$counts/efflib)
668 outpdfname="edgeR_top_100_heatmap.pdf"
669 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle))
670 outSmear = "edgeR_smearplot.pdf"
671 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
672 smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
673 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf')
674 norm.factor = DGEList\$samples\$norm.factors
675 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
676 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
677 edgeRcounts = rep(0, length(allgenes))
678 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
679 sink()
680 } ### doedgeR
681 if (doDESeq2 == T)
682 {
683 sink("DESeq2.log")
684 # DESeq2
685 require('DESeq2')
686 library('RColorBrewer')
687 if (length(subjects) == 0)
688 {
689 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
690 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
691 } else {
692 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
693 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
694 }
695 #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
696 #rDESeq = results(DESeq2)
697 #newCountDataSet(workCM, group)
698 deSeqDatsizefac = estimateSizeFactors(deSEQds)
699 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
700 resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
701 rDESeq = as.data.frame(results(resDESeq))
702 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
703 srDESeq = rDESeq[order(rDESeq\$pvalue),]
704 qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf')
705 cat("# DESeq top 50\n")
706 print.noquote(srDESeq[1:50,])
707 write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
708 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
709 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
710 DESeqcounts = rep(0, length(allgenes))
711 DESeqcounts[DESeqcountsindex] = 1
712 pdf("DESeq2_dispersion_estimates.pdf")
713 plotDispEsts(resDESeq)
714 dev.off()
715 ysmall = abs(min(rDESeq\$log2FoldChange))
716 ybig = abs(max(rDESeq\$log2FoldChange))
717 ylimit = min(4,ysmall,ybig)
718 pdf("DESeq2_MA_plot.pdf")
719 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
720 dev.off()
721 rlogres = rlogTransformation(resDESeq)
722 sampledists = dist( t( assay(rlogres) ) )
723 sdmat = as.matrix(sampledists)
724 pdf("DESeq2_sample_distance_plot.pdf")
725 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
726 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
727 dev.off()
728 ###outpdfname="DESeq2_top50_heatmap.pdf"
729 ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
730 sink()
731 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
732 if ("try-error" %in% class(result)) {
733 print.noquote('DESeq2 plotPCA failed.')
734 } else {
735 pdf("DESeq2_PCA_plot.pdf")
736 #### wtf - print? Seems needed to get this to work
737 print(ppca)
738 dev.off()
739 }
740 }
741
742 if (doVoom == T) {
743 sink('VOOM.log')
744 if (doedgeR == F) {
745 #### Setup DGEList object
746 DGEList = DGEList(counts=workCM, group = group)
747 DGEList = calcNormFactors(DGEList)
748 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
749 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
750 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
751 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
752 norm.factor = DGEList\$samples\$norm.factors
753 }
754 pdf("VOOM_mean_variance_plot.pdf")
755 dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
756 dev.off()
757 # Use limma to fit data
758 fit = lmFit(dat.voomed, mydesign)
759 fit = eBayes(fit)
760 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
761 qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='VOOM_qqplot.pdf')
762 rownames(rvoom) = rownames(workCM)
763 rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls)
764 srvoom = rvoom[order(rvoom\$P.Value),]
765 cat("# VOOM top 50\n")
766 print(srvoom[1:50,])
767 write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F)
768 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
769 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
770 voomcountsindex = which(allgenes %in% topresults.voom\$ID)
771 voomcounts = rep(0, length(allgenes))
772 voomcounts[voomcountsindex] = 1
773 sink()
774 }
775
776 if (doCamera) {
777 doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
778 outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
779 }
780
781 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
782 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
783 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
784 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
785 VOOM_limma = voomcounts, row.names = allgenes)
786 } else if ((doDESeq2==T) && (doedgeR==T)) {
787 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
788 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
789 } else if ((doVoom==T) && (doedgeR==T)) {
790 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
791 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
792 }
793
794 if (nrow(counts.dataframe > 1)) {
795 counts.venn = vennCounts(counts.dataframe)
796 vennf = "Venn_significant_genes_overlap.pdf"
797 pdf(vennf)
798 vennDiagram(counts.venn,main=vennmain,col="maroon")
799 dev.off()
800 }
801 } #### doDESeq2 or doVoom
802
803 }
804 #### Done
805
806 ###sink(stdout(),append=T,type="message")
807 builtin_gmt = ""
808 history_gmt = ""
809 history_gmt_name = ""
810 out_edgeR = F
811 out_DESeq2 = F
812 out_VOOM = "$out_VOOM"
813 doDESeq2 = $DESeq2.doDESeq2 # make these T or F
814 doVoom = $doVoom
815 doCamera = F
816 doedgeR = $edgeR.doedgeR
817 edgeR_priordf = 0
818
819
820 #if $doVoom == "T":
821 out_VOOM = "$out_VOOM"
822 #end if
823
824 #if $DESeq2.doDESeq2 == "T":
825 out_DESeq2 = "$out_DESeq2"
826 DESeq_fitType = "$DESeq2.DESeq_fitType"
827 #end if
828
829 #if $edgeR.doedgeR == "T":
830 out_edgeR = "$out_edgeR"
831 edgeR_priordf = $edgeR.edgeR_priordf
832 #end if
833
834 <!--
835 #if $camera.doCamera == 'T'
836 doCamera = $camera.doCamera
837 #if $camera.gmtSource.refgmtSource == "indexed" or $camera.gmtSource.refgmtSource == "both":
838 builtin_gmt = "${camera.gmtSource.builtinGMT.fields.path}"
839 #end if
840 #if $camera.gmtSource.refgmtSource == "history" or $camera.gmtSource.refgmtSource == "both":
841 history_gmt = "${camera.gmtSource.ownGMT}"
842 history_gmt_name = "${camera.gmtSource.ownGMT.name}"
843 #end if
844 #end if
845 -->
846
847 if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
848 {
849 write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
850 quit(save="no",status=2)
851 }
852
853 Out_Dir = "$html_file.files_path"
854 Input = "$input1"
855 TreatmentName = "$treatment_name"
856 TreatmentCols = "$Treat_cols"
857 ControlName = "$control_name"
858 ControlCols= "$Control_cols"
859 org = "$input1.dbkey"
860 if (org == "") { org = "hg19"}
861 fdrtype = "$fdrtype"
862 fdrthresh = $fdrthresh
863 useNDF = $useNDF
864 fQ = $fQ # non-differential centile cutoff
865 myTitle = "$title"
866 sids = strsplit("$subjectids",',')
867 subjects = unlist(sids)
868 nsubj = length(subjects)
869 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
870 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
871 cat('Got TCols=')
872 cat(TCols)
873 cat('; CCols=')
874 cat(CCols)
875 cat('\n')
876 useCols = c(TCols,CCols)
877 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
878 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
879 snames = colnames(Count_Matrix)
880 nsamples = length(snames)
881 if (nsubj > 0 & nsubj != nsamples) {
882 options("show.error.messages"=T)
883 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
884 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
885 write(mess, stderr())
886 quit(save="no",status=4)
887 }
888 if (length(subjects) != 0) {subjects = subjects[useCols]}
889 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
890 rn = rownames(Count_Matrix)
891 islib = rn %in% c('librarySize','NotInBedRegions')
892 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
893 Count_Matrix = Count_Matrix[subset(rn,! islib),]
894 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
895 group = factor(group, levels=c(ControlName,TreatmentName))
896 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
897 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
898 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
899 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,
900 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
901 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType)
902 sessionInfo()
903 ]]>
904 </configfile>
905 </configfiles>
906 <help>
907
908 **What it does**
909
910 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
911 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
912
913 **Input**
914
915 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
916 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
917 non-negative integer count of reads from one sample overlapping the feature.
918 The matrix must have a header row uniquely identifying the source samples, and unique row names in
919 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
920
921 **Specifying comparisons**
922
923 This is basically dumbed down for two factors - case vs control.
924
925 More complex interfaces are possible but painful at present.
926 Probably need to specify a phenotype file to do this better.
927 Work in progress. Send code.
928
929 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
930 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
931 A list of integers, one for each subject or an empty string if samples are all independent.
932 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
933 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
934
935 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
936 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
937 8,9,1,1,2,2
938 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
939
940 **Methods available**
941
942 You can run 3 popular Bioconductor packages available for count data.
943
944 edgeR - see edgeR_ for details
945
946 VOOM/limma - see limma_VOOM_ for details
947
948 DESeq2 - see DESeq2_ for details
949
950 and optionally camera in edgeR which works better if MSigDB is installed.
951
952 **Outputs**
953
954 Some helpful plots and analysis results. Note that most of these are produced using R code
955 suggested by the excellent documentation and vignettes for the Bioconductor
956 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
957
958 **Note on Voom**
959
960 The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
961
962 This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
963
964 voom is an acronym for mean-variance modelling at the observational level.
965 The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
966 Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
967 This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
968 The weights are then used in the linear modelling process to adjust for heteroscedasticity.
969
970 In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
971 The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
972 The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
973 Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
974 Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
975 This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
976
977
978 Author(s)
979
980 Charity Law and Gordon Smyth
981
982 References
983
984 Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
985
986 Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
987 Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
988 http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
989
990 See Also
991
992 A voom case study is given in the edgeR User's Guide.
993
994 vooma is a similar function but for microarrays instead of RNA-seq.
995
996
997 ***old rant on changes to Bioconductor package variable names between versions***
998
999 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
1000 breaking this and all other code that assumed the old name for this variable,
1001 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
1002 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
1003 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
1004 when their old scripts break. This tool currently now works with 2.4.6.
1005
1006 **Note on prior.N**
1007
1008 http://seqanswers.com/forums/showthread.php?t=5591 says:
1009
1010 *prior.n*
1011
1012 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
1013 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
1014 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
1015 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
1016 common likelihood the weight of one observation.
1017
1018 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
1019 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
1020 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
1021 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
1022 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
1023 If you have more samples, then the tagwise dispersion estimates will be more reliable,
1024 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
1025
1026
1027 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
1028
1029 Dear Dorota,
1030
1031 The important settings are prior.df and trend.
1032
1033 prior.n and prior.df are related through prior.df = prior.n * residual.df,
1034 and your experiment has residual.df = 36 - 12 = 24. So the old setting of
1035 prior.n=10 is equivalent for your data to prior.df = 240, a very large
1036 value. Going the other way, the new setting of prior.df=10 is equivalent
1037 to prior.n=10/24.
1038
1039 To recover old results with the current software you would use
1040
1041 estimateTagwiseDisp(object, prior.df=240, trend="none")
1042
1043 To get the new default from old software you would use
1044
1045 estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
1046
1047 Actually the old trend method is equivalent to trend="loess" in the new
1048 software. You should use plotBCV(object) to see whether a trend is
1049 required.
1050
1051 Note you could also use
1052
1053 prior.n = getPriorN(object, prior.df=10)
1054
1055 to map between prior.df and prior.n.
1056
1057 ----
1058
1059 **Attributions**
1060
1061 edgeR - edgeR_
1062
1063 VOOM/limma - limma_VOOM_
1064
1065 DESeq2 - DESeq2_ for details
1066
1067 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
1068
1069 Galaxy_ (that's what you are using right now!) for gluing everything together
1070
1071 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
1072 licensed to you under the LGPL_ like other rgenetics artefacts
1073
1074 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
1075 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
1076 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
1077 .. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
1078 .. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
1079 .. _Galaxy: http://getgalaxy.org
1080 </help>
1081
1082 </tool>
1083
1084