comparison rgedgeRpaired_nocamera.xml~ @ 32:8051d14cf819 draft

update with better filtering
author fubar
date Sun, 22 Dec 2013 01:32:29 -0500
parents
children
comparison
equal deleted inserted replaced
31:5b6af671d985 32:8051d14cf819
1 <tool id="rgDifferentialCount" name="Differential_Count" version="0.22">
2 <description>models using BioConductor packages</description>
3 <requirements>
4 <requirement type="package" version="3.0.1">r3</requirement>
5 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
6 <requirement type="package" version="9.07">ghostscript</requirement>
7 <requirement type="package" version="2.12">biocbasics</requirement>
8 </requirements>
9
10 <command interpreter="python">
11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts"
12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
13 </command>
14 <inputs>
15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
16 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
17 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs"
18 help="Supply a meaningful name here to remind you what the outputs contain">
19 <sanitizer invalid_char="">
20 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
21 </sanitizer>
22 </param>
23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
25 multiple="true" use_header_names="true" size="120" display="checkboxes">
26 <validator type="no_options" message="Please select at least one column."/>
27 <sanitizer invalid_char="">
28 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
29 </sanitizer>
30 </param>
31 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
32 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
33 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
34 <validator type="no_options" message="Please select at least one column."/>
35 <sanitizer invalid_char="">
36 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
37 </sanitizer>
38 </param>
39 <param name="subjectids" type="text" optional="true" size="120" value = ""
40 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input"
41 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'">
42 <sanitizer>
43 <valid initial="string.letters,string.digits"><add value="," /> </valid>
44 </sanitizer>
45 </param>
46 <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
47 help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
48 <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1"
49 label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
50 help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
51
52 <conditional name="edgeR">
53 <param name="doedgeR" type="select"
54 label="Run this model using edgeR"
55 help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
56 <option value="F">Do not run edgeR</option>
57 <option value="T" selected="true">Run edgeR</option>
58 </param>
59 <when value="T">
60 <param name="edgeR_priordf" type="integer" value="20" size="3"
61 label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n and prior.df = prior.n * residual.df"
62 help="0 = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
63 </when>
64 <when value="F"></when>
65 </conditional>
66 <conditional name="DESeq2">
67 <param name="doDESeq2" type="select"
68 label="Run the same model with DESeq2 and compare findings"
69 help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
70 <option value="F" selected="true">Do not run DESeq2</option>
71 <option value="T">Run DESeq2</option>
72 </param>
73 <when value="T">
74 <param name="DESeq_fitType" type="select">
75 <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
76 <option value="local">Local fit - this will automagically be used if parametric fit fails</option>
77 <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual linked below in the documentation</option>
78 </param>
79 </when>
80 <when value="F"> </when>
81 </conditional>
82 <param name="doVoom" type="select"
83 label="Run the same model with Voom/limma and compare findings"
84 help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma">
85 <option value="F" selected="true">Do not run VOOM</option>
86 <option value="T">Run VOOM</option>
87 </param>
88 <!--
89 <conditional name="camera">
90 <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets"
91 help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history">
92 <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option>
93 <option value="T">Run GSEA tests with the Camera algorithm</option>
94 </param>
95 <when value="T">
96 <conditional name="gmtSource">
97 <param name="refgmtSource" type="select"
98 label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set">
99 <option value="indexed" selected="true">Use a built-in gene set</option>
100 <option value="history">Use a gene set from my history</option>
101 <option value="both">Add a gene set from my history to a built in gene set</option>
102 </param>
103 <when value="indexed">
104 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
105 <options from_data_table="gseaGMT_3.1">
106 <filter type="sort_by" column="2" />
107 <validator type="no_options" message="No GMT v3.1 files are available - please install them"/>
108 </options>
109 </param>
110 </when>
111 <when value="history">
112 <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" />
113 </when>
114 <when value="both">
115 <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" />
116 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
117 <options from_data_table="gseaGMT_4">
118 <filter type="sort_by" column="2" />
119 <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/>
120 </options>
121 </param>
122 </when>
123 </conditional>
124 </when>
125 <when value="F">
126 </when>
127 </conditional>
128 -->
129 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
130 help="Conventional default value of 0.05 recommended"/>
131 <param name="fdrtype" type="select" label="FDR (Type II error) control method"
132 help="Use fdr or bh typically to control for the number of tests in a reliable way">
133 <option value="fdr" selected="true">fdr</option>
134 <option value="BH">Benjamini Hochberg</option>
135 <option value="BY">Benjamini Yukateli</option>
136 <option value="bonferroni">Bonferroni</option>
137 <option value="hochberg">Hochberg</option>
138 <option value="holm">Holm</option>
139 <option value="hommel">Hommel</option>
140 <option value="none">no control for multiple tests</option>
141 </param>
142 </inputs>
143 <outputs>
144 <data format="tabular" name="out_edgeR" label="${title}_topTable_edgeR.xls">
145 <filter>edgeR['doedgeR'] == "T"</filter>
146 </data>
147 <data format="tabular" name="out_DESeq2" label="${title}_topTable_DESeq2.xls">
148 <filter>DESeq2['doDESeq2'] == "T"</filter>
149 </data>
150 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
151 <filter>doVoom == "T"</filter>
152 </data>
153 <data format="html" name="html_file" label="${title}.html"/>
154 </outputs>
155 <stdio>
156 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix" />
157 </stdio>
158 <tests>
159 <test>
160 <param name='input1' value='test_bams2mx.xls' ftype='tabular' />
161 <param name='treatment_name' value='liver' />
162 <param name='title' value='edgeRtest' />
163 <param name='useNDF' value='' />
164 <param name='doedgeR' value='T' />
165 <param name='doVoom' value='T' />
166 <param name='doDESeq2' value='T' />
167 <param name='fdrtype' value='fdr' />
168 <param name='edgeR_priordf' value="8" />
169 <param name='fdrthresh' value="0.05" />
170 <param name='control_name' value='heart' />
171 <param name='subjectids' value='' />
172 <param name='Control_cols' value='3,4,5,9' />
173 <param name='Treat_cols' value='2,6,7,8' />
174 <output name='out_edgeR' file='edgeRtest1out.xls' compare='diff' />
175 <output name='html_file' file='edgeRtest1out.html' compare='diff' lines_diff='20' />
176 </test>
177 </tests>
178
179 <configfiles>
180 <configfile name="runme">
181 <![CDATA[
182 #
183 # edgeR.Rscript
184 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
185 # Performs DGE on a count table containing n replicates of two conditions
186 #
187 # Parameters
188 #
189 # 1 - Output Dir
190
191 # Original edgeR code by: S.Lunke and A.Kaspi
192 reallybig = log10(.Machine\$double.xmax)
193 reallysmall = log10(.Machine\$double.xmin)
194 library('stringr')
195 library('gplots')
196 library('edgeR')
197 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
198 {
199 # Perform clustering for significant pvalues after controlling FWER
200 samples = colnames(cmat)
201 gu = unique(group)
202 gn = rownames(cmat)
203 if (length(gu) == 2) {
204 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
205 pcols = unlist(lapply(group,col.map))
206 } else {
207 colours = rainbow(length(gu),start=0,end=4/6)
208 pcols = colours[match(group,gu)] }
209 dm = cmat[(! is.na(gn)),]
210 # remove unlabelled hm rows
211 nprobes = nrow(dm)
212 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
213 if (nprobes > nsamp) {
214 dm =dm[1:nsamp,]
215 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
216 }
217 newcolnames = substr(colnames(dm),1,20)
218 colnames(dm) = newcolnames
219 pdf(outpdfname)
220 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
221 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
222 dev.off()
223 }
224
225 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
226 {
227 # for 2 groups only was
228 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
229 #pcols = unlist(lapply(group,col.map))
230 gu = unique(group)
231 colours = rainbow(length(gu),start=0.3,end=0.6)
232 pcols = colours[match(group,gu)]
233 nrows = nrow(cmat)
234 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
235 if (nrows > nsamp) {
236 cmat = cmat[c(1:nsamp),]
237 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
238 }
239 newcolnames = substr(colnames(cmat),1,20)
240 colnames(cmat) = newcolnames
241 pdf(outpdfname)
242 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
243 dev.off()
244 }
245
246 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
247 # stolen from https://gist.github.com/703512
248 {
249 o = -log10(sort(pvector,decreasing=F))
250 e = -log10( 1:length(o)/length(o) )
251 o[o==-Inf] = reallysmall
252 o[o==Inf] = reallybig
253 maint = descr
254 pdf(outpdf)
255 plot(e,o,pch=19,cex=1, main=maint, ...,
256 xlab=expression(Expected~~-log[10](italic(p))),
257 ylab=expression(Observed~~-log[10](italic(p))),
258 xlim=c(0,max(e)), ylim=c(0,max(o)))
259 lines(e,e,col="red")
260 grid(col = "lightgray", lty = "dotted")
261 dev.off()
262 }
263
264 smearPlot = function(DGEList,deTags, outSmear, outMain)
265 {
266 pdf(outSmear)
267 plotSmear(DGEList,de.tags=deTags,main=outMain)
268 grid(col="lightgray", lty="dotted")
269 dev.off()
270 }
271
272 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
273 { #
274 nc = ncol(rawrs)
275 for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
276 fullnames = colnames(rawrs)
277 newcolnames = substr(colnames(rawrs),1,20)
278 colnames(rawrs) = newcolnames
279 newcolnames = substr(colnames(cleanrs),1,20)
280 colnames(cleanrs) = newcolnames
281 defpar = par(no.readonly=T)
282 print.noquote('raw contig counts by sample:')
283 print.noquote(summary(rawrs))
284 print.noquote('normalised contig counts by sample:')
285 print.noquote(summary(cleanrs))
286 pdf(pdfname)
287 par(mfrow=c(1,2))
288 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
289 grid(col="lightgray",lty="dotted")
290 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
291 grid(col="lightgray",lty="dotted")
292 dev.off()
293 pdfname = "sample_counts_histogram.pdf"
294 nc = ncol(rawrs)
295 print.noquote(paste('Using ncol rawrs=',nc))
296 ncroot = round(sqrt(nc))
297 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
298 m = c()
299 for (i in c(1:nc)) {
300 rhist = hist(rawrs[,i],breaks=100,plot=F)
301 m = append(m,max(rhist\$counts))
302 }
303 ymax = max(m)
304 ncols = length(fullnames)
305 if (ncols > 20)
306 {
307 scale = 7*ncols/20
308 pdf(pdfname,width=scale,height=scale)
309 } else {
310 pdf(pdfname)
311 }
312 par(mfrow=c(ncroot,ncroot))
313 for (i in c(1:nc)) {
314 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
315 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
316 }
317 dev.off()
318 par(defpar)
319
320 }
321
322 cumPlot = function(rawrs,cleanrs,maint,myTitle)
323 { # updated to use ecdf
324 pdfname = "Filtering_rowsum_bar_charts.pdf"
325 defpar = par(no.readonly=T)
326 lrs = log(rawrs,10)
327 lim = max(lrs)
328 pdf(pdfname)
329 par(mfrow=c(2,1))
330 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
331 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
332 grid(col="lightgray", lty="dotted")
333 lrs = log(cleanrs,10)
334 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
335 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
336 grid(col="lightgray", lty="dotted")
337 dev.off()
338 par(defpar)
339 }
340
341 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
342 { # updated to use ecdf
343 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
344 pdf(pdfname)
345 par(mfrow=c(2,1))
346 lastx = max(rawrs)
347 rawe = knots(ecdf(rawrs))
348 cleane = knots(ecdf(cleanrs))
349 cy = 1:length(cleane)/length(cleane)
350 ry = 1:length(rawe)/length(rawe)
351 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
352 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
353 grid(col="blue")
354 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
355 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
356 grid(col="blue")
357 dev.off()
358 }
359
360
361
362 doGSEA = function(y=NULL,design=NULL,histgmt="",
363 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
364 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
365 {
366 sink('Camera.log')
367 genesets = c()
368 if (bigmt > "")
369 {
370 bigenesets = readLines(bigmt)
371 genesets = bigenesets
372 }
373 if (histgmt > "")
374 {
375 hgenesets = readLines(histgmt)
376 if (bigmt > "") {
377 genesets = rbind(genesets,hgenesets)
378 } else {
379 genesets = hgenesets
380 } # use only history if no bi
381 }
382 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
383 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
384 outf = outfname
385 head=paste(myTitle,'edgeR GSEA')
386 write(head,file=outfname,append=F)
387 ntest=length(genesets)
388 urownames = toupper(rownames(y))
389 upcam = c()
390 downcam = c()
391 for (i in 1:ntest) {
392 gs = unlist(genesets[i])
393 g = gs[1] # geneset_id
394 u = gs[2]
395 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
396 glist = gs[3:length(gs)] # member gene symbols
397 glist = toupper(glist)
398 inglist = urownames %in% glist
399 nin = sum(inglist)
400 if ((nin > minnin) && (nin < maxnin)) {
401 ### print(paste('@@found',sum(inglist),'genes in glist'))
402 camres = camera(y=y,index=inglist,design=design)
403 if (! is.null(camres)) {
404 rownames(camres) = g # gene set name
405 camres = cbind(GeneSet=g,URL=u,camres)
406 if (camres\$Direction == "Up")
407 {
408 upcam = rbind(upcam,camres) } else {
409 downcam = rbind(downcam,camres)
410 }
411 }
412 }
413 }
414 uscam = upcam[order(upcam\$PValue),]
415 unadjp = uscam\$PValue
416 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
417 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
418 dscam = downcam[order(downcam\$PValue),]
419 unadjp = dscam\$PValue
420 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
421 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
422 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
423 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
424 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
425 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
426 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
427 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
428 sink()
429 }
430
431
432
433
434 doGSEAatonce = function(y=NULL,design=NULL,histgmt="",
435 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
436 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
437 {
438 sink('Camera.log')
439 genesets = c()
440 if (bigmt > "")
441 {
442 bigenesets = readLines(bigmt)
443 genesets = bigenesets
444 }
445 if (histgmt > "")
446 {
447 hgenesets = readLines(histgmt)
448 if (bigmt > "") {
449 genesets = rbind(genesets,hgenesets)
450 } else {
451 genesets = hgenesets
452 } # use only history if no bi
453 }
454 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
455 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
456 outf = outfname
457 head=paste(myTitle,'edgeR GSEA')
458 write(head,file=outfname,append=F)
459 ntest=length(genesets)
460 urownames = toupper(rownames(y))
461 upcam = c()
462 downcam = c()
463 incam = c()
464 urls = c()
465 gsids = c()
466 for (i in 1:ntest) {
467 gs = unlist(genesets[i])
468 gsid = gs[1] # geneset_id
469 url = gs[2]
470 if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
471 glist = gs[3:length(gs)] # member gene symbols
472 glist = toupper(glist)
473 inglist = urownames %in% glist
474 nin = sum(inglist)
475 if ((nin > minnin) && (nin < maxnin)) {
476 incam = c(incam,inglist)
477 gsids = c(gsids,gsid)
478 urls = c(urls,url)
479 }
480 }
481 incam = as.list(incam)
482 names(incam) = gsids
483 allcam = camera(y=y,index=incam,design=design)
484 allcamres = cbind(geneset=gsids,allcam,URL=urls)
485 for (i in 1:ntest) {
486 camres = allcamres[i]
487 res = try(test = (camres\$Direction == "Up"))
488 if ("try-error" %in% class(res)) {
489 cat("test failed, camres = :")
490 print.noquote(camres)
491 } else { if (camres\$Direction == "Up")
492 { upcam = rbind(upcam,camres)
493 } else { downcam = rbind(downcam,camres)
494 }
495
496 }
497 }
498 uscam = upcam[order(upcam\$PValue),]
499 unadjp = uscam\$PValue
500 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
501 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
502 dscam = downcam[order(downcam\$PValue),]
503 unadjp = dscam\$PValue
504 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
505 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
506 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
507 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
508 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
509 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
510 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
511 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
512 sink()
513 }
514
515
516 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
517 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
518 filterquantile=0.2, subjects=c(),mydesign=NULL,
519 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
520 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
521 doCook=F,DESeq_fitType="parameteric")
522 {
523 # Error handling
524 if (length(unique(group))!=2){
525 print("Number of conditions identified in experiment does not equal 2")
526 q()
527 }
528 require(edgeR)
529 options(width = 512)
530 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
531 allN = nrow(Count_Matrix)
532 nscut = round(ncol(Count_Matrix)/2)
533 colTotmillionreads = colSums(Count_Matrix)/1e6
534 counts.dataframe = as.data.frame(c())
535 rawrs = rowSums(Count_Matrix)
536 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
537 nzN = nrow(nonzerod)
538 nzrs = rowSums(nonzerod)
539 zN = allN - nzN
540 print('# Quantiles for non-zero row counts:',quote=F)
541 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
542 if (useNDF == T)
543 {
544 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
545 lo = colSums(Count_Matrix[!gt1rpin3,])
546 workCM = Count_Matrix[gt1rpin3,]
547 cleanrs = rowSums(workCM)
548 cleanN = length(cleanrs)
549 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
550 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
551 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
552 } else {
553 useme = (nzrs > quantile(nzrs,filterquantile))
554 workCM = nonzerod[useme,]
555 lo = colSums(nonzerod[!useme,])
556 cleanrs = rowSums(workCM)
557 cleanN = length(cleanrs)
558 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
559 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
560 maint = paste('Filter below',filterquantile,'quantile')
561 }
562 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
563 allgenes = rownames(workCM)
564 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
565 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
566 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
567 testreg = str_match(allgenes,reg)
568 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
569 {
570 print("@@ using ucsc substitution for urls")
571 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
572 } else {
573 print.noquote("@@ using genecards substitution for urls")
574 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
575 }
576 print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
577 cmrowsums = rowSums(workCM)
578 TName=unique(group)[1]
579 CName=unique(group)[2]
580 if (is.null(mydesign)) {
581 if (length(subjects) == 0)
582 {
583 mydesign = model.matrix(~group)
584 }
585 else {
586 subjf = factor(subjects)
587 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
588 }
589 }
590 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
591 print.noquote('Using design matrix:')
592 print.noquote(mydesign)
593 if (doedgeR) {
594 sink('edgeR.log')
595 #### Setup DGEList object
596 DGEList = DGEList(counts=workCM, group = group)
597 DGEList = calcNormFactors(DGEList)
598
599 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
600 comdisp = DGEList\$common.dispersion
601 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
602 if (edgeR_priordf > 0) {
603 print.noquote(paste("prior.df =",edgeR_priordf))
604 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
605 } else {
606 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
607 }
608 DGLM = glmFit(DGEList,design=mydesign)
609 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
610 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
611 normData = (1e+06*DGEList\$counts/efflib)
612 uoutput = cbind(
613 Name=as.character(rownames(DGEList\$counts)),
614 DE\$table,
615 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
616 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
617 DGEList\$counts
618 )
619 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
620 goodness = gof(DGLM, pcutoff=fdrthresh)
621 if (sum(goodness\$outlier) > 0) {
622 print.noquote('GLM outliers:')
623 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
624 } else {
625 print('No GLM fit outlier genes found\n')
626 }
627 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
628 pdf("edgeR_GoodnessofFit.pdf")
629 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
630 abline(0,1,lwd=3)
631 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
632 dev.off()
633 estpriorn = getPriorN(DGEList)
634 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
635 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
636 normData = (1e+06*DGEList\$counts/efflib)
637 uniqueg = unique(group)
638 #### Plot MDS
639 sample_colors = match(group,levels(group))
640 sampleTypes = levels(factor(group))
641 print.noquote(sampleTypes)
642 pdf("edgeR_MDSplot.pdf")
643 plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
644 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
645 grid(col="blue")
646 dev.off()
647 colnames(normData) = paste( colnames(normData),'N',sep="_")
648 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
649 nzd = data.frame(log(nonzerod + 1e-2,10))
650 try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") )
651 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
652 tt = cbind(
653 Name=as.character(rownames(DGEList\$counts)),
654 DE\$table,
655 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
656 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
657 )
658 print.noquote("# edgeR Top tags\n")
659 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
660 tt = tt[order(DE\$table\$PValue),]
661 print.noquote(tt[1:50,])
662 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
663 nsig = length(deTags)
664 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
665 deColours = ifelse(deTags,'red','black')
666 pdf("edgeR_BCV_vs_abundance.pdf")
667 plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
668 dev.off()
669 dg = DGEList[order(DE\$table\$PValue),]
670 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
671 efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
672 normData = (1e+06*dg\$counts/efflib)
673 outpdfname="edgeR_top_100_heatmap.pdf"
674 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle))
675 outSmear = "edgeR_smearplot.pdf"
676 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
677 smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
678 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf')
679 norm.factor = DGEList\$samples\$norm.factors
680 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
681 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
682 edgeRcounts = rep(0, length(allgenes))
683 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
684 sink()
685 } ### doedgeR
686 if (doDESeq2 == T)
687 {
688 sink("DESeq2.log")
689 # DESeq2
690 require('DESeq2')
691 library('RColorBrewer')
692 if (length(subjects) == 0)
693 {
694 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
695 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
696 } else {
697 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
698 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
699 }
700 #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
701 #rDESeq = results(DESeq2)
702 #newCountDataSet(workCM, group)
703 deSeqDatsizefac = estimateSizeFactors(deSEQds)
704 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
705 resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
706 rDESeq = as.data.frame(results(resDESeq))
707 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums)
708 srDESeq = rDESeq[order(rDESeq\$pvalue),]
709 write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
710 qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf')
711 cat("# DESeq top 50\n")
712 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
713 srDESeq = rDESeq[order(rDESeq\$pvalue),]
714 print.noquote(srDESeq[1:50,])
715 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
716 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
717 DESeqcounts = rep(0, length(allgenes))
718 DESeqcounts[DESeqcountsindex] = 1
719 pdf("DESeq2_dispersion_estimates.pdf")
720 plotDispEsts(resDESeq)
721 dev.off()
722 ysmall = abs(min(rDESeq\$log2FoldChange))
723 ybig = abs(max(rDESeq\$log2FoldChange))
724 ylimit = min(4,ysmall,ybig)
725 pdf("DESeq2_MA_plot.pdf")
726 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
727 dev.off()
728 rlogres = rlogTransformation(resDESeq)
729 sampledists = dist( t( assay(rlogres) ) )
730 sdmat = as.matrix(sampledists)
731 pdf("DESeq2_sample_distance_plot.pdf")
732 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
733 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
734 dev.off()
735 ###outpdfname="DESeq2_top50_heatmap.pdf"
736 ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
737 sink()
738 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
739 if ("try-error" %in% class(result)) {
740 print.noquote('DESeq2 plotPCA failed.')
741 } else {
742 pdf("DESeq2_PCA_plot.pdf")
743 #### wtf - print? Seems needed to get this to work
744 print(ppca)
745 dev.off()
746 }
747 }
748
749 if (doVoom == T) {
750 sink('Voom.log')
751 if (doedgeR == F) {
752 #### Setup DGEList object
753 DGEList = DGEList(counts=workCM, group = group)
754 DGEList = calcNormFactors(DGEList)
755 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
756 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
757 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
758 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
759 norm.factor = DGEList\$samples\$norm.factors
760 }
761 pdf("Voom_mean_variance_plot.pdf")
762 dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
763 dev.off()
764 # Use limma to fit data
765 fit = lmFit(dat.voomed, mydesign)
766 fit = eBayes(fit)
767 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
768 qqPlot(descr=paste(myTitle,'Voom-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='Voom_qqplot.pdf')
769 rownames(rvoom) = rownames(workCM)
770 rvoom = cbind(rvoom,NReads=cmrowsums)
771 srvoom = rvoom[order(rvoom\$P.Value),]
772 write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F)
773 rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls)
774 srvoom = rvoom[order(rvoom\$P.Value),]
775 cat("# Voom top 50\n")
776 print(srvoom[1:50,])
777 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
778 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
779 voomcountsindex = which(allgenes %in% topresults.voom\$ID)
780 voomcounts = rep(0, length(allgenes))
781 voomcounts[voomcountsindex] = 1
782 sink()
783 }
784
785 if (doCamera) {
786 doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
787 outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
788 }
789
790 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
791 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
792 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
793 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
794 VOOM_limma = voomcounts, row.names = allgenes)
795 } else if ((doDESeq2==T) && (doedgeR==T)) {
796 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
797 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
798 } else if ((doVoom==T) && (doedgeR==T)) {
799 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
800 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
801 }
802
803 if (nrow(counts.dataframe > 1)) {
804 counts.venn = vennCounts(counts.dataframe)
805 vennf = "Venn_significant_genes_overlap.pdf"
806 pdf(vennf)
807 vennDiagram(counts.venn,main=vennmain,col="maroon")
808 dev.off()
809 }
810 } #### doDESeq2 or doVoom
811
812 }
813 #### Done
814
815 ###sink(stdout(),append=T,type="message")
816 builtin_gmt = ""
817 history_gmt = ""
818 history_gmt_name = ""
819 out_edgeR = F
820 out_DESeq2 = F
821 out_VOOM = "$out_VOOM"
822 doDESeq2 = $DESeq2.doDESeq2 # make these T or F
823 doVoom = $doVoom
824 doCamera = F
825 doedgeR = $edgeR.doedgeR
826 edgeR_priordf = 0
827
828
829 #if $doVoom == "T":
830 out_VOOM = "$out_VOOM"
831 #end if
832
833 #if $DESeq2.doDESeq2 == "T":
834 out_DESeq2 = "$out_DESeq2"
835 DESeq_fitType = "$DESeq2.DESeq_fitType"
836 #end if
837
838 #if $edgeR.doedgeR == "T":
839 out_edgeR = "$out_edgeR"
840 edgeR_priordf = $edgeR.edgeR_priordf
841 #end if
842
843
844 if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
845 {
846 write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
847 quit(save="no",status=2)
848 }
849
850 Out_Dir = "$html_file.files_path"
851 Input = "$input1"
852 TreatmentName = "$treatment_name"
853 TreatmentCols = "$Treat_cols"
854 ControlName = "$control_name"
855 ControlCols= "$Control_cols"
856 org = "$input1.dbkey"
857 if (org == "") { org = "hg19"}
858 fdrtype = "$fdrtype"
859 fdrthresh = $fdrthresh
860 useNDF = $useNDF
861 fQ = $fQ # non-differential centile cutoff
862 myTitle = "$title"
863 sids = strsplit("$subjectids",',')
864 subjects = unlist(sids)
865 nsubj = length(subjects)
866 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
867 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
868 cat('Got TCols=')
869 cat(TCols)
870 cat('; CCols=')
871 cat(CCols)
872 cat('\n')
873 useCols = c(TCols,CCols)
874 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
875 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
876 snames = colnames(Count_Matrix)
877 nsamples = length(snames)
878 if (nsubj > 0 & nsubj != nsamples) {
879 options("show.error.messages"=T)
880 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
881 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
882 write(mess, stderr())
883 quit(save="no",status=4)
884 }
885 if (length(subjects) != 0) {subjects = subjects[useCols]}
886 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
887 rn = rownames(Count_Matrix)
888 islib = rn %in% c('librarySize','NotInBedRegions')
889 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
890 Count_Matrix = Count_Matrix[subset(rn,! islib),]
891 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
892 group = factor(group, levels=c(ControlName,TreatmentName))
893 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
894 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
895 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
896 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,
897 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
898 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType)
899 sessionInfo()
900 ]]>
901 </configfile>
902 </configfiles>
903 <help>
904
905 **What it does**
906
907 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
908 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
909
910 **Input**
911
912 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
913 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
914 non-negative integer count of reads from one sample overlapping the feature.
915 The matrix must have a header row uniquely identifying the source samples, and unique row names in
916 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
917
918 **Specifying comparisons**
919
920 This is basically dumbed down for two factors - case vs control.
921
922 More complex interfaces are possible but painful at present.
923 Probably need to specify a phenotype file to do this better.
924 Work in progress. Send code.
925
926 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
927 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
928 A list of integers, one for each subject or an empty string if samples are all independent.
929 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
930 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
931
932 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
933 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
934 8,9,1,1,2,2
935 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
936
937 **Methods available**
938
939 You can run 3 popular Bioconductor packages available for count data.
940
941 edgeR - see edgeR_ for details
942
943 VOOM/limma - see limma_VOOM_ for details
944
945 DESeq2 - see DESeq2_ for details
946
947 and optionally camera in edgeR which works better if MSigDB is installed.
948
949 **Outputs**
950
951 Some helpful plots and analysis results. Note that most of these are produced using R code
952 suggested by the excellent documentation and vignettes for the Bioconductor
953 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
954
955 **Note on Voom**
956
957 The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
958
959 This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
960
961 voom is an acronym for mean-variance modelling at the observational level.
962 The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
963 Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
964 This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
965 The weights are then used in the linear modelling process to adjust for heteroscedasticity.
966
967 In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
968 The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
969 The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
970 Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
971 Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
972 This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
973
974
975 Author(s)
976
977 Charity Law and Gordon Smyth
978
979 References
980
981 Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
982
983 Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
984 Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
985 http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
986
987 See Also
988
989 A voom case study is given in the edgeR User's Guide.
990
991 vooma is a similar function but for microarrays instead of RNA-seq.
992
993
994 ***old rant on changes to Bioconductor package variable names between versions***
995
996 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
997 breaking this and all other code that assumed the old name for this variable,
998 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
999 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
1000 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
1001 when their old scripts break. This tool currently now works with 2.4.6.
1002
1003 **Note on prior.N**
1004
1005 http://seqanswers.com/forums/showthread.php?t=5591 says:
1006
1007 *prior.n*
1008
1009 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
1010 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
1011 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
1012 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
1013 common likelihood the weight of one observation.
1014
1015 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
1016 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
1017 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
1018 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
1019 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
1020 If you have more samples, then the tagwise dispersion estimates will be more reliable,
1021 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
1022
1023
1024 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
1025
1026 Dear Dorota,
1027
1028 The important settings are prior.df and trend.
1029
1030 prior.n and prior.df are related through prior.df = prior.n * residual.df,
1031 and your experiment has residual.df = 36 - 12 = 24. So the old setting of
1032 prior.n=10 is equivalent for your data to prior.df = 240, a very large
1033 value. Going the other way, the new setting of prior.df=10 is equivalent
1034 to prior.n=10/24.
1035
1036 To recover old results with the current software you would use
1037
1038 estimateTagwiseDisp(object, prior.df=240, trend="none")
1039
1040 To get the new default from old software you would use
1041
1042 estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
1043
1044 Actually the old trend method is equivalent to trend="loess" in the new
1045 software. You should use plotBCV(object) to see whether a trend is
1046 required.
1047
1048 Note you could also use
1049
1050 prior.n = getPriorN(object, prior.df=10)
1051
1052 to map between prior.df and prior.n.
1053
1054 ----
1055
1056 **Attributions**
1057
1058 edgeR - edgeR_
1059
1060 VOOM/limma - limma_VOOM_
1061
1062 DESeq2 - DESeq2_ for details
1063
1064 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
1065
1066 Galaxy_ (that's what you are using right now!) for gluing everything together
1067
1068 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
1069 licensed to you under the LGPL_ like other rgenetics artefacts
1070
1071 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
1072 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
1073 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
1074 .. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
1075 .. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
1076 .. _Galaxy: http://getgalaxy.org
1077 </help>
1078
1079 </tool>
1080
1081