comparison rgedgeRpaired_nocamera.xml @ 38:2f7573aacac3 draft

Uploaded
author fubar
date Sun, 22 Dec 2013 02:57:05 -0500
parents 8051d14cf819
children
comparison
equal deleted inserted replaced
37:9cec7795c52f 38:2f7573aacac3
1 <tool id="rgDifferentialCount" name="Differential_Count" version="0.22">
2 <description>models using BioConductor packages</description>
3 <requirements>
4 <requirement type="package" version="3.0.1">r3</requirement>
5 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
6 <requirement type="package" version="9.07">ghostscript</requirement>
7 <requirement type="package" version="2.12">biocbasics</requirement>
8 </requirements>
9
10 <command interpreter="python">
11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts"
12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
13 </command>
14 <inputs>
15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
16 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
17 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs"
18 help="Supply a meaningful name here to remind you what the outputs contain">
19 <sanitizer invalid_char="">
20 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
21 </sanitizer>
22 </param>
23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
25 multiple="true" use_header_names="true" size="120" display="checkboxes">
26 <validator type="no_options" message="Please select at least one column."/>
27 <sanitizer invalid_char="">
28 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
29 </sanitizer>
30 </param>
31 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
32 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
33 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
34 <validator type="no_options" message="Please select at least one column."/>
35 <sanitizer invalid_char="">
36 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
37 </sanitizer> <validator type="no_options" message="Please select at least one column."/>
38 <sanitizer invalid_char="">
39 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
40 </sanitizer>
41
42 </param>
43 <param name="subjectids" type="text" optional="true" size="120" value = ""
44 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input"
45 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'">
46 <sanitizer>
47 <valid initial="string.letters,string.digits"><add value="," /> </valid>
48 </sanitizer>
49 </param>
50 <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
51 help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
52 <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1"
53 label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
54 help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
55
56 <conditional name="edgeR">
57 <param name="doedgeR" type="select"
58 label="Run this model using edgeR"
59 help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
60 <option value="F">Do not run edgeR</option>
61 <option value="T" selected="true">Run edgeR</option>
62 </param>
63 <when value="T">
64 <param name="edgeR_priordf" type="integer" value="20" size="3"
65 label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n and prior.df = prior.n * residual.df"
66 help="0 = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
67 </when>
68 <when value="F"></when>
69 </conditional>
70 <conditional name="DESeq2">
71 <param name="doDESeq2" type="select"
72 label="Run the same model with DESeq2 and compare findings"
73 help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
74 <option value="F" selected="true">Do not run DESeq2</option>
75 <option value="T">Run DESeq2</option>
76 </param>
77 <when value="T">
78 <param name="DESeq_fitType" type="select">
79 <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
80 <option value="local">Local fit - this will automagically be used if parametric fit fails</option>
81 <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual linked below in the documentation</option>
82 </param>
83 </when>
84 <when value="F"> </when>
85 </conditional>
86 <param name="doVoom" type="select"
87 label="Run the same model with Voom/limma and compare findings"
88 help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma">
89 <option value="F" selected="true">Do not run VOOM</option>
90 <option value="T">Run VOOM</option>
91 </param>
92 <!--
93 <conditional name="camera">
94 <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets"
95 help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history">
96 <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option>
97 <option value="T">Run GSEA tests with the Camera algorithm</option>
98 </param>
99 <when value="T">
100 <conditional name="gmtSource">
101 <param name="refgmtSource" type="select"
102 label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set">
103 <option value="indexed" selected="true">Use a built-in gene set</option>
104 <option value="history">Use a gene set from my history</option>
105 <option value="both">Add a gene set from my history to a built in gene set</option>
106 </param>
107 <when value="indexed">
108 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
109 <options from_data_table="gseaGMT_3.1">
110 <filter type="sort_by" column="2" />
111 <validator type="no_options" message="No GMT v3.1 files are available - please install them"/>
112 </options>
113 </param>
114 </when>
115 <when value="history">
116 <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" />
117 </when>
118 <when value="both">
119 <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" />
120 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
121 <options from_data_table="gseaGMT_4">
122 <filter type="sort_by" column="2" />
123 <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/>
124 </options>
125 </param>
126 </when>
127 </conditional>
128 </when>
129 <when value="F">
130 </when>
131 </conditional>
132 -->
133 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
134 help="Conventional default value of 0.05 recommended"/>
135 <param name="fdrtype" type="select" label="FDR (Type II error) control method"
136 help="Use fdr or bh typically to control for the number of tests in a reliable way">
137 <option value="fdr" selected="true">fdr</option>
138 <option value="BH">Benjamini Hochberg</option>
139 <option value="BY">Benjamini Yukateli</option>
140 <option value="bonferroni">Bonferroni</option>
141 <option value="hochberg">Hochberg</option>
142 <option value="holm">Holm</option>
143 <option value="hommel">Hommel</option>
144 <option value="none">no control for multiple tests</option>
145 </param>
146 </inputs>
147 <outputs>
148 <data format="tabular" name="out_edgeR" label="${title}_topTable_edgeR.xls">
149 <filter>edgeR['doedgeR'] == "T"</filter>
150 </data>
151 <data format="tabular" name="out_DESeq2" label="${title}_topTable_DESeq2.xls">
152 <filter>DESeq2['doDESeq2'] == "T"</filter>
153 </data>
154 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
155 <filter>doVoom == "T"</filter>
156 </data>
157 <data format="html" name="html_file" label="${title}.html"/>
158 </outputs>
159 <stdio>
160 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix" />
161 </stdio>
162 <tests>
163 <test>
164 <param name='input1' value='test_bams2mx.xls' ftype='tabular' />
165 <param name='treatment_name' value='liver' />
166 <param name='title' value='edgeRtest' />
167 <param name='useNDF' value='' />
168 <param name='doedgeR' value='T' />
169 <param name='doVoom' value='T' />
170 <param name='doDESeq2' value='T' />
171 <param name='fdrtype' value='fdr' />
172 <param name='edgeR_priordf' value="8" />
173 <param name='fdrthresh' value="0.05" />
174 <param name='control_name' value='heart' />
175 <param name='subjectids' value='' />
176 <param name='Control_cols' value='3,4,5,9' />
177 <param name='Treat_cols' value='2,6,7,8' />
178 <output name='out_edgeR' file='edgeRtest1out.xls' compare='diff' />
179 <output name='html_file' file='edgeRtest1out.html' compare='diff' lines_diff='20' />
180 </test>
181 </tests>
182
183 <configfiles>
184 <configfile name="runme">
185 <![CDATA[
186 #
187 # edgeR.Rscript
188 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
189 # Performs DGE on a count table containing n replicates of two conditions
190 #
191 # Parameters
192 #
193 # 1 - Output Dir
194
195 # Original edgeR code by: S.Lunke and A.Kaspi
196 reallybig = log10(.Machine\$double.xmax)
197 reallysmall = log10(.Machine\$double.xmin)
198 library('stringr')
199 library('gplots')
200 library('edgeR')
201 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
202 {
203 # Perform clustering for significant pvalues after controlling FWER
204 samples = colnames(cmat)
205 gu = unique(group)
206 gn = rownames(cmat)
207 if (length(gu) == 2) {
208 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
209 pcols = unlist(lapply(group,col.map))
210 } else {
211 colours = rainbow(length(gu),start=0,end=4/6)
212 pcols = colours[match(group,gu)] }
213 dm = cmat[(! is.na(gn)),]
214 # remove unlabelled hm rows
215 nprobes = nrow(dm)
216 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
217 if (nprobes > nsamp) {
218 dm =dm[1:nsamp,]
219 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
220 }
221 newcolnames = substr(colnames(dm),1,20)
222 colnames(dm) = newcolnames
223 pdf(outpdfname)
224 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
225 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
226 dev.off()
227 }
228
229 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
230 {
231 # for 2 groups only was
232 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
233 #pcols = unlist(lapply(group,col.map))
234 gu = unique(group)
235 colours = rainbow(length(gu),start=0.3,end=0.6)
236 pcols = colours[match(group,gu)]
237 nrows = nrow(cmat)
238 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
239 if (nrows > nsamp) {
240 cmat = cmat[c(1:nsamp),]
241 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
242 }
243 newcolnames = substr(colnames(cmat),1,20)
244 colnames(cmat) = newcolnames
245 pdf(outpdfname)
246 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
247 dev.off()
248 }
249
250 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
251 # stolen from https://gist.github.com/703512
252 {
253 o = -log10(sort(pvector,decreasing=F))
254 e = -log10( 1:length(o)/length(o) )
255 o[o==-Inf] = reallysmall
256 o[o==Inf] = reallybig
257 maint = descr
258 pdf(outpdf)
259 plot(e,o,pch=19,cex=1, main=maint, ...,
260 xlab=expression(Expected~~-log[10](italic(p))),
261 ylab=expression(Observed~~-log[10](italic(p))),
262 xlim=c(0,max(e)), ylim=c(0,max(o)))
263 lines(e,e,col="red")
264 grid(col = "lightgray", lty = "dotted")
265 dev.off()
266 }
267
268 smearPlot = function(DGEList,deTags, outSmear, outMain)
269 {
270 pdf(outSmear)
271 plotSmear(DGEList,de.tags=deTags,main=outMain)
272 grid(col="lightgray", lty="dotted")
273 dev.off()
274 }
275
276 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
277 { #
278 nc = ncol(rawrs)
279 for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
280 fullnames = colnames(rawrs)
281 newcolnames = substr(colnames(rawrs),1,20)
282 colnames(rawrs) = newcolnames
283 newcolnames = substr(colnames(cleanrs),1,20)
284 colnames(cleanrs) = newcolnames
285 defpar = par(no.readonly=T)
286 print.noquote('raw contig counts by sample:')
287 print.noquote(summary(rawrs))
288 print.noquote('normalised contig counts by sample:')
289 print.noquote(summary(cleanrs))
290 pdf(pdfname)
291 par(mfrow=c(1,2))
292 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
293 grid(col="lightgray",lty="dotted")
294 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
295 grid(col="lightgray",lty="dotted")
296 dev.off()
297 pdfname = "sample_counts_histogram.pdf"
298 nc = ncol(rawrs)
299 print.noquote(paste('Using ncol rawrs=',nc))
300 ncroot = round(sqrt(nc))
301 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
302 m = c()
303 for (i in c(1:nc)) {
304 rhist = hist(rawrs[,i],breaks=100,plot=F)
305 m = append(m,max(rhist\$counts))
306 }
307 ymax = max(m)
308 ncols = length(fullnames)
309 if (ncols > 20)
310 {
311 scale = 7*ncols/20
312 pdf(pdfname,width=scale,height=scale)
313 } else {
314 pdf(pdfname)
315 }
316 par(mfrow=c(ncroot,ncroot))
317 for (i in c(1:nc)) {
318 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
319 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
320 }
321 dev.off()
322 par(defpar)
323
324 }
325
326 cumPlot = function(rawrs,cleanrs,maint,myTitle)
327 { # updated to use ecdf
328 pdfname = "Filtering_rowsum_bar_charts.pdf"
329 defpar = par(no.readonly=T)
330 lrs = log(rawrs,10)
331 lim = max(lrs)
332 pdf(pdfname)
333 par(mfrow=c(2,1))
334 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
335 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
336 grid(col="lightgray", lty="dotted")
337 lrs = log(cleanrs,10)
338 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
339 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
340 grid(col="lightgray", lty="dotted")
341 dev.off()
342 par(defpar)
343 }
344
345 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
346 { # updated to use ecdf
347 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
348 pdf(pdfname)
349 par(mfrow=c(2,1))
350 lastx = max(rawrs)
351 rawe = knots(ecdf(rawrs))
352 cleane = knots(ecdf(cleanrs))
353 cy = 1:length(cleane)/length(cleane)
354 ry = 1:length(rawe)/length(rawe)
355 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
356 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
357 grid(col="blue")
358 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
359 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
360 grid(col="blue")
361 dev.off()
362 }
363
364
365
366 doGSEA = function(y=NULL,design=NULL,histgmt="",
367 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
368 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
369 {
370 sink('Camera.log')
371 genesets = c()
372 if (bigmt > "")
373 {
374 bigenesets = readLines(bigmt)
375 genesets = bigenesets
376 }
377 if (histgmt > "")
378 {
379 hgenesets = readLines(histgmt)
380 if (bigmt > "") {
381 genesets = rbind(genesets,hgenesets)
382 } else {
383 genesets = hgenesets
384 } # use only history if no bi
385 }
386 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
387 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
388 outf = outfname
389 head=paste(myTitle,'edgeR GSEA')
390 write(head,file=outfname,append=F)
391 ntest=length(genesets)
392 urownames = toupper(rownames(y))
393 upcam = c()
394 downcam = c()
395 for (i in 1:ntest) {
396 gs = unlist(genesets[i])
397 g = gs[1] # geneset_id
398 u = gs[2]
399 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
400 glist = gs[3:length(gs)] # member gene symbols
401 glist = toupper(glist)
402 inglist = urownames %in% glist
403 nin = sum(inglist)
404 if ((nin > minnin) && (nin < maxnin)) {
405 ### print(paste('@@found',sum(inglist),'genes in glist'))
406 camres = camera(y=y,index=inglist,design=design)
407 if (! is.null(camres)) {
408 rownames(camres) = g # gene set name
409 camres = cbind(GeneSet=g,URL=u,camres)
410 if (camres\$Direction == "Up")
411 {
412 upcam = rbind(upcam,camres) } else {
413 downcam = rbind(downcam,camres)
414 }
415 }
416 }
417 }
418 uscam = upcam[order(upcam\$PValue),]
419 unadjp = uscam\$PValue
420 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
421 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
422 dscam = downcam[order(downcam\$PValue),]
423 unadjp = dscam\$PValue
424 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
425 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
426 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
427 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
428 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
429 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
430 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
431 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
432 sink()
433 }
434
435
436
437
438 doGSEAatonce = function(y=NULL,design=NULL,histgmt="",
439 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
440 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
441 {
442 sink('Camera.log')
443 genesets = c()
444 if (bigmt > "")
445 {
446 bigenesets = readLines(bigmt)
447 genesets = bigenesets
448 }
449 if (histgmt > "")
450 {
451 hgenesets = readLines(histgmt)
452 if (bigmt > "") {
453 genesets = rbind(genesets,hgenesets)
454 } else {
455 genesets = hgenesets
456 } # use only history if no bi
457 }
458 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
459 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
460 outf = outfname
461 head=paste(myTitle,'edgeR GSEA')
462 write(head,file=outfname,append=F)
463 ntest=length(genesets)
464 urownames = toupper(rownames(y))
465 upcam = c()
466 downcam = c()
467 incam = c()
468 urls = c()
469 gsids = c()
470 for (i in 1:ntest) {
471 gs = unlist(genesets[i])
472 gsid = gs[1] # geneset_id
473 url = gs[2]
474 if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
475 glist = gs[3:length(gs)] # member gene symbols
476 glist = toupper(glist)
477 inglist = urownames %in% glist
478 nin = sum(inglist)
479 if ((nin > minnin) && (nin < maxnin)) {
480 incam = c(incam,inglist)
481 gsids = c(gsids,gsid)
482 urls = c(urls,url)
483 }
484 }
485 incam = as.list(incam)
486 names(incam) = gsids
487 allcam = camera(y=y,index=incam,design=design)
488 allcamres = cbind(geneset=gsids,allcam,URL=urls)
489 for (i in 1:ntest) {
490 camres = allcamres[i]
491 res = try(test = (camres\$Direction == "Up"))
492 if ("try-error" %in% class(res)) {
493 cat("test failed, camres = :")
494 print.noquote(camres)
495 } else { if (camres\$Direction == "Up")
496 { upcam = rbind(upcam,camres)
497 } else { downcam = rbind(downcam,camres)
498 }
499
500 }
501 }
502 uscam = upcam[order(upcam\$PValue),]
503 unadjp = uscam\$PValue
504 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
505 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
506 dscam = downcam[order(downcam\$PValue),]
507 unadjp = dscam\$PValue
508 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
509 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
510 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
511 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
512 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
513 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
514 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
515 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
516 sink()
517 }
518
519
520 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
521 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
522 filterquantile=0.2, subjects=c(),mydesign=NULL,
523 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
524 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
525 doCook=F,DESeq_fitType="parameteric")
526 {
527 # Error handling
528 if (length(unique(group))!=2){
529 print("Number of conditions identified in experiment does not equal 2")
530 q()
531 }
532 require(edgeR)
533 options(width = 512)
534 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
535 allN = nrow(Count_Matrix)
536 nscut = round(ncol(Count_Matrix)/2)
537 colTotmillionreads = colSums(Count_Matrix)/1e6
538 counts.dataframe = as.data.frame(c())
539 rawrs = rowSums(Count_Matrix)
540 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
541 nzN = nrow(nonzerod)
542 nzrs = rowSums(nonzerod)
543 zN = allN - nzN
544 print('# Quantiles for non-zero row counts:',quote=F)
545 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
546 if (useNDF == T)
547 {
548 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
549 lo = colSums(Count_Matrix[!gt1rpin3,])
550 workCM = Count_Matrix[gt1rpin3,]
551 cleanrs = rowSums(workCM)
552 cleanN = length(cleanrs)
553 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
554 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
555 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
556 } else {
557 useme = (nzrs > quantile(nzrs,filterquantile))
558 workCM = nonzerod[useme,]
559 lo = colSums(nonzerod[!useme,])
560 cleanrs = rowSums(workCM)
561 cleanN = length(cleanrs)
562 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
563 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
564 maint = paste('Filter below',filterquantile,'quantile')
565 }
566 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
567 allgenes = rownames(workCM)
568 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
569 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
570 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
571 testreg = str_match(allgenes,reg)
572 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
573 {
574 print("@@ using ucsc substitution for urls")
575 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
576 } else {
577 print.noquote("@@ using genecards substitution for urls")
578 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
579 }
580 print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
581 cmrowsums = rowSums(workCM)
582 TName=unique(group)[1]
583 CName=unique(group)[2]
584 if (is.null(mydesign)) {
585 if (length(subjects) == 0)
586 {
587 mydesign = model.matrix(~group)
588 }
589 else {
590 subjf = factor(subjects)
591 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
592 }
593 }
594 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
595 print.noquote('Using design matrix:')
596 print.noquote(mydesign)
597 if (doedgeR) {
598 sink('edgeR.log')
599 #### Setup DGEList object
600 DGEList = DGEList(counts=workCM, group = group)
601 DGEList = calcNormFactors(DGEList)
602
603 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
604 comdisp = DGEList\$common.dispersion
605 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
606 if (edgeR_priordf > 0) {
607 print.noquote(paste("prior.df =",edgeR_priordf))
608 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
609 } else {
610 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
611 }
612 DGLM = glmFit(DGEList,design=mydesign)
613 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
614 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
615 normData = (1e+06*DGEList\$counts/efflib)
616 uoutput = cbind(
617 Name=as.character(rownames(DGEList\$counts)),
618 DE\$table,
619 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
620 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
621 DGEList\$counts
622 )
623 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
624 goodness = gof(DGLM, pcutoff=fdrthresh)
625 if (sum(goodness\$outlier) > 0) {
626 print.noquote('GLM outliers:')
627 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
628 } else {
629 print('No GLM fit outlier genes found\n')
630 }
631 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
632 pdf("edgeR_GoodnessofFit.pdf")
633 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
634 abline(0,1,lwd=3)
635 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
636 dev.off()
637 estpriorn = getPriorN(DGEList)
638 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
639 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
640 normData = (1e+06*DGEList\$counts/efflib)
641 uniqueg = unique(group)
642 #### Plot MDS
643 sample_colors = match(group,levels(group))
644 sampleTypes = levels(factor(group))
645 print.noquote(sampleTypes)
646 pdf("edgeR_MDSplot.pdf")
647 plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
648 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
649 grid(col="blue")
650 dev.off()
651 colnames(normData) = paste( colnames(normData),'N',sep="_")
652 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
653 nzd = data.frame(log(nonzerod + 1e-2,10))
654 try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") )
655 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
656 tt = cbind(
657 Name=as.character(rownames(DGEList\$counts)),
658 DE\$table,
659 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
660 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
661 )
662 print.noquote("# edgeR Top tags\n")
663 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
664 tt = tt[order(DE\$table\$PValue),]
665 print.noquote(tt[1:50,])
666 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
667 nsig = length(deTags)
668 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
669 deColours = ifelse(deTags,'red','black')
670 pdf("edgeR_BCV_vs_abundance.pdf")
671 plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
672 dev.off()
673 dg = DGEList[order(DE\$table\$PValue),]
674 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
675 efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
676 normData = (1e+06*dg\$counts/efflib)
677 outpdfname="edgeR_top_100_heatmap.pdf"
678 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle))
679 outSmear = "edgeR_smearplot.pdf"
680 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
681 smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
682 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf')
683 norm.factor = DGEList\$samples\$norm.factors
684 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
685 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
686 edgeRcounts = rep(0, length(allgenes))
687 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
688 sink()
689 } ### doedgeR
690 if (doDESeq2 == T)
691 {
692 sink("DESeq2.log")
693 # DESeq2
694 require('DESeq2')
695 library('RColorBrewer')
696 if (length(subjects) == 0)
697 {
698 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
699 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
700 } else {
701 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
702 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
703 }
704 #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
705 #rDESeq = results(DESeq2)
706 #newCountDataSet(workCM, group)
707 deSeqDatsizefac = estimateSizeFactors(deSEQds)
708 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
709 resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
710 rDESeq = as.data.frame(results(resDESeq))
711 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums)
712 srDESeq = rDESeq[order(rDESeq\$pvalue),]
713 write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
714 qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf')
715 cat("# DESeq top 50\n")
716 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
717 srDESeq = rDESeq[order(rDESeq\$pvalue),]
718 print.noquote(srDESeq[1:50,])
719 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
720 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
721 DESeqcounts = rep(0, length(allgenes))
722 DESeqcounts[DESeqcountsindex] = 1
723 pdf("DESeq2_dispersion_estimates.pdf")
724 plotDispEsts(resDESeq)
725 dev.off()
726 ysmall = abs(min(rDESeq\$log2FoldChange))
727 ybig = abs(max(rDESeq\$log2FoldChange))
728 ylimit = min(4,ysmall,ybig)
729 pdf("DESeq2_MA_plot.pdf")
730 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
731 dev.off()
732 rlogres = rlogTransformation(resDESeq)
733 sampledists = dist( t( assay(rlogres) ) )
734 sdmat = as.matrix(sampledists)
735 pdf("DESeq2_sample_distance_plot.pdf")
736 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
737 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
738 dev.off()
739 ###outpdfname="DESeq2_top50_heatmap.pdf"
740 ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
741 sink()
742 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
743 if ("try-error" %in% class(result)) {
744 print.noquote('DESeq2 plotPCA failed.')
745 } else {
746 pdf("DESeq2_PCA_plot.pdf")
747 #### wtf - print? Seems needed to get this to work
748 print(ppca)
749 dev.off()
750 }
751 }
752
753 if (doVoom == T) {
754 sink('Voom.log')
755 if (doedgeR == F) {
756 #### Setup DGEList object
757 DGEList = DGEList(counts=workCM, group = group)
758 DGEList = calcNormFactors(DGEList)
759 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
760 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
761 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
762 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
763 norm.factor = DGEList\$samples\$norm.factors
764 }
765 pdf("Voom_mean_variance_plot.pdf")
766 dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
767 dev.off()
768 # Use limma to fit data
769 fit = lmFit(dat.voomed, mydesign)
770 fit = eBayes(fit)
771 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
772 qqPlot(descr=paste(myTitle,'Voom-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='Voom_qqplot.pdf')
773 rownames(rvoom) = rownames(workCM)
774 rvoom = cbind(rvoom,NReads=cmrowsums)
775 srvoom = rvoom[order(rvoom\$P.Value),]
776 write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F)
777 rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls)
778 srvoom = rvoom[order(rvoom\$P.Value),]
779 cat("# Voom top 50\n")
780 print(srvoom[1:50,])
781 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
782 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
783 voomcountsindex = which(allgenes %in% topresults.voom\$ID)
784 voomcounts = rep(0, length(allgenes))
785 voomcounts[voomcountsindex] = 1
786 sink()
787 }
788
789 if (doCamera) {
790 doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
791 outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
792 }
793
794 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
795 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
796 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
797 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
798 VOOM_limma = voomcounts, row.names = allgenes)
799 } else if ((doDESeq2==T) && (doedgeR==T)) {
800 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
801 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
802 } else if ((doVoom==T) && (doedgeR==T)) {
803 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
804 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
805 }
806
807 if (nrow(counts.dataframe > 1)) {
808 counts.venn = vennCounts(counts.dataframe)
809 vennf = "Venn_significant_genes_overlap.pdf"
810 pdf(vennf)
811 vennDiagram(counts.venn,main=vennmain,col="maroon")
812 dev.off()
813 }
814 } #### doDESeq2 or doVoom
815
816 }
817 #### Done
818
819 ###sink(stdout(),append=T,type="message")
820 builtin_gmt = ""
821 history_gmt = ""
822 history_gmt_name = ""
823 out_edgeR = F
824 out_DESeq2 = F
825 out_VOOM = "$out_VOOM"
826 doDESeq2 = $DESeq2.doDESeq2 # make these T or F
827 doVoom = $doVoom
828 doCamera = F
829 doedgeR = $edgeR.doedgeR
830 edgeR_priordf = 0
831
832
833 #if $doVoom == "T":
834 out_VOOM = "$out_VOOM"
835 #end if
836
837 #if $DESeq2.doDESeq2 == "T":
838 out_DESeq2 = "$out_DESeq2"
839 DESeq_fitType = "$DESeq2.DESeq_fitType"
840 #end if
841
842 #if $edgeR.doedgeR == "T":
843 out_edgeR = "$out_edgeR"
844 edgeR_priordf = $edgeR.edgeR_priordf
845 #end if
846
847
848 if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
849 {
850 write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
851 quit(save="no",status=2)
852 }
853
854 Out_Dir = "$html_file.files_path"
855 Input = "$input1"
856 TreatmentName = "$treatment_name"
857 TreatmentCols = "$Treat_cols"
858 ControlName = "$control_name"
859 ControlCols= "$Control_cols"
860 org = "$input1.dbkey"
861 if (org == "") { org = "hg19"}
862 fdrtype = "$fdrtype"
863 fdrthresh = $fdrthresh
864 useNDF = $useNDF
865 fQ = $fQ # non-differential centile cutoff
866 myTitle = "$title"
867 sids = strsplit("$subjectids",',')
868 subjects = unlist(sids)
869 nsubj = length(subjects)
870 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
871 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
872 cat('Got TCols=')
873 cat(TCols)
874 cat('; CCols=')
875 cat(CCols)
876 cat('\n')
877 useCols = c(TCols,CCols)
878 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
879 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
880 snames = colnames(Count_Matrix)
881 nsamples = length(snames)
882 if (nsubj > 0 & nsubj != nsamples) {
883 options("show.error.messages"=T)
884 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
885 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
886 write(mess, stderr())
887 quit(save="no",status=4)
888 }
889 if (length(subjects) != 0) {subjects = subjects[useCols]}
890 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
891 rn = rownames(Count_Matrix)
892 islib = rn %in% c('librarySize','NotInBedRegions')
893 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
894 Count_Matrix = Count_Matrix[subset(rn,! islib),]
895 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
896 group = factor(group, levels=c(ControlName,TreatmentName))
897 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
898 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
899 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
900 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,
901 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
902 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType)
903 sessionInfo()
904 ]]>
905 </configfile>
906 </configfiles>
907 <help>
908
909 **What it does**
910
911 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
912 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
913
914 **Input**
915
916 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
917 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
918 non-negative integer count of reads from one sample overlapping the feature.
919 The matrix must have a header row uniquely identifying the source samples, and unique row names in
920 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
921
922 **Specifying comparisons**
923
924 This is basically dumbed down for two factors - case vs control.
925
926 More complex interfaces are possible but painful at present.
927 Probably need to specify a phenotype file to do this better.
928 Work in progress. Send code.
929
930 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
931 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
932 A list of integers, one for each subject or an empty string if samples are all independent.
933 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
934 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
935
936 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
937 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
938 8,9,1,1,2,2
939 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
940
941 **Methods available**
942
943 You can run 3 popular Bioconductor packages available for count data.
944
945 edgeR - see edgeR_ for details
946
947 VOOM/limma - see limma_VOOM_ for details
948
949 DESeq2 - see DESeq2_ for details
950
951 and optionally camera in edgeR which works better if MSigDB is installed.
952
953 **Outputs**
954
955 Some helpful plots and analysis results. Note that most of these are produced using R code
956 suggested by the excellent documentation and vignettes for the Bioconductor
957 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
958
959 **Note on Voom**
960
961 The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
962
963 This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
964
965 voom is an acronym for mean-variance modelling at the observational level.
966 The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
967 Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
968 This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
969 The weights are then used in the linear modelling process to adjust for heteroscedasticity.
970
971 In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
972 The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
973 The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
974 Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
975 Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
976 This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
977
978
979 Author(s)
980
981 Charity Law and Gordon Smyth
982
983 References
984
985 Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
986
987 Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
988 Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
989 http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
990
991 See Also
992
993 A voom case study is given in the edgeR User's Guide.
994
995 vooma is a similar function but for microarrays instead of RNA-seq.
996
997
998 ***old rant on changes to Bioconductor package variable names between versions***
999
1000 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
1001 breaking this and all other code that assumed the old name for this variable,
1002 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
1003 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
1004 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
1005 when their old scripts break. This tool currently now works with 2.4.6.
1006
1007 **Note on prior.N**
1008
1009 http://seqanswers.com/forums/showthread.php?t=5591 says:
1010
1011 *prior.n*
1012
1013 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
1014 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
1015 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
1016 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
1017 common likelihood the weight of one observation.
1018
1019 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
1020 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
1021 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
1022 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
1023 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
1024 If you have more samples, then the tagwise dispersion estimates will be more reliable,
1025 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
1026
1027
1028 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
1029
1030 Dear Dorota,
1031
1032 The important settings are prior.df and trend.
1033
1034 prior.n and prior.df are related through prior.df = prior.n * residual.df,
1035 and your experiment has residual.df = 36 - 12 = 24. So the old setting of
1036 prior.n=10 is equivalent for your data to prior.df = 240, a very large
1037 value. Going the other way, the new setting of prior.df=10 is equivalent
1038 to prior.n=10/24.
1039
1040 To recover old results with the current software you would use
1041
1042 estimateTagwiseDisp(object, prior.df=240, trend="none")
1043
1044 To get the new default from old software you would use
1045
1046 estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
1047
1048 Actually the old trend method is equivalent to trend="loess" in the new
1049 software. You should use plotBCV(object) to see whether a trend is
1050 required.
1051
1052 Note you could also use
1053
1054 prior.n = getPriorN(object, prior.df=10)
1055
1056 to map between prior.df and prior.n.
1057
1058 ----
1059
1060 **Attributions**
1061
1062 edgeR - edgeR_
1063
1064 VOOM/limma - limma_VOOM_
1065
1066 DESeq2 - DESeq2_ for details
1067
1068 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
1069
1070 Galaxy_ (that's what you are using right now!) for gluing everything together
1071
1072 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
1073 licensed to you under the LGPL_ like other rgenetics artefacts
1074
1075 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
1076 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
1077 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
1078 .. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
1079 .. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
1080 .. _Galaxy: http://getgalaxy.org
1081 </help>
1082
1083 </tool>
1084
1085