comparison rgedgeRpaired_nocamera.xml @ 148:1e20061decdd draft

Uploaded
author iuc
date Wed, 29 Apr 2015 12:07:19 -0400
parents 474c08e747b6
children 3107df74056e
comparison
equal deleted inserted replaced
147:34eedc5fe099 148:1e20061decdd
1 <?xml version="1.0"?>
2 <tool id="rgdifferentialcount" name="Differential_Count" version="0.28">
3 <description>models using BioConductor packages</description>
4 <requirements>
5 <requirement type="package" version="3.1.2">R</requirement>
6 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
7 <requirement type="package" version="9.10">ghostscript</requirement>
8 <requirement type="package" version="2.14">biocbasics</requirement>
9 </requirements>
10 <command interpreter="python">
11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "Differential_Counts"
12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
13 </command>
14 <inputs>
15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample" help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
16 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain">
17 <sanitizer invalid_char="">
18 <valid initial="string.letters,string.digits">
19 <add value="_"/>
20 </valid>
21 </sanitizer>
22 </param>
23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True" multiple="true" use_header_names="true" size="120" display="checkboxes" force_select="True">
25 <validator type="no_options" message="Please select at least one column."/>
26 </param>
27 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
28 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True" multiple="true" use_header_names="true" size="120" display="checkboxes" force_select="True">
29 </param>
30 <param name="subjectids" type="text" optional="true" size="120" value="" label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input" help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'">
31 <sanitizer>
32 <valid initial="string.letters,string.digits">
33 <add value=","/>
34 </valid>
35 </sanitizer>
36 </param>
37 <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs" help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
38 <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1" label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples" help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
39 <conditional name="edgeR">
40 <param name="doedgeR" type="select" label="Run this model using edgeR" help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
41 <option value="F">Do not run edgeR</option>
42 <option value="T" selected="true">Run edgeR</option>
43 </param>
44 <when value="T">
45 <param name="edgeR_priordf" type="integer" value="10" size="3" label="prior.df for tagwise dispersion - larger value = more squeezing of tag dispersions to common dispersion. Replaces prior.n and prior.df = prior.n * residual.df" help="10 = edgeR default. Use a larger value to 'smooth' small samples. See edgeR docs and note below"/>
46 <param name="edgeR_robust_method" type="select" value="20" size="3" label="Use robust dispersion method" help="Use ordinary, anscombe or deviance robust deviance estimates">
47 <option value="ordinary" selected="true">Use ordinary deviance estimates</option>
48 <option value="deviance">Use robust deviance estimates</option>
49 <option value="anscombe">use Anscombe robust deviance estimates</option>
50 </param>
51 </when>
52 <when value="F"/>
53 </conditional>
54 <conditional name="DESeq2">
55 <param name="doDESeq2" type="select" label="Run the same model with DESeq2 and compare findings" help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
56 <option value="F" selected="true">Do not run DESeq2</option>
57 <option value="T">Run DESeq2</option>
58 </param>
59 <when value="T">
60 <param name="DESeq_fitType" type="select">
61 <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
62 <option value="local">Local fit - this will automagically be used if parametric fit fails</option>
63 <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual linked below in the documentation</option>
64 </param>
65 </when>
66 <when value="F"> </when>
67 </conditional>
68 <param name="doVoom" type="select" label="Run the same model with Voom/limma and compare findings" help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma">
69 <option value="F" selected="true">Do not run VOOM</option>
70 <option value="T">Run VOOM</option>
71 </param>
72 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control" help="Conventional default value of 0.05 recommended"/>
73 <param name="fdrtype" type="select" label="FDR (Type II error) control method" help="Use fdr or bh typically to control for the number of tests in a reliable way">
74 <option value="fdr" selected="true">fdr</option>
75 <option value="BH">Benjamini Hochberg</option>
76 <option value="BY">Benjamini Yukateli</option>
77 <option value="bonferroni">Bonferroni</option>
78 <option value="hochberg">Hochberg</option>
79 <option value="holm">Holm</option>
80 <option value="hommel">Hommel</option>
81 <option value="none">no control for multiple tests</option>
82 </param>
83 </inputs>
84 <outputs>
85 <data format="tabular" name="out_edgeR" label="${title}_topTable_edgeR.xls">
86 <filter>edgeR['doedgeR'] == "T"</filter>
87 </data>
88 <data format="tabular" name="out_DESeq2" label="${title}_topTable_DESeq2.xls">
89 <filter>DESeq2['doDESeq2'] == "T"</filter>
90 </data>
91 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
92 <filter>doVoom == "T"</filter>
93 </data>
94 <data format="html" name="html_file" label="${title}.html"/>
95 </outputs>
96 <stdio>
97 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix"/>
98 </stdio>
99 <tests>
100 <test>
101 <param name="input1" value="test_bams2mx.xls" ftype="tabular"/>
102 <param name="treatment_name" value="liver"/>
103 <param name="title" value="edgeRtest"/>
104 <param name="useNDF" value=""/>
105 <param name="doedgeR" value="T"/>
106 <param name="doVoom" value="T"/>
107 <param name="doDESeq2" value="T"/>
108 <param name="fdrtype" value="fdr"/>
109 <param name="edgeR_priordf" value="8"/>
110 <param name="edgeR_robust" value="ordinary"/>
111 <param name="fdrthresh" value="0.05"/>
112 <param name="control_name" value="heart"/>
113 <param name="subjectids" value=""/>
114 <param name="Control_cols" value="3,4,5,9"/>
115 <param name="Treat_cols" value="2,6,7,8"/>
116 <output name="out_edgeR" file="edgeRtest1out.xls" compare="diff" lines_diff="20"/>
117 <output name="html_file" file="edgeRtest1out.html" compare="diff" lines_diff="20"/>
118 </test>
119 </tests>
120 <configfiles>
121 <configfile name="runme"><![CDATA[
122 #
123 # edgeR.Rscript
124 # updated feb 2014 adding outlier-robust deviance estimate options by ross for R 3.0.2/bioc 2.13
125 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
126 # Performs DGE on a count table containing n replicates of two conditions
127 #
128 # Parameters
129 #
130 # 1 - Output Dir
131
132 # Original edgeR code by: S.Lunke and A.Kaspi
133 reallybig = log10(.Machine\$double.xmax)
134 reallysmall = log10(.Machine\$double.xmin)
135 library("gplots")
136 library("edgeR")
137 library('stringr')
138 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
139 {
140 # Perform clustering for significant pvalues after controlling FWER
141 samples = colnames(cmat)
142 gu = unique(group)
143 gn = rownames(cmat)
144 if (length(gu) == 2) {
145 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
146 pcols = unlist(lapply(group,col.map))
147 } else {
148 colours = rainbow(length(gu),start=0,end=4/6)
149 pcols = colours[match(group,gu)] }
150 dm = cmat[(! is.na(gn)),]
151 # remove unlabelled hm rows
152 nprobes = nrow(dm)
153 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
154 if (nprobes > nsamp) {
155 dm =dm[1:nsamp,]
156 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
157 }
158 newcolnames = substr(colnames(dm),1,20)
159 colnames(dm) = newcolnames
160 pdf(outpdfname)
161 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
162 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
163 dev.off()
164 }
165
166 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
167 {
168 # for 2 groups only was
169 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
170 #pcols = unlist(lapply(group,col.map))
171 gu = unique(group)
172 colours = rainbow(length(gu),start=0.3,end=0.6)
173 pcols = colours[match(group,gu)]
174 nrows = nrow(cmat)
175 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
176 if (nrows > nsamp) {
177 cmat = cmat[c(1:nsamp),]
178 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
179 }
180 newcolnames = substr(colnames(cmat),1,20)
181 colnames(cmat) = newcolnames
182 pdf(outpdfname)
183 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
184 dev.off()
185 }
186
187 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
188 # stolen from https://gist.github.com/703512
189 {
190 o = -log10(sort(pvector,decreasing=F))
191 e = -log10( 1:length(o)/length(o) )
192 o[o==-Inf] = reallysmall
193 o[o==Inf] = reallybig
194 maint = descr
195 pdf(outpdf)
196 plot(e,o,pch=19,cex=1, main=maint, ...,
197 xlab=expression(Expected~~-log[10](italic(p))),
198 ylab=expression(Observed~~-log[10](italic(p))),
199 xlim=c(0,max(e)), ylim=c(0,max(o)))
200 lines(e,e,col="red")
201 grid(col = "lightgray", lty = "dotted")
202 dev.off()
203 }
204
205 smearPlot = function(myDGEList,deTags, outSmear, outMain)
206 {
207 pdf(outSmear)
208 plotSmear(myDGEList,de.tags=deTags,main=outMain)
209 grid(col="lightgray", lty="dotted")
210 dev.off()
211 }
212
213 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
214 {
215 nc = ncol(rawrs)
216 ##### for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
217 fullnames = colnames(rawrs)
218 newcolnames = substr(colnames(rawrs),1,20)
219 colnames(rawrs) = newcolnames
220 newcolnames = substr(colnames(cleanrs),1,20)
221 colnames(cleanrs) = newcolnames
222 defpar = par(no.readonly=T)
223 print.noquote('@@@ Raw contig counts by sample:')
224 print.noquote(summary(rawrs))
225 print.noquote('@@@ Library size contig counts by sample:')
226 print.noquote(summary(cleanrs))
227 pdf(pdfname)
228 par(mfrow=c(1,2))
229 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main='log2 raw counts')
230 grid(col="lightgray",lty="dotted")
231 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('log2 counts after ',maint))
232 grid(col="lightgray",lty="dotted")
233 dev.off()
234 pdfname = "sample_counts_histogram.pdf"
235 nc = ncol(rawrs)
236 print.noquote(paste('Using ncol rawrs=',nc))
237 ncroot = round(sqrt(nc))
238 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
239 m = c()
240 for (i in c(1:nc)) {
241 rhist = hist(rawrs[,i],breaks=100,plot=F)
242 m = append(m,max(rhist\$counts))
243 }
244 ymax = max(m)
245 ncols = length(fullnames)
246 if (ncols > 20)
247 {
248 scale = 7*ncols/20
249 pdf(pdfname,width=scale,height=scale)
250 } else {
251 pdf(pdfname)
252 }
253 par(mfrow=c(ncroot,ncroot))
254 for (i in c(1:nc)) {
255 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
256 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
257 }
258 dev.off()
259 par(defpar)
260
261 }
262
263 cumPlot = function(rawrs,cleanrs,maint,myTitle)
264 { # updated to use ecdf
265 pdfname = "Differential_rowsum_bar_charts.pdf"
266 defpar = par(no.readonly=T)
267 lrs = log(rawrs,10)
268 lim = max(lrs)
269 pdf(pdfname)
270 par(mfrow=c(2,1))
271 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
272 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
273 grid(col="lightgray", lty="dotted")
274 lrs = log(cleanrs,10)
275 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
276 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
277 grid(col="lightgray", lty="dotted")
278 dev.off()
279 par(defpar)
280 }
281
282 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
283 { # updated to use ecdf
284 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
285 pdf(pdfname)
286 par(mfrow=c(2,1))
287 lastx = max(rawrs)
288 rawe = knots(ecdf(rawrs))
289 cleane = knots(ecdf(cleanrs))
290 cy = 1:length(cleane)/length(cleane)
291 ry = 1:length(rawe)/length(rawe)
292 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
293 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
294 grid(col="blue")
295 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
296 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
297 grid(col="blue")
298 dev.off()
299 }
300
301
302
303 doGSEAold = function(y=NULL,design=NULL,histgmt="",
304 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
305 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
306 {
307 sink('Camera.log')
308 genesets = c()
309 if (bigmt > "")
310 {
311 bigenesets = readLines(bigmt)
312 genesets = bigenesets
313 }
314 if (histgmt > "")
315 {
316 hgenesets = readLines(histgmt)
317 if (bigmt > "") {
318 genesets = rbind(genesets,hgenesets)
319 } else {
320 genesets = hgenesets
321 } # use only history if no bi
322 }
323 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
324 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
325 outf = outfname
326 head=paste(myTitle,'edgeR GSEA')
327 write(head,file=outfname,append=F)
328 ntest=length(genesets)
329 urownames = toupper(rownames(y))
330 upcam = c()
331 downcam = c()
332 for (i in 1:ntest) {
333 gs = unlist(genesets[i])
334 g = gs[1] # geneset_id
335 u = gs[2]
336 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
337 glist = gs[3:length(gs)] # member gene symbols
338 glist = toupper(glist)
339 inglist = urownames %in% glist
340 nin = sum(inglist)
341 if ((nin > minnin) && (nin < maxnin)) {
342 ### print(paste('@@found',sum(inglist),'genes in glist'))
343 camres = camera(y=y,index=inglist,design=design)
344 if (! is.null(camres)) {
345 rownames(camres) = g # gene set name
346 camres = cbind(GeneSet=g,URL=u,camres)
347 if (camres\$Direction == "Up")
348 {
349 upcam = rbind(upcam,camres) } else {
350 downcam = rbind(downcam,camres)
351 }
352 }
353 }
354 }
355 uscam = upcam[order(upcam\$PValue),]
356 unadjp = uscam\$PValue
357 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
358 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
359 dscam = downcam[order(downcam\$PValue),]
360 unadjp = dscam\$PValue
361 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
362 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
363 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
364 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
365 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
366 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
367 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
368 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
369 sink()
370 }
371
372
373
374
375 doGSEA = function(y=NULL,design=NULL,histgmt="",
376 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
377 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
378 {
379 sink('Camera.log')
380 genesets = c()
381 if (bigmt > "")
382 {
383 bigenesets = readLines(bigmt)
384 genesets = bigenesets
385 }
386 if (histgmt > "")
387 {
388 hgenesets = readLines(histgmt)
389 if (bigmt > "") {
390 genesets = rbind(genesets,hgenesets)
391 } else {
392 genesets = hgenesets
393 } # use only history if no bi
394 }
395 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
396 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
397 outf = outfname
398 head=paste(myTitle,'edgeR GSEA')
399 write(head,file=outfname,append=F)
400 ntest=length(genesets)
401 urownames = toupper(rownames(y))
402 upcam = c()
403 downcam = c()
404 incam = c()
405 urls = c()
406 gsids = c()
407 for (i in 1:ntest) {
408 gs = unlist(genesets[i])
409 gsid = gs[1] # geneset_id
410 url = gs[2]
411 if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
412 glist = gs[3:length(gs)] # member gene symbols
413 glist = toupper(glist)
414 inglist = urownames %in% glist
415 nin = sum(inglist)
416 if ((nin > minnin) && (nin < maxnin)) {
417 incam = c(incam,inglist)
418 gsids = c(gsids,gsid)
419 urls = c(urls,url)
420 }
421 }
422 incam = as.list(incam)
423 names(incam) = gsids
424 allcam = camera(y=y,index=incam,design=design)
425 allcamres = cbind(geneset=gsids,allcam,URL=urls)
426 for (i in 1:ntest) {
427 camres = allcamres[i]
428 res = try(test = (camres\$Direction == "Up"))
429 if ("try-error" %in% class(res)) {
430 cat("test failed, camres = :")
431 print.noquote(camres)
432 } else { if (camres\$Direction == "Up")
433 { upcam = rbind(upcam,camres)
434 } else { downcam = rbind(downcam,camres)
435 }
436
437 }
438 }
439 uscam = upcam[order(upcam\$PValue),]
440 unadjp = uscam\$PValue
441 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
442 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
443 dscam = downcam[order(downcam\$PValue),]
444 unadjp = dscam\$PValue
445 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
446 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
447 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
448 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
449 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
450 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
451 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
452 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
453 sink()
454 }
455
456
457 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_Voom=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
458 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
459 filterquantile=0.2, subjects=c(),TreatmentName="Rx",ControlName="Ctrl",mydesign=NULL,
460 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
461 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
462 doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary')
463 {
464
465 logf = file('Differential.log', open = "a")
466 sink(logf,type = c("output", "message"))
467
468
469 run_edgeR = function(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod)
470 {
471 logf = file('edgeR.log', open = "a")
472 sink(logf,type = c("output", "message"))
473 #### Setup myDGEList object
474 myDGEList = DGEList(counts=workCM, group = group)
475 myDGEList = calcNormFactors(myDGEList)
476 if (robust_meth == 'ordinary') {
477 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign)
478 myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign)
479 if (priordf > 0) { myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign,prior.df = priordf)
480 } else { myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign) }
481 comdisp = myDGEList\$common.dispersion
482 estpriorn = getPriorN(myDGEList)
483 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
484 } else {
485 myDGEList = estimateGLMRobustDisp(myDGEList,design=mydesign, prior.df = priordf, maxit = 6, residual.type = robust_meth)
486 }
487
488
489 DGLM = glmFit(myDGEList,design=mydesign)
490 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
491 normData = cpm(myDGEList)
492 uoutput = cbind(
493 Name=as.character(rownames(myDGEList\$counts)),
494 DE\$table,
495 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
496 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
497 myDGEList\$counts
498 )
499 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
500 goodness = gof(DGLM, pcutoff=fdrthresh)
501 if (sum(goodness\$outlier) > 0) {
502 print.noquote('GLM outliers:')
503 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
504 } else {
505 print('No GLM fit outlier genes found\n')
506 }
507 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
508 pdf(paste("edgeR",mt,"GoodnessofFit.pdf",sep='_'))
509 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
510 abline(0,1,lwd=3)
511 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
512 dev.off()
513 uniqueg = unique(group)
514 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
515 tt = cbind(
516 Name=as.character(rownames(myDGEList)),
517 DE\$table,
518 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
519 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums
520 )
521 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
522 stt = tt[order(DE\$table\$PValue),]
523 print.noquote("@@ edgeR Top tags\n")
524 print.noquote(stt[1:50,])
525 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
526 nsig = length(deTags)
527 print.noquote(paste('@@',nsig,'tags significant at adj p=',fdrthresh))
528 deColours = ifelse(deTags,'red','black')
529 pdf(paste("edgeR",mt,"BCV_vs_abundance.pdf",sep="_"))
530 plotBCV(myDGEList, cex=0.3, main="Biological CV vs abundance")
531 dev.off()
532 dg = myDGEList[order(DE\$table\$PValue),]
533 outpdfname= paste("edgeR",mt,"top_100_heatmap.pdf",sep="_")
534 ocpm = normData[order(DE\$table\$PValue),]
535 ocpm = ocpm[c(1:100),]
536 hmap2(ocpm,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste(myTitle,'Heatmap'))
537 outSmear = paste("edgeR",mt,"smearplot.pdf",sep="_")
538 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
539 smearPlot(myDGEList=myDGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
540 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf=paste('edgeR',mt,'qqplot.pdf',sep='_'))
541 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
542 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
543 edgeRcounts = rep(0, length(allgenes))
544 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
545 sink()
546 return(list(myDGEList=myDGEList,edgeRcounts=edgeRcounts))
547 } ### run_edgeR
548
549
550 run_DESeq2 = function(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType)
551
552 {
553 logf = file("DESeq2.log", open = "a")
554 sink(logf,type = c("output", "message"))
555 # DESeq2
556 require('DESeq2')
557 library('RColorBrewer')
558 if (length(subjects) == 0)
559 {
560 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
561 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
562 } else {
563 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
564 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
565 }
566 deSeqDatsizefac = estimateSizeFactors(deSEQds)
567 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
568 resDESeq = nbinomWaldTest(deSeqDatdisp)
569 rDESeq = as.data.frame(results(resDESeq))
570 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
571 srDESeq = rDESeq[order(rDESeq\$pvalue),]
572 qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf=paste('DESeq2',mt,'qqplot.pdf',sep="_"))
573 cat("# DESeq top 50\n")
574 print.noquote(srDESeq[1:50,])
575 write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
576 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
577 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
578 DESeqcounts = rep(0, length(allgenes))
579 DESeqcounts[DESeqcountsindex] = 1
580 pdf(paste("DESeq2",mt,"dispersion_estimates.pdf",sep='_'))
581 plotDispEsts(resDESeq)
582 dev.off()
583 ysmall = abs(min(rDESeq\$log2FoldChange))
584 ybig = abs(max(rDESeq\$log2FoldChange))
585 ylimit = min(4,ysmall,ybig)
586 pdf(paste("DESeq2",mt,"MA_plot.pdf",sep="_"))
587 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
588 dev.off()
589 rlogres = rlogTransformation(resDESeq)
590 sampledists = dist( t( assay(rlogres) ) )
591 sdmat = as.matrix(sampledists)
592 pdf(paste("DESeq2",mt,"sample_distance_plot.pdf",sep="_"))
593 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
594 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
595 dev.off()
596 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
597 if ("try-error" %in% class(result)) {
598 print.noquote('DESeq2 plotPCA failed.')
599 } else {
600 pdf(paste("DESeq2",mt,"PCA_plot.pdf",sep="_"))
601 #### wtf - print? Seems needed to get this to work
602 print(ppca)
603 dev.off()
604 }
605 sink()
606 return(DESeqcounts)
607 }
608
609
610 run_Voom = function(workCM,pdata,subjects,group,mydesign,mt,out_Voom)
611 {
612 logf = file('VOOM.log', open = "a")
613 sink(logf,type = c("output", "message"))
614 if (doedgeR == F) {
615 #### Setup myDGEList object
616 myDGEList = DGEList(counts=workCM, group = group)
617 myDGEList = calcNormFactors(myDGEList)
618 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign)
619 myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign)
620 myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign)
621 }
622 pdf(paste("VOOM",mt,"mean_variance_plot.pdf",sep='_'))
623 dat.voomed <- voom(myDGEList, mydesign, plot = TRUE, normalize.method="quantil", lib.size = NULL)
624 dev.off()
625 # Use limma to fit data
626 fit = lmFit(dat.voomed, mydesign)
627 fit = eBayes(fit)
628 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
629 qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf=paste('VOOM',mt,'qqplot.pdf',sep='_'))
630 rownames(rvoom) = rownames(workCM)
631 rvoom = cbind(Contig=rownames(workCM),rvoom,NReads=cmrowsums,URL=contigurls)
632 srvoom = rvoom[order(rvoom\$P.Value),]
633 cat("# VOOM top 50\n")
634 print(srvoom[1:50,])
635 write.table(srvoom,file=out_Voom, quote=FALSE, sep="\t",row.names=F)
636 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
637 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
638 voomcountsindex <- which(allgenes %in% rownames(topresults.voom))
639 voomcounts = rep(0, length(allgenes))
640 voomcounts[voomcountsindex] = 1
641 sink()
642 return(voomcounts)
643 }
644
645
646 #### data cleaning and analsis control starts here
647
648
649 # Error handling
650 nugroup = length(unique(group))
651 if (nugroup!=2){
652 print("Number of conditions identified in experiment does not equal 2")
653 q()
654 }
655 require(edgeR)
656 options(width = 512)
657 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
658 allN = nrow(Count_Matrix)
659 nscut = round(ncol(Count_Matrix)/2) # half samples
660 colTotmillionreads = colSums(Count_Matrix)/1e6
661 counts.dataframe = as.data.frame(c())
662 rawrs = rowSums(Count_Matrix)
663 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
664 nzN = nrow(nonzerod)
665 nzrs = rowSums(nonzerod)
666 zN = allN - nzN
667 print('@@@ Quantiles for non-zero row counts:',quote=F)
668 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
669 if (useNDF == T)
670 {
671 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
672 lo = colSums(Count_Matrix[!gt1rpin3,])
673 workCM = Count_Matrix[gt1rpin3,]
674 cleanrs = rowSums(workCM)
675 cleanN = length(cleanrs)
676 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
677 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
678 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
679 } else {
680 useme = (nzrs > quantile(nzrs,filterquantile))
681 workCM = nonzerod[useme,]
682 lo = colSums(nonzerod[!useme,])
683 cleanrs = rowSums(workCM)
684 cleanN = length(cleanrs)
685 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
686 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
687 maint = paste('Filter below',filterquantile,'quantile')
688 }
689 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
690 allgenes = rownames(workCM)
691 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)" # ucsc chr:start-end regexp
692 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
693 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
694 testreg = str_match(allgenes,reg)
695 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
696 {
697 print("@@ using ucsc substitution for urls")
698 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
699 } else {
700 print("@@ using genecards substitution for urls")
701 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
702 }
703 print.noquote(paste("@@ Total low count contigs per sample = ",paste(table(lo),collapse=',')))
704 cmrowsums = rowSums(workCM)
705 TName=unique(group)[1]
706 CName=unique(group)[2]
707 if (is.null(mydesign)) {
708 if (length(subjects) == 0)
709 {
710 mydesign = model.matrix(~group)
711 }
712 else {
713 subjf = factor(subjects)
714 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
715 }
716 }
717 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
718 print.noquote('Using design matrix:')
719 print.noquote(mydesign)
720 normData = cpm(workCM)*1e6
721 colnames(normData) = paste( colnames(workCM),'N',sep="_")
722 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
723
724 if (doedgeR == T) {
725 eres = run_edgeR(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod)
726 myDGEList = eres\$myDGEList
727 edgeRcounts = eres\$edgeRcounts
728 #### Plot MDS
729 sample_colors = match(group,levels(group))
730 sampleTypes = levels(factor(group))
731 print.noquote(sampleTypes)
732 pdf(paste("edgeR",mt,"MDSplot.pdf",sep='_'))
733 plotMDS.DGEList(myDGEList,main=paste("MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
734 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
735 grid(col="blue")
736 dev.off()
737 scale <- myDGEList\$samples\$lib.size*myDGEList\$samples\$norm.factors
738 normCounts <- round(t(t(myDGEList\$counts)/scale)*mean(scale))
739 try({boxPlot(rawrs=nzd,cleanrs=log2(normCounts+1),maint='Effects of TMM size normalisation',myTitle=myTitle,pdfname=paste("edgeR",mt,"raw_norm_counts_box.pdf",sep='_'))},T)
740 }
741 if (doDESeq2 == T) { DESeqcounts = run_DESeq2(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) }
742 if (doVoom == T) { voomcounts = run_Voom(workCM,pdata,subjects,group,mydesign,mt,out_Voom) }
743
744
745 if (doCamera) {
746 doGSEA(y=myDGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
747 outfname=paste("GSEA_Camera",mt,"table.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
748 }
749 counts.dataframe = c()
750 vennmain = 'no venn'
751 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
752 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
753 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
754 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
755 VOOM_limma = voomcounts, row.names = allgenes)
756 } else if ((doDESeq2==T) && (doedgeR==T)) {
757 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
758 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
759 } else if ((doVoom==T) && (doedgeR==T)) {
760 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
761 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
762 }
763
764 if (nrow(counts.dataframe > 1)) {
765 counts.venn = vennCounts(counts.dataframe)
766 vennf = paste("Differential_venn",mt,"significant_genes_overlap.pdf",sep="_")
767 pdf(vennf)
768 vennDiagram(counts.venn,main=vennmain,col="maroon")
769 dev.off()
770 }
771 } #### doDESeq2 or doVoom
772 sink()
773 }
774 #### Done
775 ]]>
776 builtin_gmt = ""
777 history_gmt = ""
778 history_gmt_name = ""
779 out_edgeR = F
780 out_DESeq2 = F
781 out_Voom = "$out_VOOM"
782 edgeR_robust_meth = "ordinary"
783 doDESeq2 = $DESeq2.doDESeq2
784 doVoom = $doVoom
785 doCamera = F
786 doedgeR = $edgeR.doedgeR
787 edgeR_priordf = 10
788
789
790 #if $doVoom == "T":
791 out_Voom = "$out_VOOM"
792 #end if
793
794 #if $DESeq2.doDESeq2 == "T":
795 out_DESeq2 = "$out_DESeq2"
796 doDESeq2 = T
797 DESeq_fitType = "$DESeq2.DESeq_fitType"
798 #end if
799
800 #if $edgeR.doedgeR == "T":
801 out_edgeR = "$out_edgeR"
802 edgeR_priordf = $edgeR.edgeR_priordf
803 edgeR_robust_meth = "$edgeR.edgeR_robust_method"
804 #end if
805
806
807 if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
808 {
809 write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
810 quit(save="no",status=2)
811 }
812
813 Out_Dir = "$html_file.files_path"
814 Input = "$input1"
815 TreatmentName = "$treatment_name"
816 TreatmentCols = "$Treat_cols"
817 ControlName = "$control_name"
818 ControlCols= "$Control_cols"
819 org = "$input1.dbkey"
820 if (org == "") { org = "hg19"}
821 fdrtype = "$fdrtype"
822 fdrthresh = $fdrthresh
823 useNDF = $useNDF
824 fQ = $fQ # non-differential centile cutoff
825 myTitle = "$title"
826 sids = strsplit("$subjectids",',')
827 subjects = unlist(sids)
828 nsubj = length(subjects)
829 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
830 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
831 cat('Got TCols=')
832 cat(TCols)
833 cat('; CCols=')
834 cat(CCols)
835 cat('\n')
836 <![CDATA[
837 useCols = c(TCols,CCols)
838 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
839 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t')
840 snames = colnames(Count_Matrix)
841 nsamples = length(snames)
842 if (nsubj > 0 & nsubj != nsamples) {
843 options("show.error.messages"=T)
844 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
845 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
846 write(mess, stderr())
847 quit(save="no",status=4)
848 }
849 if (length(subjects) != 0) {subjects = subjects[useCols]}
850 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
851 rn = rownames(Count_Matrix)
852 islib = rn %in% c('librarySize','NotInBedRegions')
853 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
854 Count_Matrix = Count_Matrix[subset(rn,! islib),]
855 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) )
856 group = factor(group, levels=c(ControlName,TreatmentName))
857 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_")
858 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_Voom=out_Voom, out_DESeq2=out_DESeq2,
859 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
860 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,TreatmentName=TreatmentName,ControlName=ControlName,
861 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
862 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType,robust_meth=edgeR_robust_meth)
863 sessionInfo()
864
865 sink()
866 ]]>
867 </configfile>
868 </configfiles>
869 <help>
870
871 **What it does**
872
873 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
874 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
875
876 **Input**
877
878 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
879 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
880 non-negative integer count of reads from one sample overlapping the feature.
881
882 The matrix must have a header row uniquely identifying the source samples, and unique row names in
883 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
884 They must be unique and R names or they will be mangled - please read the fine R docs for the rules on identifiers.
885
886 **Specifying comparisons**
887
888 This is basically dumbed down for two factors - case vs control.
889
890 More complex interfaces are possible but painful at present.
891 Probably need to specify a phenotype file to do this better.
892 Work in progress. Send code.
893
894 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
895 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
896 A list of integers, one for each subject or an empty string if samples are all independent.
897 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
898 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
899
900 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
901 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
902 8,9,1,1,2,2
903 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
904
905 **Methods available**
906
907 You can run 3 popular Bioconductor packages available for count data.
908
909 edgeR - see edgeR_ for details
910
911 VOOM/limma - see limma_VOOM_ for details
912
913 DESeq2 - see DESeq2_ for details
914
915 and optionally camera in edgeR which works better if MSigDB is installed.
916
917 **Outputs**
918
919 Some helpful plots and analysis results. Note that most of these are produced using R code
920 suggested by the excellent documentation and vignettes for the Bioconductor
921 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
922
923 **Note on Voom**
924
925 The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
926
927 This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
928
929 voom is an acronym for mean-variance modelling at the observational level.
930 The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
931 Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
932 This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
933 The weights are then used in the linear modelling process to adjust for heteroscedasticity.
934
935 In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
936 The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
937 The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
938 Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
939 Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
940 This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
941
942
943 Author(s)
944
945 Charity Law and Gordon Smyth
946
947 References
948
949 Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
950
951 Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
952 Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
953 http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
954
955 See Also
956
957 A voom case study is given in the edgeR User's Guide.
958
959 vooma is a similar function but for microarrays instead of RNA-seq.
960
961
962 ***old rant on changes to Bioconductor package variable names between versions***
963
964 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
965 breaking this and all other code that assumed the old name for this variable,
966 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
967 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
968 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
969 when their old scripts break. This tool currently now works with 2.4.6.
970
971 **Note on prior.N**
972
973 http://seqanswers.com/forums/showthread.php?t=5591 says:
974
975 *prior.n*
976
977 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
978 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
979 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
980 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
981 common likelihood the weight of one observation.
982
983 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
984 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
985 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
986 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
987 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
988 If you have more samples, then the tagwise dispersion estimates will be more reliable,
989 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
990
991
992 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
993
994 Dear Dorota,
995
996 The important settings are prior.df and trend.
997
998 prior.n and prior.df are related through prior.df = prior.n * residual.df,
999 and your experiment has residual.df = 36 - 12 = 24. So the old setting of
1000 prior.n=10 is equivalent for your data to prior.df = 240, a very large
1001 value. Going the other way, the new setting of prior.df=10 is equivalent
1002 to prior.n=10/24.
1003
1004 To recover old results with the current software you would use
1005
1006 estimateTagwiseDisp(object, prior.df=240, trend="none")
1007
1008 To get the new default from old software you would use
1009
1010 estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
1011
1012 Actually the old trend method is equivalent to trend="loess" in the new
1013 software. You should use plotBCV(object) to see whether a trend is
1014 required.
1015
1016 Note you could also use
1017
1018 prior.n = getPriorN(object, prior.df=10)
1019
1020 to map between prior.df and prior.n.
1021
1022 ----
1023
1024 **Attributions**
1025
1026 edgeR - edgeR_
1027
1028 VOOM/limma - limma_VOOM_
1029
1030 DESeq2 - DESeq2_ for details
1031
1032 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
1033
1034 Galaxy_ (that's what you are using right now!) for gluing everything together
1035
1036 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
1037 licensed to you under the LGPL_ like other rgenetics artefacts
1038
1039 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
1040 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
1041 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
1042 .. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
1043 .. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
1044 .. _Galaxy: http://getgalaxy.org
1045 </help>
1046 <citations>
1047 <citation type="doi">doi: 10.1093/bioinformatics/btp616</citation>
1048 </citations>
1049 </tool>