comparison rgedgeRpaired_nocamera.xml~ @ 45:6d9d2c9679ec draft

Uploaded
author fubar
date Mon, 23 Dec 2013 16:42:58 -0500
parents
children
comparison
equal deleted inserted replaced
44:bdb19fdbd679 45:6d9d2c9679ec
1 <tool id="rgDifferentialCount" name="Differential_Count" version="0.22">
2 <description>models using BioConductor packages</description>
3 <requirements>
4 <requirement type="package" version="3.0.1">r3</requirement>
5 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
6 <requirement type="package" version="9.07">ghostscript</requirement>
7 <requirement type="package" version="2.12">biocbasics</requirement>
8 </requirements>
9
10 <command interpreter="python">
11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts"
12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
13 </command>
14 <inputs>
15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
16 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
17 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs"
18 help="Supply a meaningful name here to remind you what the outputs contain">
19 <sanitizer invalid_char="">
20 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
21 </sanitizer>
22 </param>
23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
25 multiple="true" use_header_names="true" size="120" display="checkboxes">
26 <validator type="no_options" message="Please select at least one column."/>
27 <sanitizer invalid_char="">
28 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
29 </sanitizer>
30 </param>
31 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
32 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
33 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
34 <validator type="no_options" message="Please select at least one column."/>
35 <sanitizer invalid_char="">
36 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
37 </sanitizer> <validator type="no_options" message="Please select at least one column."/>
38 <sanitizer invalid_char="">
39 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
40 </sanitizer>
41
42 </param>
43 <param name="subjectids" type="text" optional="true" size="120" value = ""
44 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input"
45 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'">
46 <sanitizer>
47 <valid initial="string.letters,string.digits"><add value="," /> </valid>
48 </sanitizer>
49 </param>
50 <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
51 help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
52 <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1"
53 label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
54 help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
55
56 <conditional name="edgeR">
57 <param name="doedgeR" type="select"
58 label="Run this model using edgeR"
59 help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
60 <option value="F">Do not run edgeR</option>
61 <option value="T" selected="true">Run edgeR</option>
62 </param>
63 <when value="T">
64 <param name="edgeR_priordf" type="integer" value="20" size="3"
65 label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n and prior.df = prior.n * residual.df"
66 help="0 = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
67 </when>
68 <when value="F"></when>
69 </conditional>
70 <conditional name="DESeq2">
71 <param name="doDESeq2" type="select"
72 label="Run the same model with DESeq2 and compare findings"
73 help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
74 <option value="F" selected="true">Do not run DESeq2</option>
75 <option value="T">Run DESeq2</option>
76 </param>
77 <when value="T">
78 <param name="DESeq_fitType" type="select">
79 <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
80 <option value="local">Local fit - this will automagically be used if parametric fit fails</option>
81 <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual linked below in the documentation</option>
82 </param>
83 </when>
84 <when value="F"> </when>
85 </conditional>
86 <param name="doVoom" type="select"
87 label="Run the same model with Voom/limma and compare findings"
88 help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma">
89 <option value="F" selected="true">Do not run VOOM</option>
90 <option value="T">Run VOOM</option>
91 </param>
92 <!--
93 <conditional name="camera">
94 <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets"
95 help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history">
96 <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option>
97 <option value="T">Run GSEA tests with the Camera algorithm</option>
98 </param>
99 <when value="T">
100 <conditional name="gmtSource">
101 <param name="refgmtSource" type="select"
102 label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set">
103 <option value="indexed" selected="true">Use a built-in gene set</option>
104 <option value="history">Use a gene set from my history</option>
105 <option value="both">Add a gene set from my history to a built in gene set</option>
106 </param>
107 <when value="indexed">
108 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
109 <options from_data_table="gseaGMT_3.1">
110 <filter type="sort_by" column="2" />
111 <validator type="no_options" message="No GMT v3.1 files are available - please install them"/>
112 </options>
113 </param>
114 </when>
115 <when value="history">
116 <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" />
117 </when>
118 <when value="both">
119 <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" />
120 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
121 <options from_data_table="gseaGMT_4">
122 <filter type="sort_by" column="2" />
123 <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/>
124 </options>
125 </param>
126 </when>
127 </conditional>
128 </when>
129 <when value="F">
130 </when>
131 </conditional>
132 -->
133 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
134 help="Conventional default value of 0.05 recommended"/>
135 <param name="fdrtype" type="select" label="FDR (Type II error) control method"
136 help="Use fdr or bh typically to control for the number of tests in a reliable way">
137 <option value="fdr" selected="true">fdr</option>
138 <option value="BH">Benjamini Hochberg</option>
139 <option value="BY">Benjamini Yukateli</option>
140 <option value="bonferroni">Bonferroni</option>
141 <option value="hochberg">Hochberg</option>
142 <option value="holm">Holm</option>
143 <option value="hommel">Hommel</option>
144 <option value="none">no control for multiple tests</option>
145 </param>
146 </inputs>
147 <outputs>
148 <data format="tabular" name="out_edgeR" label="${title}_topTable_edgeR.xls">
149 <filter>edgeR['doedgeR'] == "T"</filter>
150 </data>
151 <data format="tabular" name="out_DESeq2" label="${title}_topTable_DESeq2.xls">
152 <filter>DESeq2['doDESeq2'] == "T"</filter>
153 </data>
154 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
155 <filter>doVoom == "T"</filter>
156 </data>
157 <data format="html" name="html_file" label="${title}.html"/>
158 </outputs>
159 <stdio>
160 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix" />
161 </stdio>
162 <tests>
163 <test>
164 <param name='input1' value='test_bams2mx.xls' ftype='tabular' />
165 <param name='treatment_name' value='liver' />
166 <param name='title' value='edgeRtest' />
167 <param name='useNDF' value='' />
168 <param name='doedgeR' value='T' />
169 <param name='doVoom' value='T' />
170 <param name='doDESeq2' value='T' />
171 <param name='fdrtype' value='fdr' />
172 <param name='edgeR_priordf' value="8" />
173 <param name='fdrthresh' value="0.05" />
174 <param name='control_name' value='heart' />
175 <param name='subjectids' value='' />
176 <param name='Control_cols' value='3,4,5,9' />
177 <param name='Treat_cols' value='2,6,7,8' />
178 <output name='out_edgeR' file='edgeRtest1out.xls' compare='diff' />
179 <output name='html_file' file='edgeRtest1out.html' compare='diff' lines_diff='20' />
180 </test>
181 </tests>
182
183 <configfiles>
184 <configfile name="runme">
185 <![CDATA[
186 #
187 # edgeR.Rscript
188 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
189 # Performs DGE on a count table containing n replicates of two conditions
190 #
191 # Parameters
192 #
193 # 1 - Output Dir
194
195 # Original edgeR code by: S.Lunke and A.Kaspi
196 reallybig = log10(.Machine\$double.xmax)
197 reallysmall = log10(.Machine\$double.xmin)
198 library('stringr')
199 library('gplots')
200 library('edgeR')
201 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
202 {
203 # Perform clustering for significant pvalues after controlling FWER
204 samples = colnames(cmat)
205 gu = unique(group)
206 gn = rownames(cmat)
207 if (length(gu) == 2) {
208 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
209 pcols = unlist(lapply(group,col.map))
210 } else {
211 colours = rainbow(length(gu),start=0,end=4/6)
212 pcols = colours[match(group,gu)] }
213 dm = cmat[(! is.na(gn)),]
214 # remove unlabelled hm rows
215 nprobes = nrow(dm)
216 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
217 if (nprobes > nsamp) {
218 dm =dm[1:nsamp,]
219 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
220 }
221 newcolnames = substr(colnames(dm),1,20)
222 colnames(dm) = newcolnames
223 pdf(outpdfname)
224 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
225 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
226 dev.off()
227 }
228
229 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
230 {
231 # for 2 groups only was
232 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
233 #pcols = unlist(lapply(group,col.map))
234 gu = unique(group)
235 colours = rainbow(length(gu),start=0.3,end=0.6)
236 pcols = colours[match(group,gu)]
237 nrows = nrow(cmat)
238 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
239 if (nrows > nsamp) {
240 cmat = cmat[c(1:nsamp),]
241 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
242 }
243 newcolnames = substr(colnames(cmat),1,20)
244 colnames(cmat) = newcolnames
245 pdf(outpdfname)
246 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
247 dev.off()
248 }
249
250 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
251 # stolen from https://gist.github.com/703512
252 {
253 o = -log10(sort(pvector,decreasing=F))
254 e = -log10( 1:length(o)/length(o) )
255 o[o==-Inf] = reallysmall
256 o[o==Inf] = reallybig
257 maint = descr
258 pdf(outpdf)
259 plot(e,o,pch=19,cex=1, main=maint, ...,
260 xlab=expression(Expected~~-log[10](italic(p))),
261 ylab=expression(Observed~~-log[10](italic(p))),
262 xlim=c(0,max(e)), ylim=c(0,max(o)))
263 lines(e,e,col="red")
264 grid(col = "lightgray", lty = "dotted")
265 dev.off()
266 }
267
268 smearPlot = function(DGEList,deTags, outSmear, outMain)
269 {
270 pdf(outSmear)
271 plotSmear(DGEList,de.tags=deTags,main=outMain)
272 grid(col="lightgray", lty="dotted")
273 dev.off()
274 }
275
276 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
277 { #
278 nc = ncol(rawrs)
279 #### for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
280 fullnames = colnames(rawrs)
281 newcolnames = substr(colnames(rawrs),1,20)
282 colnames(rawrs) = newcolnames
283 newcolnames = substr(colnames(cleanrs),1,20)
284 colnames(cleanrs) = newcolnames
285 defpar = par(no.readonly=T)
286 print.noquote('raw contig counts by sample:')
287 print.noquote(summary(rawrs))
288 print.noquote('normalised contig counts by sample:')
289 print.noquote(summary(cleanrs))
290 pdf(pdfname)
291 par(mfrow=c(1,2))
292 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
293 grid(col="lightgray",lty="dotted")
294 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
295 grid(col="lightgray",lty="dotted")
296 dev.off()
297 pdfname = "sample_counts_histogram.pdf"
298 nc = ncol(rawrs)
299 print.noquote(paste('Using ncol rawrs=',nc))
300 ncroot = round(sqrt(nc))
301 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
302 m = c()
303 for (i in c(1:nc)) {
304 rhist = hist(rawrs[,i],breaks=100,plot=F)
305 m = append(m,max(rhist\$counts))
306 }
307 ymax = max(m)
308 ncols = length(fullnames)
309 if (ncols > 20)
310 {
311 scale = 7*ncols/20
312 pdf(pdfname,width=scale,height=scale)
313 } else {
314 pdf(pdfname)
315 }
316 par(mfrow=c(ncroot,ncroot))
317 for (i in c(1:nc)) {
318 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
319 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
320 }
321 dev.off()
322 par(defpar)
323
324 }
325
326 cumPlot = function(rawrs,cleanrs,maint,myTitle)
327 { # updated to use ecdf
328 pdfname = "Filtering_rowsum_bar_charts.pdf"
329 defpar = par(no.readonly=T)
330 lrs = log(rawrs,10)
331 lim = max(lrs)
332 pdf(pdfname)
333 par(mfrow=c(2,1))
334 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
335 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
336 grid(col="lightgray", lty="dotted")
337 lrs = log(cleanrs,10)
338 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
339 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
340 grid(col="lightgray", lty="dotted")
341 dev.off()
342 par(defpar)
343 }
344
345 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
346 { # updated to use ecdf
347 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
348 pdf(pdfname)
349 par(mfrow=c(2,1))
350 lastx = max(rawrs)
351 rawe = knots(ecdf(rawrs))
352 cleane = knots(ecdf(cleanrs))
353 cy = 1:length(cleane)/length(cleane)
354 ry = 1:length(rawe)/length(rawe)
355 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
356 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
357 grid(col="blue")
358 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
359 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
360 grid(col="blue")
361 dev.off()
362 }
363
364
365
366 doGSEA = function(y=NULL,design=NULL,histgmt="",
367 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
368 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
369 {
370 sink('Camera.log')
371 genesets = c()
372 if (bigmt > "")
373 {
374 bigenesets = readLines(bigmt)
375 genesets = bigenesets
376 }
377 if (histgmt > "")
378 {
379 hgenesets = readLines(histgmt)
380 if (bigmt > "") {
381 genesets = rbind(genesets,hgenesets)
382 } else {
383 genesets = hgenesets
384 } # use only history if no bi
385 }
386 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
387 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
388 outf = outfname
389 head=paste(myTitle,'edgeR GSEA')
390 write(head,file=outfname,append=F)
391 ntest=length(genesets)
392 urownames = toupper(rownames(y))
393 upcam = c()
394 downcam = c()
395 for (i in 1:ntest) {
396 gs = unlist(genesets[i])
397 g = gs[1] # geneset_id
398 u = gs[2]
399 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
400 glist = gs[3:length(gs)] # member gene symbols
401 glist = toupper(glist)
402 inglist = urownames %in% glist
403 nin = sum(inglist)
404 if ((nin > minnin) && (nin < maxnin)) {
405 ### print(paste('@@found',sum(inglist),'genes in glist'))
406 camres = camera(y=y,index=inglist,design=design)
407 if (! is.null(camres)) {
408 rownames(camres) = g # gene set name
409 camres = cbind(GeneSet=g,URL=u,camres)
410 if (camres\$Direction == "Up")
411 {
412 upcam = rbind(upcam,camres) } else {
413 downcam = rbind(downcam,camres)
414 }
415 }
416 }
417 }
418 uscam = upcam[order(upcam\$PValue),]
419 unadjp = uscam\$PValue
420 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
421 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
422 dscam = downcam[order(downcam\$PValue),]
423 unadjp = dscam\$PValue
424 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
425 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
426 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
427 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
428 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
429 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
430 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
431 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
432 sink()
433 }
434
435
436
437
438 doGSEAatonce = function(y=NULL,design=NULL,histgmt="",
439 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
440 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
441 {
442 sink('Camera.log')
443 genesets = c()
444 if (bigmt > "")
445 {
446 bigenesets = readLines(bigmt)
447 genesets = bigenesets
448 }
449 if (histgmt > "")
450 {
451 hgenesets = readLines(histgmt)
452 if (bigmt > "") {
453 genesets = rbind(genesets,hgenesets)
454 } else {
455 genesets = hgenesets
456 } # use only history if no bi
457 }
458 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
459 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
460 outf = outfname
461 head=paste(myTitle,'edgeR GSEA')
462 write(head,file=outfname,append=F)
463 ntest=length(genesets)
464 urownames = toupper(rownames(y))
465 upcam = c()
466 downcam = c()
467 incam = c()
468 urls = c()
469 gsids = c()
470 for (i in 1:ntest) {
471 gs = unlist(genesets[i])
472 gsid = gs[1] # geneset_id
473 url = gs[2]
474 if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
475 glist = gs[3:length(gs)] # member gene symbols
476 glist = toupper(glist)
477 inglist = urownames %in% glist
478 nin = sum(inglist)
479 if ((nin > minnin) && (nin < maxnin)) {
480 incam = c(incam,inglist)
481 gsids = c(gsids,gsid)
482 urls = c(urls,url)
483 }
484 }
485 incam = as.list(incam)
486 names(incam) = gsids
487 allcam = camera(y=y,index=incam,design=design)
488 allcamres = cbind(geneset=gsids,allcam,URL=urls)
489 for (i in 1:ntest) {
490 camres = allcamres[i]
491 res = try(test = (camres\$Direction == "Up"))
492 if ("try-error" %in% class(res)) {
493 cat("test failed, camres = :")
494 print.noquote(camres)
495 } else { if (camres\$Direction == "Up")
496 { upcam = rbind(upcam,camres)
497 } else { downcam = rbind(downcam,camres)
498 }
499
500 }
501 }
502 uscam = upcam[order(upcam\$PValue),]
503 unadjp = uscam\$PValue
504 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
505 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
506 dscam = downcam[order(downcam\$PValue),]
507 unadjp = dscam\$PValue
508 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
509 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
510 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
511 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
512 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
513 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
514 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
515 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
516 sink()
517 }
518
519
520 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
521 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
522 filterquantile=0.2, subjects=c(),mydesign=NULL,
523 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
524 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
525 doCook=F,DESeq_fitType="parameteric")
526 {
527 # Error handling
528 if (length(unique(group))!=2){
529 print("Number of conditions identified in experiment does not equal 2")
530 q()
531 }
532 require(edgeR)
533 options(width = 512)
534 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
535 allN = nrow(Count_Matrix)
536 nscut = round(ncol(Count_Matrix)/2)
537 colTotmillionreads = colSums(Count_Matrix)/1e6
538 counts.dataframe = as.data.frame(c())
539 rawrs = rowSums(Count_Matrix)
540 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
541 nzN = nrow(nonzerod)
542 nzrs = rowSums(nonzerod)
543 zN = allN - nzN
544 print('# Quantiles for non-zero row counts:',quote=F)
545 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
546 if (useNDF == T)
547 {
548 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
549 lo = colSums(Count_Matrix[!gt1rpin3,])
550 workCM = Count_Matrix[gt1rpin3,]
551 cleanrs = rowSums(workCM)
552 cleanN = length(cleanrs)
553 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
554 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
555 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
556 } else {
557 useme = (nzrs > quantile(nzrs,filterquantile))
558 workCM = nonzerod[useme,]
559 lo = colSums(nonzerod[!useme,])
560 cleanrs = rowSums(workCM)
561 cleanN = length(cleanrs)
562 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
563 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
564 maint = paste('Filter below',filterquantile,'quantile')
565 }
566 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
567 allgenes = rownames(workCM)
568 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
569 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
570 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
571 testreg = str_match(allgenes,reg)
572 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
573 {
574 print("@@ using ucsc substitution for urls")
575 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
576 } else {
577 print.noquote("@@ using genecards substitution for urls")
578 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
579 }
580 print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
581 cmrowsums = rowSums(workCM)
582 TName=unique(group)[1]
583 CName=unique(group)[2]
584 if (is.null(mydesign)) {
585 if (length(subjects) == 0)
586 {
587 mydesign = model.matrix(~group)
588 }
589 else {
590 subjf = factor(subjects)
591 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
592 }
593 }
594 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
595 print.noquote('Using design matrix:')
596 print.noquote(mydesign)
597 if (doedgeR) {
598 sink('edgeR.log')
599 #### Setup DGEList object
600 DGEList = DGEList(counts=workCM, group = group)
601 DGEList = calcNormFactors(DGEList)
602
603 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
604 comdisp = DGEList\$common.dispersion
605 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
606 if (edgeR_priordf > 0) {
607 print.noquote(paste("prior.df =",edgeR_priordf))
608 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
609 } else {
610 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
611 }
612 DGLM = glmFit(DGEList,design=mydesign)
613 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
614 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
615 normData = (1e+06*DGEList\$counts/efflib)
616 uoutput = cbind(
617 Name=as.character(rownames(DGEList\$counts)),
618 DE\$table,
619 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
620 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
621 DGEList\$counts
622 )
623 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
624 goodness = gof(DGLM, pcutoff=fdrthresh)
625 if (sum(goodness\$outlier) > 0) {
626 print.noquote('GLM outliers:')
627 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
628 } else {
629 print('No GLM fit outlier genes found\n')
630 }
631 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
632 pdf("edgeR_GoodnessofFit.pdf")
633 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
634 abline(0,1,lwd=3)
635 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
636 dev.off()
637 estpriorn = getPriorN(DGEList)
638 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
639 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
640 normData = (1e+06*DGEList\$counts)/efflib
641 lnormData = log(normData + 1e-6,10)
642 uniqueg = unique(group)
643 #### Plot MDS
644 sample_colors = match(group,levels(group))
645 sampleTypes = levels(factor(group))
646 print.noquote(sampleTypes)
647 pdf("edgeR_MDSplot.pdf")
648 plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
649 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
650 grid(col="blue")
651 dev.off()
652 colnames(normData) = paste( colnames(normData),'N',sep="_")
653 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
654 nzd = data.frame(log(nonzerod + 1e-2,10))
655 try( boxPlot(rawrs=nzd,cleanrs=lnormData,maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") )
656 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
657 tt = cbind(
658 Name=as.character(rownames(DGEList\$counts)),
659 DE\$table,
660 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
661 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
662 )
663 print.noquote("# edgeR Top tags\n")
664 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
665 tt = tt[order(DE\$table\$PValue),]
666 print.noquote(tt[1:50,])
667 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
668 nsig = length(deTags)
669 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
670 deColours = ifelse(deTags,'red','black')
671 pdf("edgeR_BCV_vs_abundance.pdf")
672 plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance",col.tagwise=deColours)
673 dev.off()
674 dg = DGEList[order(DE\$table\$PValue),]
675 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
676 efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
677 normData = (1e+06*dg\$counts/efflib)
678 outpdfname="edgeR_top_100_heatmap.pdf"
679 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle))
680 outSmear = "edgeR_smearplot.pdf"
681 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
682 smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
683 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf')
684 norm.factor = DGEList\$samples\$norm.factors
685 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
686 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
687 edgeRcounts = rep(0, length(allgenes))
688 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
689 sink()
690 } ### doedgeR
691 if (doDESeq2 == T)
692 {
693 sink("DESeq2.log")
694 # DESeq2
695 require('DESeq2')
696 library('RColorBrewer')
697 if (length(subjects) == 0)
698 {
699 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
700 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
701 } else {
702 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
703 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
704 }
705 #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
706 #rDESeq = results(DESeq2)
707 #newCountDataSet(workCM, group)
708 deSeqDatsizefac = estimateSizeFactors(deSEQds)
709 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
710 resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
711 rDESeq = as.data.frame(results(resDESeq))
712 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums)
713 srDESeq = rDESeq[order(rDESeq\$pvalue),]
714 write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
715 qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf')
716 cat("# DESeq top 50\n")
717 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
718 srDESeq = rDESeq[order(rDESeq\$pvalue),]
719 print.noquote(srDESeq[1:50,])
720 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
721 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
722 DESeqcounts = rep(0, length(allgenes))
723 DESeqcounts[DESeqcountsindex] = 1
724 pdf("DESeq2_dispersion_estimates.pdf")
725 plotDispEsts(resDESeq)
726 dev.off()
727 ysmall = abs(min(rDESeq\$log2FoldChange))
728 ybig = abs(max(rDESeq\$log2FoldChange))
729 ylimit = min(4,ysmall,ybig)
730 pdf("DESeq2_MA_plot.pdf")
731 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
732 dev.off()
733 rlogres = rlogTransformation(resDESeq)
734 sampledists = dist( t( assay(rlogres) ) )
735 sdmat = as.matrix(sampledists)
736 pdf("DESeq2_sample_distance_plot.pdf")
737 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
738 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
739 dev.off()
740 ###outpdfname="DESeq2_top50_heatmap.pdf"
741 ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
742 sink()
743 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
744 if ("try-error" %in% class(result)) {
745 print.noquote('DESeq2 plotPCA failed.')
746 } else {
747 pdf("DESeq2_PCA_plot.pdf")
748 #### wtf - print? Seems needed to get this to work
749 print(ppca)
750 dev.off()
751 }
752 }
753
754 if (doVoom == T) {
755 sink('Voom.log')
756 if (doedgeR == F) {
757 #### Setup DGEList object
758 DGEList = DGEList(counts=workCM, group = group)
759 DGEList = calcNormFactors(DGEList)
760 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
761 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
762 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
763 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
764 norm.factor = DGEList\$samples\$norm.factors
765 }
766 pdf("Voom_mean_variance_plot.pdf")
767 dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
768 dev.off()
769 # Use limma to fit data
770 fit = lmFit(dat.voomed, mydesign)
771 fit = eBayes(fit)
772 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
773 qqPlot(descr=paste(myTitle,'Voom-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='Voom_qqplot.pdf')
774 rownames(rvoom) = rownames(workCM)
775 rvoom = cbind(rvoom,NReads=cmrowsums)
776 srvoom = rvoom[order(rvoom\$P.Value),]
777 write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F)
778 rvoom = cbind(rvoom,URL=contigurls)
779 deTags = rownames(rvoom[rvoom\$adj.p.value < fdrthresh,])
780 nsig = length(deTags)
781 srvoom = rvoom[order(rvoom\$P.Value),]
782 cat("# Voom top 50\n")
783 print(srvoom[1:50,])
784 normData = srvoom\$E
785 outpdfname="Voom_top_100_heatmap.pdf"
786 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('VOOM Heatmap',myTitle))
787 outSmear = "Voom_smearplot.pdf"
788 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
789 smearPlot(DGEList=rvoom,deTags=deTags, outSmear=outSmear, outMain = outMain)
790 qqPlot(descr=paste(myTitle,'VOOM adj p QQ plot'),pvector=srvoom\$adj.P.Val,outpdf='Voom_qqplot.pdf')
791 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
792 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
793 voomcountsindex = which(allgenes %in% topresults.voom\$ID)
794 voomcounts = rep(0, length(allgenes))
795 voomcounts[voomcountsindex] = 1
796 sink()
797 }
798
799 if (doCamera) {
800 doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
801 outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
802 }
803
804 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
805 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
806 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
807 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
808 VOOM_limma = voomcounts, row.names = allgenes)
809 } else if ((doDESeq2==T) && (doedgeR==T)) {
810 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
811 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
812 } else if ((doVoom==T) && (doedgeR==T)) {
813 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
814 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
815 }
816
817 if (nrow(counts.dataframe > 1)) {
818 counts.venn = vennCounts(counts.dataframe)
819 vennf = "Venn_significant_genes_overlap.pdf"
820 pdf(vennf)
821 vennDiagram(counts.venn,main=vennmain,col="maroon")
822 dev.off()
823 }
824 } #### doDESeq2 or doVoom
825
826 }
827 #### Done
828
829 ###sink(stdout(),append=T,type="message")
830 builtin_gmt = ""
831 history_gmt = ""
832 history_gmt_name = ""
833 out_edgeR = F
834 out_DESeq2 = F
835 out_VOOM = "$out_VOOM"
836 doDESeq2 = $DESeq2.doDESeq2 # make these T or F
837 doVoom = $doVoom
838 doCamera = F
839 doedgeR = $edgeR.doedgeR
840 edgeR_priordf = 0
841
842
843 #if $doVoom == "T":
844 out_VOOM = "$out_VOOM"
845 #end if
846
847 #if $DESeq2.doDESeq2 == "T":
848 out_DESeq2 = "$out_DESeq2"
849 DESeq_fitType = "$DESeq2.DESeq_fitType"
850 #end if
851
852 #if $edgeR.doedgeR == "T":
853 out_edgeR = "$out_edgeR"
854 edgeR_priordf = $edgeR.edgeR_priordf
855 #end if
856
857
858 if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
859 {
860 write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
861 quit(save="no",status=2)
862 }
863
864 Out_Dir = "$html_file.files_path"
865 Input = "$input1"
866 TreatmentName = "$treatment_name"
867 TreatmentCols = "$Treat_cols"
868 ControlName = "$control_name"
869 ControlCols= "$Control_cols"
870 org = "$input1.dbkey"
871 if (org == "") { org = "hg19"}
872 fdrtype = "$fdrtype"
873 fdrthresh = $fdrthresh
874 useNDF = $useNDF
875 fQ = $fQ # non-differential centile cutoff
876 myTitle = "$title"
877 sids = strsplit("$subjectids",',')
878 subjects = unlist(sids)
879 nsubj = length(subjects)
880 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
881 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
882 cat('Got TCols=')
883 cat(TCols)
884 cat('; CCols=')
885 cat(CCols)
886 cat('\n')
887 useCols = c(TCols,CCols)
888 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
889 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
890 snames = colnames(Count_Matrix)
891 nsamples = length(snames)
892 if (nsubj > 0 & nsubj != nsamples) {
893 options("show.error.messages"=T)
894 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
895 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
896 write(mess, stderr())
897 quit(save="no",status=4)
898 }
899 if (length(subjects) != 0) {subjects = subjects[useCols]}
900 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
901 rn = rownames(Count_Matrix)
902 islib = rn %in% c('librarySize','NotInBedRegions')
903 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
904 Count_Matrix = Count_Matrix[subset(rn,! islib),]
905 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
906 group = factor(group, levels=c(ControlName,TreatmentName))
907 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
908 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
909 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
910 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,
911 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
912 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType)
913 sessionInfo()
914 ]]>
915 </configfile>
916 </configfiles>
917 <help>
918
919 **What it does**
920
921 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
922 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
923
924 **Input**
925
926 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
927 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
928 non-negative integer count of reads from one sample overlapping the feature.
929 The matrix must have a header row uniquely identifying the source samples, and unique row names in
930 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
931
932 **Specifying comparisons**
933
934 This is basically dumbed down for two factors - case vs control.
935
936 More complex interfaces are possible but painful at present.
937 Probably need to specify a phenotype file to do this better.
938 Work in progress. Send code.
939
940 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
941 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
942 A list of integers, one for each subject or an empty string if samples are all independent.
943 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
944 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
945
946 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
947 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
948 8,9,1,1,2,2
949 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
950
951 **Methods available**
952
953 You can run 3 popular Bioconductor packages available for count data.
954
955 edgeR - see edgeR_ for details
956
957 VOOM/limma - see limma_VOOM_ for details
958
959 DESeq2 - see DESeq2_ for details
960
961 and optionally camera in edgeR which works better if MSigDB is installed.
962
963 **Outputs**
964
965 Some helpful plots and analysis results. Note that most of these are produced using R code
966 suggested by the excellent documentation and vignettes for the Bioconductor
967 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
968
969 **Note on Voom**
970
971 The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
972
973 This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
974
975 voom is an acronym for mean-variance modelling at the observational level.
976 The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
977 Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
978 This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
979 The weights are then used in the linear modelling process to adjust for heteroscedasticity.
980
981 In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
982 The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
983 The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
984 Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
985 Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
986 This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
987
988
989 Author(s)
990
991 Charity Law and Gordon Smyth
992
993 References
994
995 Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
996
997 Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
998 Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
999 http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
1000
1001 See Also
1002
1003 A voom case study is given in the edgeR User's Guide.
1004
1005 vooma is a similar function but for microarrays instead of RNA-seq.
1006
1007
1008 ***old rant on changes to Bioconductor package variable names between versions***
1009
1010 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
1011 breaking this and all other code that assumed the old name for this variable,
1012 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
1013 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
1014 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
1015 when their old scripts break. This tool currently now works with 2.4.6.
1016
1017 **Note on prior.N**
1018
1019 http://seqanswers.com/forums/showthread.php?t=5591 says:
1020
1021 *prior.n*
1022
1023 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
1024 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
1025 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
1026 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
1027 common likelihood the weight of one observation.
1028
1029 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
1030 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
1031 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
1032 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
1033 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
1034 If you have more samples, then the tagwise dispersion estimates will be more reliable,
1035 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
1036
1037
1038 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
1039
1040 Dear Dorota,
1041
1042 The important settings are prior.df and trend.
1043
1044 prior.n and prior.df are related through prior.df = prior.n * residual.df,
1045 and your experiment has residual.df = 36 - 12 = 24. So the old setting of
1046 prior.n=10 is equivalent for your data to prior.df = 240, a very large
1047 value. Going the other way, the new setting of prior.df=10 is equivalent
1048 to prior.n=10/24.
1049
1050 To recover old results with the current software you would use
1051
1052 estimateTagwiseDisp(object, prior.df=240, trend="none")
1053
1054 To get the new default from old software you would use
1055
1056 estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
1057
1058 Actually the old trend method is equivalent to trend="loess" in the new
1059 software. You should use plotBCV(object) to see whether a trend is
1060 required.
1061
1062 Note you could also use
1063
1064 prior.n = getPriorN(object, prior.df=10)
1065
1066 to map between prior.df and prior.n.
1067
1068 ----
1069
1070 **Attributions**
1071
1072 edgeR - edgeR_
1073
1074 VOOM/limma - limma_VOOM_
1075
1076 DESeq2 - DESeq2_ for details
1077
1078 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
1079
1080 Galaxy_ (that's what you are using right now!) for gluing everything together
1081
1082 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
1083 licensed to you under the LGPL_ like other rgenetics artefacts
1084
1085 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
1086 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
1087 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
1088 .. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
1089 .. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
1090 .. _Galaxy: http://getgalaxy.org
1091 </help>
1092
1093 </tool>
1094
1095