comparison rgedgeRpaired_nocamera.xml~ @ 39:571b5a524d6b draft

Uploaded
author fubar
date Sun, 22 Dec 2013 05:53:33 -0500
parents
children
comparison
equal deleted inserted replaced
38:2f7573aacac3 39:571b5a524d6b
1 <tool id="rgDifferentialCount" name="Differential_Count" version="0.22">
2 <description>models using BioConductor packages</description>
3 <requirements>
4 <requirement type="package" version="3.0.1">r3</requirement>
5 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
6 <requirement type="package" version="9.07">ghostscript</requirement>
7 <requirement type="package" version="2.12">biocbasics</requirement>
8 </requirements>
9
10 <command interpreter="python">
11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts"
12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
13 </command>
14 <inputs>
15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
16 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
17 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs"
18 help="Supply a meaningful name here to remind you what the outputs contain">
19 <sanitizer invalid_char="">
20 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
21 </sanitizer>
22 </param>
23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
25 multiple="true" use_header_names="true" size="120" display="checkboxes">
26 <validator type="no_options" message="Please select at least one column."/>
27 <sanitizer invalid_char="">
28 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
29 </sanitizer>
30 </param>
31 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
32 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
33 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
34 <validator type="no_options" message="Please select at least one column."/>
35 <sanitizer invalid_char="">
36 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
37 </sanitizer> <validator type="no_options" message="Please select at least one column."/>
38 <sanitizer invalid_char="">
39 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
40 </sanitizer>
41
42 </param>
43 <param name="subjectids" type="text" optional="true" size="120" value = ""
44 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input"
45 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'">
46 <sanitizer>
47 <valid initial="string.letters,string.digits"><add value="," /> </valid>
48 </sanitizer>
49 </param>
50 <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
51 help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
52 <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1"
53 label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
54 help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
55
56 <conditional name="edgeR">
57 <param name="doedgeR" type="select"
58 label="Run this model using edgeR"
59 help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
60 <option value="F">Do not run edgeR</option>
61 <option value="T" selected="true">Run edgeR</option>
62 </param>
63 <when value="T">
64 <param name="edgeR_priordf" type="integer" value="20" size="3"
65 label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n and prior.df = prior.n * residual.df"
66 help="0 = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
67 </when>
68 <when value="F"></when>
69 </conditional>
70 <conditional name="DESeq2">
71 <param name="doDESeq2" type="select"
72 label="Run the same model with DESeq2 and compare findings"
73 help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
74 <option value="F" selected="true">Do not run DESeq2</option>
75 <option value="T">Run DESeq2</option>
76 </param>
77 <when value="T">
78 <param name="DESeq_fitType" type="select">
79 <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
80 <option value="local">Local fit - this will automagically be used if parametric fit fails</option>
81 <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual linked below in the documentation</option>
82 </param>
83 </when>
84 <when value="F"> </when>
85 </conditional>
86 <param name="doVoom" type="select"
87 label="Run the same model with Voom/limma and compare findings"
88 help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma">
89 <option value="F" selected="true">Do not run VOOM</option>
90 <option value="T">Run VOOM</option>
91 </param>
92 <!--
93 <conditional name="camera">
94 <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets"
95 help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history">
96 <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option>
97 <option value="T">Run GSEA tests with the Camera algorithm</option>
98 </param>
99 <when value="T">
100 <conditional name="gmtSource">
101 <param name="refgmtSource" type="select"
102 label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set">
103 <option value="indexed" selected="true">Use a built-in gene set</option>
104 <option value="history">Use a gene set from my history</option>
105 <option value="both">Add a gene set from my history to a built in gene set</option>
106 </param>
107 <when value="indexed">
108 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
109 <options from_data_table="gseaGMT_3.1">
110 <filter type="sort_by" column="2" />
111 <validator type="no_options" message="No GMT v3.1 files are available - please install them"/>
112 </options>
113 </param>
114 </when>
115 <when value="history">
116 <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" />
117 </when>
118 <when value="both">
119 <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" />
120 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
121 <options from_data_table="gseaGMT_4">
122 <filter type="sort_by" column="2" />
123 <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/>
124 </options>
125 </param>
126 </when>
127 </conditional>
128 </when>
129 <when value="F">
130 </when>
131 </conditional>
132 -->
133 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
134 help="Conventional default value of 0.05 recommended"/>
135 <param name="fdrtype" type="select" label="FDR (Type II error) control method"
136 help="Use fdr or bh typically to control for the number of tests in a reliable way">
137 <option value="fdr" selected="true">fdr</option>
138 <option value="BH">Benjamini Hochberg</option>
139 <option value="BY">Benjamini Yukateli</option>
140 <option value="bonferroni">Bonferroni</option>
141 <option value="hochberg">Hochberg</option>
142 <option value="holm">Holm</option>
143 <option value="hommel">Hommel</option>
144 <option value="none">no control for multiple tests</option>
145 </param>
146 </inputs>
147 <outputs>
148 <data format="tabular" name="out_edgeR" label="${title}_topTable_edgeR.xls">
149 <filter>edgeR['doedgeR'] == "T"</filter>
150 </data>
151 <data format="tabular" name="out_DESeq2" label="${title}_topTable_DESeq2.xls">
152 <filter>DESeq2['doDESeq2'] == "T"</filter>
153 </data>
154 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
155 <filter>doVoom == "T"</filter>
156 </data>
157 <data format="html" name="html_file" label="${title}.html"/>
158 </outputs>
159 <stdio>
160 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix" />
161 </stdio>
162 <tests>
163 <test>
164 <param name='input1' value='test_bams2mx.xls' ftype='tabular' />
165 <param name='treatment_name' value='liver' />
166 <param name='title' value='edgeRtest' />
167 <param name='useNDF' value='' />
168 <param name='doedgeR' value='T' />
169 <param name='doVoom' value='T' />
170 <param name='doDESeq2' value='T' />
171 <param name='fdrtype' value='fdr' />
172 <param name='edgeR_priordf' value="8" />
173 <param name='fdrthresh' value="0.05" />
174 <param name='control_name' value='heart' />
175 <param name='subjectids' value='' />
176 <param name='Control_cols' value='3,4,5,9' />
177 <param name='Treat_cols' value='2,6,7,8' />
178 <output name='out_edgeR' file='edgeRtest1out.xls' compare='diff' />
179 <output name='html_file' file='edgeRtest1out.html' compare='diff' lines_diff='20' />
180 </test>
181 </tests>
182
183 <configfiles>
184 <configfile name="runme">
185 <![CDATA[
186 #
187 # edgeR.Rscript
188 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
189 # Performs DGE on a count table containing n replicates of two conditions
190 #
191 # Parameters
192 #
193 # 1 - Output Dir
194
195 # Original edgeR code by: S.Lunke and A.Kaspi
196 reallybig = log10(.Machine\$double.xmax)
197 reallysmall = log10(.Machine\$double.xmin)
198 library('stringr')
199 library('gplots')
200 library('edgeR')
201 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
202 {
203 # Perform clustering for significant pvalues after controlling FWER
204 samples = colnames(cmat)
205 gu = unique(group)
206 gn = rownames(cmat)
207 if (length(gu) == 2) {
208 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
209 pcols = unlist(lapply(group,col.map))
210 } else {
211 colours = rainbow(length(gu),start=0,end=4/6)
212 pcols = colours[match(group,gu)] }
213 dm = cmat[(! is.na(gn)),]
214 # remove unlabelled hm rows
215 nprobes = nrow(dm)
216 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
217 if (nprobes > nsamp) {
218 dm =dm[1:nsamp,]
219 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
220 }
221 newcolnames = substr(colnames(dm),1,20)
222 colnames(dm) = newcolnames
223 pdf(outpdfname)
224 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
225 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
226 dev.off()
227 }
228
229 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
230 {
231 # for 2 groups only was
232 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
233 #pcols = unlist(lapply(group,col.map))
234 gu = unique(group)
235 colours = rainbow(length(gu),start=0.3,end=0.6)
236 pcols = colours[match(group,gu)]
237 nrows = nrow(cmat)
238 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
239 if (nrows > nsamp) {
240 cmat = cmat[c(1:nsamp),]
241 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
242 }
243 newcolnames = substr(colnames(cmat),1,20)
244 colnames(cmat) = newcolnames
245 pdf(outpdfname)
246 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
247 dev.off()
248 }
249
250 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
251 # stolen from https://gist.github.com/703512
252 {
253 o = -log10(sort(pvector,decreasing=F))
254 e = -log10( 1:length(o)/length(o) )
255 o[o==-Inf] = reallysmall
256 o[o==Inf] = reallybig
257 maint = descr
258 pdf(outpdf)
259 plot(e,o,pch=19,cex=1, main=maint, ...,
260 xlab=expression(Expected~~-log[10](italic(p))),
261 ylab=expression(Observed~~-log[10](italic(p))),
262 xlim=c(0,max(e)), ylim=c(0,max(o)))
263 lines(e,e,col="red")
264 grid(col = "lightgray", lty = "dotted")
265 dev.off()
266 }
267
268 smearPlot = function(DGEList,deTags, outSmear, outMain)
269 {
270 pdf(outSmear)
271 plotSmear(DGEList,de.tags=deTags,main=outMain)
272 grid(col="lightgray", lty="dotted")
273 dev.off()
274 }
275
276 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
277 { #
278 nc = ncol(rawrs)
279 #### for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
280 fullnames = colnames(rawrs)
281 newcolnames = substr(colnames(rawrs),1,20)
282 colnames(rawrs) = newcolnames
283 newcolnames = substr(colnames(cleanrs),1,20)
284 colnames(cleanrs) = newcolnames
285 defpar = par(no.readonly=T)
286 print.noquote('raw contig counts by sample:')
287 print.noquote(summary(rawrs))
288 print.noquote('normalised contig counts by sample:')
289 print.noquote(summary(cleanrs))
290 pdf(pdfname)
291 par(mfrow=c(1,2))
292 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
293 grid(col="lightgray",lty="dotted")
294 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
295 grid(col="lightgray",lty="dotted")
296 dev.off()
297 pdfname = "sample_counts_histogram.pdf"
298 nc = ncol(rawrs)
299 print.noquote(paste('Using ncol rawrs=',nc))
300 ncroot = round(sqrt(nc))
301 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
302 m = c()
303 for (i in c(1:nc)) {
304 rhist = hist(rawrs[,i],breaks=100,plot=F)
305 m = append(m,max(rhist\$counts))
306 }
307 ymax = max(m)
308 ncols = length(fullnames)
309 if (ncols > 20)
310 {
311 scale = 7*ncols/20
312 pdf(pdfname,width=scale,height=scale)
313 } else {
314 pdf(pdfname)
315 }
316 par(mfrow=c(ncroot,ncroot))
317 for (i in c(1:nc)) {
318 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
319 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
320 }
321 dev.off()
322 par(defpar)
323
324 }
325
326 cumPlot = function(rawrs,cleanrs,maint,myTitle)
327 { # updated to use ecdf
328 pdfname = "Filtering_rowsum_bar_charts.pdf"
329 defpar = par(no.readonly=T)
330 lrs = log(rawrs,10)
331 lim = max(lrs)
332 pdf(pdfname)
333 par(mfrow=c(2,1))
334 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
335 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
336 grid(col="lightgray", lty="dotted")
337 lrs = log(cleanrs,10)
338 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
339 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
340 grid(col="lightgray", lty="dotted")
341 dev.off()
342 par(defpar)
343 }
344
345 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
346 { # updated to use ecdf
347 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
348 pdf(pdfname)
349 par(mfrow=c(2,1))
350 lastx = max(rawrs)
351 rawe = knots(ecdf(rawrs))
352 cleane = knots(ecdf(cleanrs))
353 cy = 1:length(cleane)/length(cleane)
354 ry = 1:length(rawe)/length(rawe)
355 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
356 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
357 grid(col="blue")
358 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
359 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
360 grid(col="blue")
361 dev.off()
362 }
363
364
365
366 doGSEA = function(y=NULL,design=NULL,histgmt="",
367 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
368 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
369 {
370 sink('Camera.log')
371 genesets = c()
372 if (bigmt > "")
373 {
374 bigenesets = readLines(bigmt)
375 genesets = bigenesets
376 }
377 if (histgmt > "")
378 {
379 hgenesets = readLines(histgmt)
380 if (bigmt > "") {
381 genesets = rbind(genesets,hgenesets)
382 } else {
383 genesets = hgenesets
384 } # use only history if no bi
385 }
386 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
387 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
388 outf = outfname
389 head=paste(myTitle,'edgeR GSEA')
390 write(head,file=outfname,append=F)
391 ntest=length(genesets)
392 urownames = toupper(rownames(y))
393 upcam = c()
394 downcam = c()
395 for (i in 1:ntest) {
396 gs = unlist(genesets[i])
397 g = gs[1] # geneset_id
398 u = gs[2]
399 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
400 glist = gs[3:length(gs)] # member gene symbols
401 glist = toupper(glist)
402 inglist = urownames %in% glist
403 nin = sum(inglist)
404 if ((nin > minnin) && (nin < maxnin)) {
405 ### print(paste('@@found',sum(inglist),'genes in glist'))
406 camres = camera(y=y,index=inglist,design=design)
407 if (! is.null(camres)) {
408 rownames(camres) = g # gene set name
409 camres = cbind(GeneSet=g,URL=u,camres)
410 if (camres\$Direction == "Up")
411 {
412 upcam = rbind(upcam,camres) } else {
413 downcam = rbind(downcam,camres)
414 }
415 }
416 }
417 }
418 uscam = upcam[order(upcam\$PValue),]
419 unadjp = uscam\$PValue
420 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
421 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
422 dscam = downcam[order(downcam\$PValue),]
423 unadjp = dscam\$PValue
424 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
425 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
426 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
427 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
428 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
429 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
430 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
431 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
432 sink()
433 }
434
435
436
437
438 doGSEAatonce = function(y=NULL,design=NULL,histgmt="",
439 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
440 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
441 {
442 sink('Camera.log')
443 genesets = c()
444 if (bigmt > "")
445 {
446 bigenesets = readLines(bigmt)
447 genesets = bigenesets
448 }
449 if (histgmt > "")
450 {
451 hgenesets = readLines(histgmt)
452 if (bigmt > "") {
453 genesets = rbind(genesets,hgenesets)
454 } else {
455 genesets = hgenesets
456 } # use only history if no bi
457 }
458 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
459 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
460 outf = outfname
461 head=paste(myTitle,'edgeR GSEA')
462 write(head,file=outfname,append=F)
463 ntest=length(genesets)
464 urownames = toupper(rownames(y))
465 upcam = c()
466 downcam = c()
467 incam = c()
468 urls = c()
469 gsids = c()
470 for (i in 1:ntest) {
471 gs = unlist(genesets[i])
472 gsid = gs[1] # geneset_id
473 url = gs[2]
474 if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
475 glist = gs[3:length(gs)] # member gene symbols
476 glist = toupper(glist)
477 inglist = urownames %in% glist
478 nin = sum(inglist)
479 if ((nin > minnin) && (nin < maxnin)) {
480 incam = c(incam,inglist)
481 gsids = c(gsids,gsid)
482 urls = c(urls,url)
483 }
484 }
485 incam = as.list(incam)
486 names(incam) = gsids
487 allcam = camera(y=y,index=incam,design=design)
488 allcamres = cbind(geneset=gsids,allcam,URL=urls)
489 for (i in 1:ntest) {
490 camres = allcamres[i]
491 res = try(test = (camres\$Direction == "Up"))
492 if ("try-error" %in% class(res)) {
493 cat("test failed, camres = :")
494 print.noquote(camres)
495 } else { if (camres\$Direction == "Up")
496 { upcam = rbind(upcam,camres)
497 } else { downcam = rbind(downcam,camres)
498 }
499
500 }
501 }
502 uscam = upcam[order(upcam\$PValue),]
503 unadjp = uscam\$PValue
504 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
505 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
506 dscam = downcam[order(downcam\$PValue),]
507 unadjp = dscam\$PValue
508 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
509 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
510 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
511 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
512 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
513 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
514 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
515 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
516 sink()
517 }
518
519
520 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
521 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
522 filterquantile=0.2, subjects=c(),mydesign=NULL,
523 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
524 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
525 doCook=F,DESeq_fitType="parameteric")
526 {
527 # Error handling
528 if (length(unique(group))!=2){
529 print("Number of conditions identified in experiment does not equal 2")
530 q()
531 }
532 require(edgeR)
533 options(width = 512)
534 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
535 allN = nrow(Count_Matrix)
536 nscut = round(ncol(Count_Matrix)/2)
537 colTotmillionreads = colSums(Count_Matrix)/1e6
538 counts.dataframe = as.data.frame(c())
539 rawrs = rowSums(Count_Matrix)
540 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
541 nzN = nrow(nonzerod)
542 nzrs = rowSums(nonzerod)
543 zN = allN - nzN
544 print('# Quantiles for non-zero row counts:',quote=F)
545 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
546 if (useNDF == T)
547 {
548 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
549 lo = colSums(Count_Matrix[!gt1rpin3,])
550 workCM = Count_Matrix[gt1rpin3,]
551 cleanrs = rowSums(workCM)
552 cleanN = length(cleanrs)
553 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
554 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
555 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
556 } else {
557 useme = (nzrs > quantile(nzrs,filterquantile))
558 workCM = nonzerod[useme,]
559 lo = colSums(nonzerod[!useme,])
560 cleanrs = rowSums(workCM)
561 cleanN = length(cleanrs)
562 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
563 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
564 maint = paste('Filter below',filterquantile,'quantile')
565 }
566 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
567 allgenes = rownames(workCM)
568 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
569 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
570 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
571 testreg = str_match(allgenes,reg)
572 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
573 {
574 print("@@ using ucsc substitution for urls")
575 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
576 } else {
577 print.noquote("@@ using genecards substitution for urls")
578 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
579 }
580 print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
581 cmrowsums = rowSums(workCM)
582 TName=unique(group)[1]
583 CName=unique(group)[2]
584 if (is.null(mydesign)) {
585 if (length(subjects) == 0)
586 {
587 mydesign = model.matrix(~group)
588 }
589 else {
590 subjf = factor(subjects)
591 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
592 }
593 }
594 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
595 print.noquote('Using design matrix:')
596 print.noquote(mydesign)
597 if (doedgeR) {
598 sink('edgeR.log')
599 #### Setup DGEList object
600 DGEList = DGEList(counts=workCM, group = group)
601 DGEList = calcNormFactors(DGEList)
602
603 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
604 comdisp = DGEList\$common.dispersion
605 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
606 if (edgeR_priordf > 0) {
607 print.noquote(paste("prior.df =",edgeR_priordf))
608 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
609 } else {
610 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
611 }
612 DGLM = glmFit(DGEList,design=mydesign)
613 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
614 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
615 normData = (1e+06*DGEList\$counts/efflib)
616 uoutput = cbind(
617 Name=as.character(rownames(DGEList\$counts)),
618 DE\$table,
619 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
620 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
621 DGEList\$counts
622 )
623 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
624 goodness = gof(DGLM, pcutoff=fdrthresh)
625 if (sum(goodness\$outlier) > 0) {
626 print.noquote('GLM outliers:')
627 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
628 } else {
629 print('No GLM fit outlier genes found\n')
630 }
631 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
632 pdf("edgeR_GoodnessofFit.pdf")
633 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
634 abline(0,1,lwd=3)
635 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
636 dev.off()
637 estpriorn = getPriorN(DGEList)
638 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
639 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
640 normData = ((1e+06*DGEList\$counts)+1e-7)/efflib)
641 lnormData = log(normData,10)
642 uniqueg = unique(group)
643 #### Plot MDS
644 sample_colors = match(group,levels(group))
645 sampleTypes = levels(factor(group))
646 print.noquote(sampleTypes)
647 pdf("edgeR_MDSplot.pdf")
648 plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
649 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
650 grid(col="blue")
651 dev.off()
652 colnames(normData) = paste( colnames(normData),'N',sep="_")
653 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
654 nzd = data.frame(log(nonzerod + 1e-2,10))
655 try( boxPlot(rawrs=nzd,cleanrs=lnormData,maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") )
656 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
657 tt = cbind(
658 Name=as.character(rownames(DGEList\$counts)),
659 DE\$table,
660 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
661 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
662 )
663 print.noquote("# edgeR Top tags\n")
664 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
665 tt = tt[order(DE\$table\$PValue),]
666 print.noquote(tt[1:50,])
667 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
668 nsig = length(deTags)
669 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
670 deColours = ifelse(deTags,'red','black')
671 pdf("edgeR_BCV_vs_abundance.pdf")
672 plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance",col.tagwise=deColours)
673 dev.off()
674 dg = DGEList[order(DE\$table\$PValue),]
675 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
676 efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
677 normData = (1e+06*dg\$counts/efflib)
678 outpdfname="edgeR_top_100_heatmap.pdf"
679 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle))
680 outSmear = "edgeR_smearplot.pdf"
681 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
682 smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
683 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf')
684 norm.factor = DGEList\$samples\$norm.factors
685 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
686 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
687 edgeRcounts = rep(0, length(allgenes))
688 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
689 sink()
690 } ### doedgeR
691 if (doDESeq2 == T)
692 {
693 sink("DESeq2.log")
694 # DESeq2
695 require('DESeq2')
696 library('RColorBrewer')
697 if (length(subjects) == 0)
698 {
699 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
700 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
701 } else {
702 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
703 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
704 }
705 #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
706 #rDESeq = results(DESeq2)
707 #newCountDataSet(workCM, group)
708 deSeqDatsizefac = estimateSizeFactors(deSEQds)
709 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
710 resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
711 rDESeq = as.data.frame(results(resDESeq))
712 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums)
713 srDESeq = rDESeq[order(rDESeq\$pvalue),]
714 write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
715 qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf')
716 cat("# DESeq top 50\n")
717 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
718 srDESeq = rDESeq[order(rDESeq\$pvalue),]
719 print.noquote(srDESeq[1:50,])
720 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
721 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
722 DESeqcounts = rep(0, length(allgenes))
723 DESeqcounts[DESeqcountsindex] = 1
724 pdf("DESeq2_dispersion_estimates.pdf")
725 plotDispEsts(resDESeq)
726 dev.off()
727 ysmall = abs(min(rDESeq\$log2FoldChange))
728 ybig = abs(max(rDESeq\$log2FoldChange))
729 ylimit = min(4,ysmall,ybig)
730 pdf("DESeq2_MA_plot.pdf")
731 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
732 dev.off()
733 rlogres = rlogTransformation(resDESeq)
734 sampledists = dist( t( assay(rlogres) ) )
735 sdmat = as.matrix(sampledists)
736 pdf("DESeq2_sample_distance_plot.pdf")
737 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
738 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
739 dev.off()
740 ###outpdfname="DESeq2_top50_heatmap.pdf"
741 ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
742 sink()
743 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
744 if ("try-error" %in% class(result)) {
745 print.noquote('DESeq2 plotPCA failed.')
746 } else {
747 pdf("DESeq2_PCA_plot.pdf")
748 #### wtf - print? Seems needed to get this to work
749 print(ppca)
750 dev.off()
751 }
752 }
753
754 if (doVoom == T) {
755 sink('Voom.log')
756 if (doedgeR == F) {
757 #### Setup DGEList object
758 DGEList = DGEList(counts=workCM, group = group)
759 DGEList = calcNormFactors(DGEList)
760 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
761 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
762 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
763 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
764 norm.factor = DGEList\$samples\$norm.factors
765 }
766 pdf("Voom_mean_variance_plot.pdf")
767 dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
768 dev.off()
769 # Use limma to fit data
770 fit = lmFit(dat.voomed, mydesign)
771 fit = eBayes(fit)
772 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
773 qqPlot(descr=paste(myTitle,'Voom-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='Voom_qqplot.pdf')
774 rownames(rvoom) = rownames(workCM)
775 rvoom = cbind(rvoom,NReads=cmrowsums)
776 srvoom = rvoom[order(rvoom\$P.Value),]
777 write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F)
778 rvoom = cbind(rvoom,URL=contigurls)
779 deTags = rownames(rvoom[rvoom\$adj.p.value < fdrthresh,])
780 srvoom = rvoom[order(rvoom\$P.Value),]
781 cat("# Voom top 50\n")
782 print(srvoom[1:50,])
783 normData = srvoom\$E
784 outpdfname="VOOM_top_100_heatmap.pdf"
785 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('VOOM Heatmap',myTitle))
786 outSmear = "VOOM_smearplot.pdf"
787 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
788 smearPlot(DGEList=rvoom,deTags=deTags, outSmear=outSmear, outMain = outMain)
789 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='VOOM_qqplot.pdf')
790 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
791 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
792 voomcountsindex = which(allgenes %in% topresults.voom\$ID)
793 voomcounts = rep(0, length(allgenes))
794 voomcounts[voomcountsindex] = 1
795 sink()
796 }
797
798 if (doCamera) {
799 doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
800 outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
801 }
802
803 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
804 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
805 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
806 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
807 VOOM_limma = voomcounts, row.names = allgenes)
808 } else if ((doDESeq2==T) && (doedgeR==T)) {
809 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
810 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
811 } else if ((doVoom==T) && (doedgeR==T)) {
812 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
813 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
814 }
815
816 if (nrow(counts.dataframe > 1)) {
817 counts.venn = vennCounts(counts.dataframe)
818 vennf = "Venn_significant_genes_overlap.pdf"
819 pdf(vennf)
820 vennDiagram(counts.venn,main=vennmain,col="maroon")
821 dev.off()
822 }
823 } #### doDESeq2 or doVoom
824
825 }
826 #### Done
827
828 ###sink(stdout(),append=T,type="message")
829 builtin_gmt = ""
830 history_gmt = ""
831 history_gmt_name = ""
832 out_edgeR = F
833 out_DESeq2 = F
834 out_VOOM = "$out_VOOM"
835 doDESeq2 = $DESeq2.doDESeq2 # make these T or F
836 doVoom = $doVoom
837 doCamera = F
838 doedgeR = $edgeR.doedgeR
839 edgeR_priordf = 0
840
841
842 #if $doVoom == "T":
843 out_VOOM = "$out_VOOM"
844 #end if
845
846 #if $DESeq2.doDESeq2 == "T":
847 out_DESeq2 = "$out_DESeq2"
848 DESeq_fitType = "$DESeq2.DESeq_fitType"
849 #end if
850
851 #if $edgeR.doedgeR == "T":
852 out_edgeR = "$out_edgeR"
853 edgeR_priordf = $edgeR.edgeR_priordf
854 #end if
855
856
857 if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
858 {
859 write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
860 quit(save="no",status=2)
861 }
862
863 Out_Dir = "$html_file.files_path"
864 Input = "$input1"
865 TreatmentName = "$treatment_name"
866 TreatmentCols = "$Treat_cols"
867 ControlName = "$control_name"
868 ControlCols= "$Control_cols"
869 org = "$input1.dbkey"
870 if (org == "") { org = "hg19"}
871 fdrtype = "$fdrtype"
872 fdrthresh = $fdrthresh
873 useNDF = $useNDF
874 fQ = $fQ # non-differential centile cutoff
875 myTitle = "$title"
876 sids = strsplit("$subjectids",',')
877 subjects = unlist(sids)
878 nsubj = length(subjects)
879 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
880 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
881 cat('Got TCols=')
882 cat(TCols)
883 cat('; CCols=')
884 cat(CCols)
885 cat('\n')
886 useCols = c(TCols,CCols)
887 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
888 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
889 snames = colnames(Count_Matrix)
890 nsamples = length(snames)
891 if (nsubj > 0 & nsubj != nsamples) {
892 options("show.error.messages"=T)
893 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
894 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
895 write(mess, stderr())
896 quit(save="no",status=4)
897 }
898 if (length(subjects) != 0) {subjects = subjects[useCols]}
899 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
900 rn = rownames(Count_Matrix)
901 islib = rn %in% c('librarySize','NotInBedRegions')
902 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
903 Count_Matrix = Count_Matrix[subset(rn,! islib),]
904 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
905 group = factor(group, levels=c(ControlName,TreatmentName))
906 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
907 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
908 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
909 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,
910 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
911 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType)
912 sessionInfo()
913 ]]>
914 </configfile>
915 </configfiles>
916 <help>
917
918 **What it does**
919
920 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
921 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
922
923 **Input**
924
925 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
926 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
927 non-negative integer count of reads from one sample overlapping the feature.
928 The matrix must have a header row uniquely identifying the source samples, and unique row names in
929 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
930
931 **Specifying comparisons**
932
933 This is basically dumbed down for two factors - case vs control.
934
935 More complex interfaces are possible but painful at present.
936 Probably need to specify a phenotype file to do this better.
937 Work in progress. Send code.
938
939 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
940 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
941 A list of integers, one for each subject or an empty string if samples are all independent.
942 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
943 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
944
945 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
946 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
947 8,9,1,1,2,2
948 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
949
950 **Methods available**
951
952 You can run 3 popular Bioconductor packages available for count data.
953
954 edgeR - see edgeR_ for details
955
956 VOOM/limma - see limma_VOOM_ for details
957
958 DESeq2 - see DESeq2_ for details
959
960 and optionally camera in edgeR which works better if MSigDB is installed.
961
962 **Outputs**
963
964 Some helpful plots and analysis results. Note that most of these are produced using R code
965 suggested by the excellent documentation and vignettes for the Bioconductor
966 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
967
968 **Note on Voom**
969
970 The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
971
972 This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
973
974 voom is an acronym for mean-variance modelling at the observational level.
975 The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
976 Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
977 This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
978 The weights are then used in the linear modelling process to adjust for heteroscedasticity.
979
980 In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
981 The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
982 The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
983 Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
984 Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
985 This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
986
987
988 Author(s)
989
990 Charity Law and Gordon Smyth
991
992 References
993
994 Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
995
996 Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
997 Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
998 http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
999
1000 See Also
1001
1002 A voom case study is given in the edgeR User's Guide.
1003
1004 vooma is a similar function but for microarrays instead of RNA-seq.
1005
1006
1007 ***old rant on changes to Bioconductor package variable names between versions***
1008
1009 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
1010 breaking this and all other code that assumed the old name for this variable,
1011 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
1012 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
1013 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
1014 when their old scripts break. This tool currently now works with 2.4.6.
1015
1016 **Note on prior.N**
1017
1018 http://seqanswers.com/forums/showthread.php?t=5591 says:
1019
1020 *prior.n*
1021
1022 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
1023 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
1024 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
1025 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
1026 common likelihood the weight of one observation.
1027
1028 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
1029 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
1030 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
1031 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
1032 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
1033 If you have more samples, then the tagwise dispersion estimates will be more reliable,
1034 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
1035
1036
1037 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
1038
1039 Dear Dorota,
1040
1041 The important settings are prior.df and trend.
1042
1043 prior.n and prior.df are related through prior.df = prior.n * residual.df,
1044 and your experiment has residual.df = 36 - 12 = 24. So the old setting of
1045 prior.n=10 is equivalent for your data to prior.df = 240, a very large
1046 value. Going the other way, the new setting of prior.df=10 is equivalent
1047 to prior.n=10/24.
1048
1049 To recover old results with the current software you would use
1050
1051 estimateTagwiseDisp(object, prior.df=240, trend="none")
1052
1053 To get the new default from old software you would use
1054
1055 estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
1056
1057 Actually the old trend method is equivalent to trend="loess" in the new
1058 software. You should use plotBCV(object) to see whether a trend is
1059 required.
1060
1061 Note you could also use
1062
1063 prior.n = getPriorN(object, prior.df=10)
1064
1065 to map between prior.df and prior.n.
1066
1067 ----
1068
1069 **Attributions**
1070
1071 edgeR - edgeR_
1072
1073 VOOM/limma - limma_VOOM_
1074
1075 DESeq2 - DESeq2_ for details
1076
1077 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
1078
1079 Galaxy_ (that's what you are using right now!) for gluing everything together
1080
1081 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
1082 licensed to you under the LGPL_ like other rgenetics artefacts
1083
1084 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
1085 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
1086 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
1087 .. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
1088 .. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
1089 .. _Galaxy: http://getgalaxy.org
1090 </help>
1091
1092 </tool>
1093
1094