comparison rgedgeRpaired_nocamera.xml @ 108:96ebf676f6c0 draft

consolidated from baker version. includes anscombe robust fit option. Heatmap is wrong still and the r packages won't export with all their dependencies so testing is painful
author fubar
date Sun, 06 Jul 2014 02:36:47 -0400
parents
children d7e2a0c0cce9
comparison
equal deleted inserted replaced
107:d6d45ba6f9d8 108:96ebf676f6c0
1 <tool id="rgDifferentialCount" name="Differential_Count" version="0.25">
2 <description>models using BioConductor packages</description>
3 <requirements>
4 <requirement type="package" version="2.14">biocbasics</requirement>
5 <requirement type="package" version="3.0.3">r303</requirement>
6 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
7 <requirement type="package" version="9.10">ghostscript</requirement>
8 </requirements>
9
10 <command interpreter="python">
11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts"
12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
13 </command>
14 <inputs>
15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
16 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
17 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs"
18 help="Supply a meaningful name here to remind you what the outputs contain">
19 <sanitizer invalid_char="">
20 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
21 </sanitizer>
22 </param>
23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
25 multiple="true" use_header_names="true" size="120" display="checkboxes">
26 <validator type="no_options" message="Please select at least one column."/>
27 </param>
28 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
29 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
30 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
31 </param>
32 <param name="subjectids" type="text" optional="true" size="120" value = ""
33 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input"
34 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'">
35 <sanitizer>
36 <valid initial="string.letters,string.digits"><add value="," /> </valid>
37 </sanitizer>
38 </param>
39 <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
40 help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
41 <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1"
42 label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
43 help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
44
45 <conditional name="edgeR">
46 <param name="doedgeR" type="select"
47 label="Run this model using edgeR"
48 help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
49 <option value="F">Do not run edgeR</option>
50 <option value="T" selected="true">Run edgeR</option>
51 </param>
52 <when value="T">
53 <param name="edgeR_priordf" type="integer" value="10" size="3"
54 label="prior.df for tagwise dispersion - larger value = more squeezing of tag dispersions to common dispersion. Replaces prior.n and prior.df = prior.n * residual.df"
55 help="10 = edgeR default. Use a larger value to 'smooth' small samples. See edgeR docs and note below"/>
56 <param name="edgeR_robust_method" type="select" value="20" size="3"
57 label="Use robust dispersion method"
58 help="Use ordinary, anscombe or deviance robust deviance estimates">
59 <option value="ordinary" selected="true">Use ordinary deviance estimates</option>
60 <option value="deviance">Use robust deviance estimates</option>
61 <option value="anscombe">use Anscombe robust deviance estimates</option>
62 </param>
63 </when>
64 <when value="F"></when>
65 </conditional>
66 <conditional name="DESeq2">
67 <param name="doDESeq2" type="select"
68 label="Run the same model with DESeq2 and compare findings"
69 help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
70 <option value="F" selected="true">Do not run DESeq2</option>
71 <option value="T">Run DESeq2</option>
72 </param>
73 <when value="T">
74 <param name="DESeq_fitType" type="select">
75 <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
76 <option value="local">Local fit - this will automagically be used if parametric fit fails</option>
77 <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual linked below in the documentation</option>
78 </param>
79 </when>
80 <when value="F"> </when>
81 </conditional>
82 <param name="doVoom" type="select"
83 label="Run the same model with Voom/limma and compare findings"
84 help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma">
85 <option value="F" selected="true">Do not run VOOM</option>
86 <option value="T">Run VOOM</option>
87 </param>
88 <!--
89 <conditional name="camera">
90 <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets"
91 help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history">
92 <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option>
93 <option value="T">Run GSEA tests with the Camera algorithm</option>
94 </param>
95 <when value="T">
96 <conditional name="gmtSource">
97 <param name="refgmtSource" type="select"
98 label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set">
99 <option value="indexed" selected="true">Use a built-in gene set</option>
100 <option value="history">Use a gene set from my history</option>
101 <option value="both">Add a gene set from my history to a built in gene set</option>
102 </param>
103 <when value="indexed">
104 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
105 <options from_data_table="gseaGMT_3.1">
106 <filter type="sort_by" column="2" />
107 <validator type="no_options" message="No GMT v3.1 files are available - please install them"/>
108 </options>
109 </param>
110 </when>
111 <when value="history">
112 <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" />
113 </when>
114 <when value="both">
115 <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" />
116 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
117 <options from_data_table="gseaGMT_4">
118 <filter type="sort_by" column="2" />
119 <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/>
120 </options>
121 </param>
122 </when>
123 </conditional>
124 </when>
125 <when value="F">
126 </when>
127 </conditional>
128 -->
129 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
130 help="Conventional default value of 0.05 recommended"/>
131 <param name="fdrtype" type="select" label="FDR (Type II error) control method"
132 help="Use fdr or bh typically to control for the number of tests in a reliable way">
133 <option value="fdr" selected="true">fdr</option>
134 <option value="BH">Benjamini Hochberg</option>
135 <option value="BY">Benjamini Yukateli</option>
136 <option value="bonferroni">Bonferroni</option>
137 <option value="hochberg">Hochberg</option>
138 <option value="holm">Holm</option>
139 <option value="hommel">Hommel</option>
140 <option value="none">no control for multiple tests</option>
141 </param>
142 </inputs>
143 <outputs>
144 <data format="tabular" name="out_edgeR" label="${title}_topTable_edgeR.xls">
145 <filter>edgeR['doedgeR'] == "T"</filter>
146 </data>
147 <data format="tabular" name="out_DESeq2" label="${title}_topTable_DESeq2.xls">
148 <filter>DESeq2['doDESeq2'] == "T"</filter>
149 </data>
150 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
151 <filter>doVoom == "T"</filter>
152 </data>
153 <data format="html" name="html_file" label="${title}.html"/>
154 </outputs>
155 <stdio>
156 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix" />
157 </stdio>
158 <tests>
159 <test>
160 <param name='input1' value='test_bams2mx.xls' ftype='tabular' />
161 <param name='treatment_name' value='liver' />
162 <param name='title' value='edgeRtest' />
163 <param name='useNDF' value='' />
164 <param name='doedgeR' value='T' />
165 <param name='doVoom' value='T' />
166 <param name='doDESeq2' value='T' />
167 <param name='fdrtype' value='fdr' />
168 <param name='edgeR_priordf' value="8" />
169 <param name='edgeR_robust' value="ordinary" />
170 <param name='fdrthresh' value="0.05" />
171 <param name='control_name' value='heart' />
172 <param name='subjectids' value='' />
173 <param name='Control_cols' value='3,4,5,9' />
174 <param name='Treat_cols' value='2,6,7,8' />
175 <output name='out_edgeR' file='edgeRtest1out.xls' compare='diff' />
176 <output name='html_file' file='edgeRtest1out.html' compare='diff' lines_diff='20' />
177 </test>
178 </tests>
179
180 <configfiles>
181 <configfile name="runme">
182 <![CDATA[
183 #
184 # edgeR.Rscript
185 # updated feb 2014 adding outlier-robust deviance estimate options by ross for R 3.0.2/bioc 2.13
186 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
187 # Performs DGE on a count table containing n replicates of two conditions
188 #
189 # Parameters
190 #
191 # 1 - Output Dir
192
193 # Original edgeR code by: S.Lunke and A.Kaspi
194 reallybig = log10(.Machine\$double.xmax)
195 reallysmall = log10(.Machine\$double.xmin)
196 library('stringr')
197 library('gplots')
198 library('edgeR')
199 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
200 {
201 # Perform clustering for significant pvalues after controlling FWER
202 samples = colnames(cmat)
203 gu = unique(group)
204 gn = rownames(cmat)
205 if (length(gu) == 2) {
206 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
207 pcols = unlist(lapply(group,col.map))
208 } else {
209 colours = rainbow(length(gu),start=0,end=4/6)
210 pcols = colours[match(group,gu)] }
211 dm = cmat[(! is.na(gn)),]
212 # remove unlabelled hm rows
213 nprobes = nrow(dm)
214 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
215 if (nprobes > nsamp) {
216 dm =dm[1:nsamp,]
217 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
218 }
219 newcolnames = substr(colnames(dm),1,20)
220 colnames(dm) = newcolnames
221 pdf(outpdfname)
222 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
223 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
224 dev.off()
225 }
226
227 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
228 {
229 # for 2 groups only was
230 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
231 #pcols = unlist(lapply(group,col.map))
232 gu = unique(group)
233 colours = rainbow(length(gu),start=0.3,end=0.6)
234 pcols = colours[match(group,gu)]
235 nrows = nrow(cmat)
236 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
237 if (nrows > nsamp) {
238 cmat = cmat[c(1:nsamp),]
239 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
240 }
241 newcolnames = substr(colnames(cmat),1,20)
242 colnames(cmat) = newcolnames
243 pdf(outpdfname)
244 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
245 dev.off()
246 }
247
248 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
249 # stolen from https://gist.github.com/703512
250 {
251 o = -log10(sort(pvector,decreasing=F))
252 e = -log10( 1:length(o)/length(o) )
253 o[o==-Inf] = reallysmall
254 o[o==Inf] = reallybig
255 maint = descr
256 pdf(outpdf)
257 plot(e,o,pch=19,cex=1, main=maint, ...,
258 xlab=expression(Expected~~-log[10](italic(p))),
259 ylab=expression(Observed~~-log[10](italic(p))),
260 xlim=c(0,max(e)), ylim=c(0,max(o)))
261 lines(e,e,col="red")
262 grid(col = "lightgray", lty = "dotted")
263 dev.off()
264 }
265
266 smearPlot = function(DGEList,deTags, outSmear, outMain)
267 {
268 pdf(outSmear)
269 plotSmear(DGEList,de.tags=deTags,main=outMain)
270 grid(col="lightgray", lty="dotted")
271 dev.off()
272 }
273
274 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
275 { #
276 nc = ncol(rawrs)
277 for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
278 fullnames = colnames(rawrs)
279 newcolnames = substr(colnames(rawrs),1,20)
280 colnames(rawrs) = newcolnames
281 newcolnames = substr(colnames(cleanrs),1,20)
282 colnames(cleanrs) = newcolnames
283 defpar = par(no.readonly=T)
284 print.noquote('raw contig counts by sample:')
285 print.noquote(summary(rawrs))
286 print.noquote('normalised contig counts by sample:')
287 print.noquote(summary(cleanrs))
288 pdf(pdfname)
289 par(mfrow=c(1,2))
290 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
291 grid(col="lightgray",lty="dotted")
292 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
293 grid(col="lightgray",lty="dotted")
294 dev.off()
295 pdfname = "sample_counts_histogram.pdf"
296 nc = ncol(rawrs)
297 print.noquote(paste('Using ncol rawrs=',nc))
298 ncroot = round(sqrt(nc))
299 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
300 m = c()
301 for (i in c(1:nc)) {
302 rhist = hist(rawrs[,i],breaks=100,plot=F)
303 m = append(m,max(rhist\$counts))
304 }
305 ymax = max(m)
306 ncols = length(fullnames)
307 if (ncols > 20)
308 {
309 scale = 7*ncols/20
310 pdf(pdfname,width=scale,height=scale)
311 } else {
312 pdf(pdfname)
313 }
314 par(mfrow=c(ncroot,ncroot))
315 for (i in c(1:nc)) {
316 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
317 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
318 }
319 dev.off()
320 par(defpar)
321
322 }
323
324 cumPlot = function(rawrs,cleanrs,maint,myTitle)
325 { # updated to use ecdf
326 pdfname = "Filtering_rowsum_bar_charts.pdf"
327 defpar = par(no.readonly=T)
328 lrs = log(rawrs,10)
329 lim = max(lrs)
330 pdf(pdfname)
331 par(mfrow=c(2,1))
332 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
333 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
334 grid(col="lightgray", lty="dotted")
335 lrs = log(cleanrs,10)
336 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
337 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
338 grid(col="lightgray", lty="dotted")
339 dev.off()
340 par(defpar)
341 }
342
343 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
344 { # updated to use ecdf
345 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
346 pdf(pdfname)
347 par(mfrow=c(2,1))
348 lastx = max(rawrs)
349 rawe = knots(ecdf(rawrs))
350 cleane = knots(ecdf(cleanrs))
351 cy = 1:length(cleane)/length(cleane)
352 ry = 1:length(rawe)/length(rawe)
353 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
354 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
355 grid(col="blue")
356 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
357 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
358 grid(col="blue")
359 dev.off()
360 }
361
362
363
364 doGSEAold = function(y=NULL,design=NULL,histgmt="",
365 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
366 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
367 {
368 sink('Camera.log')
369 genesets = c()
370 if (bigmt > "")
371 {
372 bigenesets = readLines(bigmt)
373 genesets = bigenesets
374 }
375 if (histgmt > "")
376 {
377 hgenesets = readLines(histgmt)
378 if (bigmt > "") {
379 genesets = rbind(genesets,hgenesets)
380 } else {
381 genesets = hgenesets
382 } # use only history if no bi
383 }
384 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
385 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
386 outf = outfname
387 head=paste(myTitle,'edgeR GSEA')
388 write(head,file=outfname,append=F)
389 ntest=length(genesets)
390 urownames = toupper(rownames(y))
391 upcam = c()
392 downcam = c()
393 for (i in 1:ntest) {
394 gs = unlist(genesets[i])
395 g = gs[1] # geneset_id
396 u = gs[2]
397 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
398 glist = gs[3:length(gs)] # member gene symbols
399 glist = toupper(glist)
400 inglist = urownames %in% glist
401 nin = sum(inglist)
402 if ((nin > minnin) && (nin < maxnin)) {
403 ### print(paste('@@found',sum(inglist),'genes in glist'))
404 camres = camera(y=y,index=inglist,design=design)
405 if (! is.null(camres)) {
406 rownames(camres) = g # gene set name
407 camres = cbind(GeneSet=g,URL=u,camres)
408 if (camres\$Direction == "Up")
409 {
410 upcam = rbind(upcam,camres) } else {
411 downcam = rbind(downcam,camres)
412 }
413 }
414 }
415 }
416 uscam = upcam[order(upcam\$PValue),]
417 unadjp = uscam\$PValue
418 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
419 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
420 dscam = downcam[order(downcam\$PValue),]
421 unadjp = dscam\$PValue
422 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
423 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
424 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
425 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
426 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
427 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
428 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
429 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
430 sink()
431 }
432
433
434
435
436 doGSEA = function(y=NULL,design=NULL,histgmt="",
437 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
438 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
439 {
440 sink('Camera.log')
441 genesets = c()
442 if (bigmt > "")
443 {
444 bigenesets = readLines(bigmt)
445 genesets = bigenesets
446 }
447 if (histgmt > "")
448 {
449 hgenesets = readLines(histgmt)
450 if (bigmt > "") {
451 genesets = rbind(genesets,hgenesets)
452 } else {
453 genesets = hgenesets
454 } # use only history if no bi
455 }
456 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
457 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
458 outf = outfname
459 head=paste(myTitle,'edgeR GSEA')
460 write(head,file=outfname,append=F)
461 ntest=length(genesets)
462 urownames = toupper(rownames(y))
463 upcam = c()
464 downcam = c()
465 incam = c()
466 urls = c()
467 gsids = c()
468 for (i in 1:ntest) {
469 gs = unlist(genesets[i])
470 gsid = gs[1] # geneset_id
471 url = gs[2]
472 if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
473 glist = gs[3:length(gs)] # member gene symbols
474 glist = toupper(glist)
475 inglist = urownames %in% glist
476 nin = sum(inglist)
477 if ((nin > minnin) && (nin < maxnin)) {
478 incam = c(incam,inglist)
479 gsids = c(gsids,gsid)
480 urls = c(urls,url)
481 }
482 }
483 incam = as.list(incam)
484 names(incam) = gsids
485 allcam = camera(y=y,index=incam,design=design)
486 allcamres = cbind(geneset=gsids,allcam,URL=urls)
487 for (i in 1:ntest) {
488 camres = allcamres[i]
489 res = try(test = (camres\$Direction == "Up"))
490 if ("try-error" %in% class(res)) {
491 cat("test failed, camres = :")
492 print.noquote(camres)
493 } else { if (camres\$Direction == "Up")
494 { upcam = rbind(upcam,camres)
495 } else { downcam = rbind(downcam,camres)
496 }
497
498 }
499 }
500 uscam = upcam[order(upcam\$PValue),]
501 unadjp = uscam\$PValue
502 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
503 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
504 dscam = downcam[order(downcam\$PValue),]
505 unadjp = dscam\$PValue
506 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
507 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
508 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
509 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
510 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
511 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
512 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
513 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
514 sink()
515 }
516
517
518 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
519 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
520 filterquantile=0.2, subjects=c(),mydesign=NULL,
521 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
522 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
523 doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary')
524 {
525 # Error handling
526 if (length(unique(group))!=2){
527 print("Number of conditions identified in experiment does not equal 2")
528 q()
529 }
530 require(edgeR)
531 options(width = 512)
532 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
533 allN = nrow(Count_Matrix)
534 nscut = round(ncol(Count_Matrix)/2)
535 colTotmillionreads = colSums(Count_Matrix)/1e6
536 counts.dataframe = as.data.frame(c())
537 rawrs = rowSums(Count_Matrix)
538 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
539 nzN = nrow(nonzerod)
540 nzrs = rowSums(nonzerod)
541 zN = allN - nzN
542 print('# Quantiles for non-zero row counts:',quote=F)
543 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
544 if (useNDF == T)
545 {
546 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
547 lo = colSums(Count_Matrix[!gt1rpin3,])
548 workCM = Count_Matrix[gt1rpin3,]
549 cleanrs = rowSums(workCM)
550 cleanN = length(cleanrs)
551 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
552 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
553 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
554 } else {
555 useme = (nzrs > quantile(nzrs,filterquantile))
556 workCM = nonzerod[useme,]
557 lo = colSums(nonzerod[!useme,])
558 cleanrs = rowSums(workCM)
559 cleanN = length(cleanrs)
560 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
561 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
562 maint = paste('Filter below',filterquantile,'quantile')
563 }
564 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
565 allgenes = rownames(workCM)
566 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
567 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
568 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
569 testreg = str_match(allgenes,reg)
570 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
571 {
572 print("@@ using ucsc substitution for urls")
573 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
574 } else {
575 print("@@ using genecards substitution for urls")
576 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
577 }
578 print.noquote("# urls")
579 print.noquote(head(contigurls))
580 print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
581 cmrowsums = rowSums(workCM)
582 TName=unique(group)[1]
583 CName=unique(group)[2]
584 if (is.null(mydesign)) {
585 if (length(subjects) == 0)
586 {
587 mydesign = model.matrix(~group)
588 }
589 else {
590 subjf = factor(subjects)
591 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
592 }
593 }
594 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
595 print.noquote('Using design matrix:')
596 print.noquote(mydesign)
597 if (doedgeR == T) {
598 sink('edgeR.log')
599 #### Setup DGEList object
600 DGEList = DGEList(counts=workCM, group = group)
601 DGEList = calcNormFactors(DGEList)
602 if (robust_meth == 'ordinary') {
603 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
604 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
605 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
606
607 comdisp = DGEList\$common.dispersion
608 estpriorn = getPriorN(DGEList)
609 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
610 } else {
611 DGEList = estimateGLMRobustDisp(DGEList,design=mydesign, prior.df = edgeR_priordf, maxit = 6, residual.type = robust_meth)
612 }
613
614
615 DGLM = glmFit(DGEList,design=mydesign)
616 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
617 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
618 normData = (1e+06*DGEList\$counts/efflib)
619 uoutput = cbind(
620 Name=as.character(rownames(DGEList\$counts)),
621 DE\$table,
622 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
623 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
624 DGEList\$counts
625 )
626 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
627 goodness = gof(DGLM, pcutoff=fdrthresh)
628 if (sum(goodness\$outlier) > 0) {
629 print.noquote('GLM outliers:')
630 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
631 } else {
632 print('No GLM fit outlier genes found\n')
633 }
634 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
635 pdf("edgeR_GoodnessofFit.pdf")
636 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
637 abline(0,1,lwd=3)
638 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
639 dev.off()
640 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
641 normData = (1e+06*DGEList\$counts/efflib)
642 uniqueg = unique(group)
643 #### Plot MDS
644 sample_colors = match(group,levels(group))
645 sampleTypes = levels(factor(group))
646 print.noquote(sampleTypes)
647 pdf("edgeR_MDSplot.pdf")
648 plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
649 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
650 grid(col="blue")
651 dev.off()
652 colnames(normData) = paste( colnames(normData),'N',sep="_")
653 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
654 nzd = data.frame(log(nonzerod + 1e-2,10))
655 try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") )
656 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
657 tt = cbind(
658 Name=as.character(rownames(DGEList\$counts)),
659 DE\$table,
660 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
661 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
662 )
663 print.noquote("# edgeR Top tags\n")
664 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
665 tt = tt[order(DE\$table\$PValue),]
666 print.noquote(tt[1:50,])
667 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
668 nsig = length(deTags)
669 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
670 deColours = ifelse(deTags,'red','black')
671 pdf("edgeR_BCV_vs_abundance.pdf")
672 plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
673 dev.off()
674 dg = DGEList[order(DE\$table\$PValue),]
675 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
676 efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
677 normData = (1e+06*dg\$counts/efflib)
678 outpdfname="edgeR_top_100_heatmap.pdf"
679 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle))
680 outSmear = "edgeR_smearplot.pdf"
681 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
682 smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
683 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf')
684 norm.factor = DGEList\$samples\$norm.factors
685 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
686 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
687 edgeRcounts = rep(0, length(allgenes))
688 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
689 sink()
690 } ### doedgeR
691 if (doDESeq2 == T)
692 {
693 sink("DESeq2.log")
694 # DESeq2
695 require('DESeq2')
696 library('RColorBrewer')
697 if (length(subjects) == 0)
698 {
699 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
700 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
701 } else {
702 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
703 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
704 }
705 #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
706 #rDESeq = results(DESeq2)
707 #newCountDataSet(workCM, group)
708 deSeqDatsizefac = estimateSizeFactors(deSEQds)
709 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
710 resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
711 rDESeq = as.data.frame(results(resDESeq))
712 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
713 srDESeq = rDESeq[order(rDESeq\$pvalue),]
714 qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf')
715 cat("# DESeq top 50\n")
716 print.noquote(srDESeq[1:50,])
717 write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
718 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
719 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
720 DESeqcounts = rep(0, length(allgenes))
721 DESeqcounts[DESeqcountsindex] = 1
722 pdf("DESeq2_dispersion_estimates.pdf")
723 plotDispEsts(resDESeq)
724 dev.off()
725 ysmall = abs(min(rDESeq\$log2FoldChange))
726 ybig = abs(max(rDESeq\$log2FoldChange))
727 ylimit = min(4,ysmall,ybig)
728 pdf("DESeq2_MA_plot.pdf")
729 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
730 dev.off()
731 rlogres = rlogTransformation(resDESeq)
732 sampledists = dist( t( assay(rlogres) ) )
733 sdmat = as.matrix(sampledists)
734 pdf("DESeq2_sample_distance_plot.pdf")
735 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
736 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
737 dev.off()
738 ###outpdfname="DESeq2_top50_heatmap.pdf"
739 ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
740 sink()
741 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
742 if ("try-error" %in% class(result)) {
743 print.noquote('DESeq2 plotPCA failed.')
744 } else {
745 pdf("DESeq2_PCA_plot.pdf")
746 #### wtf - print? Seems needed to get this to work
747 print(ppca)
748 dev.off()
749 }
750 }
751
752 if (doVoom == T) {
753 sink('VOOM.log')
754 if (doedgeR == F) {
755 #### Setup DGEList object
756 DGEList = DGEList(counts=workCM, group = group)
757 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
758 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
759 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
760 }
761 calcNormFactors(DGEList)
762 ls = colSums(DGEList\$counts) * DGEList\$samples\$norm.factors
763 pdf("VOOM_mean_variance_plot.pdf")
764 #dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = ls)
765 dat.voomed <- voom(DGEList, mydesign, plot = TRUE, normalize.method="quantil", lib.size = NULL)
766 dev.off()
767 # Use limma to fit data
768 fit = lmFit(dat.voomed, mydesign)
769 fit = eBayes(fit)
770 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
771 qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='VOOM_qqplot.pdf')
772 rownames(rvoom) = rownames(workCM)
773 rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls)
774 srvoom = rvoom[order(rvoom\$P.Value),]
775 cat("# VOOM top 50\n")
776 print(srvoom[1:50,])
777 write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F)
778 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
779 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
780 voomcountsindex <- which(allgenes %in% rownames(topresults.voom))
781 voomcounts = rep(0, length(allgenes))
782 voomcounts[voomcountsindex] = 1
783 sink()
784 }
785
786 if (doCamera) {
787 doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
788 outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
789 }
790 counts.dataframe = c()
791 vennmain = 'no venn'
792 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
793 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
794 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
795 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
796 VOOM_limma = voomcounts, row.names = allgenes)
797 } else if ((doDESeq2==T) && (doedgeR==T)) {
798 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
799 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
800 } else if ((doVoom==T) && (doedgeR==T)) {
801 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
802 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
803 }
804
805 if (nrow(counts.dataframe > 1)) {
806 counts.venn = vennCounts(counts.dataframe)
807 vennf = "Venn_significant_genes_overlap.pdf"
808 pdf(vennf)
809 vennDiagram(counts.venn,main=vennmain,col="maroon")
810 dev.off()
811 }
812 } #### doDESeq2 or doVoom
813
814 }
815 #### Done
816
817 ###sink(stdout(),append=T,type="message")
818 builtin_gmt = ""
819 history_gmt = ""
820 history_gmt_name = ""
821 out_edgeR = F
822 out_DESeq2 = F
823 out_VOOM = "$out_VOOM"
824 edgeR_robust_meth = "ordinary" # control robust deviance options
825 doDESeq2 = $DESeq2.doDESeq2
826 doVoom = $doVoom
827 doCamera = F
828 doedgeR = $edgeR.doedgeR
829 edgeR_priordf = 10
830
831
832 #if $doVoom == "T":
833 out_VOOM = "$out_VOOM"
834 #end if
835
836 #if $DESeq2.doDESeq2 == "T":
837 out_DESeq2 = "$out_DESeq2"
838 doDESeq2 = T
839 DESeq_fitType = "$DESeq2.DESeq_fitType"
840 #end if
841
842 #if $edgeR.doedgeR == "T":
843 out_edgeR = "$out_edgeR"
844 edgeR_priordf = $edgeR.edgeR_priordf
845 edgeR_robust_meth = "$edgeR.edgeR_robust_method"
846 #end if
847
848
849 if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
850 {
851 write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
852 quit(save="no",status=2)
853 }
854
855 Out_Dir = "$html_file.files_path"
856 Input = "$input1"
857 TreatmentName = "$treatment_name"
858 TreatmentCols = "$Treat_cols"
859 ControlName = "$control_name"
860 ControlCols= "$Control_cols"
861 org = "$input1.dbkey"
862 if (org == "") { org = "hg19"}
863 fdrtype = "$fdrtype"
864 fdrthresh = $fdrthresh
865 useNDF = $useNDF
866 fQ = $fQ # non-differential centile cutoff
867 myTitle = "$title"
868 sids = strsplit("$subjectids",',')
869 subjects = unlist(sids)
870 nsubj = length(subjects)
871 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
872 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
873 cat('Got TCols=')
874 cat(TCols)
875 cat('; CCols=')
876 cat(CCols)
877 cat('\n')
878 useCols = c(TCols,CCols)
879 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
880 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
881 snames = colnames(Count_Matrix)
882 nsamples = length(snames)
883 if (nsubj > 0 & nsubj != nsamples) {
884 options("show.error.messages"=T)
885 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
886 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
887 write(mess, stderr())
888 quit(save="no",status=4)
889 }
890 if (length(subjects) != 0) {subjects = subjects[useCols]}
891 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
892 rn = rownames(Count_Matrix)
893 islib = rn %in% c('librarySize','NotInBedRegions')
894 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
895 Count_Matrix = Count_Matrix[subset(rn,! islib),]
896 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
897 group = factor(group, levels=c(ControlName,TreatmentName))
898 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
899 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
900 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
901 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,
902 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
903 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType,robust_meth=edgeR_robust_meth)
904 sessionInfo()
905 ]]>
906 </configfile>
907 </configfiles>
908 <help>
909
910 **What it does**
911
912 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
913 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
914
915 **Input**
916
917 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
918 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
919 non-negative integer count of reads from one sample overlapping the feature.
920 The matrix must have a header row uniquely identifying the source samples, and unique row names in
921 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
922
923 **Specifying comparisons**
924
925 This is basically dumbed down for two factors - case vs control.
926
927 More complex interfaces are possible but painful at present.
928 Probably need to specify a phenotype file to do this better.
929 Work in progress. Send code.
930
931 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
932 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
933 A list of integers, one for each subject or an empty string if samples are all independent.
934 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
935 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
936
937 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
938 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
939 8,9,1,1,2,2
940 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
941
942 **Methods available**
943
944 You can run 3 popular Bioconductor packages available for count data.
945
946 edgeR - see edgeR_ for details
947
948 VOOM/limma - see limma_VOOM_ for details
949
950 DESeq2 - see DESeq2_ for details
951
952 and optionally camera in edgeR which works better if MSigDB is installed.
953
954 **Outputs**
955
956 Some helpful plots and analysis results. Note that most of these are produced using R code
957 suggested by the excellent documentation and vignettes for the Bioconductor
958 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
959
960 **Note on Voom**
961
962 The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
963
964 This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
965
966 voom is an acronym for mean-variance modelling at the observational level.
967 The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
968 Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
969 This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
970 The weights are then used in the linear modelling process to adjust for heteroscedasticity.
971
972 In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
973 The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
974 The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
975 Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
976 Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
977 This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
978
979
980 Author(s)
981
982 Charity Law and Gordon Smyth
983
984 References
985
986 Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
987
988 Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
989 Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
990 http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
991
992 See Also
993
994 A voom case study is given in the edgeR User's Guide.
995
996 vooma is a similar function but for microarrays instead of RNA-seq.
997
998
999 ***old rant on changes to Bioconductor package variable names between versions***
1000
1001 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
1002 breaking this and all other code that assumed the old name for this variable,
1003 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
1004 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
1005 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
1006 when their old scripts break. This tool currently now works with 2.4.6.
1007
1008 **Note on prior.N**
1009
1010 http://seqanswers.com/forums/showthread.php?t=5591 says:
1011
1012 *prior.n*
1013
1014 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
1015 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
1016 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
1017 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
1018 common likelihood the weight of one observation.
1019
1020 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
1021 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
1022 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
1023 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
1024 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
1025 If you have more samples, then the tagwise dispersion estimates will be more reliable,
1026 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
1027
1028
1029 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
1030
1031 Dear Dorota,
1032
1033 The important settings are prior.df and trend.
1034
1035 prior.n and prior.df are related through prior.df = prior.n * residual.df,
1036 and your experiment has residual.df = 36 - 12 = 24. So the old setting of
1037 prior.n=10 is equivalent for your data to prior.df = 240, a very large
1038 value. Going the other way, the new setting of prior.df=10 is equivalent
1039 to prior.n=10/24.
1040
1041 To recover old results with the current software you would use
1042
1043 estimateTagwiseDisp(object, prior.df=240, trend="none")
1044
1045 To get the new default from old software you would use
1046
1047 estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
1048
1049 Actually the old trend method is equivalent to trend="loess" in the new
1050 software. You should use plotBCV(object) to see whether a trend is
1051 required.
1052
1053 Note you could also use
1054
1055 prior.n = getPriorN(object, prior.df=10)
1056
1057 to map between prior.df and prior.n.
1058
1059 ----
1060
1061 **Attributions**
1062
1063 edgeR - edgeR_
1064
1065 VOOM/limma - limma_VOOM_
1066
1067 DESeq2 - DESeq2_ for details
1068
1069 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
1070
1071 Galaxy_ (that's what you are using right now!) for gluing everything together
1072
1073 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
1074 licensed to you under the LGPL_ like other rgenetics artefacts
1075
1076 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
1077 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
1078 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
1079 .. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
1080 .. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
1081 .. _Galaxy: http://getgalaxy.org
1082 </help>
1083
1084 </tool>
1085
1086