comparison rgedgeR/rgedgeRpaired.xml @ 0:2122e630b13a draft

Initial commit of replacement for edger_test
author fubar
date Fri, 26 Jul 2013 23:50:59 -0400
parents
children df4fea3be1b3
comparison
equal deleted inserted replaced
-1:000000000000 0:2122e630b13a
1 <tool id="rgDifferentialCount" name="Differential_Count" version="0.20">
2 <description>models using BioConductor packages</description>
3 <requirements>
4 <requirement type="package" version="2.12">biocbasics</requirement>
5 <requirement type="package" version="3.0.1">r3</requirement>
6 </requirements>
7
8 <command interpreter="python">
9 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts"
10 --output_dir "$html_file.files_path" --output_html "$html_file" --output_tab "$outtab" --make_HTML "yes"
11 </command>
12 <inputs>
13 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
14 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
15 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs"
16 help="Supply a meaningful name here to remind you what the outputs contain">
17 <sanitizer invalid_char="">
18 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
19 </sanitizer>
20 </param>
21 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
22 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
23 multiple="true" use_header_names="true" size="120" display="checkboxes">
24 <validator type="no_options" message="Please select at least one column."/>
25 </param>
26 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
27 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
28 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
29 </param>
30 <param name="subjectids" type="text" optional="true" size="120" value = ""
31 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter integers to indicate sample pairing for every column in input"
32 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter '1,2,1,2'">
33 <sanitizer>
34 <valid initial="string.digits"><add value="," /> </valid>
35 </sanitizer>
36 </param>
37 <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
38 help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
39 <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1"
40 label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
41 help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
42
43 <conditional name="edgeR">
44 <param name="doedgeR" type="select"
45 label="Run this model using edgeR"
46 help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
47 <option value="F">Do not run edgeR</option>
48 <option value="T" selected="true">Run edgeR</option>
49 </param>
50 <when value="T">
51 <param name="edgeR_priordf" type="integer" value="20" size="3"
52 label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n and prior.df = prior.n * residual.df"
53 help="0 = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
54 </when>
55 <when value="F"> </when>
56 </conditional>
57 <conditional name="DESeq2">
58 <param name="doDESeq2" type="select"
59 label="Run the same model with DESeq2 and compare findings"
60 help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
61 <option value="F" selected="true">Do not run DESeq2</option>
62 <option value="T">Run DESeq2 (only works if NO second GLM factor supplied at present)</option>
63 </param>
64 <when value="T">
65 <param name="DESeq_fitType" type="select">
66 <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
67 <option value="local">Local fit - use this if parametric fails</option>
68 <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual</option>
69 </param>
70 </when>
71 <when value="F"> </when>
72 </conditional>
73 <param name="doVoom" type="select"
74 label="Run the same model with Voom/limma and compare findings"
75 help="The VOOM transformation allows analysis of count data using limma">
76 <option value="F" selected="true">Do not run VOOM</option>
77 <option value="T">Run VOOM</option>
78 </param>
79 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
80 help="Conventional default value of 0.05 recommended"/>
81 <param name="fdrtype" type="select" label="FDR (Type II error) control method"
82 help="Use fdr or bh typically to control for the number of tests in a reliable way">
83 <option value="fdr" selected="true">fdr</option>
84 <option value="BH">Benjamini Hochberg</option>
85 <option value="BY">Benjamini Yukateli</option>
86 <option value="bonferroni">Bonferroni</option>
87 <option value="hochberg">Hochberg</option>
88 <option value="holm">Holm</option>
89 <option value="hommel">Hommel</option>
90 <option value="none">no control for multiple tests</option>
91 </param>
92 </inputs>
93 <outputs>
94 <data format="tabular" name="outtab" label="${title}.xls"/>
95 <data format="html" name="html_file" label="${title}.html"/>
96 </outputs>
97 <stdio>
98 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix" />
99 </stdio>
100 <tests>
101 <test>
102 <param name='input1' value='test_bams2mx.xls' ftype='tabular' />
103 <param name='treatment_name' value='case' />
104 <param name='title' value='edgeRtest' />
105 <param name='useNDF' value='' />
106 <param name='doedgeR' value='T' />
107 <param name='doVoom' value='T' />
108 <param name='doDESeq2' value='T' />
109 <param name='fdrtype' value='fdr' />
110 <param name='edgeR_priordf' value="8" />
111 <param name='fdrthresh' value="0.05" />
112 <param name='control_name' value='control' />
113 <param name='subjectids' value='' />
114 <param name='Treat_cols' value='3,4,5,9' />
115 <param name='Control_cols' value='2,6,7,8' />
116 <output name='outtab' file='edgeRtest1out.xls' compare='diff' />
117 <output name='html_file' file='edgeRtest1out.html' compare='diff' lines_diff='20' />
118 </test>
119 </tests>
120
121 <configfiles>
122 <configfile name="runme">
123 <![CDATA[
124 #
125 # edgeR.Rscript
126 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
127 # Performs DGE on a count table containing n replicates of two conditions
128 #
129 # Parameters
130 #
131 # 1 - Output Dir
132
133 # Original edgeR code by: S.Lunke and A.Kaspi
134 reallybig = log10(.Machine\$double.xmax)
135 reallysmall = log10(.Machine\$double.xmin)
136 library('stringr')
137 library('gplots')
138 library('edgeR')
139 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
140 {
141 # Perform clustering for significant pvalues after controlling FWER
142 samples = colnames(cmat)
143 gu = unique(group)
144 if (length(gu) == 2) {
145 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
146 pcols = unlist(lapply(group,col.map))
147 } else {
148 colours = rainbow(length(gu),start=0,end=4/6)
149 pcols = colours[match(group,gu)] }
150 gn = rownames(cmat)
151 dm = cmat[(! is.na(gn)),]
152 # remove unlabelled hm rows
153 nprobes = nrow(dm)
154 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
155 if (nprobes > nsamp) {
156 dm =dm[1:nsamp,]
157 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
158 }
159 newcolnames = substr(colnames(dm),1,20)
160 colnames(dm) = newcolnames
161 pdf(outpdfname)
162 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
163 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
164 dev.off()
165 }
166
167 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
168 {
169 # for 2 groups only was
170 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
171 #pcols = unlist(lapply(group,col.map))
172 gu = unique(group)
173 colours = rainbow(length(gu),start=0.3,end=0.6)
174 pcols = colours[match(group,gu)]
175 nrows = nrow(cmat)
176 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
177 if (nrows > nsamp) {
178 cmat = cmat[c(1:nsamp),]
179 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
180 }
181 newcolnames = substr(colnames(cmat),1,20)
182 colnames(cmat) = newcolnames
183 pdf(outpdfname)
184 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
185 dev.off()
186 }
187
188 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
189 # stolen from https://gist.github.com/703512
190 {
191 o = -log10(sort(pvector,decreasing=F))
192 e = -log10( 1:length(o)/length(o) )
193 o[o==-Inf] = reallysmall
194 o[o==Inf] = reallybig
195 maint = descr
196 pdf(outpdf)
197 plot(e,o,pch=19,cex=1, main=maint, ...,
198 xlab=expression(Expected~~-log[10](italic(p))),
199 ylab=expression(Observed~~-log[10](italic(p))),
200 xlim=c(0,max(e)), ylim=c(0,max(o)))
201 lines(e,e,col="red")
202 grid(col = "lightgray", lty = "dotted")
203 dev.off()
204 }
205
206 smearPlot = function(DGEList,deTags, outSmear, outMain)
207 {
208 pdf(outSmear)
209 plotSmear(DGEList,de.tags=deTags,main=outMain)
210 grid(col="lightgray", lty="dotted")
211 dev.off()
212 }
213
214 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
215 { #
216 nc = ncol(rawrs)
217 for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
218 fullnames = colnames(rawrs)
219 newcolnames = substr(colnames(rawrs),1,20)
220 colnames(rawrs) = newcolnames
221 newcolnames = substr(colnames(cleanrs),1,20)
222 colnames(cleanrs) = newcolnames
223 defpar = par(no.readonly=T)
224 print.noquote('raw contig counts by sample:')
225 print.noquote(summary(rawrs))
226 print.noquote('normalised contig counts by sample:')
227 print.noquote(summary(cleanrs))
228 pdf(pdfname)
229 par(mfrow=c(1,2))
230 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
231 grid(col="lightgray",lty="dotted")
232 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
233 grid(col="lightgray",lty="dotted")
234 dev.off()
235 pdfname = "sample_counts_histogram.pdf"
236 nc = ncol(rawrs)
237 print.noquote(paste('Using ncol rawrs=',nc))
238 ncroot = round(sqrt(nc))
239 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
240 m = c()
241 for (i in c(1:nc)) {
242 rhist = hist(rawrs[,i],breaks=100,plot=F)
243 m = append(m,max(rhist\$counts))
244 }
245 ymax = max(m)
246 pdf(pdfname)
247 par(mfrow=c(ncroot,ncroot))
248 for (i in c(1:nc)) {
249 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
250 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
251 }
252 dev.off()
253 par(defpar)
254
255 }
256
257 cumPlot = function(rawrs,cleanrs,maint,myTitle)
258 { # updated to use ecdf
259 pdfname = "Filtering_rowsum_bar_charts.pdf"
260 defpar = par(no.readonly=T)
261 lrs = log(rawrs,10)
262 lim = max(lrs)
263 pdf(pdfname)
264 par(mfrow=c(2,1))
265 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
266 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
267 grid(col="lightgray", lty="dotted")
268 lrs = log(cleanrs,10)
269 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
270 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
271 grid(col="lightgray", lty="dotted")
272 dev.off()
273 par(defpar)
274 }
275
276 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
277 { # updated to use ecdf
278 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
279 pdf(pdfname)
280 par(mfrow=c(2,1))
281 lastx = max(rawrs)
282 rawe = knots(ecdf(rawrs))
283 cleane = knots(ecdf(cleanrs))
284 cy = 1:length(cleane)/length(cleane)
285 ry = 1:length(rawe)/length(rawe)
286 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
287 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
288 grid(col="blue")
289 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
290 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
291 grid(col="blue")
292 dev.off()
293 }
294
295
296
297 doGSEA = function(y=NULL,design=NULL,histgmt="",
298 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
299 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
300 {
301 sink('Camera.log')
302 genesets = c()
303 if (bigmt > "")
304 {
305 bigenesets = readLines(bigmt)
306 genesets = bigenesets
307 }
308 if (histgmt > "")
309 {
310 hgenesets = readLines(histgmt)
311 if (bigmt > "") {
312 genesets = rbind(genesets,hgenesets)
313 } else {
314 genesets = hgenesets
315 } # use only history if no bi
316 }
317 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
318 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
319 outf = outfname
320 head=paste(myTitle,'edgeR GSEA')
321 write(head,file=outfname,append=F)
322 ntest=length(genesets)
323 urownames = toupper(rownames(y))
324 upcam = c()
325 downcam = c()
326 for (i in 1:ntest) {
327 gs = unlist(genesets[i])
328 g = gs[1] # geneset_id
329 u = gs[2]
330 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
331 glist = gs[3:length(gs)] # member gene symbols
332 glist = toupper(glist)
333 inglist = urownames %in% glist
334 nin = sum(inglist)
335 if ((nin > minnin) && (nin < maxnin)) {
336 ### print(paste('@@found',sum(inglist),'genes in glist'))
337 camres = camera(y=y,index=inglist,design=design)
338 if (! is.null(camres)) {
339 rownames(camres) = g # gene set name
340 camres = cbind(GeneSet=g,URL=u,camres)
341 if (camres\$Direction == "Up")
342 {
343 upcam = rbind(upcam,camres) } else {
344 downcam = rbind(downcam,camres)
345 }
346 }
347 }
348 }
349 uscam = upcam[order(upcam\$PValue),]
350 unadjp = uscam\$PValue
351 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
352 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
353 dscam = downcam[order(downcam\$PValue),]
354 unadjp = dscam\$PValue
355 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
356 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
357 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
358 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
359 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
360 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
361 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
362 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
363 sink()
364 }
365
366
367
368 edgeIt = function (Count_Matrix,group,outputfilename,fdrtype='fdr',priordf=5,
369 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
370 filterquantile=0.2, subjects=c(),mydesign=NULL,
371 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
372 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
373 doCook=F,DESeq_fitType="parameteric")
374 {
375 # Error handling
376 if (length(unique(group))!=2){
377 print("Number of conditions identified in experiment does not equal 2")
378 q()
379 }
380 require(edgeR)
381 options(width = 512)
382 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
383 allN = nrow(Count_Matrix)
384 nscut = round(ncol(Count_Matrix)/2)
385 colTotmillionreads = colSums(Count_Matrix)/1e6
386 counts.dataframe = as.data.frame(c())
387 rawrs = rowSums(Count_Matrix)
388 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
389 nzN = nrow(nonzerod)
390 nzrs = rowSums(nonzerod)
391 zN = allN - nzN
392 print('# Quantiles for non-zero row counts:',quote=F)
393 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
394 if (useNDF == "T")
395 {
396 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
397 lo = colSums(Count_Matrix[!gt1rpin3,])
398 workCM = Count_Matrix[gt1rpin3,]
399 cleanrs = rowSums(workCM)
400 cleanN = length(cleanrs)
401 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
402 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
403 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
404 } else {
405 useme = (nzrs > quantile(nzrs,filterquantile))
406 workCM = nonzerod[useme,]
407 lo = colSums(nonzerod[!useme,])
408 cleanrs = rowSums(workCM)
409 cleanN = length(cleanrs)
410 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
411 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
412 maint = paste('Filter below',filterquantile,'quantile')
413 }
414 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
415 allgenes = rownames(workCM)
416 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
417 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
418 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
419 testreg = str_match(allgenes,reg)
420 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
421 {
422 print("@@ using ucsc substitution for urls")
423 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
424 } else {
425 print("@@ using genecards substitution for urls")
426 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
427 }
428 print.noquote("# urls")
429 print.noquote(head(contigurls))
430 print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
431 cmrowsums = rowSums(workCM)
432 TName=unique(group)[1]
433 CName=unique(group)[2]
434 if (is.null(mydesign)) {
435 if (length(subjects) == 0)
436 {
437 mydesign = model.matrix(~group)
438 }
439 else {
440 subjf = factor(subjects)
441 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
442 }
443 }
444 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
445 print.noquote('Using design matrix:')
446 print.noquote(mydesign)
447 if (doedgeR) {
448 sink('edgeR.log')
449 #### Setup DGEList object
450 DGEList = DGEList(counts=workCM, group = group)
451 DGEList = calcNormFactors(DGEList)
452
453 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
454 comdisp = DGEList\$common.dispersion
455 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
456 if (edgeR_priordf > 0) {
457 print.noquote(paste("prior.df =",edgeR_priordf))
458 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
459 } else {
460 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
461 }
462 DGLM = glmFit(DGEList,design=mydesign)
463 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
464 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
465 normData = (1e+06*DGEList\$counts/efflib)
466 uoutput = cbind(
467 Name=as.character(rownames(DGEList\$counts)),
468 DE\$table,
469 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
470 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
471 DGEList\$counts
472 )
473 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
474 goodness = gof(DGLM, pcutoff=fdrthresh)
475 if (sum(goodness\$outlier) > 0) {
476 print.noquote('GLM outliers:')
477 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
478 } else {
479 print('No GLM fit outlier genes found\n')
480 }
481 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
482 pdf("edgeR_GoodnessofFit.pdf")
483 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
484 abline(0,1,lwd=3)
485 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
486 dev.off()
487 estpriorn = getPriorN(DGEList)
488 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
489 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
490 normData = (1e+06*DGEList\$counts/efflib)
491 uniqueg = unique(group)
492 #### Plot MDS
493 sample_colors = match(group,levels(group))
494 sampleTypes = levels(factor(group))
495 print.noquote(sampleTypes)
496 pdf("edgeR_MDSplot.pdf")
497 plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
498 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
499 grid(col="blue")
500 dev.off()
501 colnames(normData) = paste( colnames(normData),'N',sep="_")
502 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
503 nzd = data.frame(log(nonzerod + 1e-2,10))
504 boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf")
505 write.table(soutput,outputfilename, quote=FALSE, sep="\t",row.names=F)
506 tt = cbind(
507 Name=as.character(rownames(DGEList\$counts)),
508 DE\$table,
509 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
510 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
511 )
512 print.noquote("# edgeR Top tags\n")
513 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
514 tt = tt[order(DE\$table\$PValue),]
515 print.noquote(tt[1:50,])
516 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
517 nsig = length(deTags)
518 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
519 deColours = ifelse(deTags,'red','black')
520 pdf("edgeR_BCV_vs_abundance.pdf")
521 plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
522 dev.off()
523 dg = DGEList[order(DE\$table\$PValue),]
524 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
525 efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
526 normData = (1e+06*dg\$counts/efflib)
527 outpdfname="edgeR_heatmap.pdf"
528 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=myTitle)
529 outSmear = "edgeR_smearplot.pdf"
530 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
531 smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
532 qqPlot(descr=paste(myTitle,'edgeR QQ plot'),pvector=DE\$table\$PValue,outpdf='edgeR_qqplot.pdf')
533 norm.factor = DGEList\$samples\$norm.factors
534 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
535 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
536 edgeRcounts = rep(0, length(allgenes))
537 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
538 sink()
539 } ### doedgeR
540 if (doDESeq2 == T)
541 {
542 sink("DESeq2.log")
543 # DESeq2
544 require('DESeq2')
545 library('RColorBrewer')
546 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
547 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
548 #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
549 #rDESeq = results(DESeq2)
550 #newCountDataSet(workCM, group)
551 deSeqDatsizefac = estimateSizeFactors(deSEQds)
552 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
553 resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
554 rDESeq = as.data.frame(results(resDESeq))
555 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
556 srDESeq = rDESeq[order(rDESeq\$pvalue),]
557 qqPlot(descr=paste(myTitle,'DESeq2 qqplot'),pvector=rDESeq\$pvalue,outpdf='DESeq2_qqplot.pdf')
558 cat("# DESeq top 50\n")
559 print.noquote(srDESeq[1:50,])
560 write.table(srDESeq,paste(mt,'DESeq2_TopTable.xls',sep='_'), quote=FALSE, sep="\t",row.names=F)
561 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
562 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
563 DESeqcounts = rep(0, length(allgenes))
564 DESeqcounts[DESeqcountsindex] = 1
565 pdf("DESeq2_dispersion_estimates.pdf")
566 plotDispEsts(resDESeq)
567 dev.off()
568 ysmall = abs(min(rDESeq\$log2FoldChange))
569 ybig = abs(max(rDESeq\$log2FoldChange))
570 ylimit = min(4,ysmall,ybig)
571 pdf("DESeq2_MA_plot.pdf")
572 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
573 dev.off()
574 rlogres = rlogTransformation(resDESeq)
575 sampledists = dist( t( assay(rlogres) ) )
576 sdmat = as.matrix(sampledists)
577 pdf("DESeq2_sample_distance_plot.pdf")
578 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
579 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
580 dev.off()
581 sink()
582 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
583 if ("try-error" %in% class(result)) {
584 print.noquote('DESeq2 plotPCA failed.')
585 } else {
586 pdf("DESeq2_PCA_plot.pdf")
587 #### wtf - print? Seems needed to get this to work
588 print(ppca)
589 dev.off()
590 }
591 }
592
593 if (doVoom == T) {
594 sink('VOOM.log')
595 pdf("VOOM_mean_variance_plot.pdf")
596 dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
597 dev.off()
598 # Use limma to fit data
599 fit = lmFit(dat.voomed, mydesign)
600 fit = eBayes(fit)
601 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
602 qqPlot(descr=paste(myTitle,'VOOM-limma QQ plot'),pvector=rvoom\$P.Value,outpdf='VOOM_qqplot.pdf')
603 rownames(rvoom) = rownames(workCM)
604 rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls)
605 srvoom = rvoom[order(rvoom\$P.Value),]
606 write.table(srvoom,paste(mt,'VOOM_topTable.xls',sep='_'), quote=FALSE, sep="\t",row.names=F)
607 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
608 topresults.voom = srvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
609 voomcountsindex = which(allgenes %in% topresults.voom\$ID)
610 voomcounts = rep(0, length(allgenes))
611 voomcounts[voomcountsindex] = 1
612 cat("# VOOM top 50\n")
613 print(srvoom[1:50,])
614 sink()
615 }
616
617 if (doCamera) {
618 doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
619 outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
620 }
621
622 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
623 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
624 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
625 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
626 VOOM_limma = voomcounts, row.names = allgenes)
627 } else if ((doDESeq2==T) && (doedgeR==T)) {
628 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
629 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
630 } else if ((doVoom==T) && (doedgeR==T)) {
631 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
632 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
633 }
634
635 if (nrow(counts.dataframe > 1)) {
636 counts.venn = vennCounts(counts.dataframe)
637 vennf = "Venn_significant_genes_overlap.pdf"
638 pdf(vennf)
639 vennDiagram(counts.venn,main=vennmain,col="maroon")
640 dev.off()
641 }
642 } #### doDESeq2 or doVoom
643 #Return our main table
644 uoutput
645
646 }
647 #### Done
648
649 ###sink(stdout(),append=T,type="message")
650 builtin_gmt=""
651 history_gmt=""
652 doDESeq2 = $DESeq2.doDESeq2 # make these T or F
653 doVoom = $doVoom
654 doCamera = F
655 doedgeR = $edgeR.doedgeR
656 edgeR_priordf = 0
657
658 #if $DESeq2.doDESeq2 == "T"
659 DESeq_fitType = "$DESeq2.DESeq_fitType"
660 #end if
661 #if $edgeR.doedgeR == "T"
662 edgeR_priordf = $edgeR.edgeR_priordf
663 #end if
664
665
666 Out_Dir = "$html_file.files_path"
667 Input = "$input1"
668 TreatmentName = "$treatment_name"
669 TreatmentCols = "$Treat_cols"
670 ControlName = "$control_name"
671 ControlCols= "$Control_cols"
672 outputfilename = "$outtab"
673 org = "$input1.dbkey"
674 if (org == "") { org = "hg19"}
675 fdrtype = "$fdrtype"
676 fdrthresh = $fdrthresh
677 useNDF = "$useNDF"
678 fQ = $fQ # non-differential centile cutoff
679 myTitle = "$title"
680 subjects = c($subjectids)
681 nsubj = length(subjects)
682 if (nsubj > 0) {
683 if (doDESeq2) {
684 print('WARNING - cannot yet use DESeq2 for 2 way anova - see the docs')
685 doDESeq2 = F
686 }
687 }
688 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
689 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
690 cat('Got TCols=')
691 cat(TCols)
692 cat('; CCols=')
693 cat(CCols)
694 cat('\n')
695 useCols = c(TCols,CCols)
696 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
697 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
698 snames = colnames(Count_Matrix)
699 nsamples = length(snames)
700 if (nsubj > 0 & nsubj != nsamples) {
701 options("show.error.messages"=T)
702 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
703 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
704 write(mess, stderr())
705 quit(save="no",status=4)
706 }
707
708 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
709 if (length(subjects) != 0) {subjects = subjects[useCols]}
710 rn = rownames(Count_Matrix)
711 islib = rn %in% c('librarySize','NotInBedRegions')
712 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
713 Count_Matrix = Count_Matrix[subset(rn,! islib),]
714 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
715 group = factor(group, levels=c(ControlName,TreatmentName))
716 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
717 results = edgeIt(Count_Matrix=Count_Matrix,group=group,outputfilename=outputfilename,
718 fdrtype='BH',priordf=edgeR_priordf,fdrthresh=0.05,outputdir='.',
719 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=c(),
720 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
721 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType)
722 sessionInfo()
723 ]]>
724 </configfile>
725 </configfiles>
726 <help>
727
728 ----
729
730 **What it does**
731
732 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
733 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
734
735 **Input**
736
737 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
738 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
739 non-negative integer count of reads from one sample overlapping the feature.
740 The matrix must have a header row uniquely identifying the source samples, and unique row names in
741 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
742
743 **Specifying comparisons**
744
745 This is basically dumbed down for two factors - case vs control.
746
747 More complex interfaces are possible but painful at present.
748 Probably need to specify a phenotype file to do this better.
749 Work in progress. Send code.
750
751 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
752 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
753 A list of integers, one for each subject or an empty string if samples are all independent.
754 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
755 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
756
757 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
758 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
759 8,9,1,1,2,2
760 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
761
762 **Methods available**
763
764 You can run 3 popular Bioconductor packages available for count data.
765
766 edgeR - see edgeR_ for details
767
768 VOOM/limma - see limma_VOOM_ for details
769
770 DESeq2 - see DESeq2_ for details
771
772 and optionally camera in edgeR which works better if MSigDB is installed.
773
774 **Outputs**
775
776 Some helpful plots and analysis results. Note that most of these are produced using R code
777 suggested by the excellent documentation and vignettes for the Bioconductor
778 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
779
780 ***old rant on changes to Bioconductor package variable names between versions***
781
782 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
783 breaking this and all other code that assumed the old name for this variable,
784 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
785 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
786 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
787 when their old scripts break. This tool currently now works with 2.4.6.
788
789 **Note on prior.N**
790
791 http://seqanswers.com/forums/showthread.php?t=5591 says:
792
793 *prior.n*
794
795 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
796 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
797 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
798 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
799 common likelihood the weight of one observation.
800
801 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
802 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
803 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
804 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
805 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
806 If you have more samples, then the tagwise dispersion estimates will be more reliable,
807 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
808
809
810 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
811
812 Dear Dorota,
813
814 The important settings are prior.df and trend.
815
816 prior.n and prior.df are related through prior.df = prior.n * residual.df,
817 and your experiment has residual.df = 36 - 12 = 24. So the old setting of
818 prior.n=10 is equivalent for your data to prior.df = 240, a very large
819 value. Going the other way, the new setting of prior.df=10 is equivalent
820 to prior.n=10/24.
821
822 To recover old results with the current software you would use
823
824 estimateTagwiseDisp(object, prior.df=240, trend="none")
825
826 To get the new default from old software you would use
827
828 estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
829
830 Actually the old trend method is equivalent to trend="loess" in the new
831 software. You should use plotBCV(object) to see whether a trend is
832 required.
833
834 Note you could also use
835
836 prior.n = getPriorN(object, prior.df=10)
837
838 to map between prior.df and prior.n.
839
840 ----
841
842 **Attributions**
843
844 edgeR - edgeR_
845
846 VOOM/limma - limma_VOOM_
847
848 DESeq2 - DESeq2_ for details
849
850 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
851
852 Galaxy_ (that's what you are using right now!) for gluing everything together
853
854 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
855 licensed to you under the LGPL_ like other rgenetics artefacts
856
857 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
858 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
859 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
860 .. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
861 .. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
862 .. _Galaxy: http://getgalaxy.org
863 </help>
864
865 </tool>
866
867