comparison rgedgeRpaired_nocamera.xml @ 149:3107df74056e draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/differential_count_models commit 344140b8df53b8b7024618bb04594607a045c03a
author iuc
date Mon, 04 May 2015 22:47:36 -0400
parents 474c08e747b6
children
comparison
equal deleted inserted replaced
148:1e20061decdd 149:3107df74056e
5 <requirement type="package" version="3.1.2">R</requirement> 5 <requirement type="package" version="3.1.2">R</requirement>
6 <requirement type="package" version="1.3.18">graphicsmagick</requirement> 6 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
7 <requirement type="package" version="9.10">ghostscript</requirement> 7 <requirement type="package" version="9.10">ghostscript</requirement>
8 <requirement type="package" version="2.14">biocbasics</requirement> 8 <requirement type="package" version="2.14">biocbasics</requirement>
9 </requirements> 9 </requirements>
10 <stdio>
11 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix"/>
12 </stdio>
10 <command interpreter="python"> 13 <command interpreter="python">
11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "Differential_Counts" 14 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "Differential_Counts"
12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" 15 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
13 </command> 16 </command>
17 <configfiles>
18 <configfile name="runme"><![CDATA[
19 #
20 # edgeR.Rscript
21 # updated feb 2014 adding outlier-robust deviance estimate options by ross for R 3.0.2/bioc 2.13
22 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
23 # Performs DGE on a count table containing n replicates of two conditions
24 #
25 # Parameters
26 #
27 # 1 - Output Dir
28
29 # Original edgeR code by: S.Lunke and A.Kaspi
30 reallybig = log10(.Machine\$double.xmax)
31 reallysmall = log10(.Machine\$double.xmin)
32 library("gplots")
33 library("edgeR")
34 library('stringr')
35 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
36 {
37 # Perform clustering for significant pvalues after controlling FWER
38 samples = colnames(cmat)
39 gu = unique(group)
40 gn = rownames(cmat)
41 if (length(gu) == 2) {
42 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
43 pcols = unlist(lapply(group,col.map))
44 } else {
45 colours = rainbow(length(gu),start=0,end=4/6)
46 pcols = colours[match(group,gu)] }
47 dm = cmat[(! is.na(gn)),]
48 # remove unlabelled hm rows
49 nprobes = nrow(dm)
50 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
51 if (nprobes > nsamp) {
52 dm =dm[1:nsamp,]
53 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
54 }
55 newcolnames = substr(colnames(dm),1,20)
56 colnames(dm) = newcolnames
57 pdf(outpdfname)
58 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
59 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
60 dev.off()
61 }
62
63 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
64 {
65 # for 2 groups only was
66 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
67 #pcols = unlist(lapply(group,col.map))
68 gu = unique(group)
69 colours = rainbow(length(gu),start=0.3,end=0.6)
70 pcols = colours[match(group,gu)]
71 nrows = nrow(cmat)
72 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
73 if (nrows > nsamp) {
74 cmat = cmat[c(1:nsamp),]
75 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
76 }
77 newcolnames = substr(colnames(cmat),1,20)
78 colnames(cmat) = newcolnames
79 pdf(outpdfname)
80 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
81 dev.off()
82 }
83
84 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
85 # stolen from https://gist.github.com/703512
86 {
87 o = -log10(sort(pvector,decreasing=F))
88 e = -log10( 1:length(o)/length(o) )
89 o[o==-Inf] = reallysmall
90 o[o==Inf] = reallybig
91 maint = descr
92 pdf(outpdf)
93 plot(e,o,pch=19,cex=1, main=maint, ...,
94 xlab=expression(Expected~~-log[10](italic(p))),
95 ylab=expression(Observed~~-log[10](italic(p))),
96 xlim=c(0,max(e)), ylim=c(0,max(o)))
97 lines(e,e,col="red")
98 grid(col = "lightgray", lty = "dotted")
99 dev.off()
100 }
101
102 smearPlot = function(myDGEList,deTags, outSmear, outMain)
103 {
104 pdf(outSmear)
105 plotSmear(myDGEList,de.tags=deTags,main=outMain)
106 grid(col="lightgray", lty="dotted")
107 dev.off()
108 }
109
110 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
111 {
112 nc = ncol(rawrs)
113 ##### for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
114 fullnames = colnames(rawrs)
115 newcolnames = substr(colnames(rawrs),1,20)
116 colnames(rawrs) = newcolnames
117 newcolnames = substr(colnames(cleanrs),1,20)
118 colnames(cleanrs) = newcolnames
119 defpar = par(no.readonly=T)
120 print.noquote('@@@ Raw contig counts by sample:')
121 print.noquote(summary(rawrs))
122 print.noquote('@@@ Library size contig counts by sample:')
123 print.noquote(summary(cleanrs))
124 pdf(pdfname)
125 par(mfrow=c(1,2))
126 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main='log2 raw counts')
127 grid(col="lightgray",lty="dotted")
128 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('log2 counts after ',maint))
129 grid(col="lightgray",lty="dotted")
130 dev.off()
131 pdfname = "sample_counts_histogram.pdf"
132 nc = ncol(rawrs)
133 print.noquote(paste('Using ncol rawrs=',nc))
134 ncroot = round(sqrt(nc))
135 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
136 m = c()
137 for (i in c(1:nc)) {
138 rhist = hist(rawrs[,i],breaks=100,plot=F)
139 m = append(m,max(rhist\$counts))
140 }
141 ymax = max(m)
142 ncols = length(fullnames)
143 if (ncols > 20)
144 {
145 scale = 7*ncols/20
146 pdf(pdfname,width=scale,height=scale)
147 } else {
148 pdf(pdfname)
149 }
150 par(mfrow=c(ncroot,ncroot))
151 for (i in c(1:nc)) {
152 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
153 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
154 }
155 dev.off()
156 par(defpar)
157
158 }
159
160 cumPlot = function(rawrs,cleanrs,maint,myTitle)
161 { # updated to use ecdf
162 pdfname = "Differential_rowsum_bar_charts.pdf"
163 defpar = par(no.readonly=T)
164 lrs = log(rawrs,10)
165 lim = max(lrs)
166 pdf(pdfname)
167 par(mfrow=c(2,1))
168 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
169 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
170 grid(col="lightgray", lty="dotted")
171 lrs = log(cleanrs,10)
172 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
173 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
174 grid(col="lightgray", lty="dotted")
175 dev.off()
176 par(defpar)
177 }
178
179 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
180 { # updated to use ecdf
181 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
182 pdf(pdfname)
183 par(mfrow=c(2,1))
184 lastx = max(rawrs)
185 rawe = knots(ecdf(rawrs))
186 cleane = knots(ecdf(cleanrs))
187 cy = 1:length(cleane)/length(cleane)
188 ry = 1:length(rawe)/length(rawe)
189 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
190 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
191 grid(col="blue")
192 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
193 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
194 grid(col="blue")
195 dev.off()
196 }
197
198
199
200 doGSEAold = function(y=NULL,design=NULL,histgmt="",
201 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
202 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
203 {
204 sink('Camera.log')
205 genesets = c()
206 if (bigmt > "")
207 {
208 bigenesets = readLines(bigmt)
209 genesets = bigenesets
210 }
211 if (histgmt > "")
212 {
213 hgenesets = readLines(histgmt)
214 if (bigmt > "") {
215 genesets = rbind(genesets,hgenesets)
216 } else {
217 genesets = hgenesets
218 } # use only history if no bi
219 }
220 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
221 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
222 outf = outfname
223 head=paste(myTitle,'edgeR GSEA')
224 write(head,file=outfname,append=F)
225 ntest=length(genesets)
226 urownames = toupper(rownames(y))
227 upcam = c()
228 downcam = c()
229 for (i in 1:ntest) {
230 gs = unlist(genesets[i])
231 g = gs[1] # geneset_id
232 u = gs[2]
233 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
234 glist = gs[3:length(gs)] # member gene symbols
235 glist = toupper(glist)
236 inglist = urownames %in% glist
237 nin = sum(inglist)
238 if ((nin > minnin) && (nin < maxnin)) {
239 ### print(paste('@@found',sum(inglist),'genes in glist'))
240 camres = camera(y=y,index=inglist,design=design)
241 if (! is.null(camres)) {
242 rownames(camres) = g # gene set name
243 camres = cbind(GeneSet=g,URL=u,camres)
244 if (camres\$Direction == "Up")
245 {
246 upcam = rbind(upcam,camres) } else {
247 downcam = rbind(downcam,camres)
248 }
249 }
250 }
251 }
252 uscam = upcam[order(upcam\$PValue),]
253 unadjp = uscam\$PValue
254 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
255 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
256 dscam = downcam[order(downcam\$PValue),]
257 unadjp = dscam\$PValue
258 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
259 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
260 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
261 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
262 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
263 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
264 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
265 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
266 sink()
267 }
268
269
270
271
272 doGSEA = function(y=NULL,design=NULL,histgmt="",
273 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
274 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
275 {
276 sink('Camera.log')
277 genesets = c()
278 if (bigmt > "")
279 {
280 bigenesets = readLines(bigmt)
281 genesets = bigenesets
282 }
283 if (histgmt > "")
284 {
285 hgenesets = readLines(histgmt)
286 if (bigmt > "") {
287 genesets = rbind(genesets,hgenesets)
288 } else {
289 genesets = hgenesets
290 } # use only history if no bi
291 }
292 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
293 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
294 outf = outfname
295 head=paste(myTitle,'edgeR GSEA')
296 write(head,file=outfname,append=F)
297 ntest=length(genesets)
298 urownames = toupper(rownames(y))
299 upcam = c()
300 downcam = c()
301 incam = c()
302 urls = c()
303 gsids = c()
304 for (i in 1:ntest) {
305 gs = unlist(genesets[i])
306 gsid = gs[1] # geneset_id
307 url = gs[2]
308 if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
309 glist = gs[3:length(gs)] # member gene symbols
310 glist = toupper(glist)
311 inglist = urownames %in% glist
312 nin = sum(inglist)
313 if ((nin > minnin) && (nin < maxnin)) {
314 incam = c(incam,inglist)
315 gsids = c(gsids,gsid)
316 urls = c(urls,url)
317 }
318 }
319 incam = as.list(incam)
320 names(incam) = gsids
321 allcam = camera(y=y,index=incam,design=design)
322 allcamres = cbind(geneset=gsids,allcam,URL=urls)
323 for (i in 1:ntest) {
324 camres = allcamres[i]
325 res = try(test = (camres\$Direction == "Up"))
326 if ("try-error" %in% class(res)) {
327 cat("test failed, camres = :")
328 print.noquote(camres)
329 } else { if (camres\$Direction == "Up")
330 { upcam = rbind(upcam,camres)
331 } else { downcam = rbind(downcam,camres)
332 }
333
334 }
335 }
336 uscam = upcam[order(upcam\$PValue),]
337 unadjp = uscam\$PValue
338 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
339 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
340 dscam = downcam[order(downcam\$PValue),]
341 unadjp = dscam\$PValue
342 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
343 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
344 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
345 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
346 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
347 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
348 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
349 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
350 sink()
351 }
352
353
354 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_Voom=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
355 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
356 filterquantile=0.2, subjects=c(),TreatmentName="Rx",ControlName="Ctrl",mydesign=NULL,
357 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
358 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
359 doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary')
360 {
361
362 logf = file('Differential.log', open = "a")
363 sink(logf,type = c("output", "message"))
364
365
366 run_edgeR = function(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod)
367 {
368 logf = file('edgeR.log', open = "a")
369 sink(logf,type = c("output", "message"))
370 #### Setup myDGEList object
371 myDGEList = DGEList(counts=workCM, group = group)
372 myDGEList = calcNormFactors(myDGEList)
373 if (robust_meth == 'ordinary') {
374 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign)
375 myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign)
376 if (priordf > 0) { myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign,prior.df = priordf)
377 } else { myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign) }
378 comdisp = myDGEList\$common.dispersion
379 estpriorn = getPriorN(myDGEList)
380 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
381 } else {
382 myDGEList = estimateGLMRobustDisp(myDGEList,design=mydesign, prior.df = priordf, maxit = 6, residual.type = robust_meth)
383 }
384
385
386 DGLM = glmFit(myDGEList,design=mydesign)
387 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
388 normData = cpm(myDGEList)
389 uoutput = cbind(
390 Name=as.character(rownames(myDGEList\$counts)),
391 DE\$table,
392 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
393 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
394 myDGEList\$counts
395 )
396 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
397 goodness = gof(DGLM, pcutoff=fdrthresh)
398 if (sum(goodness\$outlier) > 0) {
399 print.noquote('GLM outliers:')
400 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
401 } else {
402 print('No GLM fit outlier genes found\n')
403 }
404 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
405 pdf(paste("edgeR",mt,"GoodnessofFit.pdf",sep='_'))
406 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
407 abline(0,1,lwd=3)
408 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
409 dev.off()
410 uniqueg = unique(group)
411 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
412 tt = cbind(
413 Name=as.character(rownames(myDGEList)),
414 DE\$table,
415 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
416 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums
417 )
418 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
419 stt = tt[order(DE\$table\$PValue),]
420 print.noquote("@@ edgeR Top tags\n")
421 print.noquote(stt[1:50,])
422 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
423 nsig = length(deTags)
424 print.noquote(paste('@@',nsig,'tags significant at adj p=',fdrthresh))
425 deColours = ifelse(deTags,'red','black')
426 pdf(paste("edgeR",mt,"BCV_vs_abundance.pdf",sep="_"))
427 plotBCV(myDGEList, cex=0.3, main="Biological CV vs abundance")
428 dev.off()
429 dg = myDGEList[order(DE\$table\$PValue),]
430 outpdfname= paste("edgeR",mt,"top_100_heatmap.pdf",sep="_")
431 ocpm = normData[order(DE\$table\$PValue),]
432 ocpm = ocpm[c(1:100),]
433 hmap2(ocpm,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste(myTitle,'Heatmap'))
434 outSmear = paste("edgeR",mt,"smearplot.pdf",sep="_")
435 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
436 smearPlot(myDGEList=myDGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
437 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf=paste('edgeR',mt,'qqplot.pdf',sep='_'))
438 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
439 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
440 edgeRcounts = rep(0, length(allgenes))
441 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
442 sink()
443 return(list(myDGEList=myDGEList,edgeRcounts=edgeRcounts))
444 } ### run_edgeR
445
446
447 run_DESeq2 = function(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType)
448
449 {
450 logf = file("DESeq2.log", open = "a")
451 sink(logf,type = c("output", "message"))
452 # DESeq2
453 require('DESeq2')
454 library('RColorBrewer')
455 if (length(subjects) == 0)
456 {
457 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
458 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
459 } else {
460 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
461 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
462 }
463 deSeqDatsizefac = estimateSizeFactors(deSEQds)
464 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
465 resDESeq = nbinomWaldTest(deSeqDatdisp)
466 rDESeq = as.data.frame(results(resDESeq))
467 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
468 srDESeq = rDESeq[order(rDESeq\$pvalue),]
469 qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf=paste('DESeq2',mt,'qqplot.pdf',sep="_"))
470 cat("# DESeq top 50\n")
471 print.noquote(srDESeq[1:50,])
472 write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
473 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
474 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
475 DESeqcounts = rep(0, length(allgenes))
476 DESeqcounts[DESeqcountsindex] = 1
477 pdf(paste("DESeq2",mt,"dispersion_estimates.pdf",sep='_'))
478 plotDispEsts(resDESeq)
479 dev.off()
480 ysmall = abs(min(rDESeq\$log2FoldChange))
481 ybig = abs(max(rDESeq\$log2FoldChange))
482 ylimit = min(4,ysmall,ybig)
483 pdf(paste("DESeq2",mt,"MA_plot.pdf",sep="_"))
484 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
485 dev.off()
486 rlogres = rlogTransformation(resDESeq)
487 sampledists = dist( t( assay(rlogres) ) )
488 sdmat = as.matrix(sampledists)
489 pdf(paste("DESeq2",mt,"sample_distance_plot.pdf",sep="_"))
490 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
491 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
492 dev.off()
493 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
494 if ("try-error" %in% class(result)) {
495 print.noquote('DESeq2 plotPCA failed.')
496 } else {
497 pdf(paste("DESeq2",mt,"PCA_plot.pdf",sep="_"))
498 #### wtf - print? Seems needed to get this to work
499 print(ppca)
500 dev.off()
501 }
502 sink()
503 return(DESeqcounts)
504 }
505
506
507 run_Voom = function(workCM,pdata,subjects,group,mydesign,mt,out_Voom)
508 {
509 logf = file('VOOM.log', open = "a")
510 sink(logf,type = c("output", "message"))
511 if (doedgeR == F) {
512 #### Setup myDGEList object
513 myDGEList = DGEList(counts=workCM, group = group)
514 myDGEList = calcNormFactors(myDGEList)
515 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign)
516 myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign)
517 myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign)
518 }
519 pdf(paste("VOOM",mt,"mean_variance_plot.pdf",sep='_'))
520 dat.voomed <- voom(myDGEList, mydesign, plot = TRUE, normalize.method="quantil", lib.size = NULL)
521 dev.off()
522 # Use limma to fit data
523 fit = lmFit(dat.voomed, mydesign)
524 fit = eBayes(fit)
525 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
526 qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf=paste('VOOM',mt,'qqplot.pdf',sep='_'))
527 rownames(rvoom) = rownames(workCM)
528 rvoom = cbind(Contig=rownames(workCM),rvoom,NReads=cmrowsums,URL=contigurls)
529 srvoom = rvoom[order(rvoom\$P.Value),]
530 cat("# VOOM top 50\n")
531 print(srvoom[1:50,])
532 write.table(srvoom,file=out_Voom, quote=FALSE, sep="\t",row.names=F)
533 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
534 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
535 voomcountsindex <- which(allgenes %in% rownames(topresults.voom))
536 voomcounts = rep(0, length(allgenes))
537 voomcounts[voomcountsindex] = 1
538 sink()
539 return(voomcounts)
540 }
541
542
543 #### data cleaning and analsis control starts here
544
545
546 # Error handling
547 nugroup = length(unique(group))
548 if (nugroup!=2){
549 print("Number of conditions identified in experiment does not equal 2")
550 q()
551 }
552 require(edgeR)
553 options(width = 512)
554 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
555 allN = nrow(Count_Matrix)
556 nscut = round(ncol(Count_Matrix)/2) # half samples
557 colTotmillionreads = colSums(Count_Matrix)/1e6
558 counts.dataframe = as.data.frame(c())
559 rawrs = rowSums(Count_Matrix)
560 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
561 nzN = nrow(nonzerod)
562 nzrs = rowSums(nonzerod)
563 zN = allN - nzN
564 print('@@@ Quantiles for non-zero row counts:',quote=F)
565 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
566 if (useNDF == T)
567 {
568 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
569 lo = colSums(Count_Matrix[!gt1rpin3,])
570 workCM = Count_Matrix[gt1rpin3,]
571 cleanrs = rowSums(workCM)
572 cleanN = length(cleanrs)
573 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
574 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
575 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
576 } else {
577 useme = (nzrs > quantile(nzrs,filterquantile))
578 workCM = nonzerod[useme,]
579 lo = colSums(nonzerod[!useme,])
580 cleanrs = rowSums(workCM)
581 cleanN = length(cleanrs)
582 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
583 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
584 maint = paste('Filter below',filterquantile,'quantile')
585 }
586 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
587 allgenes = rownames(workCM)
588 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)" # ucsc chr:start-end regexp
589 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
590 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
591 testreg = str_match(allgenes,reg)
592 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
593 {
594 print("@@ using ucsc substitution for urls")
595 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
596 } else {
597 print("@@ using genecards substitution for urls")
598 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
599 }
600 print.noquote(paste("@@ Total low count contigs per sample = ",paste(table(lo),collapse=',')))
601 cmrowsums = rowSums(workCM)
602 TName=unique(group)[1]
603 CName=unique(group)[2]
604 if (is.null(mydesign)) {
605 if (length(subjects) == 0)
606 {
607 mydesign = model.matrix(~group)
608 }
609 else {
610 subjf = factor(subjects)
611 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
612 }
613 }
614 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
615 print.noquote('Using design matrix:')
616 print.noquote(mydesign)
617 normData = cpm(workCM)*1e6
618 colnames(normData) = paste( colnames(workCM),'N',sep="_")
619 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
620
621 if (doedgeR == T) {
622 eres = run_edgeR(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod)
623 myDGEList = eres\$myDGEList
624 edgeRcounts = eres\$edgeRcounts
625 #### Plot MDS
626 sample_colors = match(group,levels(group))
627 sampleTypes = levels(factor(group))
628 print.noquote(sampleTypes)
629 pdf(paste("edgeR",mt,"MDSplot.pdf",sep='_'))
630 plotMDS.DGEList(myDGEList,main=paste("MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
631 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
632 grid(col="blue")
633 dev.off()
634 scale <- myDGEList\$samples\$lib.size*myDGEList\$samples\$norm.factors
635 normCounts <- round(t(t(myDGEList\$counts)/scale)*mean(scale))
636 try({boxPlot(rawrs=nzd,cleanrs=log2(normCounts+1),maint='Effects of TMM size normalisation',myTitle=myTitle,pdfname=paste("edgeR",mt,"raw_norm_counts_box.pdf",sep='_'))},T)
637 }
638 if (doDESeq2 == T) { DESeqcounts = run_DESeq2(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) }
639 if (doVoom == T) { voomcounts = run_Voom(workCM,pdata,subjects,group,mydesign,mt,out_Voom) }
640
641
642 if (doCamera) {
643 doGSEA(y=myDGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
644 outfname=paste("GSEA_Camera",mt,"table.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
645 }
646 counts.dataframe = c()
647 vennmain = 'no venn'
648 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
649 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
650 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
651 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
652 VOOM_limma = voomcounts, row.names = allgenes)
653 } else if ((doDESeq2==T) && (doedgeR==T)) {
654 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
655 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
656 } else if ((doVoom==T) && (doedgeR==T)) {
657 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
658 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
659 }
660
661 if (nrow(counts.dataframe > 1)) {
662 counts.venn = vennCounts(counts.dataframe)
663 vennf = paste("Differential_venn",mt,"significant_genes_overlap.pdf",sep="_")
664 pdf(vennf)
665 vennDiagram(counts.venn,main=vennmain,col="maroon")
666 dev.off()
667 }
668 } #### doDESeq2 or doVoom
669 sink()
670 }
671 #### Done
672 ]]>
673 builtin_gmt = ""
674 history_gmt = ""
675 history_gmt_name = ""
676 out_edgeR = F
677 out_DESeq2 = F
678 out_Voom = "$out_VOOM"
679 edgeR_robust_meth = "ordinary"
680 doDESeq2 = $DESeq2.doDESeq2
681 doVoom = $doVoom
682 doCamera = F
683 doedgeR = $edgeR.doedgeR
684 edgeR_priordf = 10
685
686
687 #if $doVoom == "T":
688 out_Voom = "$out_VOOM"
689 #end if
690
691 #if $DESeq2.doDESeq2 == "T":
692 out_DESeq2 = "$out_DESeq2"
693 doDESeq2 = T
694 DESeq_fitType = "$DESeq2.DESeq_fitType"
695 #end if
696
697 #if $edgeR.doedgeR == "T":
698 out_edgeR = "$out_edgeR"
699 edgeR_priordf = $edgeR.edgeR_priordf
700 edgeR_robust_meth = "$edgeR.edgeR_robust_method"
701 #end if
702
703
704 if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
705 {
706 write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
707 quit(save="no",status=2)
708 }
709
710 Out_Dir = "$html_file.files_path"
711 Input = "$input1"
712 TreatmentName = "$treatment_name"
713 TreatmentCols = "$Treat_cols"
714 ControlName = "$control_name"
715 ControlCols= "$Control_cols"
716 org = "$input1.dbkey"
717 if (org == "") { org = "hg19"}
718 fdrtype = "$fdrtype"
719 fdrthresh = $fdrthresh
720 useNDF = $useNDF
721 fQ = $fQ # non-differential centile cutoff
722 myTitle = "$title"
723 sids = strsplit("$subjectids",',')
724 subjects = unlist(sids)
725 nsubj = length(subjects)
726 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
727 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
728 cat('Got TCols=')
729 cat(TCols)
730 cat('; CCols=')
731 cat(CCols)
732 cat('\n')
733 <![CDATA[
734 useCols = c(TCols,CCols)
735 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
736 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t')
737 snames = colnames(Count_Matrix)
738 nsamples = length(snames)
739 if (nsubj > 0 & nsubj != nsamples) {
740 options("show.error.messages"=T)
741 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
742 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
743 write(mess, stderr())
744 quit(save="no",status=4)
745 }
746 if (length(subjects) != 0) {subjects = subjects[useCols]}
747 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
748 rn = rownames(Count_Matrix)
749 islib = rn %in% c('librarySize','NotInBedRegions')
750 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
751 Count_Matrix = Count_Matrix[subset(rn,! islib),]
752 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) )
753 group = factor(group, levels=c(ControlName,TreatmentName))
754 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_")
755 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_Voom=out_Voom, out_DESeq2=out_DESeq2,
756 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
757 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,TreatmentName=TreatmentName,ControlName=ControlName,
758 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
759 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType,robust_meth=edgeR_robust_meth)
760 sessionInfo()
761
762 sink()
763 ]]>
764 </configfile>
765 </configfiles>
14 <inputs> 766 <inputs>
15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample" help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/> 767 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample" help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
16 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain"> 768 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain">
17 <sanitizer invalid_char=""> 769 <sanitizer invalid_char="">
18 <valid initial="string.letters,string.digits"> 770 <valid initial="string.letters,string.digits">
91 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls"> 843 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
92 <filter>doVoom == "T"</filter> 844 <filter>doVoom == "T"</filter>
93 </data> 845 </data>
94 <data format="html" name="html_file" label="${title}.html"/> 846 <data format="html" name="html_file" label="${title}.html"/>
95 </outputs> 847 </outputs>
96 <stdio>
97 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix"/>
98 </stdio>
99 <tests> 848 <tests>
100 <test> 849 <test>
101 <param name="input1" value="test_bams2mx.xls" ftype="tabular"/> 850 <param name="input1" value="test_bams2mx.xls" ftype="tabular"/>
102 <param name="treatment_name" value="liver"/> 851 <param name="treatment_name" value="liver"/>
103 <param name="title" value="edgeRtest"/> 852 <param name="title" value="edgeRtest"/>
115 <param name="Treat_cols" value="2,6,7,8"/> 864 <param name="Treat_cols" value="2,6,7,8"/>
116 <output name="out_edgeR" file="edgeRtest1out.xls" compare="diff" lines_diff="20"/> 865 <output name="out_edgeR" file="edgeRtest1out.xls" compare="diff" lines_diff="20"/>
117 <output name="html_file" file="edgeRtest1out.html" compare="diff" lines_diff="20"/> 866 <output name="html_file" file="edgeRtest1out.html" compare="diff" lines_diff="20"/>
118 </test> 867 </test>
119 </tests> 868 </tests>
120 <configfiles>
121 <configfile name="runme"><![CDATA[
122 #
123 # edgeR.Rscript
124 # updated feb 2014 adding outlier-robust deviance estimate options by ross for R 3.0.2/bioc 2.13
125 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
126 # Performs DGE on a count table containing n replicates of two conditions
127 #
128 # Parameters
129 #
130 # 1 - Output Dir
131
132 # Original edgeR code by: S.Lunke and A.Kaspi
133 reallybig = log10(.Machine\$double.xmax)
134 reallysmall = log10(.Machine\$double.xmin)
135 library("gplots")
136 library("edgeR")
137 library('stringr')
138 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
139 {
140 # Perform clustering for significant pvalues after controlling FWER
141 samples = colnames(cmat)
142 gu = unique(group)
143 gn = rownames(cmat)
144 if (length(gu) == 2) {
145 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
146 pcols = unlist(lapply(group,col.map))
147 } else {
148 colours = rainbow(length(gu),start=0,end=4/6)
149 pcols = colours[match(group,gu)] }
150 dm = cmat[(! is.na(gn)),]
151 # remove unlabelled hm rows
152 nprobes = nrow(dm)
153 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
154 if (nprobes > nsamp) {
155 dm =dm[1:nsamp,]
156 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
157 }
158 newcolnames = substr(colnames(dm),1,20)
159 colnames(dm) = newcolnames
160 pdf(outpdfname)
161 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
162 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
163 dev.off()
164 }
165
166 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
167 {
168 # for 2 groups only was
169 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
170 #pcols = unlist(lapply(group,col.map))
171 gu = unique(group)
172 colours = rainbow(length(gu),start=0.3,end=0.6)
173 pcols = colours[match(group,gu)]
174 nrows = nrow(cmat)
175 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
176 if (nrows > nsamp) {
177 cmat = cmat[c(1:nsamp),]
178 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
179 }
180 newcolnames = substr(colnames(cmat),1,20)
181 colnames(cmat) = newcolnames
182 pdf(outpdfname)
183 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
184 dev.off()
185 }
186
187 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
188 # stolen from https://gist.github.com/703512
189 {
190 o = -log10(sort(pvector,decreasing=F))
191 e = -log10( 1:length(o)/length(o) )
192 o[o==-Inf] = reallysmall
193 o[o==Inf] = reallybig
194 maint = descr
195 pdf(outpdf)
196 plot(e,o,pch=19,cex=1, main=maint, ...,
197 xlab=expression(Expected~~-log[10](italic(p))),
198 ylab=expression(Observed~~-log[10](italic(p))),
199 xlim=c(0,max(e)), ylim=c(0,max(o)))
200 lines(e,e,col="red")
201 grid(col = "lightgray", lty = "dotted")
202 dev.off()
203 }
204
205 smearPlot = function(myDGEList,deTags, outSmear, outMain)
206 {
207 pdf(outSmear)
208 plotSmear(myDGEList,de.tags=deTags,main=outMain)
209 grid(col="lightgray", lty="dotted")
210 dev.off()
211 }
212
213 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
214 {
215 nc = ncol(rawrs)
216 ##### for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
217 fullnames = colnames(rawrs)
218 newcolnames = substr(colnames(rawrs),1,20)
219 colnames(rawrs) = newcolnames
220 newcolnames = substr(colnames(cleanrs),1,20)
221 colnames(cleanrs) = newcolnames
222 defpar = par(no.readonly=T)
223 print.noquote('@@@ Raw contig counts by sample:')
224 print.noquote(summary(rawrs))
225 print.noquote('@@@ Library size contig counts by sample:')
226 print.noquote(summary(cleanrs))
227 pdf(pdfname)
228 par(mfrow=c(1,2))
229 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main='log2 raw counts')
230 grid(col="lightgray",lty="dotted")
231 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('log2 counts after ',maint))
232 grid(col="lightgray",lty="dotted")
233 dev.off()
234 pdfname = "sample_counts_histogram.pdf"
235 nc = ncol(rawrs)
236 print.noquote(paste('Using ncol rawrs=',nc))
237 ncroot = round(sqrt(nc))
238 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
239 m = c()
240 for (i in c(1:nc)) {
241 rhist = hist(rawrs[,i],breaks=100,plot=F)
242 m = append(m,max(rhist\$counts))
243 }
244 ymax = max(m)
245 ncols = length(fullnames)
246 if (ncols > 20)
247 {
248 scale = 7*ncols/20
249 pdf(pdfname,width=scale,height=scale)
250 } else {
251 pdf(pdfname)
252 }
253 par(mfrow=c(ncroot,ncroot))
254 for (i in c(1:nc)) {
255 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
256 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
257 }
258 dev.off()
259 par(defpar)
260
261 }
262
263 cumPlot = function(rawrs,cleanrs,maint,myTitle)
264 { # updated to use ecdf
265 pdfname = "Differential_rowsum_bar_charts.pdf"
266 defpar = par(no.readonly=T)
267 lrs = log(rawrs,10)
268 lim = max(lrs)
269 pdf(pdfname)
270 par(mfrow=c(2,1))
271 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
272 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
273 grid(col="lightgray", lty="dotted")
274 lrs = log(cleanrs,10)
275 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
276 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
277 grid(col="lightgray", lty="dotted")
278 dev.off()
279 par(defpar)
280 }
281
282 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
283 { # updated to use ecdf
284 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
285 pdf(pdfname)
286 par(mfrow=c(2,1))
287 lastx = max(rawrs)
288 rawe = knots(ecdf(rawrs))
289 cleane = knots(ecdf(cleanrs))
290 cy = 1:length(cleane)/length(cleane)
291 ry = 1:length(rawe)/length(rawe)
292 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
293 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
294 grid(col="blue")
295 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
296 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
297 grid(col="blue")
298 dev.off()
299 }
300
301
302
303 doGSEAold = function(y=NULL,design=NULL,histgmt="",
304 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
305 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
306 {
307 sink('Camera.log')
308 genesets = c()
309 if (bigmt > "")
310 {
311 bigenesets = readLines(bigmt)
312 genesets = bigenesets
313 }
314 if (histgmt > "")
315 {
316 hgenesets = readLines(histgmt)
317 if (bigmt > "") {
318 genesets = rbind(genesets,hgenesets)
319 } else {
320 genesets = hgenesets
321 } # use only history if no bi
322 }
323 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
324 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
325 outf = outfname
326 head=paste(myTitle,'edgeR GSEA')
327 write(head,file=outfname,append=F)
328 ntest=length(genesets)
329 urownames = toupper(rownames(y))
330 upcam = c()
331 downcam = c()
332 for (i in 1:ntest) {
333 gs = unlist(genesets[i])
334 g = gs[1] # geneset_id
335 u = gs[2]
336 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
337 glist = gs[3:length(gs)] # member gene symbols
338 glist = toupper(glist)
339 inglist = urownames %in% glist
340 nin = sum(inglist)
341 if ((nin > minnin) && (nin < maxnin)) {
342 ### print(paste('@@found',sum(inglist),'genes in glist'))
343 camres = camera(y=y,index=inglist,design=design)
344 if (! is.null(camres)) {
345 rownames(camres) = g # gene set name
346 camres = cbind(GeneSet=g,URL=u,camres)
347 if (camres\$Direction == "Up")
348 {
349 upcam = rbind(upcam,camres) } else {
350 downcam = rbind(downcam,camres)
351 }
352 }
353 }
354 }
355 uscam = upcam[order(upcam\$PValue),]
356 unadjp = uscam\$PValue
357 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
358 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
359 dscam = downcam[order(downcam\$PValue),]
360 unadjp = dscam\$PValue
361 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
362 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
363 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
364 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
365 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
366 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
367 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
368 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
369 sink()
370 }
371
372
373
374
375 doGSEA = function(y=NULL,design=NULL,histgmt="",
376 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
377 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
378 {
379 sink('Camera.log')
380 genesets = c()
381 if (bigmt > "")
382 {
383 bigenesets = readLines(bigmt)
384 genesets = bigenesets
385 }
386 if (histgmt > "")
387 {
388 hgenesets = readLines(histgmt)
389 if (bigmt > "") {
390 genesets = rbind(genesets,hgenesets)
391 } else {
392 genesets = hgenesets
393 } # use only history if no bi
394 }
395 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
396 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
397 outf = outfname
398 head=paste(myTitle,'edgeR GSEA')
399 write(head,file=outfname,append=F)
400 ntest=length(genesets)
401 urownames = toupper(rownames(y))
402 upcam = c()
403 downcam = c()
404 incam = c()
405 urls = c()
406 gsids = c()
407 for (i in 1:ntest) {
408 gs = unlist(genesets[i])
409 gsid = gs[1] # geneset_id
410 url = gs[2]
411 if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
412 glist = gs[3:length(gs)] # member gene symbols
413 glist = toupper(glist)
414 inglist = urownames %in% glist
415 nin = sum(inglist)
416 if ((nin > minnin) && (nin < maxnin)) {
417 incam = c(incam,inglist)
418 gsids = c(gsids,gsid)
419 urls = c(urls,url)
420 }
421 }
422 incam = as.list(incam)
423 names(incam) = gsids
424 allcam = camera(y=y,index=incam,design=design)
425 allcamres = cbind(geneset=gsids,allcam,URL=urls)
426 for (i in 1:ntest) {
427 camres = allcamres[i]
428 res = try(test = (camres\$Direction == "Up"))
429 if ("try-error" %in% class(res)) {
430 cat("test failed, camres = :")
431 print.noquote(camres)
432 } else { if (camres\$Direction == "Up")
433 { upcam = rbind(upcam,camres)
434 } else { downcam = rbind(downcam,camres)
435 }
436
437 }
438 }
439 uscam = upcam[order(upcam\$PValue),]
440 unadjp = uscam\$PValue
441 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
442 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
443 dscam = downcam[order(downcam\$PValue),]
444 unadjp = dscam\$PValue
445 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
446 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
447 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
448 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
449 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
450 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
451 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
452 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
453 sink()
454 }
455
456
457 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_Voom=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
458 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
459 filterquantile=0.2, subjects=c(),TreatmentName="Rx",ControlName="Ctrl",mydesign=NULL,
460 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
461 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
462 doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary')
463 {
464
465 logf = file('Differential.log', open = "a")
466 sink(logf,type = c("output", "message"))
467
468
469 run_edgeR = function(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod)
470 {
471 logf = file('edgeR.log', open = "a")
472 sink(logf,type = c("output", "message"))
473 #### Setup myDGEList object
474 myDGEList = DGEList(counts=workCM, group = group)
475 myDGEList = calcNormFactors(myDGEList)
476 if (robust_meth == 'ordinary') {
477 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign)
478 myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign)
479 if (priordf > 0) { myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign,prior.df = priordf)
480 } else { myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign) }
481 comdisp = myDGEList\$common.dispersion
482 estpriorn = getPriorN(myDGEList)
483 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
484 } else {
485 myDGEList = estimateGLMRobustDisp(myDGEList,design=mydesign, prior.df = priordf, maxit = 6, residual.type = robust_meth)
486 }
487
488
489 DGLM = glmFit(myDGEList,design=mydesign)
490 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
491 normData = cpm(myDGEList)
492 uoutput = cbind(
493 Name=as.character(rownames(myDGEList\$counts)),
494 DE\$table,
495 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
496 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
497 myDGEList\$counts
498 )
499 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
500 goodness = gof(DGLM, pcutoff=fdrthresh)
501 if (sum(goodness\$outlier) > 0) {
502 print.noquote('GLM outliers:')
503 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
504 } else {
505 print('No GLM fit outlier genes found\n')
506 }
507 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
508 pdf(paste("edgeR",mt,"GoodnessofFit.pdf",sep='_'))
509 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
510 abline(0,1,lwd=3)
511 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
512 dev.off()
513 uniqueg = unique(group)
514 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
515 tt = cbind(
516 Name=as.character(rownames(myDGEList)),
517 DE\$table,
518 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
519 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums
520 )
521 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
522 stt = tt[order(DE\$table\$PValue),]
523 print.noquote("@@ edgeR Top tags\n")
524 print.noquote(stt[1:50,])
525 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
526 nsig = length(deTags)
527 print.noquote(paste('@@',nsig,'tags significant at adj p=',fdrthresh))
528 deColours = ifelse(deTags,'red','black')
529 pdf(paste("edgeR",mt,"BCV_vs_abundance.pdf",sep="_"))
530 plotBCV(myDGEList, cex=0.3, main="Biological CV vs abundance")
531 dev.off()
532 dg = myDGEList[order(DE\$table\$PValue),]
533 outpdfname= paste("edgeR",mt,"top_100_heatmap.pdf",sep="_")
534 ocpm = normData[order(DE\$table\$PValue),]
535 ocpm = ocpm[c(1:100),]
536 hmap2(ocpm,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste(myTitle,'Heatmap'))
537 outSmear = paste("edgeR",mt,"smearplot.pdf",sep="_")
538 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
539 smearPlot(myDGEList=myDGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
540 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf=paste('edgeR',mt,'qqplot.pdf',sep='_'))
541 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
542 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
543 edgeRcounts = rep(0, length(allgenes))
544 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
545 sink()
546 return(list(myDGEList=myDGEList,edgeRcounts=edgeRcounts))
547 } ### run_edgeR
548
549
550 run_DESeq2 = function(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType)
551
552 {
553 logf = file("DESeq2.log", open = "a")
554 sink(logf,type = c("output", "message"))
555 # DESeq2
556 require('DESeq2')
557 library('RColorBrewer')
558 if (length(subjects) == 0)
559 {
560 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
561 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
562 } else {
563 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
564 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
565 }
566 deSeqDatsizefac = estimateSizeFactors(deSEQds)
567 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
568 resDESeq = nbinomWaldTest(deSeqDatdisp)
569 rDESeq = as.data.frame(results(resDESeq))
570 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
571 srDESeq = rDESeq[order(rDESeq\$pvalue),]
572 qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf=paste('DESeq2',mt,'qqplot.pdf',sep="_"))
573 cat("# DESeq top 50\n")
574 print.noquote(srDESeq[1:50,])
575 write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
576 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
577 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
578 DESeqcounts = rep(0, length(allgenes))
579 DESeqcounts[DESeqcountsindex] = 1
580 pdf(paste("DESeq2",mt,"dispersion_estimates.pdf",sep='_'))
581 plotDispEsts(resDESeq)
582 dev.off()
583 ysmall = abs(min(rDESeq\$log2FoldChange))
584 ybig = abs(max(rDESeq\$log2FoldChange))
585 ylimit = min(4,ysmall,ybig)
586 pdf(paste("DESeq2",mt,"MA_plot.pdf",sep="_"))
587 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
588 dev.off()
589 rlogres = rlogTransformation(resDESeq)
590 sampledists = dist( t( assay(rlogres) ) )
591 sdmat = as.matrix(sampledists)
592 pdf(paste("DESeq2",mt,"sample_distance_plot.pdf",sep="_"))
593 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
594 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
595 dev.off()
596 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
597 if ("try-error" %in% class(result)) {
598 print.noquote('DESeq2 plotPCA failed.')
599 } else {
600 pdf(paste("DESeq2",mt,"PCA_plot.pdf",sep="_"))
601 #### wtf - print? Seems needed to get this to work
602 print(ppca)
603 dev.off()
604 }
605 sink()
606 return(DESeqcounts)
607 }
608
609
610 run_Voom = function(workCM,pdata,subjects,group,mydesign,mt,out_Voom)
611 {
612 logf = file('VOOM.log', open = "a")
613 sink(logf,type = c("output", "message"))
614 if (doedgeR == F) {
615 #### Setup myDGEList object
616 myDGEList = DGEList(counts=workCM, group = group)
617 myDGEList = calcNormFactors(myDGEList)
618 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign)
619 myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign)
620 myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign)
621 }
622 pdf(paste("VOOM",mt,"mean_variance_plot.pdf",sep='_'))
623 dat.voomed <- voom(myDGEList, mydesign, plot = TRUE, normalize.method="quantil", lib.size = NULL)
624 dev.off()
625 # Use limma to fit data
626 fit = lmFit(dat.voomed, mydesign)
627 fit = eBayes(fit)
628 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
629 qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf=paste('VOOM',mt,'qqplot.pdf',sep='_'))
630 rownames(rvoom) = rownames(workCM)
631 rvoom = cbind(Contig=rownames(workCM),rvoom,NReads=cmrowsums,URL=contigurls)
632 srvoom = rvoom[order(rvoom\$P.Value),]
633 cat("# VOOM top 50\n")
634 print(srvoom[1:50,])
635 write.table(srvoom,file=out_Voom, quote=FALSE, sep="\t",row.names=F)
636 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
637 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
638 voomcountsindex <- which(allgenes %in% rownames(topresults.voom))
639 voomcounts = rep(0, length(allgenes))
640 voomcounts[voomcountsindex] = 1
641 sink()
642 return(voomcounts)
643 }
644
645
646 #### data cleaning and analsis control starts here
647
648
649 # Error handling
650 nugroup = length(unique(group))
651 if (nugroup!=2){
652 print("Number of conditions identified in experiment does not equal 2")
653 q()
654 }
655 require(edgeR)
656 options(width = 512)
657 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
658 allN = nrow(Count_Matrix)
659 nscut = round(ncol(Count_Matrix)/2) # half samples
660 colTotmillionreads = colSums(Count_Matrix)/1e6
661 counts.dataframe = as.data.frame(c())
662 rawrs = rowSums(Count_Matrix)
663 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
664 nzN = nrow(nonzerod)
665 nzrs = rowSums(nonzerod)
666 zN = allN - nzN
667 print('@@@ Quantiles for non-zero row counts:',quote=F)
668 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
669 if (useNDF == T)
670 {
671 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
672 lo = colSums(Count_Matrix[!gt1rpin3,])
673 workCM = Count_Matrix[gt1rpin3,]
674 cleanrs = rowSums(workCM)
675 cleanN = length(cleanrs)
676 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
677 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
678 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
679 } else {
680 useme = (nzrs > quantile(nzrs,filterquantile))
681 workCM = nonzerod[useme,]
682 lo = colSums(nonzerod[!useme,])
683 cleanrs = rowSums(workCM)
684 cleanN = length(cleanrs)
685 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
686 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
687 maint = paste('Filter below',filterquantile,'quantile')
688 }
689 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
690 allgenes = rownames(workCM)
691 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)" # ucsc chr:start-end regexp
692 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
693 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
694 testreg = str_match(allgenes,reg)
695 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
696 {
697 print("@@ using ucsc substitution for urls")
698 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
699 } else {
700 print("@@ using genecards substitution for urls")
701 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
702 }
703 print.noquote(paste("@@ Total low count contigs per sample = ",paste(table(lo),collapse=',')))
704 cmrowsums = rowSums(workCM)
705 TName=unique(group)[1]
706 CName=unique(group)[2]
707 if (is.null(mydesign)) {
708 if (length(subjects) == 0)
709 {
710 mydesign = model.matrix(~group)
711 }
712 else {
713 subjf = factor(subjects)
714 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
715 }
716 }
717 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
718 print.noquote('Using design matrix:')
719 print.noquote(mydesign)
720 normData = cpm(workCM)*1e6
721 colnames(normData) = paste( colnames(workCM),'N',sep="_")
722 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
723
724 if (doedgeR == T) {
725 eres = run_edgeR(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod)
726 myDGEList = eres\$myDGEList
727 edgeRcounts = eres\$edgeRcounts
728 #### Plot MDS
729 sample_colors = match(group,levels(group))
730 sampleTypes = levels(factor(group))
731 print.noquote(sampleTypes)
732 pdf(paste("edgeR",mt,"MDSplot.pdf",sep='_'))
733 plotMDS.DGEList(myDGEList,main=paste("MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
734 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
735 grid(col="blue")
736 dev.off()
737 scale <- myDGEList\$samples\$lib.size*myDGEList\$samples\$norm.factors
738 normCounts <- round(t(t(myDGEList\$counts)/scale)*mean(scale))
739 try({boxPlot(rawrs=nzd,cleanrs=log2(normCounts+1),maint='Effects of TMM size normalisation',myTitle=myTitle,pdfname=paste("edgeR",mt,"raw_norm_counts_box.pdf",sep='_'))},T)
740 }
741 if (doDESeq2 == T) { DESeqcounts = run_DESeq2(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) }
742 if (doVoom == T) { voomcounts = run_Voom(workCM,pdata,subjects,group,mydesign,mt,out_Voom) }
743
744
745 if (doCamera) {
746 doGSEA(y=myDGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
747 outfname=paste("GSEA_Camera",mt,"table.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
748 }
749 counts.dataframe = c()
750 vennmain = 'no venn'
751 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
752 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
753 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
754 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
755 VOOM_limma = voomcounts, row.names = allgenes)
756 } else if ((doDESeq2==T) && (doedgeR==T)) {
757 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
758 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
759 } else if ((doVoom==T) && (doedgeR==T)) {
760 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
761 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
762 }
763
764 if (nrow(counts.dataframe > 1)) {
765 counts.venn = vennCounts(counts.dataframe)
766 vennf = paste("Differential_venn",mt,"significant_genes_overlap.pdf",sep="_")
767 pdf(vennf)
768 vennDiagram(counts.venn,main=vennmain,col="maroon")
769 dev.off()
770 }
771 } #### doDESeq2 or doVoom
772 sink()
773 }
774 #### Done
775 ]]>
776 builtin_gmt = ""
777 history_gmt = ""
778 history_gmt_name = ""
779 out_edgeR = F
780 out_DESeq2 = F
781 out_Voom = "$out_VOOM"
782 edgeR_robust_meth = "ordinary"
783 doDESeq2 = $DESeq2.doDESeq2
784 doVoom = $doVoom
785 doCamera = F
786 doedgeR = $edgeR.doedgeR
787 edgeR_priordf = 10
788
789
790 #if $doVoom == "T":
791 out_Voom = "$out_VOOM"
792 #end if
793
794 #if $DESeq2.doDESeq2 == "T":
795 out_DESeq2 = "$out_DESeq2"
796 doDESeq2 = T
797 DESeq_fitType = "$DESeq2.DESeq_fitType"
798 #end if
799
800 #if $edgeR.doedgeR == "T":
801 out_edgeR = "$out_edgeR"
802 edgeR_priordf = $edgeR.edgeR_priordf
803 edgeR_robust_meth = "$edgeR.edgeR_robust_method"
804 #end if
805
806
807 if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
808 {
809 write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
810 quit(save="no",status=2)
811 }
812
813 Out_Dir = "$html_file.files_path"
814 Input = "$input1"
815 TreatmentName = "$treatment_name"
816 TreatmentCols = "$Treat_cols"
817 ControlName = "$control_name"
818 ControlCols= "$Control_cols"
819 org = "$input1.dbkey"
820 if (org == "") { org = "hg19"}
821 fdrtype = "$fdrtype"
822 fdrthresh = $fdrthresh
823 useNDF = $useNDF
824 fQ = $fQ # non-differential centile cutoff
825 myTitle = "$title"
826 sids = strsplit("$subjectids",',')
827 subjects = unlist(sids)
828 nsubj = length(subjects)
829 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
830 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
831 cat('Got TCols=')
832 cat(TCols)
833 cat('; CCols=')
834 cat(CCols)
835 cat('\n')
836 <![CDATA[
837 useCols = c(TCols,CCols)
838 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
839 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t')
840 snames = colnames(Count_Matrix)
841 nsamples = length(snames)
842 if (nsubj > 0 & nsubj != nsamples) {
843 options("show.error.messages"=T)
844 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
845 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
846 write(mess, stderr())
847 quit(save="no",status=4)
848 }
849 if (length(subjects) != 0) {subjects = subjects[useCols]}
850 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
851 rn = rownames(Count_Matrix)
852 islib = rn %in% c('librarySize','NotInBedRegions')
853 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
854 Count_Matrix = Count_Matrix[subset(rn,! islib),]
855 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) )
856 group = factor(group, levels=c(ControlName,TreatmentName))
857 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_")
858 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_Voom=out_Voom, out_DESeq2=out_DESeq2,
859 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
860 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,TreatmentName=TreatmentName,ControlName=ControlName,
861 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
862 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType,robust_meth=edgeR_robust_meth)
863 sessionInfo()
864
865 sink()
866 ]]>
867 </configfile>
868 </configfiles>
869 <help> 869 <help>
870 870
871 **What it does** 871 **What it does**
872 872
873 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes. 873 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
874 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design. 874 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
875 875
876 **Input** 876 **Input**
877 877
878 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper 878 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
879 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the 879 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
880 non-negative integer count of reads from one sample overlapping the feature. 880 non-negative integer count of reads from one sample overlapping the feature.
881 881
882 The matrix must have a header row uniquely identifying the source samples, and unique row names in 882 The matrix must have a header row uniquely identifying the source samples, and unique row names in
883 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods. 883 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
884 They must be unique and R names or they will be mangled - please read the fine R docs for the rules on identifiers. 884 They must be unique and R names or they will be mangled - please read the fine R docs for the rules on identifiers.
885 885
886 **Specifying comparisons** 886 **Specifying comparisons**
887 887
888 This is basically dumbed down for two factors - case vs control. 888 This is basically dumbed down for two factors - case vs control.
889 889
890 More complex interfaces are possible but painful at present. 890 More complex interfaces are possible but painful at present.
891 Probably need to specify a phenotype file to do this better. 891 Probably need to specify a phenotype file to do this better.
892 Work in progress. Send code. 892 Work in progress. Send code.
893 893
894 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples), 894 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
895 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or 895 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
896 A list of integers, one for each subject or an empty string if samples are all independent. 896 A list of integers, one for each subject or an empty string if samples are all independent.
897 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix. 897 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
898 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used. 898 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
899 899
900 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones 900 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
901 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use 901 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
902 8,9,1,1,2,2 902 8,9,1,1,2,2
903 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6 903 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
904 904
905 **Methods available** 905 **Methods available**
906 906
907 You can run 3 popular Bioconductor packages available for count data. 907 You can run 3 popular Bioconductor packages available for count data.
914 914
915 and optionally camera in edgeR which works better if MSigDB is installed. 915 and optionally camera in edgeR which works better if MSigDB is installed.
916 916
917 **Outputs** 917 **Outputs**
918 918
919 Some helpful plots and analysis results. Note that most of these are produced using R code 919 Some helpful plots and analysis results. Note that most of these are produced using R code
920 suggested by the excellent documentation and vignettes for the Bioconductor 920 suggested by the excellent documentation and vignettes for the Bioconductor
921 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy. 921 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
922 922
923 **Note on Voom** 923 **Note on Voom**
924 924
959 vooma is a similar function but for microarrays instead of RNA-seq. 959 vooma is a similar function but for microarrays instead of RNA-seq.
960 960
961 961
962 ***old rant on changes to Bioconductor package variable names between versions*** 962 ***old rant on changes to Bioconductor package variable names between versions***
963 963
964 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue) 964 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
965 breaking this and all other code that assumed the old name for this variable, 965 breaking this and all other code that assumed the old name for this variable,
966 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing). 966 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
967 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing 967 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
968 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly 968 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
969 when their old scripts break. This tool currently now works with 2.4.6. 969 when their old scripts break. This tool currently now works with 2.4.6.
970 970
971 **Note on prior.N** 971 **Note on prior.N**
972 972
973 http://seqanswers.com/forums/showthread.php?t=5591 says: 973 http://seqanswers.com/forums/showthread.php?t=5591 says:
974 974
975 *prior.n* 975 *prior.n*
976 976
977 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion. 977 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
978 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood 978 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
979 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your 979 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
980 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the 980 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
981 common likelihood the weight of one observation. 981 common likelihood the weight of one observation.
982 982
983 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value, 983 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
984 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that 984 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
985 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation 985 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
986 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment? 986 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
987 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10. 987 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
988 If you have more samples, then the tagwise dispersion estimates will be more reliable, 988 If you have more samples, then the tagwise dispersion estimates will be more reliable,
989 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5. 989 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
990 990
991 991
992 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes: 992 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
993 993
994 Dear Dorota, 994 Dear Dorota,
1021 1021
1022 ---- 1022 ----
1023 1023
1024 **Attributions** 1024 **Attributions**
1025 1025
1026 edgeR - edgeR_ 1026 edgeR - edgeR_
1027 1027
1028 VOOM/limma - limma_VOOM_ 1028 VOOM/limma - limma_VOOM_
1029 1029
1030 DESeq2 - DESeq2_ for details 1030 DESeq2 - DESeq2_ for details
1031 1031
1032 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package. 1032 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
1033 1033
1034 Galaxy_ (that's what you are using right now!) for gluing everything together 1034 Galaxy_ (that's what you are using right now!) for gluing everything together
1035 1035
1036 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is 1036 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
1037 licensed to you under the LGPL_ like other rgenetics artefacts 1037 licensed to you under the LGPL_ like other rgenetics artefacts
1038 1038
1039 .. _LGPL: http://www.gnu.org/copyleft/lesser.html 1039 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
1040 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html 1040 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
1041 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html 1041 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html