4
|
1 <tool id="rgedgeRpaired" name="edgeR" version="0.18">
|
|
2 <description>1 or 2 level models for count data</description>
|
|
3 <requirements>
|
9
|
4 <requirement type="package" version="6.2">readline</requirement>
|
|
5 <requirement type="package" version="3.0.1">package_R</requirement>
|
|
6 <requirement type="package" version="2.12">npackage_BioCBasics</requirement>
|
4
|
7 </requirements>
|
|
8
|
0
|
9 <command interpreter="python">
|
|
10 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "edgeR"
|
|
11 --output_dir "$html_file.files_path" --output_html "$html_file" --output_tab "$outtab" --make_HTML "yes"
|
|
12 </command>
|
|
13 <inputs>
|
|
14 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
|
|
15 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
|
|
16 <param name="title" type="text" value="edgeR" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain">
|
|
17 <sanitizer invalid_char="">
|
|
18 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
|
|
19 </sanitizer>
|
|
20 </param>
|
|
21 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
|
|
22 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
|
|
23 multiple="true" use_header_names="true" size="120" display="checkboxes">
|
|
24 <validator type="no_options" message="Please select at least one column."/>
|
|
25 </param>
|
|
26 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
|
|
27 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
|
|
28 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
|
|
29 </param>
|
|
30 <param name="subjectids" type="text" optional="true" size="120"
|
|
31 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter integers to indicate sample pairing for every column in input"
|
|
32 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter '1,2,1,2'">
|
|
33 <sanitizer>
|
|
34 <valid initial="string.digits"><add value="," /> </valid>
|
|
35 </sanitizer>
|
|
36 </param>
|
|
37 <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
|
|
38 help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
|
|
39 <param name="useNDF" type="boolean" truevalue="T" checked='false' falsevalue="" size="1" label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
|
|
40 help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
|
|
41 <param name="priordf" type="integer" value="20" size="3" label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n and prior.df = prior.n * residual.df"
|
|
42 help="Zero = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
|
|
43 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
|
|
44 help="Conventional default value of 0.05 recommended"/>
|
|
45 <param name="fdrtype" type="select" label="FDR (Type II error) control method"
|
|
46 help="Use fdr or bh typically to control for the number of tests in a reliable way">
|
|
47 <option value="fdr" selected="true">fdr</option>
|
|
48 <option value="BH">Benjamini Hochberg</option>
|
|
49 <option value="BY">Benjamini Yukateli</option>
|
|
50 <option value="bonferroni">Bonferroni</option>
|
|
51 <option value="hochberg">Hochberg</option>
|
|
52 <option value="holm">Holm</option>
|
|
53 <option value="hommel">Hommel</option>
|
|
54 <option value="none">no control for multiple tests</option>
|
|
55 </param>
|
|
56 </inputs>
|
|
57 <outputs>
|
|
58 <data format="tabular" name="outtab" label="${title}.xls"/>
|
|
59 <data format="html" name="html_file" label="${title}.html"/>
|
|
60 </outputs>
|
|
61 <stdio>
|
|
62 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix" />
|
|
63 </stdio>
|
|
64 <tests>
|
|
65 <test>
|
|
66 <param name='input1' value='test_bams2mx.xls' ftype='tabular' />
|
|
67 <param name='treatment_name' value='case' />
|
|
68 <param name='title' value='edgeRtest' />
|
|
69 <param name='fdrtype' value='fdr' />
|
|
70 <param name='priordf' value="0" />
|
|
71 <param name='fdrthresh' value="0.05" />
|
|
72 <param name='control_name' value='control' />
|
|
73 <param name='Treat_cols' value='3,4,5,9' />
|
|
74 <param name='Control_cols' value='2,6,7,8' />
|
|
75 <output name='outtab' file='edgeRtest1out.xls' ftype='tabular' compare='diff' />
|
|
76 <output name='html_file' file='edgeRtest1out.html' ftype='html' compare='diff' lines_diff='20' />
|
|
77 </test>
|
|
78 </tests>
|
|
79
|
|
80 <configfiles>
|
|
81 <configfile name="runme">
|
|
82 <![CDATA[
|
|
83 #
|
|
84 # edgeR.Rscript
|
|
85 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
|
|
86 # Performs DGE on a count table containing n replicates of two conditions
|
|
87 #
|
|
88 # Parameters
|
|
89 #
|
|
90 # 1 - Output Dir
|
|
91
|
|
92 # Original edgeR code by: S.Lunke and A.Kaspi
|
|
93 reallybig = log10(.Machine\$double.xmax)
|
|
94 reallysmall = log10(.Machine\$double.xmin)
|
|
95 library('stringr')
|
|
96 library('gplots')
|
|
97 library('DESeq')
|
|
98 library('edgeR')
|
|
99 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
|
|
100 {
|
|
101 # Perform clustering for significant pvalues after controlling FWER
|
|
102 samples = colnames(cmat)
|
|
103 gu = unique(group)
|
|
104 if (length(gu) == 2) {
|
|
105 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
|
|
106 pcols = unlist(lapply(group,col.map))
|
|
107 } else {
|
|
108 colours = rainbow(length(gu),start=0,end=4/6)
|
|
109 pcols = colours[match(group,gu)] }
|
|
110 gn = rownames(cmat)
|
|
111 dm = cmat[(! is.na(gn)),]
|
|
112 # remove unlabelled hm rows
|
|
113 nprobes = nrow(dm)
|
|
114 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
|
|
115 if (nprobes > nsamp) {
|
|
116 dm =dm[1:nsamp,]
|
|
117 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
|
|
118 }
|
|
119 newcolnames = substr(colnames(dm),1,20)
|
|
120 colnames(dm) = newcolnames
|
|
121 pdf(outpdfname)
|
|
122 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
|
|
123 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
|
|
124 dev.off()
|
|
125 }
|
|
126
|
|
127 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
|
|
128 {
|
|
129 # for 2 groups only was
|
|
130 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
|
|
131 #pcols = unlist(lapply(group,col.map))
|
|
132 gu = unique(group)
|
|
133 colours = rainbow(length(gu),start=0.3,end=0.6)
|
|
134 pcols = colours[match(group,gu)]
|
|
135 nrows = nrow(cmat)
|
|
136 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
|
|
137 if (nrows > nsamp) {
|
|
138 cmat = cmat[c(1:nsamp),]
|
|
139 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
|
|
140 }
|
|
141 newcolnames = substr(colnames(cmat),1,20)
|
|
142 colnames(cmat) = newcolnames
|
|
143 pdf(outpdfname)
|
|
144 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
|
|
145 dev.off()
|
|
146 }
|
|
147
|
|
148 qqPlot = function(descr='Title',pvector, ...)
|
|
149 # stolen from https://gist.github.com/703512
|
|
150 {
|
|
151 o = -log10(sort(pvector,decreasing=F))
|
|
152 e = -log10( 1:length(o)/length(o) )
|
|
153 o[o==-Inf] = reallysmall
|
|
154 o[o==Inf] = reallybig
|
|
155 pdfname = paste(gsub(" ","", descr , fixed=TRUE),'pval_qq.pdf',sep='_')
|
|
156 maint = paste(descr,'QQ Plot')
|
|
157 pdf(pdfname)
|
|
158 plot(e,o,pch=19,cex=1, main=maint, ...,
|
|
159 xlab=expression(Expected~~-log[10](italic(p))),
|
|
160 ylab=expression(Observed~~-log[10](italic(p))),
|
|
161 xlim=c(0,max(e)), ylim=c(0,max(o)))
|
|
162 lines(e,e,col="red")
|
|
163 grid(col = "lightgray", lty = "dotted")
|
|
164 dev.off()
|
|
165 }
|
|
166
|
|
167 smearPlot = function(DGEList,deTags, outSmear, outMain)
|
|
168 {
|
|
169 pdf(outSmear)
|
|
170 plotSmear(DGEList,de.tags=deTags,main=outMain)
|
|
171 grid(col="blue")
|
|
172 dev.off()
|
|
173 }
|
|
174
|
|
175 boxPlot = function(rawrs,cleanrs,maint,myTitle)
|
|
176 { #
|
|
177 nc = ncol(rawrs)
|
|
178 for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
|
|
179 fullnames = colnames(rawrs)
|
|
180 newcolnames = substr(colnames(rawrs),1,20)
|
|
181 colnames(rawrs) = newcolnames
|
|
182 newcolnames = substr(colnames(cleanrs),1,20)
|
|
183 colnames(cleanrs) = newcolnames
|
|
184 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"sampleBoxplot.pdf",sep='_')
|
|
185 defpar = par(no.readonly=T)
|
|
186 pdf(pdfname,height=6,width=8)
|
|
187 #par(mfrow=c(1,2)) # 1 rows 2 col
|
|
188 l = layout(matrix(c(1,2),1,2,byrow=T))
|
|
189 print.noquote('raw contig counts by sample:')
|
|
190 print.noquote(summary(rawrs))
|
|
191 print.noquote('normalised contig counts by sample:')
|
|
192 print.noquote(summary(cleanrs))
|
|
193 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
|
|
194 grid(col="blue")
|
|
195 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
|
|
196 grid(col="blue")
|
|
197 dev.off()
|
|
198 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"samplehistplot.pdf",sep='_')
|
|
199 nc = ncol(rawrs)
|
|
200 print.noquote(paste('Using ncol rawrs=',nc))
|
|
201 ncroot = round(sqrt(nc))
|
|
202 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
|
|
203 m = c()
|
|
204 for (i in c(1:nc)) {
|
|
205 rhist = hist(rawrs[,i],breaks=100,plot=F)
|
|
206 m = append(m,max(rhist\$counts))
|
|
207 }
|
|
208 ymax = max(m)
|
|
209 pdf(pdfname)
|
|
210 par(mfrow=c(ncroot,ncroot))
|
|
211 for (i in c(1:nc)) {
|
|
212 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
|
|
213 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
|
|
214 }
|
|
215 dev.off()
|
|
216 par(defpar)
|
|
217
|
|
218 }
|
|
219
|
|
220 cumPlot = function(rawrs,cleanrs,maint,myTitle)
|
|
221 { # updated to use ecdf
|
|
222 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
|
|
223 defpar = par(no.readonly=T)
|
|
224 pdf(pdfname)
|
|
225 par(mfrow=c(2,1))
|
|
226 lrs = log(rawrs,10)
|
|
227 lim = max(lrs)
|
|
228 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
|
|
229 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
|
|
230 grid(col="blue")
|
|
231 lrs = log(cleanrs,10)
|
|
232 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
|
|
233 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
|
|
234 grid(col="blue")
|
|
235 dev.off()
|
|
236 par(defpar)
|
|
237 }
|
|
238
|
|
239 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
|
|
240 { # updated to use ecdf
|
|
241 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
|
|
242 pdf(pdfname)
|
|
243 par(mfrow=c(2,1))
|
|
244 lastx = max(rawrs)
|
|
245 rawe = knots(ecdf(rawrs))
|
|
246 cleane = knots(ecdf(cleanrs))
|
|
247 cy = 1:length(cleane)/length(cleane)
|
|
248 ry = 1:length(rawe)/length(rawe)
|
|
249 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
|
|
250 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
|
|
251 grid(col="blue")
|
|
252 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
|
|
253 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
|
|
254 grid(col="blue")
|
|
255 dev.off()
|
|
256 }
|
|
257
|
|
258
|
|
259
|
|
260 edgeIt = function (Count_Matrix,group,outputfilename,fdrtype='fdr',priordf=5,fdrthresh=0.05,outputdir='.',
|
|
261 myTitle='edgeR',libSize=c(),useNDF="T",filterquantile=0.2,subjects=c()) {
|
|
262
|
|
263 # Error handling
|
|
264 if (length(unique(group))!=2){
|
|
265 print("Number of conditions identified in experiment does not equal 2")
|
|
266 q()
|
|
267 }
|
|
268 require(edgeR)
|
|
269 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
|
|
270 allN = nrow(Count_Matrix)
|
|
271 nscut = round(ncol(Count_Matrix)/2)
|
|
272 colTotmillionreads = colSums(Count_Matrix)/1e6
|
|
273 rawrs = rowSums(Count_Matrix)
|
|
274 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
|
|
275 nzN = nrow(nonzerod)
|
|
276 nzrs = rowSums(nonzerod)
|
|
277 zN = allN - nzN
|
|
278 print('# Quantiles for non-zero row counts:',quote=F)
|
|
279 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
|
|
280 if (useNDF == "T")
|
|
281 {
|
|
282 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
|
|
283 lo = colSums(Count_Matrix[!gt1rpin3,])
|
|
284 workCM = Count_Matrix[gt1rpin3,]
|
|
285 cleanrs = rowSums(workCM)
|
|
286 cleanN = length(cleanrs)
|
|
287 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
|
|
288 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
|
|
289 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
|
|
290 }
|
|
291 else {
|
|
292 useme = (nzrs > quantile(nzrs,filterquantile))
|
|
293 workCM = nonzerod[useme,]
|
|
294 lo = colSums(nonzerod[!useme,])
|
|
295 cleanrs = rowSums(workCM)
|
|
296 cleanN = length(cleanrs)
|
|
297 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
|
|
298 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
|
|
299 maint = paste('Filter below',filterquantile,'quantile')
|
|
300 }
|
|
301 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
|
|
302 allgenes <- rownames(workCM)
|
|
303 print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
|
|
304 rsums = rowSums(workCM)
|
|
305 TName=unique(group)[1]
|
|
306 CName=unique(group)[2]
|
|
307 # Setup DGEList object
|
|
308 DGEList = DGEList(counts=workCM, group = group)
|
|
309 if (length(subjects) == 0)
|
|
310 {
|
|
311 doDESEQ = T
|
|
312 mydesign = model.matrix(~group)
|
|
313 }
|
|
314 else {
|
|
315 doDESEQ = F
|
|
316 subjf = factor(subjects)
|
|
317 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
|
|
318 }
|
|
319 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
|
|
320 print.noquote('Using design matrix:')
|
|
321 print.noquote(mydesign)
|
|
322 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
|
|
323 comdisp = DGEList\$common.dispersion
|
|
324 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
|
|
325 if (priordf > 0) {
|
|
326 print.noquote(paste("prior.df =",priordf))
|
|
327 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = priordf)
|
|
328 } else {
|
|
329 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
|
|
330 }
|
|
331 DGLM = glmFit(DGEList,design=mydesign)
|
|
332 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
|
|
333 normData = (1e+06*DGEList\$counts/efflib)
|
|
334 co = length(colnames(mydesign))
|
|
335 DE = glmLRT(DGLM,coef=co) # always last one - subject is first if needed
|
|
336 uoutput = cbind(
|
|
337 Name=as.character(rownames(DGEList\$counts)),
|
|
338 DE\$table,
|
|
339 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
|
|
340 Dispersion=DGEList\$tagwise.dispersion,totreads=rsums,normData,
|
|
341 DGEList\$counts
|
|
342 )
|
|
343 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
|
|
344 goodness = gof(DGLM, pcutoff=fdrthresh)
|
|
345 if (sum(goodness\$outlier) > 0) {
|
|
346 print.noquote('GLM outliers:')
|
|
347 print(paste(rownames(DGLM)[(goodness\$outlier != 0)],collapse=','),quote=F)
|
|
348 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
|
|
349 pdf(paste(mt,"GoodnessofFit.pdf",sep='_'))
|
|
350 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
|
|
351 abline(0,1,lwd=3)
|
|
352 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue")
|
|
353 dev.off()
|
|
354 } else { print('No GLM fit outlier genes found\n')}
|
|
355 estpriorn = getPriorN(DGEList)
|
|
356 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
|
|
357 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
|
|
358 normData = (1e+06*DGEList\$counts/efflib)
|
|
359 uniqueg = unique(group)
|
|
360 # Plot MDS
|
|
361 sample_colors = match(group,levels(group))
|
|
362 pdf(paste(mt,"MDSplot.pdf",sep='_'))
|
|
363 sampleTypes = levels(group)
|
|
364 plotMDS.DGEList(DGEList,main=paste("MDS Plot for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
|
|
365 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
|
|
366 grid(col="blue")
|
|
367 dev.off()
|
|
368 colnames(normData) = paste( colnames(normData),'N',sep="_")
|
|
369 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
|
|
370 nzd = data.frame(log(nonzerod + 1e-2,10))
|
|
371 boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle)
|
|
372 if (doDESEQ)
|
|
373 {
|
|
374 # DESeq
|
|
375 deSeqDatcount <- newCountDataSet(workCM, group)
|
|
376 deSeqDatsizefac <- estimateSizeFactors(deSeqDatcount)
|
|
377 deSeqDatdisp <- estimateDispersions(deSeqDatsizefac)
|
|
378 rDESeq <- nbinomTest(deSeqDatdisp, levels(group)[1], levels(group)[2])
|
|
379 rDESeq <- rDESeq[order(rDESeq\$pval), ]
|
|
380 write.table(rDESeq,paste(mt,'DESeq_TopTable.xls',sep='_'), quote=FALSE, sep="\t",row.names=F)
|
|
381 topresults.DESeq <- rDESeq[which(rDESeq\$padj < fdrthresh), ]
|
|
382 DESeqcountsindex <- which(allgenes %in% topresults.DESeq\$id)
|
|
383 DESeqcounts <- rep(0, length(allgenes))
|
|
384 DESeqcounts[DESeqcountsindex] <- 1
|
|
385 }
|
|
386 DGEList = calcNormFactors(DGEList)
|
|
387 norm.factor = DGEList\$samples\$norm.factors
|
|
388 pdf(paste(mt,"voomplot.pdf",sep='_'))
|
|
389 dat.voomed <- voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
|
|
390 dev.off()
|
|
391 # Use limma to fit data
|
|
392 fit <- lmFit(dat.voomed, mydesign)
|
|
393 fit <- eBayes(fit)
|
|
394 rvoom <- topTable(fit, coef = length(colnames(mydesign)), adj = "BH", n = Inf)
|
|
395 write.table(rvoom,paste(mt,'VOOM_topTable.xls',sep='_'), quote=FALSE, sep="\t",row.names=F)
|
|
396 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
|
|
397 topresults.voom <- rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
|
|
398 topresults.edgeR <- soutput[which(soutput\$adj.p.value < fdrthresh), ]
|
|
399 # Create venn diagram of hits
|
|
400 edgeRcountsindex <- which(allgenes %in% rownames(topresults.edgeR))
|
|
401 voomcountsindex <- which(allgenes %in% topresults.voom\$ID)
|
|
402 edgeRcounts <- rep(0, length(allgenes))
|
|
403 edgeRcounts[edgeRcountsindex] <- 1
|
|
404 voomcounts <- rep(0, length(allgenes))
|
|
405 voomcounts[voomcountsindex] <- 1
|
|
406 if (doDESEQ) {
|
|
407 vennmain = paste(mt,'Voom,edgeR and DESeq overlap at FDR=',fdrthresh)
|
|
408 counts.dataframe <- data.frame(edgeRcounts = edgeRcounts, DESeqcounts = DESeqcounts,
|
|
409 voomcounts = voomcounts, row.names = allgenes)
|
|
410 } else {
|
|
411 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
|
|
412 counts.dataframe <- data.frame(edgeRcounts = edgeRcounts, voomcounts = voomcounts, row.names = allgenes)
|
|
413 }
|
|
414 counts.venn <- vennCounts(counts.dataframe)
|
|
415 vennf = paste(mt,'venn.pdf',sep='_')
|
|
416 pdf(vennf)
|
|
417 vennDiagram(counts.venn,main=vennmain,col="maroon")
|
|
418 dev.off()
|
|
419 #Prepare our output file
|
|
420 nreads = soutput\$totreads # ordered same way
|
|
421 print('# writing output',quote=F)
|
|
422 write.table(soutput,outputfilename, quote=FALSE, sep="\t",row.names=F)
|
|
423 rn = uoutput\$Name
|
|
424 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
|
|
425 org="hg19"
|
|
426 genecards="<a href='http://www.genecards.org/index.php?path=/Search/keyword/"
|
|
427 ucsc = paste("<a href='http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
|
|
428 testreg = str_match(rn,reg)
|
|
429 nreads = uoutput\$totreads # ordered same way
|
|
430 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
|
|
431 {
|
|
432 urls = paste(ucsc,'&position=chr',testreg[,2],':',testreg[,3],"-",testreg[,4],"'>",rn,'</a>',sep='')
|
|
433 } else {
|
|
434 urls = paste(genecards,rn,"'></a>",rn,'</a>',sep="")
|
|
435 }
|
|
436 print.noquote('# urls')
|
|
437 cat(head(urls))
|
|
438 tt = uoutput
|
|
439 cat("# Top tags\n")
|
|
440 tt = cbind(tt,ntotreads=nreads,URL=urls) # add to end so table isn't laid out strangely
|
|
441 tt = tt[order(DE\$table\$PValue),]
|
|
442 options(width = 500)
|
|
443 print.noquote(tt[1:50,])
|
|
444 pdf(paste(mt,"BCV_vs_abundance.pdf",sep='_'))
|
|
445 plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
|
|
446 dev.off()
|
|
447 # Plot MAplot
|
|
448 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
|
|
449 nsig = length(deTags)
|
|
450 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
|
|
451 print('# deTags',quote=F)
|
|
452 print(head(deTags))
|
|
453 dg = DGEList[order(DE\$table\$PValue),]
|
|
454 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
|
|
455 efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
|
|
456 normData = (1e+06*dg\$counts/efflib)
|
|
457 outpdfname=paste(mt,"heatmap.pdf",sep='_')
|
|
458 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=myTitle)
|
|
459 outSmear = paste(mt,"Smearplot.pdf",sep='_')
|
|
460 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
|
|
461 smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
|
|
462 qqPlot(descr=myTitle,pvector=DE\$table\$PValue)
|
|
463 if (doDESEQ) {
|
|
464 cat("# DESeq top 50\n")
|
|
465 print(rDESeq[1:50,])
|
|
466 }
|
|
467 cat("# VOOM top 50\n")
|
|
468 print(rvoom[1:50,])
|
|
469 # need a design matrix and glm to use this
|
|
470 goodness = gof(DGLM, pcutoff=fdrthresh)
|
|
471 nout = sum(goodness\$outlier)
|
|
472 if (nout > 0) {
|
|
473 print.noquote(paste('Found',nout,'Goodness of fit outliers'))
|
|
474 rownames(DGLM)[goodness\$outlier]
|
|
475 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
|
|
476 pdf(paste(mt,"GoodnessofFit.pdf",sep='_'))
|
|
477 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
|
|
478 abline(0,1,lwd=3)
|
|
479 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue")
|
|
480 dev.off()
|
|
481 }
|
|
482 #Return our main table
|
|
483 uoutput
|
|
484
|
|
485 } #Done
|
|
486 sink(stdout(),append=T,type="message")
|
|
487 options(width=512)
|
|
488 Out_Dir = "$html_file.files_path"
|
|
489 Input = "$input1"
|
|
490 TreatmentName = "$treatment_name"
|
|
491 TreatmentCols = "$Treat_cols"
|
|
492 ControlName = "$control_name"
|
|
493 ControlCols= "$Control_cols"
|
|
494 outputfilename = "$outtab"
|
|
495 fdrtype = "$fdrtype"
|
|
496 priordf = $priordf
|
|
497 fdrthresh = $fdrthresh
|
|
498 useNDF = "$useNDF"
|
|
499 fQ = $fQ # non-differential centile cutoff
|
|
500 myTitle = "$title"
|
|
501 subjects = c($subjectids)
|
|
502 nsubj = length(subjects)
|
|
503 #Set our columns
|
|
504 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
|
|
505 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
|
|
506 cat('# got TCols=')
|
|
507 cat(TCols)
|
|
508 cat('; CCols=')
|
|
509 cat(CCols)
|
|
510 cat('\n')
|
|
511 useCols = c(TCols,CCols)
|
|
512 # Create output dir if non existent
|
|
513 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
|
|
514
|
|
515 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
|
|
516 snames = colnames(Count_Matrix)
|
|
517 nsamples = length(snames)
|
|
518 if (nsubj > 0 & nsubj != nsamples) {
|
|
519 options("show.error.messages"=T)
|
|
520 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
|
|
521 write(mess, stderr())
|
|
522 #print(mess)
|
|
523 quit(save="no",status=4)
|
|
524 }
|
|
525
|
|
526 Count_Matrix = Count_Matrix[,useCols] # reorder columns
|
|
527 if (length(subjects) != 0) {subjects = subjects[useCols]}
|
|
528 rn = rownames(Count_Matrix)
|
|
529 islib = rn %in% c('librarySize','NotInBedRegions')
|
|
530 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
|
|
531 Count_Matrix = Count_Matrix[subset(rn,! islib),]
|
|
532 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
|
|
533 group = factor(group, levels=c(ControlName,TreatmentName))
|
|
534 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
|
|
535 results = edgeIt(Count_Matrix=Count_Matrix,group=group,outputfilename=outputfilename,fdrtype=fdrtype,priordf=priordf,fdrthresh=fdrthresh,
|
|
536 outputdir=Out_Dir,myTitle=myTitle,libSize=c(),useNDF=useNDF,filterquantile=fQ,subjects=subjects)
|
|
537 #Run the main function
|
|
538 # for the log
|
|
539 sessionInfo()
|
|
540 ]]>
|
|
541 </configfile>
|
|
542 </configfiles>
|
|
543 <help>
|
|
544 **What it does**
|
|
545
|
|
546 Performs digital gene expression analysis between a treatment and control on a count matrix.
|
|
547 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
|
|
548
|
|
549 **Input**
|
|
550
|
|
551 A matrix consisting of non-negative integers. The matrix must have a unique header row identifiying the samples, and a unique set of row names
|
|
552 as the first column. Typically the row names are gene symbols or probe id's for downstream use in GSEA and other methods.
|
|
553
|
|
554 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
|
|
555 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
|
|
556 A list of integers, one for each subject or an empty string if samples are all independent.
|
|
557 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
|
|
558 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
|
|
559
|
|
560 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
|
|
561 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
|
|
562 8,9,1,1,2,2
|
|
563 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
|
|
564
|
|
565 **Output**
|
|
566
|
|
567 A matrix which consists the original data and relative expression levels and some helpful plots
|
|
568
|
|
569 **Note on edgeR versions**
|
|
570
|
|
571 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
|
|
572 breaking this and all other code that assumed the old name for this variable,
|
|
573 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
|
|
574 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
|
|
575 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
|
|
576 when their old scripts break. This tool currently now works with 2.4.6.
|
|
577
|
|
578 **Note on prior.N**
|
|
579
|
|
580 http://seqanswers.com/forums/showthread.php?t=5591 says:
|
|
581
|
|
582 *prior.n*
|
|
583
|
|
584 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
|
|
585 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
|
|
586 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
|
|
587 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
|
|
588 common likelihood the weight of one observation.
|
|
589
|
|
590 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
|
|
591 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
|
|
592 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
|
|
593 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
|
|
594 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
|
|
595 If you have more samples, then the tagwise dispersion estimates will be more reliable,
|
|
596 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
|
|
597
|
|
598
|
|
599 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
|
|
600
|
|
601 Dear Dorota,
|
|
602
|
|
603 The important settings are prior.df and trend.
|
|
604
|
|
605 prior.n and prior.df are related through prior.df = prior.n * residual.df,
|
|
606 and your experiment has residual.df = 36 - 12 = 24. So the old setting of
|
|
607 prior.n=10 is equivalent for your data to prior.df = 240, a very large
|
|
608 value. Going the other way, the new setting of prior.df=10 is equivalent
|
|
609 to prior.n=10/24.
|
|
610
|
|
611 To recover old results with the current software you would use
|
|
612
|
|
613 estimateTagwiseDisp(object, prior.df=240, trend="none")
|
|
614
|
|
615 To get the new default from old software you would use
|
|
616
|
|
617 estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
|
|
618
|
|
619 Actually the old trend method is equivalent to trend="loess" in the new
|
|
620 software. You should use plotBCV(object) to see whether a trend is
|
|
621 required.
|
|
622
|
|
623 Note you could also use
|
|
624
|
|
625 prior.n = getPriorN(object, prior.df=10)
|
|
626
|
|
627 to map between prior.df and prior.n.
|
|
628
|
|
629 </help>
|
|
630
|
|
631 </tool>
|
|
632
|
|
633
|