0
+ − 1 <tool id="rgDifferentialCount" name="Differential_Count" version="0.20">
+ − 2 <description>models using BioConductor packages</description>
+ − 3 <requirements>
+ − 4 <requirement type="package" version="2.12">biocbasics</requirement>
+ − 5 <requirement type="package" version="3.0.1">r3</requirement>
5
+ − 6 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
1
+ − 7 <requirement type="package" version="9.07">ghostscript</requirement>
0
+ − 8 </requirements>
+ − 9
+ − 10 <command interpreter="python">
+ − 11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts"
8
+ − 12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
0
+ − 13 </command>
+ − 14 <inputs>
+ − 15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
+ − 16 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
+ − 17 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs"
+ − 18 help="Supply a meaningful name here to remind you what the outputs contain">
+ − 19 <sanitizer invalid_char="">
+ − 20 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
+ − 21 </sanitizer>
+ − 22 </param>
+ − 23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
+ − 24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
+ − 25 multiple="true" use_header_names="true" size="120" display="checkboxes">
+ − 26 <validator type="no_options" message="Please select at least one column."/>
+ − 27 </param>
+ − 28 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
+ − 29 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
+ − 30 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
+ − 31 </param>
+ − 32 <param name="subjectids" type="text" optional="true" size="120" value = ""
+ − 33 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter integers to indicate sample pairing for every column in input"
+ − 34 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter '1,2,1,2'">
+ − 35 <sanitizer>
+ − 36 <valid initial="string.digits"><add value="," /> </valid>
+ − 37 </sanitizer>
+ − 38 </param>
+ − 39 <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
+ − 40 help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
+ − 41 <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1"
+ − 42 label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
+ − 43 help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
+ − 44
+ − 45 <conditional name="edgeR">
+ − 46 <param name="doedgeR" type="select"
+ − 47 label="Run this model using edgeR"
+ − 48 help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
+ − 49 <option value="F">Do not run edgeR</option>
+ − 50 <option value="T" selected="true">Run edgeR</option>
+ − 51 </param>
+ − 52 <when value="T">
+ − 53 <param name="edgeR_priordf" type="integer" value="20" size="3"
+ − 54 label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n and prior.df = prior.n * residual.df"
+ − 55 help="0 = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
+ − 56 </when>
+ − 57 <when value="F"> </when>
+ − 58 </conditional>
+ − 59 <conditional name="DESeq2">
+ − 60 <param name="doDESeq2" type="select"
+ − 61 label="Run the same model with DESeq2 and compare findings"
+ − 62 help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
+ − 63 <option value="F" selected="true">Do not run DESeq2</option>
+ − 64 <option value="T">Run DESeq2 (only works if NO second GLM factor supplied at present)</option>
+ − 65 </param>
+ − 66 <when value="T">
+ − 67 <param name="DESeq_fitType" type="select">
+ − 68 <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
+ − 69 <option value="local">Local fit - use this if parametric fails</option>
+ − 70 <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual</option>
+ − 71 </param>
+ − 72 </when>
+ − 73 <when value="F"> </when>
+ − 74 </conditional>
+ − 75 <param name="doVoom" type="select"
+ − 76 label="Run the same model with Voom/limma and compare findings"
+ − 77 help="The VOOM transformation allows analysis of count data using limma">
+ − 78 <option value="F" selected="true">Do not run VOOM</option>
+ − 79 <option value="T">Run VOOM</option>
+ − 80 </param>
+ − 81 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
+ − 82 help="Conventional default value of 0.05 recommended"/>
+ − 83 <param name="fdrtype" type="select" label="FDR (Type II error) control method"
+ − 84 help="Use fdr or bh typically to control for the number of tests in a reliable way">
+ − 85 <option value="fdr" selected="true">fdr</option>
+ − 86 <option value="BH">Benjamini Hochberg</option>
+ − 87 <option value="BY">Benjamini Yukateli</option>
+ − 88 <option value="bonferroni">Bonferroni</option>
+ − 89 <option value="hochberg">Hochberg</option>
+ − 90 <option value="holm">Holm</option>
+ − 91 <option value="hommel">Hommel</option>
+ − 92 <option value="none">no control for multiple tests</option>
+ − 93 </param>
+ − 94 </inputs>
+ − 95 <outputs>
7
+ − 96 <data format="tabular" name="out_edgeR" label="${title}_topTable_edgeR.xls">
9
+ − 97 <filter>edgeR['doedgeR'] == "T"</filter>
7
+ − 98 </data>
+ − 99 <data format="tabular" name="out_DESeq2" label="${title}_topTable_DESeq2.xls">
9
+ − 100 <filter>DESeq2['doDESeq2'] == "T"</filter>
7
+ − 101 </data>
+ − 102 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
+ − 103 <filter>doVoom == "T"</filter>
+ − 104 </data>
0
+ − 105 <data format="html" name="html_file" label="${title}.html"/>
+ − 106 </outputs>
+ − 107 <stdio>
+ − 108 <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix" />
+ − 109 </stdio>
+ − 110 <tests>
+ − 111 <test>
+ − 112 <param name='input1' value='test_bams2mx.xls' ftype='tabular' />
+ − 113 <param name='treatment_name' value='case' />
+ − 114 <param name='title' value='edgeRtest' />
+ − 115 <param name='useNDF' value='' />
+ − 116 <param name='doedgeR' value='T' />
+ − 117 <param name='doVoom' value='T' />
+ − 118 <param name='doDESeq2' value='T' />
+ − 119 <param name='fdrtype' value='fdr' />
+ − 120 <param name='edgeR_priordf' value="8" />
+ − 121 <param name='fdrthresh' value="0.05" />
+ − 122 <param name='control_name' value='control' />
+ − 123 <param name='subjectids' value='' />
+ − 124 <param name='Treat_cols' value='3,4,5,9' />
+ − 125 <param name='Control_cols' value='2,6,7,8' />
8
+ − 126 <output name='out_edgeR' file='edgeRtest1out.xls' compare='diff' />
0
+ − 127 <output name='html_file' file='edgeRtest1out.html' compare='diff' lines_diff='20' />
+ − 128 </test>
+ − 129 </tests>
+ − 130
+ − 131 <configfiles>
+ − 132 <configfile name="runme">
+ − 133 <![CDATA[
+ − 134 #
+ − 135 # edgeR.Rscript
+ − 136 # updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
+ − 137 # Performs DGE on a count table containing n replicates of two conditions
+ − 138 #
+ − 139 # Parameters
+ − 140 #
+ − 141 # 1 - Output Dir
+ − 142
+ − 143 # Original edgeR code by: S.Lunke and A.Kaspi
+ − 144 reallybig = log10(.Machine\$double.xmax)
+ − 145 reallysmall = log10(.Machine\$double.xmin)
+ − 146 library('stringr')
+ − 147 library('gplots')
+ − 148 library('edgeR')
+ − 149 hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
+ − 150 {
+ − 151 # Perform clustering for significant pvalues after controlling FWER
+ − 152 samples = colnames(cmat)
+ − 153 gu = unique(group)
+ − 154 if (length(gu) == 2) {
+ − 155 col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
+ − 156 pcols = unlist(lapply(group,col.map))
+ − 157 } else {
+ − 158 colours = rainbow(length(gu),start=0,end=4/6)
+ − 159 pcols = colours[match(group,gu)] }
+ − 160 gn = rownames(cmat)
+ − 161 dm = cmat[(! is.na(gn)),]
+ − 162 # remove unlabelled hm rows
+ − 163 nprobes = nrow(dm)
+ − 164 # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
+ − 165 if (nprobes > nsamp) {
+ − 166 dm =dm[1:nsamp,]
+ − 167 #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
+ − 168 }
+ − 169 newcolnames = substr(colnames(dm),1,20)
+ − 170 colnames(dm) = newcolnames
+ − 171 pdf(outpdfname)
+ − 172 heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
+ − 173 Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
+ − 174 dev.off()
+ − 175 }
+ − 176
+ − 177 hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
+ − 178 {
+ − 179 # for 2 groups only was
+ − 180 #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
+ − 181 #pcols = unlist(lapply(group,col.map))
+ − 182 gu = unique(group)
+ − 183 colours = rainbow(length(gu),start=0.3,end=0.6)
+ − 184 pcols = colours[match(group,gu)]
+ − 185 nrows = nrow(cmat)
+ − 186 mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
+ − 187 if (nrows > nsamp) {
+ − 188 cmat = cmat[c(1:nsamp),]
+ − 189 mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
+ − 190 }
+ − 191 newcolnames = substr(colnames(cmat),1,20)
+ − 192 colnames(cmat) = newcolnames
+ − 193 pdf(outpdfname)
+ − 194 heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
+ − 195 dev.off()
+ − 196 }
+ − 197
+ − 198 qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
+ − 199 # stolen from https://gist.github.com/703512
+ − 200 {
+ − 201 o = -log10(sort(pvector,decreasing=F))
+ − 202 e = -log10( 1:length(o)/length(o) )
+ − 203 o[o==-Inf] = reallysmall
+ − 204 o[o==Inf] = reallybig
+ − 205 maint = descr
+ − 206 pdf(outpdf)
+ − 207 plot(e,o,pch=19,cex=1, main=maint, ...,
+ − 208 xlab=expression(Expected~~-log[10](italic(p))),
+ − 209 ylab=expression(Observed~~-log[10](italic(p))),
+ − 210 xlim=c(0,max(e)), ylim=c(0,max(o)))
+ − 211 lines(e,e,col="red")
+ − 212 grid(col = "lightgray", lty = "dotted")
+ − 213 dev.off()
+ − 214 }
+ − 215
+ − 216 smearPlot = function(DGEList,deTags, outSmear, outMain)
+ − 217 {
+ − 218 pdf(outSmear)
+ − 219 plotSmear(DGEList,de.tags=deTags,main=outMain)
+ − 220 grid(col="lightgray", lty="dotted")
+ − 221 dev.off()
+ − 222 }
+ − 223
+ − 224 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
+ − 225 { #
+ − 226 nc = ncol(rawrs)
+ − 227 for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
+ − 228 fullnames = colnames(rawrs)
+ − 229 newcolnames = substr(colnames(rawrs),1,20)
+ − 230 colnames(rawrs) = newcolnames
+ − 231 newcolnames = substr(colnames(cleanrs),1,20)
+ − 232 colnames(cleanrs) = newcolnames
+ − 233 defpar = par(no.readonly=T)
+ − 234 print.noquote('raw contig counts by sample:')
+ − 235 print.noquote(summary(rawrs))
+ − 236 print.noquote('normalised contig counts by sample:')
+ − 237 print.noquote(summary(cleanrs))
+ − 238 pdf(pdfname)
+ − 239 par(mfrow=c(1,2))
+ − 240 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
+ − 241 grid(col="lightgray",lty="dotted")
+ − 242 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
+ − 243 grid(col="lightgray",lty="dotted")
+ − 244 dev.off()
+ − 245 pdfname = "sample_counts_histogram.pdf"
+ − 246 nc = ncol(rawrs)
+ − 247 print.noquote(paste('Using ncol rawrs=',nc))
+ − 248 ncroot = round(sqrt(nc))
+ − 249 if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
+ − 250 m = c()
+ − 251 for (i in c(1:nc)) {
+ − 252 rhist = hist(rawrs[,i],breaks=100,plot=F)
+ − 253 m = append(m,max(rhist\$counts))
+ − 254 }
+ − 255 ymax = max(m)
+ − 256 pdf(pdfname)
+ − 257 par(mfrow=c(ncroot,ncroot))
+ − 258 for (i in c(1:nc)) {
+ − 259 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
+ − 260 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
+ − 261 }
+ − 262 dev.off()
+ − 263 par(defpar)
+ − 264
+ − 265 }
+ − 266
+ − 267 cumPlot = function(rawrs,cleanrs,maint,myTitle)
+ − 268 { # updated to use ecdf
+ − 269 pdfname = "Filtering_rowsum_bar_charts.pdf"
+ − 270 defpar = par(no.readonly=T)
+ − 271 lrs = log(rawrs,10)
+ − 272 lim = max(lrs)
+ − 273 pdf(pdfname)
+ − 274 par(mfrow=c(2,1))
+ − 275 hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
+ − 276 ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
+ − 277 grid(col="lightgray", lty="dotted")
+ − 278 lrs = log(cleanrs,10)
+ − 279 hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
+ − 280 ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
+ − 281 grid(col="lightgray", lty="dotted")
+ − 282 dev.off()
+ − 283 par(defpar)
+ − 284 }
+ − 285
+ − 286 cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
+ − 287 { # updated to use ecdf
+ − 288 pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
+ − 289 pdf(pdfname)
+ − 290 par(mfrow=c(2,1))
+ − 291 lastx = max(rawrs)
+ − 292 rawe = knots(ecdf(rawrs))
+ − 293 cleane = knots(ecdf(cleanrs))
+ − 294 cy = 1:length(cleane)/length(cleane)
+ − 295 ry = 1:length(rawe)/length(rawe)
+ − 296 plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
+ − 297 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
+ − 298 grid(col="blue")
+ − 299 plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
+ − 300 ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
+ − 301 grid(col="blue")
+ − 302 dev.off()
+ − 303 }
+ − 304
+ − 305
+ − 306
+ − 307 doGSEA = function(y=NULL,design=NULL,histgmt="",
+ − 308 bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
+ − 309 ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
+ − 310 {
+ − 311 sink('Camera.log')
+ − 312 genesets = c()
+ − 313 if (bigmt > "")
+ − 314 {
+ − 315 bigenesets = readLines(bigmt)
+ − 316 genesets = bigenesets
+ − 317 }
+ − 318 if (histgmt > "")
+ − 319 {
+ − 320 hgenesets = readLines(histgmt)
+ − 321 if (bigmt > "") {
+ − 322 genesets = rbind(genesets,hgenesets)
+ − 323 } else {
+ − 324 genesets = hgenesets
+ − 325 } # use only history if no bi
+ − 326 }
+ − 327 print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
+ − 328 genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
+ − 329 outf = outfname
+ − 330 head=paste(myTitle,'edgeR GSEA')
+ − 331 write(head,file=outfname,append=F)
+ − 332 ntest=length(genesets)
+ − 333 urownames = toupper(rownames(y))
+ − 334 upcam = c()
+ − 335 downcam = c()
+ − 336 for (i in 1:ntest) {
+ − 337 gs = unlist(genesets[i])
+ − 338 g = gs[1] # geneset_id
+ − 339 u = gs[2]
+ − 340 if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
+ − 341 glist = gs[3:length(gs)] # member gene symbols
+ − 342 glist = toupper(glist)
+ − 343 inglist = urownames %in% glist
+ − 344 nin = sum(inglist)
+ − 345 if ((nin > minnin) && (nin < maxnin)) {
+ − 346 ### print(paste('@@found',sum(inglist),'genes in glist'))
+ − 347 camres = camera(y=y,index=inglist,design=design)
+ − 348 if (! is.null(camres)) {
+ − 349 rownames(camres) = g # gene set name
+ − 350 camres = cbind(GeneSet=g,URL=u,camres)
+ − 351 if (camres\$Direction == "Up")
+ − 352 {
+ − 353 upcam = rbind(upcam,camres) } else {
+ − 354 downcam = rbind(downcam,camres)
+ − 355 }
+ − 356 }
+ − 357 }
+ − 358 }
+ − 359 uscam = upcam[order(upcam\$PValue),]
+ − 360 unadjp = uscam\$PValue
+ − 361 uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+ − 362 nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
+ − 363 dscam = downcam[order(downcam\$PValue),]
+ − 364 unadjp = dscam\$PValue
+ − 365 dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+ − 366 ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
+ − 367 write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+ − 368 write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+ − 369 print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
+ − 370 write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
+ − 371 print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
+ − 372 write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
+ − 373 sink()
+ − 374 }
+ − 375
+ − 376
+ − 377
7
+ − 378 edgeIt = function (Count_Matrix,group,out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
0
+ − 379 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
+ − 380 filterquantile=0.2, subjects=c(),mydesign=NULL,
+ − 381 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
+ − 382 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
+ − 383 doCook=F,DESeq_fitType="parameteric")
+ − 384 {
+ − 385 # Error handling
+ − 386 if (length(unique(group))!=2){
+ − 387 print("Number of conditions identified in experiment does not equal 2")
+ − 388 q()
+ − 389 }
+ − 390 require(edgeR)
+ − 391 options(width = 512)
+ − 392 mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
+ − 393 allN = nrow(Count_Matrix)
+ − 394 nscut = round(ncol(Count_Matrix)/2)
+ − 395 colTotmillionreads = colSums(Count_Matrix)/1e6
+ − 396 counts.dataframe = as.data.frame(c())
+ − 397 rawrs = rowSums(Count_Matrix)
+ − 398 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
+ − 399 nzN = nrow(nonzerod)
+ − 400 nzrs = rowSums(nonzerod)
+ − 401 zN = allN - nzN
+ − 402 print('# Quantiles for non-zero row counts:',quote=F)
+ − 403 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
+ − 404 if (useNDF == "T")
+ − 405 {
+ − 406 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
+ − 407 lo = colSums(Count_Matrix[!gt1rpin3,])
+ − 408 workCM = Count_Matrix[gt1rpin3,]
+ − 409 cleanrs = rowSums(workCM)
+ − 410 cleanN = length(cleanrs)
+ − 411 meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
+ − 412 print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
+ − 413 maint = paste('Filter >=1/million reads in >=',nscut,'samples')
+ − 414 } else {
+ − 415 useme = (nzrs > quantile(nzrs,filterquantile))
+ − 416 workCM = nonzerod[useme,]
+ − 417 lo = colSums(nonzerod[!useme,])
+ − 418 cleanrs = rowSums(workCM)
+ − 419 cleanN = length(cleanrs)
+ − 420 meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
+ − 421 print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
+ − 422 maint = paste('Filter below',filterquantile,'quantile')
+ − 423 }
+ − 424 cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
+ − 425 allgenes = rownames(workCM)
+ − 426 reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
+ − 427 genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
+ − 428 ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
+ − 429 testreg = str_match(allgenes,reg)
+ − 430 if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
+ − 431 {
+ − 432 print("@@ using ucsc substitution for urls")
+ − 433 contigurls = paste0(ucsc,"&position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
+ − 434 } else {
+ − 435 print("@@ using genecards substitution for urls")
+ − 436 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
+ − 437 }
+ − 438 print.noquote("# urls")
+ − 439 print.noquote(head(contigurls))
+ − 440 print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
+ − 441 cmrowsums = rowSums(workCM)
+ − 442 TName=unique(group)[1]
+ − 443 CName=unique(group)[2]
+ − 444 if (is.null(mydesign)) {
+ − 445 if (length(subjects) == 0)
+ − 446 {
+ − 447 mydesign = model.matrix(~group)
+ − 448 }
+ − 449 else {
+ − 450 subjf = factor(subjects)
+ − 451 mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
+ − 452 }
+ − 453 }
+ − 454 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
+ − 455 print.noquote('Using design matrix:')
+ − 456 print.noquote(mydesign)
+ − 457 if (doedgeR) {
+ − 458 sink('edgeR.log')
+ − 459 #### Setup DGEList object
+ − 460 DGEList = DGEList(counts=workCM, group = group)
+ − 461 DGEList = calcNormFactors(DGEList)
+ − 462
+ − 463 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
+ − 464 comdisp = DGEList\$common.dispersion
+ − 465 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
+ − 466 if (edgeR_priordf > 0) {
+ − 467 print.noquote(paste("prior.df =",edgeR_priordf))
+ − 468 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
+ − 469 } else {
+ − 470 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
+ − 471 }
+ − 472 DGLM = glmFit(DGEList,design=mydesign)
+ − 473 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
+ − 474 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
+ − 475 normData = (1e+06*DGEList\$counts/efflib)
+ − 476 uoutput = cbind(
+ − 477 Name=as.character(rownames(DGEList\$counts)),
+ − 478 DE\$table,
+ − 479 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
+ − 480 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
+ − 481 DGEList\$counts
+ − 482 )
+ − 483 soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
+ − 484 goodness = gof(DGLM, pcutoff=fdrthresh)
+ − 485 if (sum(goodness\$outlier) > 0) {
+ − 486 print.noquote('GLM outliers:')
+ − 487 print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
+ − 488 } else {
+ − 489 print('No GLM fit outlier genes found\n')
+ − 490 }
+ − 491 z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
+ − 492 pdf("edgeR_GoodnessofFit.pdf")
+ − 493 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
+ − 494 abline(0,1,lwd=3)
+ − 495 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
+ − 496 dev.off()
+ − 497 estpriorn = getPriorN(DGEList)
+ − 498 print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
+ − 499 efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
+ − 500 normData = (1e+06*DGEList\$counts/efflib)
+ − 501 uniqueg = unique(group)
+ − 502 #### Plot MDS
+ − 503 sample_colors = match(group,levels(group))
+ − 504 sampleTypes = levels(factor(group))
+ − 505 print.noquote(sampleTypes)
+ − 506 pdf("edgeR_MDSplot.pdf")
+ − 507 plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
+ − 508 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
+ − 509 grid(col="blue")
+ − 510 dev.off()
+ − 511 colnames(normData) = paste( colnames(normData),'N',sep="_")
+ − 512 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
+ − 513 nzd = data.frame(log(nonzerod + 1e-2,10))
+ − 514 boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf")
7
+ − 515 write.table(soutput,out_edgeR, quote=FALSE, sep="\t",row.names=F)
0
+ − 516 tt = cbind(
+ − 517 Name=as.character(rownames(DGEList\$counts)),
+ − 518 DE\$table,
+ − 519 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
+ − 520 Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
+ − 521 )
+ − 522 print.noquote("# edgeR Top tags\n")
+ − 523 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
+ − 524 tt = tt[order(DE\$table\$PValue),]
+ − 525 print.noquote(tt[1:50,])
+ − 526 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
+ − 527 nsig = length(deTags)
+ − 528 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
+ − 529 deColours = ifelse(deTags,'red','black')
+ − 530 pdf("edgeR_BCV_vs_abundance.pdf")
+ − 531 plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
+ − 532 dev.off()
+ − 533 dg = DGEList[order(DE\$table\$PValue),]
+ − 534 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
+ − 535 efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
+ − 536 normData = (1e+06*dg\$counts/efflib)
+ − 537 outpdfname="edgeR_heatmap.pdf"
+ − 538 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=myTitle)
+ − 539 outSmear = "edgeR_smearplot.pdf"
+ − 540 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
+ − 541 smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
+ − 542 qqPlot(descr=paste(myTitle,'edgeR QQ plot'),pvector=DE\$table\$PValue,outpdf='edgeR_qqplot.pdf')
+ − 543 norm.factor = DGEList\$samples\$norm.factors
+ − 544 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
+ − 545 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
+ − 546 edgeRcounts = rep(0, length(allgenes))
+ − 547 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
+ − 548 sink()
+ − 549 } ### doedgeR
+ − 550 if (doDESeq2 == T)
+ − 551 {
+ − 552 sink("DESeq2.log")
+ − 553 # DESeq2
+ − 554 require('DESeq2')
+ − 555 library('RColorBrewer')
+ − 556 pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
+ − 557 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
+ − 558 #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
+ − 559 #rDESeq = results(DESeq2)
+ − 560 #newCountDataSet(workCM, group)
+ − 561 deSeqDatsizefac = estimateSizeFactors(deSEQds)
+ − 562 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
+ − 563 resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
+ − 564 rDESeq = as.data.frame(results(resDESeq))
+ − 565 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
+ − 566 srDESeq = rDESeq[order(rDESeq\$pvalue),]
+ − 567 qqPlot(descr=paste(myTitle,'DESeq2 qqplot'),pvector=rDESeq\$pvalue,outpdf='DESeq2_qqplot.pdf')
+ − 568 cat("# DESeq top 50\n")
+ − 569 print.noquote(srDESeq[1:50,])
7
+ − 570 write.table(srDESeq,out_DESeq2, quote=FALSE, sep="\t",row.names=F)
0
+ − 571 topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
+ − 572 DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
+ − 573 DESeqcounts = rep(0, length(allgenes))
+ − 574 DESeqcounts[DESeqcountsindex] = 1
+ − 575 pdf("DESeq2_dispersion_estimates.pdf")
+ − 576 plotDispEsts(resDESeq)
+ − 577 dev.off()
+ − 578 ysmall = abs(min(rDESeq\$log2FoldChange))
+ − 579 ybig = abs(max(rDESeq\$log2FoldChange))
+ − 580 ylimit = min(4,ysmall,ybig)
+ − 581 pdf("DESeq2_MA_plot.pdf")
+ − 582 plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
+ − 583 dev.off()
+ − 584 rlogres = rlogTransformation(resDESeq)
+ − 585 sampledists = dist( t( assay(rlogres) ) )
+ − 586 sdmat = as.matrix(sampledists)
+ − 587 pdf("DESeq2_sample_distance_plot.pdf")
+ − 588 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
+ − 589 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
+ − 590 dev.off()
+ − 591 sink()
+ − 592 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
+ − 593 if ("try-error" %in% class(result)) {
+ − 594 print.noquote('DESeq2 plotPCA failed.')
+ − 595 } else {
+ − 596 pdf("DESeq2_PCA_plot.pdf")
+ − 597 #### wtf - print? Seems needed to get this to work
+ − 598 print(ppca)
+ − 599 dev.off()
+ − 600 }
+ − 601 }
+ − 602
+ − 603 if (doVoom == T) {
+ − 604 sink('VOOM.log')
10
+ − 605 if (doedgeR == F) {
+ − 606 #### Setup DGEList object
+ − 607 DGEList = DGEList(counts=workCM, group = group)
+ − 608 DGEList = calcNormFactors(DGEList)
+ − 609 DGEList = estimateGLMCommonDisp(DGEList,mydesign)
+ − 610 DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
+ − 611 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
11
+ − 612 DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
+ − 613 norm.factor = DGEList\$samples\$norm.factors
10
+ − 614 }
0
+ − 615 pdf("VOOM_mean_variance_plot.pdf")
+ − 616 dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
+ − 617 dev.off()
+ − 618 # Use limma to fit data
+ − 619 fit = lmFit(dat.voomed, mydesign)
+ − 620 fit = eBayes(fit)
+ − 621 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
+ − 622 qqPlot(descr=paste(myTitle,'VOOM-limma QQ plot'),pvector=rvoom\$P.Value,outpdf='VOOM_qqplot.pdf')
+ − 623 rownames(rvoom) = rownames(workCM)
+ − 624 rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls)
+ − 625 srvoom = rvoom[order(rvoom\$P.Value),]
7
+ − 626 write.table(srvoom,out_VOOM, quote=FALSE, sep="\t",row.names=F)
0
+ − 627 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
+ − 628 topresults.voom = srvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
+ − 629 voomcountsindex = which(allgenes %in% topresults.voom\$ID)
+ − 630 voomcounts = rep(0, length(allgenes))
+ − 631 voomcounts[voomcountsindex] = 1
+ − 632 cat("# VOOM top 50\n")
+ − 633 print(srvoom[1:50,])
+ − 634 sink()
+ − 635 }
+ − 636
+ − 637 if (doCamera) {
+ − 638 doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
+ − 639 outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
+ − 640 }
+ − 641
+ − 642 if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
+ − 643 if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
+ − 644 vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
+ − 645 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
+ − 646 VOOM_limma = voomcounts, row.names = allgenes)
+ − 647 } else if ((doDESeq2==T) && (doedgeR==T)) {
+ − 648 vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
+ − 649 counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
+ − 650 } else if ((doVoom==T) && (doedgeR==T)) {
+ − 651 vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
+ − 652 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
+ − 653 }
+ − 654
+ − 655 if (nrow(counts.dataframe > 1)) {
+ − 656 counts.venn = vennCounts(counts.dataframe)
+ − 657 vennf = "Venn_significant_genes_overlap.pdf"
+ − 658 pdf(vennf)
+ − 659 vennDiagram(counts.venn,main=vennmain,col="maroon")
+ − 660 dev.off()
+ − 661 }
+ − 662 } #### doDESeq2 or doVoom
+ − 663 #Return our main table
+ − 664 uoutput
+ − 665
+ − 666 }
+ − 667 #### Done
+ − 668
+ − 669 ###sink(stdout(),append=T,type="message")
+ − 670 builtin_gmt=""
+ − 671 history_gmt=""
7
+ − 672 out_edgeR = F
+ − 673 out_DESeq2 = F
+ − 674 out_VOOM = F
0
+ − 675 doDESeq2 = $DESeq2.doDESeq2 # make these T or F
+ − 676 doVoom = $doVoom
+ − 677 doCamera = F
+ − 678 doedgeR = $edgeR.doedgeR
+ − 679 edgeR_priordf = 0
+ − 680
+ − 681 #if $DESeq2.doDESeq2 == "T"
7
+ − 682 out_DESeq2 = "$out_DESeq2"
0
+ − 683 DESeq_fitType = "$DESeq2.DESeq_fitType"
+ − 684 #end if
+ − 685 #if $edgeR.doedgeR == "T"
7
+ − 686 out_edgeR = "$out_edgeR"
+ − 687 edgeR_priordf = $edgeR.edgeR_priordf
0
+ − 688 #end if
7
+ − 689 #if $doVoom == "T"
+ − 690 out_VOOM = "$out_VOOM"
+ − 691 #end if
0
+ − 692
+ − 693 Out_Dir = "$html_file.files_path"
+ − 694 Input = "$input1"
+ − 695 TreatmentName = "$treatment_name"
+ − 696 TreatmentCols = "$Treat_cols"
+ − 697 ControlName = "$control_name"
+ − 698 ControlCols= "$Control_cols"
+ − 699 org = "$input1.dbkey"
+ − 700 if (org == "") { org = "hg19"}
+ − 701 fdrtype = "$fdrtype"
+ − 702 fdrthresh = $fdrthresh
+ − 703 useNDF = "$useNDF"
+ − 704 fQ = $fQ # non-differential centile cutoff
+ − 705 myTitle = "$title"
+ − 706 subjects = c($subjectids)
+ − 707 nsubj = length(subjects)
+ − 708 if (nsubj > 0) {
+ − 709 if (doDESeq2) {
+ − 710 print('WARNING - cannot yet use DESeq2 for 2 way anova - see the docs')
+ − 711 doDESeq2 = F
+ − 712 }
+ − 713 }
+ − 714 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
+ − 715 CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
+ − 716 cat('Got TCols=')
+ − 717 cat(TCols)
+ − 718 cat('; CCols=')
+ − 719 cat(CCols)
+ − 720 cat('\n')
+ − 721 useCols = c(TCols,CCols)
+ − 722 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
+ − 723 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
+ − 724 snames = colnames(Count_Matrix)
+ − 725 nsamples = length(snames)
+ − 726 if (nsubj > 0 & nsubj != nsamples) {
+ − 727 options("show.error.messages"=T)
+ − 728 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
+ − 729 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
+ − 730 write(mess, stderr())
+ − 731 quit(save="no",status=4)
+ − 732 }
+ − 733
+ − 734 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
+ − 735 if (length(subjects) != 0) {subjects = subjects[useCols]}
+ − 736 rn = rownames(Count_Matrix)
+ − 737 islib = rn %in% c('librarySize','NotInBedRegions')
+ − 738 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
+ − 739 Count_Matrix = Count_Matrix[subset(rn,! islib),]
+ − 740 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
+ − 741 group = factor(group, levels=c(ControlName,TreatmentName))
+ − 742 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
7
+ − 743 results = edgeIt(Count_Matrix=Count_Matrix,group=group,out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
0
+ − 744 fdrtype='BH',priordf=edgeR_priordf,fdrthresh=0.05,outputdir='.',
+ − 745 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=c(),
+ − 746 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
+ − 747 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType)
+ − 748 sessionInfo()
+ − 749 ]]>
+ − 750 </configfile>
+ − 751 </configfiles>
+ − 752 <help>
+ − 753
+ − 754 ----
+ − 755
+ − 756 **What it does**
+ − 757
+ − 758 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
+ − 759 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
+ − 760
+ − 761 **Input**
+ − 762
+ − 763 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
+ − 764 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
+ − 765 non-negative integer count of reads from one sample overlapping the feature.
+ − 766 The matrix must have a header row uniquely identifying the source samples, and unique row names in
+ − 767 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
+ − 768
+ − 769 **Specifying comparisons**
+ − 770
+ − 771 This is basically dumbed down for two factors - case vs control.
+ − 772
+ − 773 More complex interfaces are possible but painful at present.
+ − 774 Probably need to specify a phenotype file to do this better.
+ − 775 Work in progress. Send code.
+ − 776
+ − 777 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
+ − 778 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
+ − 779 A list of integers, one for each subject or an empty string if samples are all independent.
+ − 780 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
+ − 781 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
+ − 782
+ − 783 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
+ − 784 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
+ − 785 8,9,1,1,2,2
+ − 786 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
+ − 787
+ − 788 **Methods available**
+ − 789
+ − 790 You can run 3 popular Bioconductor packages available for count data.
+ − 791
+ − 792 edgeR - see edgeR_ for details
+ − 793
+ − 794 VOOM/limma - see limma_VOOM_ for details
+ − 795
+ − 796 DESeq2 - see DESeq2_ for details
+ − 797
+ − 798 and optionally camera in edgeR which works better if MSigDB is installed.
+ − 799
+ − 800 **Outputs**
+ − 801
+ − 802 Some helpful plots and analysis results. Note that most of these are produced using R code
+ − 803 suggested by the excellent documentation and vignettes for the Bioconductor
+ − 804 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
+ − 805
+ − 806 ***old rant on changes to Bioconductor package variable names between versions***
+ − 807
+ − 808 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
+ − 809 breaking this and all other code that assumed the old name for this variable,
+ − 810 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
+ − 811 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
+ − 812 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
+ − 813 when their old scripts break. This tool currently now works with 2.4.6.
+ − 814
+ − 815 **Note on prior.N**
+ − 816
+ − 817 http://seqanswers.com/forums/showthread.php?t=5591 says:
+ − 818
+ − 819 *prior.n*
+ − 820
+ − 821 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
+ − 822 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
+ − 823 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
+ − 824 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
+ − 825 common likelihood the weight of one observation.
+ − 826
+ − 827 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
+ − 828 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
+ − 829 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
+ − 830 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
+ − 831 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
+ − 832 If you have more samples, then the tagwise dispersion estimates will be more reliable,
+ − 833 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
+ − 834
+ − 835
+ − 836 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
+ − 837
+ − 838 Dear Dorota,
+ − 839
+ − 840 The important settings are prior.df and trend.
+ − 841
+ − 842 prior.n and prior.df are related through prior.df = prior.n * residual.df,
+ − 843 and your experiment has residual.df = 36 - 12 = 24. So the old setting of
+ − 844 prior.n=10 is equivalent for your data to prior.df = 240, a very large
+ − 845 value. Going the other way, the new setting of prior.df=10 is equivalent
+ − 846 to prior.n=10/24.
+ − 847
+ − 848 To recover old results with the current software you would use
+ − 849
+ − 850 estimateTagwiseDisp(object, prior.df=240, trend="none")
+ − 851
+ − 852 To get the new default from old software you would use
+ − 853
+ − 854 estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
+ − 855
+ − 856 Actually the old trend method is equivalent to trend="loess" in the new
+ − 857 software. You should use plotBCV(object) to see whether a trend is
+ − 858 required.
+ − 859
+ − 860 Note you could also use
+ − 861
+ − 862 prior.n = getPriorN(object, prior.df=10)
+ − 863
+ − 864 to map between prior.df and prior.n.
+ − 865
+ − 866 ----
+ − 867
+ − 868 **Attributions**
+ − 869
+ − 870 edgeR - edgeR_
+ − 871
+ − 872 VOOM/limma - limma_VOOM_
+ − 873
+ − 874 DESeq2 - DESeq2_ for details
+ − 875
+ − 876 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
+ − 877
+ − 878 Galaxy_ (that's what you are using right now!) for gluing everything together
+ − 879
+ − 880 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
+ − 881 licensed to you under the LGPL_ like other rgenetics artefacts
+ − 882
+ − 883 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
+ − 884 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
+ − 885 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
+ − 886 .. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
+ − 887 .. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
+ − 888 .. _Galaxy: http://getgalaxy.org
+ − 889 </help>
+ − 890
+ − 891 </tool>
+ − 892
+ − 893