# HG changeset patch # User fubar # Date 1375857700 14400 # Node ID c4ee2e69d691a66948404f46d8f4dfadb07cc874 # Parent ddd76b6db251f3ae20d68c397b3d32b47afcd699 Uploaded diff -r ddd76b6db251 -r c4ee2e69d691 rgedgeRpaired.xml --- a/rgedgeRpaired.xml Wed Aug 07 02:10:19 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1084 +0,0 @@ - - models using BioConductor packages - - biocbasics - r3 - graphicsmagick - ghostscript - - - - rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts" - --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - edgeR['doedgeR'] == "T" - - - DESeq2['doDESeq2'] == "T" - - - doVoom == "T" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - nsamp) { - dm =dm[1:nsamp,] - #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total') - } - newcolnames = substr(colnames(dm),1,20) - colnames(dm) = newcolnames - pdf(outpdfname) - heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none', - Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5) - dev.off() -} - -hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here") -{ - # for 2 groups only was - #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"} - #pcols = unlist(lapply(group,col.map)) - gu = unique(group) - colours = rainbow(length(gu),start=0.3,end=0.6) - pcols = colours[match(group,gu)] - nrows = nrow(cmat) - mtitle = paste(myTitle,'Heatmap: n contigs =',nrows) - if (nrows > nsamp) { - cmat = cmat[c(1:nsamp),] - mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='') - } - newcolnames = substr(colnames(cmat),1,20) - colnames(cmat) = newcolnames - pdf(outpdfname) - heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols) - dev.off() -} - -qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...) -# stolen from https://gist.github.com/703512 -{ - o = -log10(sort(pvector,decreasing=F)) - e = -log10( 1:length(o)/length(o) ) - o[o==-Inf] = reallysmall - o[o==Inf] = reallybig - maint = descr - pdf(outpdf) - plot(e,o,pch=19,cex=1, main=maint, ..., - xlab=expression(Expected~~-log[10](italic(p))), - ylab=expression(Observed~~-log[10](italic(p))), - xlim=c(0,max(e)), ylim=c(0,max(o))) - lines(e,e,col="red") - grid(col = "lightgray", lty = "dotted") - dev.off() -} - -smearPlot = function(DGEList,deTags, outSmear, outMain) - { - pdf(outSmear) - plotSmear(DGEList,de.tags=deTags,main=outMain) - grid(col="lightgray", lty="dotted") - dev.off() - } - -boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname) -{ # - nc = ncol(rawrs) - for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA} - fullnames = colnames(rawrs) - newcolnames = substr(colnames(rawrs),1,20) - colnames(rawrs) = newcolnames - newcolnames = substr(colnames(cleanrs),1,20) - colnames(cleanrs) = newcolnames - defpar = par(no.readonly=T) - print.noquote('raw contig counts by sample:') - print.noquote(summary(rawrs)) - print.noquote('normalised contig counts by sample:') - print.noquote(summary(cleanrs)) - pdf(pdfname) - par(mfrow=c(1,2)) - boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint)) - grid(col="lightgray",lty="dotted") - boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint)) - grid(col="lightgray",lty="dotted") - dev.off() - pdfname = "sample_counts_histogram.pdf" - nc = ncol(rawrs) - print.noquote(paste('Using ncol rawrs=',nc)) - ncroot = round(sqrt(nc)) - if (ncroot*ncroot < nc) { ncroot = ncroot + 1 } - m = c() - for (i in c(1:nc)) { - rhist = hist(rawrs[,i],breaks=100,plot=F) - m = append(m,max(rhist\$counts)) - } - ymax = max(m) - ncols = length(fullnames) - if (ncols > 20) - { - scale = 7*ncols/20 - pdf(pdfname,width=scale,height=scale) - } else { - pdf(pdfname) - } - par(mfrow=c(ncroot,ncroot)) - for (i in c(1:nc)) { - hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon", - breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax)) - } - dev.off() - par(defpar) - -} - -cumPlot = function(rawrs,cleanrs,maint,myTitle) -{ # updated to use ecdf - pdfname = "Filtering_rowsum_bar_charts.pdf" - defpar = par(no.readonly=T) - lrs = log(rawrs,10) - lim = max(lrs) - pdf(pdfname) - par(mfrow=c(2,1)) - hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)", - ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1) - grid(col="lightgray", lty="dotted") - lrs = log(cleanrs,10) - hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)", - ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1) - grid(col="lightgray", lty="dotted") - dev.off() - par(defpar) -} - -cumPlot1 = function(rawrs,cleanrs,maint,myTitle) -{ # updated to use ecdf - pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_') - pdf(pdfname) - par(mfrow=c(2,1)) - lastx = max(rawrs) - rawe = knots(ecdf(rawrs)) - cleane = knots(ecdf(cleanrs)) - cy = 1:length(cleane)/length(cleane) - ry = 1:length(rawe)/length(rawe) - plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads", - ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle) - grid(col="blue") - plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads", - ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle) - grid(col="blue") - dev.off() -} - - - -doGSEAold = function(y=NULL,design=NULL,histgmt="", - bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt", - ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH") -{ - sink('Camera.log') - genesets = c() - if (bigmt > "") - { - bigenesets = readLines(bigmt) - genesets = bigenesets - } - if (histgmt > "") - { - hgenesets = readLines(histgmt) - if (bigmt > "") { - genesets = rbind(genesets,hgenesets) - } else { - genesets = hgenesets - } # use only history if no bi - } - print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt)) - genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n - outf = outfname - head=paste(myTitle,'edgeR GSEA') - write(head,file=outfname,append=F) - ntest=length(genesets) - urownames = toupper(rownames(y)) - upcam = c() - downcam = c() - for (i in 1:ntest) { - gs = unlist(genesets[i]) - g = gs[1] # geneset_id - u = gs[2] - if (u > "") { u = paste("",u,"",sep="") } - glist = gs[3:length(gs)] # member gene symbols - glist = toupper(glist) - inglist = urownames %in% glist - nin = sum(inglist) - if ((nin > minnin) && (nin < maxnin)) { - ### print(paste('@@found',sum(inglist),'genes in glist')) - camres = camera(y=y,index=inglist,design=design) - if (! is.null(camres)) { - rownames(camres) = g # gene set name - camres = cbind(GeneSet=g,URL=u,camres) - if (camres\$Direction == "Up") - { - upcam = rbind(upcam,camres) } else { - downcam = rbind(downcam,camres) - } - } - } - } - uscam = upcam[order(upcam\$PValue),] - unadjp = uscam\$PValue - uscam\$adjPValue = p.adjust(unadjp,method=fdrtype) - nup = max(10,sum((uscam\$adjPValue < fdrthresh))) - dscam = downcam[order(downcam\$PValue),] - unadjp = dscam\$PValue - dscam\$adjPValue = p.adjust(unadjp,method=fdrtype) - ndown = max(10,sum((dscam\$adjPValue < fdrthresh))) - write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F) - write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F) - print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:')) - write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F) - print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:')) - write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F) - sink() -} - - - - -doGSEA = function(y=NULL,design=NULL,histgmt="", - bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt", - ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH") -{ - sink('Camera.log') - genesets = c() - if (bigmt > "") - { - bigenesets = readLines(bigmt) - genesets = bigenesets - } - if (histgmt > "") - { - hgenesets = readLines(histgmt) - if (bigmt > "") { - genesets = rbind(genesets,hgenesets) - } else { - genesets = hgenesets - } # use only history if no bi - } - print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt)) - genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n - outf = outfname - head=paste(myTitle,'edgeR GSEA') - write(head,file=outfname,append=F) - ntest=length(genesets) - urownames = toupper(rownames(y)) - upcam = c() - downcam = c() - incam = c() - urls = c() - gsids = c() - for (i in 1:ntest) { - gs = unlist(genesets[i]) - gsid = gs[1] # geneset_id - url = gs[2] - if (url > "") { url = paste("",url,"",sep="") } - glist = gs[3:length(gs)] # member gene symbols - glist = toupper(glist) - inglist = urownames %in% glist - nin = sum(inglist) - if ((nin > minnin) && (nin < maxnin)) { - incam = c(incam,inglist) - gsids = c(gsids,gsid) - urls = c(urls,url) - } - } - incam = as.list(incam) - names(incam) = gsids - allcam = camera(y=y,index=incam,design=design) - allcamres = cbind(geneset=gsids,allcam,URL=urls) - for (i in 1:ntest) { - camres = allcamres[i] - res = try(test = (camres\$Direction == "Up")) - if ("try-error" %in% class(res)) { - cat("test failed, camres = :") - print.noquote(camres) - } else { if (camres\$Direction == "Up") - { upcam = rbind(upcam,camres) - } else { downcam = rbind(downcam,camres) - } - - } - } - uscam = upcam[order(upcam\$PValue),] - unadjp = uscam\$PValue - uscam\$adjPValue = p.adjust(unadjp,method=fdrtype) - nup = max(10,sum((uscam\$adjPValue < fdrthresh))) - dscam = downcam[order(downcam\$PValue),] - unadjp = dscam\$PValue - dscam\$adjPValue = p.adjust(unadjp,method=fdrtype) - ndown = max(10,sum((dscam\$adjPValue < fdrthresh))) - write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F) - write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F) - print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:')) - write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F) - print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:')) - write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F) - sink() - } - - -edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5, - fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F, - filterquantile=0.2, subjects=c(),mydesign=NULL, - doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19', - histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt", - doCook=F,DESeq_fitType="parameteric") -{ - # Error handling - if (length(unique(group))!=2){ - print("Number of conditions identified in experiment does not equal 2") - q() - } - require(edgeR) - options(width = 512) - mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ") - allN = nrow(Count_Matrix) - nscut = round(ncol(Count_Matrix)/2) - colTotmillionreads = colSums(Count_Matrix)/1e6 - counts.dataframe = as.data.frame(c()) - rawrs = rowSums(Count_Matrix) - nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes - nzN = nrow(nonzerod) - nzrs = rowSums(nonzerod) - zN = allN - nzN - print('# Quantiles for non-zero row counts:',quote=F) - print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F) - if (useNDF == T) - { - gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut - lo = colSums(Count_Matrix[!gt1rpin3,]) - workCM = Count_Matrix[gt1rpin3,] - cleanrs = rowSums(workCM) - cleanN = length(cleanrs) - meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="") - print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F) - maint = paste('Filter >=1/million reads in >=',nscut,'samples') - } else { - useme = (nzrs > quantile(nzrs,filterquantile)) - workCM = nonzerod[useme,] - lo = colSums(nonzerod[!useme,]) - cleanrs = rowSums(workCM) - cleanN = length(cleanrs) - meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="") - print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F) - maint = paste('Filter below',filterquantile,'quantile') - } - cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle) - allgenes = rownames(workCM) - reg = "^chr([0-9]+):([0-9]+)-([0-9]+)" - genecards=" 0.8) # is ucsc style string - { - print("@@ using ucsc substitution for urls") - contigurls = paste0(ucsc,"&position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"") - } else { - print("@@ using genecards substitution for urls") - contigurls = paste0(genecards,allgenes,"\'>",allgenes,"") - } - print.noquote("# urls") - print.noquote(head(contigurls)) - print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F) - cmrowsums = rowSums(workCM) - TName=unique(group)[1] - CName=unique(group)[2] - if (is.null(mydesign)) { - if (length(subjects) == 0) - { - mydesign = model.matrix(~group) - } - else { - subjf = factor(subjects) - mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it - } - } - print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=','))) - print.noquote('Using design matrix:') - print.noquote(mydesign) - if (doedgeR) { - sink('edgeR.log') - #### Setup DGEList object - DGEList = DGEList(counts=workCM, group = group) - DGEList = calcNormFactors(DGEList) - - DGEList = estimateGLMCommonDisp(DGEList,mydesign) - comdisp = DGEList\$common.dispersion - DGEList = estimateGLMTrendedDisp(DGEList,mydesign) - if (edgeR_priordf > 0) { - print.noquote(paste("prior.df =",edgeR_priordf)) - DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf) - } else { - DGEList = estimateGLMTagwiseDisp(DGEList,mydesign) - } - DGLM = glmFit(DGEList,design=mydesign) - DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed - efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors - normData = (1e+06*DGEList\$counts/efflib) - uoutput = cbind( - Name=as.character(rownames(DGEList\$counts)), - DE\$table, - adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), - Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData, - DGEList\$counts - ) - soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable - goodness = gof(DGLM, pcutoff=fdrthresh) - if (sum(goodness\$outlier) > 0) { - print.noquote('GLM outliers:') - print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F) - } else { - print('No GLM fit outlier genes found\n') - } - z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2) - pdf("edgeR_GoodnessofFit.pdf") - qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion") - abline(0,1,lwd=3) - points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon") - dev.off() - estpriorn = getPriorN(DGEList) - print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F) - efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors - normData = (1e+06*DGEList\$counts/efflib) - uniqueg = unique(group) - #### Plot MDS - sample_colors = match(group,levels(group)) - sampleTypes = levels(factor(group)) - print.noquote(sampleTypes) - pdf("edgeR_MDSplot.pdf") - plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors) - legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19) - grid(col="blue") - dev.off() - colnames(normData) = paste( colnames(normData),'N',sep="_") - print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=','))) - nzd = data.frame(log(nonzerod + 1e-2,10)) - try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") ) - write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F) - tt = cbind( - Name=as.character(rownames(DGEList\$counts)), - DE\$table, - adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), - Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums - ) - print.noquote("# edgeR Top tags\n") - tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely - tt = tt[order(DE\$table\$PValue),] - print.noquote(tt[1:50,]) - deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,]) - nsig = length(deTags) - print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F) - deColours = ifelse(deTags,'red','black') - pdf("edgeR_BCV_vs_abundance.pdf") - plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance") - dev.off() - dg = DGEList[order(DE\$table\$PValue),] - #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg))) - efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors - normData = (1e+06*dg\$counts/efflib) - outpdfname="edgeR_top_100_heatmap.pdf" - hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle)) - outSmear = "edgeR_smearplot.pdf" - outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='') - smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain) - qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf') - norm.factor = DGEList\$samples\$norm.factors - topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ] - edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR)) - edgeRcounts = rep(0, length(allgenes)) - edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits - sink() - } ### doedgeR - if (doDESeq2 == T) - { - sink("DESeq2.log") - # DESeq2 - require('DESeq2') - library('RColorBrewer') - if (length(subjects) == 0) - { - pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM)) - deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx)) - } else { - pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM)) - deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx)) - } - #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype) - #rDESeq = results(DESeq2) - #newCountDataSet(workCM, group) - deSeqDatsizefac = estimateSizeFactors(deSEQds) - deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType) - resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype) - rDESeq = as.data.frame(results(resDESeq)) - rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls) - srDESeq = rDESeq[order(rDESeq\$pvalue),] - qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf') - cat("# DESeq top 50\n") - print.noquote(srDESeq[1:50,]) - write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F) - topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ] - DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq)) - DESeqcounts = rep(0, length(allgenes)) - DESeqcounts[DESeqcountsindex] = 1 - pdf("DESeq2_dispersion_estimates.pdf") - plotDispEsts(resDESeq) - dev.off() - ysmall = abs(min(rDESeq\$log2FoldChange)) - ybig = abs(max(rDESeq\$log2FoldChange)) - ylimit = min(4,ysmall,ybig) - pdf("DESeq2_MA_plot.pdf") - plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit)) - dev.off() - rlogres = rlogTransformation(resDESeq) - sampledists = dist( t( assay(rlogres) ) ) - sdmat = as.matrix(sampledists) - pdf("DESeq2_sample_distance_plot.pdf") - heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"), - col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255)) - dev.off() - ###outpdfname="DESeq2_top50_heatmap.pdf" - ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle)) - sink() - result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) ) - if ("try-error" %in% class(result)) { - print.noquote('DESeq2 plotPCA failed.') - } else { - pdf("DESeq2_PCA_plot.pdf") - #### wtf - print? Seems needed to get this to work - print(ppca) - dev.off() - } - } - - if (doVoom == T) { - sink('VOOM.log') - if (doedgeR == F) { - #### Setup DGEList object - DGEList = DGEList(counts=workCM, group = group) - DGEList = calcNormFactors(DGEList) - DGEList = estimateGLMCommonDisp(DGEList,mydesign) - DGEList = estimateGLMTrendedDisp(DGEList,mydesign) - DGEList = estimateGLMTagwiseDisp(DGEList,mydesign) - DGEList = estimateGLMTagwiseDisp(DGEList,mydesign) - norm.factor = DGEList\$samples\$norm.factors - } - pdf("VOOM_mean_variance_plot.pdf") - dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor) - dev.off() - # Use limma to fit data - fit = lmFit(dat.voomed, mydesign) - fit = eBayes(fit) - rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none") - qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='VOOM_qqplot.pdf') - rownames(rvoom) = rownames(workCM) - rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls) - srvoom = rvoom[order(rvoom\$P.Value),] - cat("# VOOM top 50\n") - print(srvoom[1:50,]) - write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F) - # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma - topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ] - voomcountsindex = which(allgenes %in% topresults.voom\$ID) - voomcounts = rep(0, length(allgenes)) - voomcounts[voomcountsindex] = 1 - sink() - } - - if (doCamera) { - doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle, - outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype) - } - - if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) { - if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) { - vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh) - counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, - VOOM_limma = voomcounts, row.names = allgenes) - } else if ((doDESeq2==T) && (doedgeR==T)) { - vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh) - counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes) - } else if ((doVoom==T) && (doedgeR==T)) { - vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh) - counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes) - } - - if (nrow(counts.dataframe > 1)) { - counts.venn = vennCounts(counts.dataframe) - vennf = "Venn_significant_genes_overlap.pdf" - pdf(vennf) - vennDiagram(counts.venn,main=vennmain,col="maroon") - dev.off() - } - } #### doDESeq2 or doVoom - -} -#### Done - -###sink(stdout(),append=T,type="message") -builtin_gmt = "" -history_gmt = "" -history_gmt_name = "" -out_edgeR = F -out_DESeq2 = F -out_VOOM = "$out_VOOM" -doDESeq2 = $DESeq2.doDESeq2 # make these T or F -doVoom = $doVoom -doCamera = F -doedgeR = $edgeR.doedgeR -edgeR_priordf = 0 - - -#if $doVoom == "T": - out_VOOM = "$out_VOOM" -#end if - -#if $DESeq2.doDESeq2 == "T": - out_DESeq2 = "$out_DESeq2" - DESeq_fitType = "$DESeq2.DESeq_fitType" -#end if - -#if $edgeR.doedgeR == "T": - out_edgeR = "$out_edgeR" - edgeR_priordf = $edgeR.edgeR_priordf -#end if - - - -if (sum(c(doedgeR,doVoom,doDESeq2)) == 0) -{ -write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr()) -quit(save="no",status=2) -} - -Out_Dir = "$html_file.files_path" -Input = "$input1" -TreatmentName = "$treatment_name" -TreatmentCols = "$Treat_cols" -ControlName = "$control_name" -ControlCols= "$Control_cols" -org = "$input1.dbkey" -if (org == "") { org = "hg19"} -fdrtype = "$fdrtype" -fdrthresh = $fdrthresh -useNDF = $useNDF -fQ = $fQ # non-differential centile cutoff -myTitle = "$title" -sids = strsplit("$subjectids",',') -subjects = unlist(sids) -nsubj = length(subjects) -TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1 -CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1 -cat('Got TCols=') -cat(TCols) -cat('; CCols=') -cat(CCols) -cat('\n') -useCols = c(TCols,CCols) -if (file.exists(Out_Dir) == F) dir.create(Out_Dir) -Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header -snames = colnames(Count_Matrix) -nsamples = length(snames) -if (nsubj > 0 & nsubj != nsamples) { -options("show.error.messages"=T) -mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','), - 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=',')) -write(mess, stderr()) -quit(save="no",status=4) -} -if (length(subjects) != 0) {subjects = subjects[useCols]} -Count_Matrix = Count_Matrix[,useCols] ### reorder columns -rn = rownames(Count_Matrix) -islib = rn %in% c('librarySize','NotInBedRegions') -LibSizes = Count_Matrix[subset(rn,islib),][1] # take first -Count_Matrix = Count_Matrix[subset(rn,! islib),] -group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor -group = factor(group, levels=c(ControlName,TreatmentName)) -colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns -results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2, - fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.', - myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects, - doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org, - histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType) -sessionInfo() -]]> - - - - -**What it does** - -Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes. -Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design. - -**Input** - -Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper -and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the -non-negative integer count of reads from one sample overlapping the feature. -The matrix must have a header row uniquely identifying the source samples, and unique row names in -the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods. - -**Specifying comparisons** - -This is basically dumbed down for two factors - case vs control. - -More complex interfaces are possible but painful at present. -Probably need to specify a phenotype file to do this better. -Work in progress. Send code. - -If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples), -put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or -A list of integers, one for each subject or an empty string if samples are all independent. -If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix. -Integers for samples that are not in the analysis *must* be present in the string as filler even if not used. - -So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones -eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use -8,9,1,1,2,2 -as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6 - -**Methods available** - -You can run 3 popular Bioconductor packages available for count data. - -edgeR - see edgeR_ for details - -VOOM/limma - see limma_VOOM_ for details - -DESeq2 - see DESeq2_ for details - -and optionally camera in edgeR which works better if MSigDB is installed. - -**Outputs** - -Some helpful plots and analysis results. Note that most of these are produced using R code -suggested by the excellent documentation and vignettes for the Bioconductor -packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy. - -**Note on Voom** - -The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method. - -This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma. - -voom is an acronym for mean-variance modelling at the observational level. -The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation. -Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend. -This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance. -The weights are then used in the linear modelling process to adjust for heteroscedasticity. - -In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess. -The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag. -The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag. -Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays. -Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation. -This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data. - - -Author(s) - -Charity Law and Gordon Smyth - -References - -Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia. - -Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts. -Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia. -http://www.statsci.org/smyth/pubs/VoomPreprint.pdf - -See Also - -A voom case study is given in the edgeR User's Guide. - -vooma is a similar function but for microarrays instead of RNA-seq. - - -***old rant on changes to Bioconductor package variable names between versions*** - -The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue) -breaking this and all other code that assumed the old name for this variable, -between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing). -This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing -to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly -when their old scripts break. This tool currently now works with 2.4.6. - -**Note on prior.N** - -http://seqanswers.com/forums/showthread.php?t=5591 says: - -*prior.n* - -The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion. -You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood -in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your -tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the -common likelihood the weight of one observation. - -In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value, -or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that -you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation -(squeezing) of the tagwise dispersions. How many samples do you have in your experiment? -What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10. -If you have more samples, then the tagwise dispersion estimates will be more reliable, -so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5. - - -From Bioconductor Digest, Vol 118, Issue 5, Gordon writes: - -Dear Dorota, - -The important settings are prior.df and trend. - -prior.n and prior.df are related through prior.df = prior.n * residual.df, -and your experiment has residual.df = 36 - 12 = 24. So the old setting of -prior.n=10 is equivalent for your data to prior.df = 240, a very large -value. Going the other way, the new setting of prior.df=10 is equivalent -to prior.n=10/24. - -To recover old results with the current software you would use - - estimateTagwiseDisp(object, prior.df=240, trend="none") - -To get the new default from old software you would use - - estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE) - -Actually the old trend method is equivalent to trend="loess" in the new -software. You should use plotBCV(object) to see whether a trend is -required. - -Note you could also use - - prior.n = getPriorN(object, prior.df=10) - -to map between prior.df and prior.n. - ----- - -**Attributions** - -edgeR - edgeR_ - -VOOM/limma - limma_VOOM_ - -DESeq2 - DESeq2_ for details - -See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package. - -Galaxy_ (that's what you are using right now!) for gluing everything together - -Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is -licensed to you under the LGPL_ like other rgenetics artefacts - -.. _LGPL: http://www.gnu.org/copyleft/lesser.html -.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html -.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html -.. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html -.. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html -.. _Galaxy: http://getgalaxy.org - - - - - diff -r ddd76b6db251 -r c4ee2e69d691 rgedgeRpaired.xml.camera --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rgedgeRpaired.xml.camera Wed Aug 07 02:41:40 2013 -0400 @@ -0,0 +1,1084 @@ + + models using BioConductor packages + + biocbasics + r3 + graphicsmagick + ghostscript + + + + rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts" + --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + edgeR['doedgeR'] == "T" + + + DESeq2['doDESeq2'] == "T" + + + doVoom == "T" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nsamp) { + dm =dm[1:nsamp,] + #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total') + } + newcolnames = substr(colnames(dm),1,20) + colnames(dm) = newcolnames + pdf(outpdfname) + heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none', + Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5) + dev.off() +} + +hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here") +{ + # for 2 groups only was + #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"} + #pcols = unlist(lapply(group,col.map)) + gu = unique(group) + colours = rainbow(length(gu),start=0.3,end=0.6) + pcols = colours[match(group,gu)] + nrows = nrow(cmat) + mtitle = paste(myTitle,'Heatmap: n contigs =',nrows) + if (nrows > nsamp) { + cmat = cmat[c(1:nsamp),] + mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='') + } + newcolnames = substr(colnames(cmat),1,20) + colnames(cmat) = newcolnames + pdf(outpdfname) + heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols) + dev.off() +} + +qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...) +# stolen from https://gist.github.com/703512 +{ + o = -log10(sort(pvector,decreasing=F)) + e = -log10( 1:length(o)/length(o) ) + o[o==-Inf] = reallysmall + o[o==Inf] = reallybig + maint = descr + pdf(outpdf) + plot(e,o,pch=19,cex=1, main=maint, ..., + xlab=expression(Expected~~-log[10](italic(p))), + ylab=expression(Observed~~-log[10](italic(p))), + xlim=c(0,max(e)), ylim=c(0,max(o))) + lines(e,e,col="red") + grid(col = "lightgray", lty = "dotted") + dev.off() +} + +smearPlot = function(DGEList,deTags, outSmear, outMain) + { + pdf(outSmear) + plotSmear(DGEList,de.tags=deTags,main=outMain) + grid(col="lightgray", lty="dotted") + dev.off() + } + +boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname) +{ # + nc = ncol(rawrs) + for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA} + fullnames = colnames(rawrs) + newcolnames = substr(colnames(rawrs),1,20) + colnames(rawrs) = newcolnames + newcolnames = substr(colnames(cleanrs),1,20) + colnames(cleanrs) = newcolnames + defpar = par(no.readonly=T) + print.noquote('raw contig counts by sample:') + print.noquote(summary(rawrs)) + print.noquote('normalised contig counts by sample:') + print.noquote(summary(cleanrs)) + pdf(pdfname) + par(mfrow=c(1,2)) + boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint)) + grid(col="lightgray",lty="dotted") + boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint)) + grid(col="lightgray",lty="dotted") + dev.off() + pdfname = "sample_counts_histogram.pdf" + nc = ncol(rawrs) + print.noquote(paste('Using ncol rawrs=',nc)) + ncroot = round(sqrt(nc)) + if (ncroot*ncroot < nc) { ncroot = ncroot + 1 } + m = c() + for (i in c(1:nc)) { + rhist = hist(rawrs[,i],breaks=100,plot=F) + m = append(m,max(rhist\$counts)) + } + ymax = max(m) + ncols = length(fullnames) + if (ncols > 20) + { + scale = 7*ncols/20 + pdf(pdfname,width=scale,height=scale) + } else { + pdf(pdfname) + } + par(mfrow=c(ncroot,ncroot)) + for (i in c(1:nc)) { + hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon", + breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax)) + } + dev.off() + par(defpar) + +} + +cumPlot = function(rawrs,cleanrs,maint,myTitle) +{ # updated to use ecdf + pdfname = "Filtering_rowsum_bar_charts.pdf" + defpar = par(no.readonly=T) + lrs = log(rawrs,10) + lim = max(lrs) + pdf(pdfname) + par(mfrow=c(2,1)) + hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)", + ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1) + grid(col="lightgray", lty="dotted") + lrs = log(cleanrs,10) + hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)", + ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1) + grid(col="lightgray", lty="dotted") + dev.off() + par(defpar) +} + +cumPlot1 = function(rawrs,cleanrs,maint,myTitle) +{ # updated to use ecdf + pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_') + pdf(pdfname) + par(mfrow=c(2,1)) + lastx = max(rawrs) + rawe = knots(ecdf(rawrs)) + cleane = knots(ecdf(cleanrs)) + cy = 1:length(cleane)/length(cleane) + ry = 1:length(rawe)/length(rawe) + plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads", + ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle) + grid(col="blue") + plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads", + ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle) + grid(col="blue") + dev.off() +} + + + +doGSEAold = function(y=NULL,design=NULL,histgmt="", + bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt", + ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH") +{ + sink('Camera.log') + genesets = c() + if (bigmt > "") + { + bigenesets = readLines(bigmt) + genesets = bigenesets + } + if (histgmt > "") + { + hgenesets = readLines(histgmt) + if (bigmt > "") { + genesets = rbind(genesets,hgenesets) + } else { + genesets = hgenesets + } # use only history if no bi + } + print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt)) + genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n + outf = outfname + head=paste(myTitle,'edgeR GSEA') + write(head,file=outfname,append=F) + ntest=length(genesets) + urownames = toupper(rownames(y)) + upcam = c() + downcam = c() + for (i in 1:ntest) { + gs = unlist(genesets[i]) + g = gs[1] # geneset_id + u = gs[2] + if (u > "") { u = paste("",u,"",sep="") } + glist = gs[3:length(gs)] # member gene symbols + glist = toupper(glist) + inglist = urownames %in% glist + nin = sum(inglist) + if ((nin > minnin) && (nin < maxnin)) { + ### print(paste('@@found',sum(inglist),'genes in glist')) + camres = camera(y=y,index=inglist,design=design) + if (! is.null(camres)) { + rownames(camres) = g # gene set name + camres = cbind(GeneSet=g,URL=u,camres) + if (camres\$Direction == "Up") + { + upcam = rbind(upcam,camres) } else { + downcam = rbind(downcam,camres) + } + } + } + } + uscam = upcam[order(upcam\$PValue),] + unadjp = uscam\$PValue + uscam\$adjPValue = p.adjust(unadjp,method=fdrtype) + nup = max(10,sum((uscam\$adjPValue < fdrthresh))) + dscam = downcam[order(downcam\$PValue),] + unadjp = dscam\$PValue + dscam\$adjPValue = p.adjust(unadjp,method=fdrtype) + ndown = max(10,sum((dscam\$adjPValue < fdrthresh))) + write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F) + write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F) + print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:')) + write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F) + print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:')) + write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F) + sink() +} + + + + +doGSEA = function(y=NULL,design=NULL,histgmt="", + bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt", + ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH") +{ + sink('Camera.log') + genesets = c() + if (bigmt > "") + { + bigenesets = readLines(bigmt) + genesets = bigenesets + } + if (histgmt > "") + { + hgenesets = readLines(histgmt) + if (bigmt > "") { + genesets = rbind(genesets,hgenesets) + } else { + genesets = hgenesets + } # use only history if no bi + } + print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt)) + genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n + outf = outfname + head=paste(myTitle,'edgeR GSEA') + write(head,file=outfname,append=F) + ntest=length(genesets) + urownames = toupper(rownames(y)) + upcam = c() + downcam = c() + incam = c() + urls = c() + gsids = c() + for (i in 1:ntest) { + gs = unlist(genesets[i]) + gsid = gs[1] # geneset_id + url = gs[2] + if (url > "") { url = paste("",url,"",sep="") } + glist = gs[3:length(gs)] # member gene symbols + glist = toupper(glist) + inglist = urownames %in% glist + nin = sum(inglist) + if ((nin > minnin) && (nin < maxnin)) { + incam = c(incam,inglist) + gsids = c(gsids,gsid) + urls = c(urls,url) + } + } + incam = as.list(incam) + names(incam) = gsids + allcam = camera(y=y,index=incam,design=design) + allcamres = cbind(geneset=gsids,allcam,URL=urls) + for (i in 1:ntest) { + camres = allcamres[i] + res = try(test = (camres\$Direction == "Up")) + if ("try-error" %in% class(res)) { + cat("test failed, camres = :") + print.noquote(camres) + } else { if (camres\$Direction == "Up") + { upcam = rbind(upcam,camres) + } else { downcam = rbind(downcam,camres) + } + + } + } + uscam = upcam[order(upcam\$PValue),] + unadjp = uscam\$PValue + uscam\$adjPValue = p.adjust(unadjp,method=fdrtype) + nup = max(10,sum((uscam\$adjPValue < fdrthresh))) + dscam = downcam[order(downcam\$PValue),] + unadjp = dscam\$PValue + dscam\$adjPValue = p.adjust(unadjp,method=fdrtype) + ndown = max(10,sum((dscam\$adjPValue < fdrthresh))) + write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F) + write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F) + print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:')) + write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F) + print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:')) + write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F) + sink() + } + + +edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5, + fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F, + filterquantile=0.2, subjects=c(),mydesign=NULL, + doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19', + histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt", + doCook=F,DESeq_fitType="parameteric") +{ + # Error handling + if (length(unique(group))!=2){ + print("Number of conditions identified in experiment does not equal 2") + q() + } + require(edgeR) + options(width = 512) + mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ") + allN = nrow(Count_Matrix) + nscut = round(ncol(Count_Matrix)/2) + colTotmillionreads = colSums(Count_Matrix)/1e6 + counts.dataframe = as.data.frame(c()) + rawrs = rowSums(Count_Matrix) + nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes + nzN = nrow(nonzerod) + nzrs = rowSums(nonzerod) + zN = allN - nzN + print('# Quantiles for non-zero row counts:',quote=F) + print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F) + if (useNDF == T) + { + gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut + lo = colSums(Count_Matrix[!gt1rpin3,]) + workCM = Count_Matrix[gt1rpin3,] + cleanrs = rowSums(workCM) + cleanN = length(cleanrs) + meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="") + print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F) + maint = paste('Filter >=1/million reads in >=',nscut,'samples') + } else { + useme = (nzrs > quantile(nzrs,filterquantile)) + workCM = nonzerod[useme,] + lo = colSums(nonzerod[!useme,]) + cleanrs = rowSums(workCM) + cleanN = length(cleanrs) + meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="") + print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F) + maint = paste('Filter below',filterquantile,'quantile') + } + cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle) + allgenes = rownames(workCM) + reg = "^chr([0-9]+):([0-9]+)-([0-9]+)" + genecards=" 0.8) # is ucsc style string + { + print("@@ using ucsc substitution for urls") + contigurls = paste0(ucsc,"&position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"") + } else { + print("@@ using genecards substitution for urls") + contigurls = paste0(genecards,allgenes,"\'>",allgenes,"") + } + print.noquote("# urls") + print.noquote(head(contigurls)) + print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F) + cmrowsums = rowSums(workCM) + TName=unique(group)[1] + CName=unique(group)[2] + if (is.null(mydesign)) { + if (length(subjects) == 0) + { + mydesign = model.matrix(~group) + } + else { + subjf = factor(subjects) + mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it + } + } + print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=','))) + print.noquote('Using design matrix:') + print.noquote(mydesign) + if (doedgeR) { + sink('edgeR.log') + #### Setup DGEList object + DGEList = DGEList(counts=workCM, group = group) + DGEList = calcNormFactors(DGEList) + + DGEList = estimateGLMCommonDisp(DGEList,mydesign) + comdisp = DGEList\$common.dispersion + DGEList = estimateGLMTrendedDisp(DGEList,mydesign) + if (edgeR_priordf > 0) { + print.noquote(paste("prior.df =",edgeR_priordf)) + DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf) + } else { + DGEList = estimateGLMTagwiseDisp(DGEList,mydesign) + } + DGLM = glmFit(DGEList,design=mydesign) + DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed + efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors + normData = (1e+06*DGEList\$counts/efflib) + uoutput = cbind( + Name=as.character(rownames(DGEList\$counts)), + DE\$table, + adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), + Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData, + DGEList\$counts + ) + soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable + goodness = gof(DGLM, pcutoff=fdrthresh) + if (sum(goodness\$outlier) > 0) { + print.noquote('GLM outliers:') + print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F) + } else { + print('No GLM fit outlier genes found\n') + } + z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2) + pdf("edgeR_GoodnessofFit.pdf") + qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion") + abline(0,1,lwd=3) + points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon") + dev.off() + estpriorn = getPriorN(DGEList) + print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F) + efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors + normData = (1e+06*DGEList\$counts/efflib) + uniqueg = unique(group) + #### Plot MDS + sample_colors = match(group,levels(group)) + sampleTypes = levels(factor(group)) + print.noquote(sampleTypes) + pdf("edgeR_MDSplot.pdf") + plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors) + legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19) + grid(col="blue") + dev.off() + colnames(normData) = paste( colnames(normData),'N',sep="_") + print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=','))) + nzd = data.frame(log(nonzerod + 1e-2,10)) + try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") ) + write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F) + tt = cbind( + Name=as.character(rownames(DGEList\$counts)), + DE\$table, + adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), + Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums + ) + print.noquote("# edgeR Top tags\n") + tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely + tt = tt[order(DE\$table\$PValue),] + print.noquote(tt[1:50,]) + deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,]) + nsig = length(deTags) + print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F) + deColours = ifelse(deTags,'red','black') + pdf("edgeR_BCV_vs_abundance.pdf") + plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance") + dev.off() + dg = DGEList[order(DE\$table\$PValue),] + #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg))) + efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors + normData = (1e+06*dg\$counts/efflib) + outpdfname="edgeR_top_100_heatmap.pdf" + hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle)) + outSmear = "edgeR_smearplot.pdf" + outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='') + smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain) + qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf') + norm.factor = DGEList\$samples\$norm.factors + topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ] + edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR)) + edgeRcounts = rep(0, length(allgenes)) + edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits + sink() + } ### doedgeR + if (doDESeq2 == T) + { + sink("DESeq2.log") + # DESeq2 + require('DESeq2') + library('RColorBrewer') + if (length(subjects) == 0) + { + pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM)) + deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx)) + } else { + pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM)) + deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx)) + } + #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype) + #rDESeq = results(DESeq2) + #newCountDataSet(workCM, group) + deSeqDatsizefac = estimateSizeFactors(deSEQds) + deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType) + resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype) + rDESeq = as.data.frame(results(resDESeq)) + rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls) + srDESeq = rDESeq[order(rDESeq\$pvalue),] + qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf') + cat("# DESeq top 50\n") + print.noquote(srDESeq[1:50,]) + write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F) + topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ] + DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq)) + DESeqcounts = rep(0, length(allgenes)) + DESeqcounts[DESeqcountsindex] = 1 + pdf("DESeq2_dispersion_estimates.pdf") + plotDispEsts(resDESeq) + dev.off() + ysmall = abs(min(rDESeq\$log2FoldChange)) + ybig = abs(max(rDESeq\$log2FoldChange)) + ylimit = min(4,ysmall,ybig) + pdf("DESeq2_MA_plot.pdf") + plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit)) + dev.off() + rlogres = rlogTransformation(resDESeq) + sampledists = dist( t( assay(rlogres) ) ) + sdmat = as.matrix(sampledists) + pdf("DESeq2_sample_distance_plot.pdf") + heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"), + col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255)) + dev.off() + ###outpdfname="DESeq2_top50_heatmap.pdf" + ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle)) + sink() + result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) ) + if ("try-error" %in% class(result)) { + print.noquote('DESeq2 plotPCA failed.') + } else { + pdf("DESeq2_PCA_plot.pdf") + #### wtf - print? Seems needed to get this to work + print(ppca) + dev.off() + } + } + + if (doVoom == T) { + sink('VOOM.log') + if (doedgeR == F) { + #### Setup DGEList object + DGEList = DGEList(counts=workCM, group = group) + DGEList = calcNormFactors(DGEList) + DGEList = estimateGLMCommonDisp(DGEList,mydesign) + DGEList = estimateGLMTrendedDisp(DGEList,mydesign) + DGEList = estimateGLMTagwiseDisp(DGEList,mydesign) + DGEList = estimateGLMTagwiseDisp(DGEList,mydesign) + norm.factor = DGEList\$samples\$norm.factors + } + pdf("VOOM_mean_variance_plot.pdf") + dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor) + dev.off() + # Use limma to fit data + fit = lmFit(dat.voomed, mydesign) + fit = eBayes(fit) + rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none") + qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='VOOM_qqplot.pdf') + rownames(rvoom) = rownames(workCM) + rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls) + srvoom = rvoom[order(rvoom\$P.Value),] + cat("# VOOM top 50\n") + print(srvoom[1:50,]) + write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F) + # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma + topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ] + voomcountsindex = which(allgenes %in% topresults.voom\$ID) + voomcounts = rep(0, length(allgenes)) + voomcounts[voomcountsindex] = 1 + sink() + } + + if (doCamera) { + doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle, + outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype) + } + + if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) { + if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) { + vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh) + counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, + VOOM_limma = voomcounts, row.names = allgenes) + } else if ((doDESeq2==T) && (doedgeR==T)) { + vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh) + counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes) + } else if ((doVoom==T) && (doedgeR==T)) { + vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh) + counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes) + } + + if (nrow(counts.dataframe > 1)) { + counts.venn = vennCounts(counts.dataframe) + vennf = "Venn_significant_genes_overlap.pdf" + pdf(vennf) + vennDiagram(counts.venn,main=vennmain,col="maroon") + dev.off() + } + } #### doDESeq2 or doVoom + +} +#### Done + +###sink(stdout(),append=T,type="message") +builtin_gmt = "" +history_gmt = "" +history_gmt_name = "" +out_edgeR = F +out_DESeq2 = F +out_VOOM = "$out_VOOM" +doDESeq2 = $DESeq2.doDESeq2 # make these T or F +doVoom = $doVoom +doCamera = F +doedgeR = $edgeR.doedgeR +edgeR_priordf = 0 + + +#if $doVoom == "T": + out_VOOM = "$out_VOOM" +#end if + +#if $DESeq2.doDESeq2 == "T": + out_DESeq2 = "$out_DESeq2" + DESeq_fitType = "$DESeq2.DESeq_fitType" +#end if + +#if $edgeR.doedgeR == "T": + out_edgeR = "$out_edgeR" + edgeR_priordf = $edgeR.edgeR_priordf +#end if + + + +if (sum(c(doedgeR,doVoom,doDESeq2)) == 0) +{ +write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr()) +quit(save="no",status=2) +} + +Out_Dir = "$html_file.files_path" +Input = "$input1" +TreatmentName = "$treatment_name" +TreatmentCols = "$Treat_cols" +ControlName = "$control_name" +ControlCols= "$Control_cols" +org = "$input1.dbkey" +if (org == "") { org = "hg19"} +fdrtype = "$fdrtype" +fdrthresh = $fdrthresh +useNDF = $useNDF +fQ = $fQ # non-differential centile cutoff +myTitle = "$title" +sids = strsplit("$subjectids",',') +subjects = unlist(sids) +nsubj = length(subjects) +TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1 +CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1 +cat('Got TCols=') +cat(TCols) +cat('; CCols=') +cat(CCols) +cat('\n') +useCols = c(TCols,CCols) +if (file.exists(Out_Dir) == F) dir.create(Out_Dir) +Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header +snames = colnames(Count_Matrix) +nsamples = length(snames) +if (nsubj > 0 & nsubj != nsamples) { +options("show.error.messages"=T) +mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','), + 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=',')) +write(mess, stderr()) +quit(save="no",status=4) +} +if (length(subjects) != 0) {subjects = subjects[useCols]} +Count_Matrix = Count_Matrix[,useCols] ### reorder columns +rn = rownames(Count_Matrix) +islib = rn %in% c('librarySize','NotInBedRegions') +LibSizes = Count_Matrix[subset(rn,islib),][1] # take first +Count_Matrix = Count_Matrix[subset(rn,! islib),] +group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor +group = factor(group, levels=c(ControlName,TreatmentName)) +colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns +results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2, + fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.', + myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects, + doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org, + histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType) +sessionInfo() +]]> + + + + +**What it does** + +Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes. +Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design. + +**Input** + +Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper +and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the +non-negative integer count of reads from one sample overlapping the feature. +The matrix must have a header row uniquely identifying the source samples, and unique row names in +the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods. + +**Specifying comparisons** + +This is basically dumbed down for two factors - case vs control. + +More complex interfaces are possible but painful at present. +Probably need to specify a phenotype file to do this better. +Work in progress. Send code. + +If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples), +put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or +A list of integers, one for each subject or an empty string if samples are all independent. +If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix. +Integers for samples that are not in the analysis *must* be present in the string as filler even if not used. + +So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones +eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use +8,9,1,1,2,2 +as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6 + +**Methods available** + +You can run 3 popular Bioconductor packages available for count data. + +edgeR - see edgeR_ for details + +VOOM/limma - see limma_VOOM_ for details + +DESeq2 - see DESeq2_ for details + +and optionally camera in edgeR which works better if MSigDB is installed. + +**Outputs** + +Some helpful plots and analysis results. Note that most of these are produced using R code +suggested by the excellent documentation and vignettes for the Bioconductor +packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy. + +**Note on Voom** + +The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method. + +This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma. + +voom is an acronym for mean-variance modelling at the observational level. +The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation. +Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend. +This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance. +The weights are then used in the linear modelling process to adjust for heteroscedasticity. + +In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess. +The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag. +The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag. +Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays. +Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation. +This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data. + + +Author(s) + +Charity Law and Gordon Smyth + +References + +Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia. + +Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts. +Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia. +http://www.statsci.org/smyth/pubs/VoomPreprint.pdf + +See Also + +A voom case study is given in the edgeR User's Guide. + +vooma is a similar function but for microarrays instead of RNA-seq. + + +***old rant on changes to Bioconductor package variable names between versions*** + +The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue) +breaking this and all other code that assumed the old name for this variable, +between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing). +This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing +to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly +when their old scripts break. This tool currently now works with 2.4.6. + +**Note on prior.N** + +http://seqanswers.com/forums/showthread.php?t=5591 says: + +*prior.n* + +The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion. +You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood +in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your +tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the +common likelihood the weight of one observation. + +In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value, +or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that +you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation +(squeezing) of the tagwise dispersions. How many samples do you have in your experiment? +What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10. +If you have more samples, then the tagwise dispersion estimates will be more reliable, +so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5. + + +From Bioconductor Digest, Vol 118, Issue 5, Gordon writes: + +Dear Dorota, + +The important settings are prior.df and trend. + +prior.n and prior.df are related through prior.df = prior.n * residual.df, +and your experiment has residual.df = 36 - 12 = 24. So the old setting of +prior.n=10 is equivalent for your data to prior.df = 240, a very large +value. Going the other way, the new setting of prior.df=10 is equivalent +to prior.n=10/24. + +To recover old results with the current software you would use + + estimateTagwiseDisp(object, prior.df=240, trend="none") + +To get the new default from old software you would use + + estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE) + +Actually the old trend method is equivalent to trend="loess" in the new +software. You should use plotBCV(object) to see whether a trend is +required. + +Note you could also use + + prior.n = getPriorN(object, prior.df=10) + +to map between prior.df and prior.n. + +---- + +**Attributions** + +edgeR - edgeR_ + +VOOM/limma - limma_VOOM_ + +DESeq2 - DESeq2_ for details + +See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package. + +Galaxy_ (that's what you are using right now!) for gluing everything together + +Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is +licensed to you under the LGPL_ like other rgenetics artefacts + +.. _LGPL: http://www.gnu.org/copyleft/lesser.html +.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html +.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html +.. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html +.. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html +.. _Galaxy: http://getgalaxy.org + + + + +