differential_count_models: rgedgeRpaired

comparison rgedgeRpaired_nocamera.xml @ 149:3107df74056e draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/differential_count_models commit 344140b8df53b8b7024618bb04594607a045c03a

author	iuc
date	Mon, 04 May 2015 22:47:36 -0400
parents	474c08e747b6
children

comparison

equal deleted inserted replaced

-:1e20061decdd
+:3107df74056e
 <requirement type="package" version="3.1.2">R</requirement>
 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
 <requirement type="package" version="9.10">ghostscript</requirement>
 <requirement type="package" version="2.14">biocbasics</requirement>
 </requirements>
+<stdio>
+<exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix"/>
+</stdio>
 <command interpreter="python">
 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "Differential_Counts"
 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
 </command>
+<configfiles>
+<configfile name="runme"><![CDATA[
+#
+# edgeR.Rscript
+# updated feb 2014 adding outlier-robust deviance estimate options by ross for R 3.0.2/bioc 2.13
+# updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
+# Performs DGE on a count table containing n replicates of two conditions
+#
+# Parameters
+#
+# 1 - Output Dir
+# Original edgeR code by: S.Lunke and A.Kaspi
+reallybig = log10(.Machine\$double.xmax)
+reallysmall = log10(.Machine\$double.xmin)
+library("gplots")
+library("edgeR")
+library('stringr')
+hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
+{
+# Perform clustering for significant pvalues after controlling FWER
+samples = colnames(cmat)
+gu = unique(group)
+gn = rownames(cmat)
+if (length(gu) == 2) {
+col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
+pcols = unlist(lapply(group,col.map))
+} else {
+colours = rainbow(length(gu),start=0,end=4/6)
+pcols = colours[match(group,gu)]        }
+dm = cmat[(! is.na(gn)),]
+# remove unlabelled hm rows
+nprobes = nrow(dm)
+# sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
+if (nprobes > nsamp) {
+dm =dm[1:nsamp,]
+#sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
+}
+newcolnames = substr(colnames(dm),1,20)
+colnames(dm) = newcolnames
+pdf(outpdfname)
+heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
+Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
+dev.off()
+}
+hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
+{
+# for 2 groups only was
+#col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
+#pcols = unlist(lapply(group,col.map))
+gu = unique(group)
+colours = rainbow(length(gu),start=0.3,end=0.6)
+pcols = colours[match(group,gu)]
+nrows = nrow(cmat)
+mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
+if (nrows > nsamp)  {
+cmat = cmat[c(1:nsamp),]
+mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
+}
+newcolnames = substr(colnames(cmat),1,20)
+colnames(cmat) = newcolnames
+pdf(outpdfname)
+heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
+dev.off()
+}
+qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
+# stolen from https://gist.github.com/703512
+{
+o = -log10(sort(pvector,decreasing=F))
+e = -log10( 1:length(o)/length(o) )
+o[o==-Inf] = reallysmall
+o[o==Inf] = reallybig
+maint = descr
+pdf(outpdf)
+plot(e,o,pch=19,cex=1, main=maint, ...,
+xlab=expression(Expected~~-log[10](italic(p))),
+ylab=expression(Observed~~-log[10](italic(p))),
+xlim=c(0,max(e)), ylim=c(0,max(o)))
+lines(e,e,col="red")
+grid(col = "lightgray", lty = "dotted")
+dev.off()
+}
+smearPlot = function(myDGEList,deTags, outSmear, outMain)
+{
+pdf(outSmear)
+plotSmear(myDGEList,de.tags=deTags,main=outMain)
+grid(col="lightgray", lty="dotted")
+dev.off()
+}
+boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
+{
+nc = ncol(rawrs)
+##### for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
+fullnames = colnames(rawrs)
+newcolnames = substr(colnames(rawrs),1,20)
+colnames(rawrs) = newcolnames
+newcolnames = substr(colnames(cleanrs),1,20)
+colnames(cleanrs) = newcolnames
+defpar = par(no.readonly=T)
+print.noquote('@@@ Raw contig counts by sample:')
+print.noquote(summary(rawrs))
+print.noquote('@@@ Library size contig counts by sample:')
+print.noquote(summary(cleanrs))
+pdf(pdfname)
+par(mfrow=c(1,2))
+boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main='log2 raw counts')
+grid(col="lightgray",lty="dotted")
+boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('log2 counts after ',maint))
+grid(col="lightgray",lty="dotted")
+dev.off()
+pdfname = "sample_counts_histogram.pdf"
+nc = ncol(rawrs)
+print.noquote(paste('Using ncol rawrs=',nc))
+ncroot = round(sqrt(nc))
+if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
+m = c()
+for (i in c(1:nc)) {
+rhist = hist(rawrs[,i],breaks=100,plot=F)
+m = append(m,max(rhist\$counts))
+}
+ymax = max(m)
+ncols = length(fullnames)
+if (ncols > 20)
+{
+scale = 7*ncols/20
+pdf(pdfname,width=scale,height=scale)
+} else {
+pdf(pdfname)
+}
+par(mfrow=c(ncroot,ncroot))
+for (i in c(1:nc)) {
+hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
+breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
+}
+dev.off()
+par(defpar)
+}
+cumPlot = function(rawrs,cleanrs,maint,myTitle)
+{   # updated to use ecdf
+pdfname = "Differential_rowsum_bar_charts.pdf"
+defpar = par(no.readonly=T)
+lrs = log(rawrs,10)
+lim = max(lrs)
+pdf(pdfname)
+par(mfrow=c(2,1))
+hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
+ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
+grid(col="lightgray", lty="dotted")
+lrs = log(cleanrs,10)
+hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
+ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
+grid(col="lightgray", lty="dotted")
+dev.off()
+par(defpar)
+}
+cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
+{   # updated to use ecdf
+pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
+pdf(pdfname)
+par(mfrow=c(2,1))
+lastx = max(rawrs)
+rawe = knots(ecdf(rawrs))
+cleane = knots(ecdf(cleanrs))
+cy = 1:length(cleane)/length(cleane)
+ry = 1:length(rawe)/length(rawe)
+plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
+ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
+grid(col="blue")
+plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
+ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
+grid(col="blue")
+dev.off()
+}
+doGSEAold = function(y=NULL,design=NULL,histgmt="",
+bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
+ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
+{
+sink('Camera.log')
+genesets = c()
+if (bigmt > "")
+{
+bigenesets = readLines(bigmt)
+genesets = bigenesets
+}
+if (histgmt > "")
+{
+hgenesets = readLines(histgmt)
+if (bigmt > "") {
+genesets = rbind(genesets,hgenesets)
+} else {
+genesets = hgenesets
+} # use only history if no bi
+}
+print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
+genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
+outf = outfname
+head=paste(myTitle,'edgeR GSEA')
+write(head,file=outfname,append=F)
+ntest=length(genesets)
+urownames = toupper(rownames(y))
+upcam = c()
+downcam = c()
+for (i in 1:ntest) {
+gs = unlist(genesets[i])
+g = gs[1] # geneset_id
+u = gs[2]
+if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
+glist = gs[3:length(gs)] # member gene symbols
+glist = toupper(glist)
+inglist = urownames %in% glist
+nin = sum(inglist)
+if ((nin > minnin) && (nin < maxnin)) {
+### print(paste('@@found',sum(inglist),'genes in glist'))
+camres = camera(y=y,index=inglist,design=design)
+if (! is.null(camres)) {
+rownames(camres) = g # gene set name
+camres = cbind(GeneSet=g,URL=u,camres)
+if (camres\$Direction == "Up")
+{
+upcam = rbind(upcam,camres) } else {
+downcam = rbind(downcam,camres)
+}
+}
+}
+}
+uscam = upcam[order(upcam\$PValue),]
+unadjp = uscam\$PValue
+uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
+dscam = downcam[order(downcam\$PValue),]
+unadjp = dscam\$PValue
+dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
+write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
+write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
+print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
+write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
+sink()
+}
+doGSEA = function(y=NULL,design=NULL,histgmt="",
+bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
+ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
+{
+sink('Camera.log')
+genesets = c()
+if (bigmt > "")
+{
+bigenesets = readLines(bigmt)
+genesets = bigenesets
+}
+if (histgmt > "")
+{
+hgenesets = readLines(histgmt)
+if (bigmt > "") {
+genesets = rbind(genesets,hgenesets)
+} else {
+genesets = hgenesets
+} # use only history if no bi
+}
+print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
+genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
+outf = outfname
+head=paste(myTitle,'edgeR GSEA')
+write(head,file=outfname,append=F)
+ntest=length(genesets)
+urownames = toupper(rownames(y))
+upcam = c()
+downcam = c()
+incam = c()
+urls = c()
+gsids = c()
+for (i in 1:ntest) {
+gs = unlist(genesets[i])
+gsid = gs[1] # geneset_id
+url = gs[2]
+if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
+glist = gs[3:length(gs)] # member gene symbols
+glist = toupper(glist)
+inglist = urownames %in% glist
+nin = sum(inglist)
+if ((nin > minnin) && (nin < maxnin)) {
+incam = c(incam,inglist)
+gsids = c(gsids,gsid)
+urls = c(urls,url)
+}
+}
+incam = as.list(incam)
+names(incam) = gsids
+allcam = camera(y=y,index=incam,design=design)
+allcamres = cbind(geneset=gsids,allcam,URL=urls)
+for (i in 1:ntest) {
+camres = allcamres[i]
+res = try(test = (camres\$Direction == "Up"))
+if ("try-error" %in% class(res)) {
+cat("test failed, camres = :")
+print.noquote(camres)
+} else  { if (camres\$Direction == "Up")
+{  upcam = rbind(upcam,camres)
+} else { downcam = rbind(downcam,camres)
+}
+}
+}
+uscam = upcam[order(upcam\$PValue),]
+unadjp = uscam\$PValue
+uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
+dscam = downcam[order(downcam\$PValue),]
+unadjp = dscam\$PValue
+dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
+write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
+write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
+print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
+write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
+sink()
+}
+edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_Voom=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
+fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
+filterquantile=0.2, subjects=c(),TreatmentName="Rx",ControlName="Ctrl",mydesign=NULL,
+doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
+histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
+doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary')
+{
+logf = file('Differential.log', open = "a")
+sink(logf,type = c("output", "message"))
+run_edgeR = function(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod)
+{
+logf = file('edgeR.log', open = "a")
+sink(logf,type = c("output", "message"))
+#### Setup myDGEList object
+myDGEList = DGEList(counts=workCM, group = group)
+myDGEList = calcNormFactors(myDGEList)
+if (robust_meth == 'ordinary') {
+myDGEList = estimateGLMCommonDisp(myDGEList,mydesign)
+myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign)
+if (priordf > 0) {  myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign,prior.df = priordf)
+} else { myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign) }
+comdisp = myDGEList\$common.dispersion
+estpriorn = getPriorN(myDGEList)
+print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
+} else {
+myDGEList = estimateGLMRobustDisp(myDGEList,design=mydesign, prior.df = priordf, maxit = 6, residual.type = robust_meth)
+}
+DGLM = glmFit(myDGEList,design=mydesign)
+DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
+normData = cpm(myDGEList)
+uoutput = cbind(
+Name=as.character(rownames(myDGEList\$counts)),
+DE\$table,
+adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
+Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
+myDGEList\$counts
+)
+soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
+goodness = gof(DGLM, pcutoff=fdrthresh)
+if (sum(goodness\$outlier) > 0) {
+print.noquote('GLM outliers:')
+print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
+} else {
+print('No GLM fit outlier genes found\n')
+}
+z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
+pdf(paste("edgeR",mt,"GoodnessofFit.pdf",sep='_'))
+qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
+abline(0,1,lwd=3)
+points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
+dev.off()
+uniqueg = unique(group)
+write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
+tt = cbind(
+Name=as.character(rownames(myDGEList)),
+DE\$table,
+adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
+Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums
+)
+tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
+stt = tt[order(DE\$table\$PValue),]
+print.noquote("@@ edgeR Top tags\n")
+print.noquote(stt[1:50,])
+deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
+nsig = length(deTags)
+print.noquote(paste('@@',nsig,'tags significant at adj p=',fdrthresh))
+deColours = ifelse(deTags,'red','black')
+pdf(paste("edgeR",mt,"BCV_vs_abundance.pdf",sep="_"))
+plotBCV(myDGEList, cex=0.3, main="Biological CV vs abundance")
+dev.off()
+dg = myDGEList[order(DE\$table\$PValue),]
+outpdfname= paste("edgeR",mt,"top_100_heatmap.pdf",sep="_")
+ocpm = normData[order(DE\$table\$PValue),]
+ocpm = ocpm[c(1:100),]
+hmap2(ocpm,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste(myTitle,'Heatmap'))
+outSmear = paste("edgeR",mt,"smearplot.pdf",sep="_")
+outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
+smearPlot(myDGEList=myDGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
+qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf=paste('edgeR',mt,'qqplot.pdf',sep='_'))
+topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
+edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
+edgeRcounts = rep(0, length(allgenes))
+edgeRcounts[edgeRcountsindex] = 1  # Create venn diagram of hits
+sink()
+return(list(myDGEList=myDGEList,edgeRcounts=edgeRcounts))
+} ### run_edgeR
+run_DESeq2 = function(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType)
+{
+logf = file("DESeq2.log", open = "a")
+sink(logf,type = c("output", "message"))
+# DESeq2
+require('DESeq2')
+library('RColorBrewer')
+if (length(subjects) == 0)
+{
+pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
+deSEQds = DESeqDataSetFromMatrix(countData = workCM,  colData = pdata, design = formula(~ Rx))
+} else {
+pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
+deSEQds = DESeqDataSetFromMatrix(countData = workCM,  colData = pdata, design = formula(~ subjects + Rx))
+}
+deSeqDatsizefac = estimateSizeFactors(deSEQds)
+deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
+resDESeq = nbinomWaldTest(deSeqDatdisp)
+rDESeq = as.data.frame(results(resDESeq))
+rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
+srDESeq = rDESeq[order(rDESeq\$pvalue),]
+qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf=paste('DESeq2',mt,'qqplot.pdf',sep="_"))
+cat("# DESeq top 50\n")
+print.noquote(srDESeq[1:50,])
+write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
+topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
+DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
+DESeqcounts = rep(0, length(allgenes))
+DESeqcounts[DESeqcountsindex] = 1
+pdf(paste("DESeq2",mt,"dispersion_estimates.pdf",sep='_'))
+plotDispEsts(resDESeq)
+dev.off()
+ysmall = abs(min(rDESeq\$log2FoldChange))
+ybig = abs(max(rDESeq\$log2FoldChange))
+ylimit = min(4,ysmall,ybig)
+pdf(paste("DESeq2",mt,"MA_plot.pdf",sep="_"))
+plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
+dev.off()
+rlogres = rlogTransformation(resDESeq)
+sampledists = dist( t( assay(rlogres) ) )
+sdmat = as.matrix(sampledists)
+pdf(paste("DESeq2",mt,"sample_distance_plot.pdf",sep="_"))
+heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
+col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
+dev.off()
+result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
+if ("try-error" %in% class(result)) {
+print.noquote('DESeq2 plotPCA failed.')
+} else {
+pdf(paste("DESeq2",mt,"PCA_plot.pdf",sep="_"))
+#### wtf - print? Seems needed to get this to work
+print(ppca)
+dev.off()
+}
+sink()
+return(DESeqcounts)
+}
+run_Voom = function(workCM,pdata,subjects,group,mydesign,mt,out_Voom)
+{
+logf = file('VOOM.log', open = "a")
+sink(logf,type = c("output", "message"))
+if (doedgeR == F) {
+#### Setup myDGEList object
+myDGEList = DGEList(counts=workCM, group = group)
+myDGEList = calcNormFactors(myDGEList)
+myDGEList = estimateGLMCommonDisp(myDGEList,mydesign)
+myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign)
+myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign)
+}
+pdf(paste("VOOM",mt,"mean_variance_plot.pdf",sep='_'))
+dat.voomed <- voom(myDGEList, mydesign, plot = TRUE, normalize.method="quantil", lib.size = NULL)
+dev.off()
+# Use limma to fit data
+fit = lmFit(dat.voomed, mydesign)
+fit = eBayes(fit)
+rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
+qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf=paste('VOOM',mt,'qqplot.pdf',sep='_'))
+rownames(rvoom) = rownames(workCM)
+rvoom = cbind(Contig=rownames(workCM),rvoom,NReads=cmrowsums,URL=contigurls)
+srvoom = rvoom[order(rvoom\$P.Value),]
+cat("# VOOM top 50\n")
+print(srvoom[1:50,])
+write.table(srvoom,file=out_Voom, quote=FALSE, sep="\t",row.names=F)
+# Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
+topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
+voomcountsindex <- which(allgenes %in% rownames(topresults.voom))
+voomcounts = rep(0, length(allgenes))
+voomcounts[voomcountsindex] = 1
+sink()
+return(voomcounts)
+}
+#### data cleaning and analsis control starts here
+# Error handling
+nugroup = length(unique(group))
+if (nugroup!=2){
+print("Number of conditions identified in experiment does not equal 2")
+q()
+}
+require(edgeR)
+options(width = 512)
+mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
+allN = nrow(Count_Matrix)
+nscut = round(ncol(Count_Matrix)/2) # half samples
+colTotmillionreads = colSums(Count_Matrix)/1e6
+counts.dataframe = as.data.frame(c())
+rawrs = rowSums(Count_Matrix)
+nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
+nzN = nrow(nonzerod)
+nzrs = rowSums(nonzerod)
+zN = allN - nzN
+print('@@@ Quantiles for non-zero row counts:',quote=F)
+print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
+if (useNDF == T)
+{
+gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
+lo = colSums(Count_Matrix[!gt1rpin3,])
+workCM = Count_Matrix[gt1rpin3,]
+cleanrs = rowSums(workCM)
+cleanN = length(cleanrs)
+meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
+print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
+maint = paste('Filter >=1/million reads in >=',nscut,'samples')
+}   else {
+useme = (nzrs > quantile(nzrs,filterquantile))
+workCM = nonzerod[useme,]
+lo = colSums(nonzerod[!useme,])
+cleanrs = rowSums(workCM)
+cleanN = length(cleanrs)
+meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
+print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
+maint = paste('Filter below',filterquantile,'quantile')
+}
+cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
+allgenes = rownames(workCM)
+reg = "^chr([0-9]+):([0-9]+)-([0-9]+)" # ucsc chr:start-end regexp
+genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
+ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
+testreg = str_match(allgenes,reg)
+if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
+{
+print("@@ using ucsc substitution for urls")
+contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
+} else {
+print("@@ using genecards substitution for urls")
+contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
+}
+print.noquote(paste("@@ Total low count contigs per sample = ",paste(table(lo),collapse=',')))
+cmrowsums = rowSums(workCM)
+TName=unique(group)[1]
+CName=unique(group)[2]
+if (is.null(mydesign)) {
+if (length(subjects) == 0)
+{
+mydesign = model.matrix(~group)
+}
+else {
+subjf = factor(subjects)
+mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
+}
+}
+print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
+print.noquote('Using design matrix:')
+print.noquote(mydesign)
+normData = cpm(workCM)*1e6
+colnames(normData) = paste( colnames(workCM),'N',sep="_")
+print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
+if (doedgeR == T) {
+eres = run_edgeR(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod)
+myDGEList = eres\$myDGEList
+edgeRcounts = eres\$edgeRcounts
+#### Plot MDS
+sample_colors =  match(group,levels(group))
+sampleTypes = levels(factor(group))
+print.noquote(sampleTypes)
+pdf(paste("edgeR",mt,"MDSplot.pdf",sep='_'))
+plotMDS.DGEList(myDGEList,main=paste("MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
+legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
+grid(col="blue")
+dev.off()
+scale <- myDGEList\$samples\$lib.size*myDGEList\$samples\$norm.factors
+normCounts <- round(t(t(myDGEList\$counts)/scale)*mean(scale))
+try({boxPlot(rawrs=nzd,cleanrs=log2(normCounts+1),maint='Effects of TMM size normalisation',myTitle=myTitle,pdfname=paste("edgeR",mt,"raw_norm_counts_box.pdf",sep='_'))},T)
+}
+if (doDESeq2 == T) {  DESeqcounts = run_DESeq2(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) }
+if (doVoom == T) { voomcounts = run_Voom(workCM,pdata,subjects,group,mydesign,mt,out_Voom) }
+if (doCamera) {
+doGSEA(y=myDGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
+outfname=paste("GSEA_Camera",mt,"table.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
+}
+counts.dataframe = c()
+vennmain = 'no venn'
+if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
+if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
+vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
+counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
+VOOM_limma = voomcounts, row.names = allgenes)
+} else if ((doDESeq2==T) && (doedgeR==T))  {
+vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
+counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
+} else if ((doVoom==T) && (doedgeR==T)) {
+vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
+counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
+}
+if (nrow(counts.dataframe > 1)) {
+counts.venn = vennCounts(counts.dataframe)
+vennf = paste("Differential_venn",mt,"significant_genes_overlap.pdf",sep="_")
+pdf(vennf)
+vennDiagram(counts.venn,main=vennmain,col="maroon")
+dev.off()
+}
+} #### doDESeq2 or doVoom
+sink()
+}
+#### Done
+]]>
+builtin_gmt = ""
+history_gmt = ""
+history_gmt_name = ""
+out_edgeR = F
+out_DESeq2 = F
+out_Voom = "$out_VOOM"
+edgeR_robust_meth = "ordinary"
+doDESeq2 = $DESeq2.doDESeq2
+doVoom = $doVoom
+doCamera = F
+doedgeR = $edgeR.doedgeR
+edgeR_priordf = 10
+#if $doVoom == "T":
+out_Voom = "$out_VOOM"
+#end if
+#if $DESeq2.doDESeq2 == "T":
+out_DESeq2 = "$out_DESeq2"
+doDESeq2 = T
+DESeq_fitType = "$DESeq2.DESeq_fitType"
+#end if
+#if $edgeR.doedgeR == "T":
+out_edgeR = "$out_edgeR"
+edgeR_priordf = $edgeR.edgeR_priordf
+edgeR_robust_meth = "$edgeR.edgeR_robust_method"
+#end if
+if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
+{
+write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
+quit(save="no",status=2)
+}
+Out_Dir = "$html_file.files_path"
+Input =  "$input1"
+TreatmentName = "$treatment_name"
+TreatmentCols = "$Treat_cols"
+ControlName = "$control_name"
+ControlCols= "$Control_cols"
+org = "$input1.dbkey"
+if (org == "") { org = "hg19"}
+fdrtype = "$fdrtype"
+fdrthresh = $fdrthresh
+useNDF = $useNDF
+fQ = $fQ # non-differential centile cutoff
+myTitle = "$title"
+sids = strsplit("$subjectids",',')
+subjects = unlist(sids)
+nsubj = length(subjects)
+TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
+CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
+cat('Got TCols=')
+cat(TCols)
+cat('; CCols=')
+cat(CCols)
+cat('\n')
+<![CDATA[
+useCols = c(TCols,CCols)
+if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
+Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t')
+snames = colnames(Count_Matrix)
+nsamples = length(snames)
+if (nsubj >  0 & nsubj != nsamples) {
+options("show.error.messages"=T)
+mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
+'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
+write(mess, stderr())
+quit(save="no",status=4)
+}
+if (length(subjects) != 0) {subjects = subjects[useCols]}
+Count_Matrix = Count_Matrix[,useCols] ### reorder columns
+rn = rownames(Count_Matrix)
+islib = rn %in% c('librarySize','NotInBedRegions')
+LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
+Count_Matrix = Count_Matrix[subset(rn,! islib),]
+group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) )
+group = factor(group, levels=c(ControlName,TreatmentName))
+colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_")
+results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_Voom=out_Voom, out_DESeq2=out_DESeq2,
+fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
+myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,TreatmentName=TreatmentName,ControlName=ControlName,
+doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
+histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType,robust_meth=edgeR_robust_meth)
+sessionInfo()
+sink()
+]]>
+</configfile>
+</configfiles>
 <inputs>
 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample" help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
 <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain">
 <sanitizer invalid_char="">
 <valid initial="string.letters,string.digits">
 <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
 <filter>doVoom == "T"</filter>
 </data>
 <data format="html" name="html_file" label="${title}.html"/>
 </outputs>
-<stdio>
-<exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix"/>
-</stdio>
 <tests>
 <test>
 <param name="input1" value="test_bams2mx.xls" ftype="tabular"/>
 <param name="treatment_name" value="liver"/>
 <param name="title" value="edgeRtest"/>
 <param name="Treat_cols" value="2,6,7,8"/>
 <output name="out_edgeR" file="edgeRtest1out.xls" compare="diff" lines_diff="20"/>
 <output name="html_file" file="edgeRtest1out.html" compare="diff" lines_diff="20"/>
 </test>
 </tests>
-<configfiles>
-<configfile name="runme"><![CDATA[
-#
-# edgeR.Rscript
-# updated feb 2014 adding outlier-robust deviance estimate options by ross for R 3.0.2/bioc 2.13
-# updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
-# Performs DGE on a count table containing n replicates of two conditions
-#
-# Parameters
-#
-# 1 - Output Dir
-# Original edgeR code by: S.Lunke and A.Kaspi
-reallybig = log10(.Machine\$double.xmax)
-reallysmall = log10(.Machine\$double.xmin)
-library("gplots")
-library("edgeR")
-library('stringr')
-hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
-{
-# Perform clustering for significant pvalues after controlling FWER
-samples = colnames(cmat)
-gu = unique(group)
-gn = rownames(cmat)
-if (length(gu) == 2) {
-col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
-pcols = unlist(lapply(group,col.map))
-} else {
-colours = rainbow(length(gu),start=0,end=4/6)
-pcols = colours[match(group,gu)]        }
-dm = cmat[(! is.na(gn)),]
-# remove unlabelled hm rows
-nprobes = nrow(dm)
-# sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
-if (nprobes > nsamp) {
-dm =dm[1:nsamp,]
-#sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
-}
-newcolnames = substr(colnames(dm),1,20)
-colnames(dm) = newcolnames
-pdf(outpdfname)
-heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
-Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
-dev.off()
-}
-hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
-{
-# for 2 groups only was
-#col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
-#pcols = unlist(lapply(group,col.map))
-gu = unique(group)
-colours = rainbow(length(gu),start=0.3,end=0.6)
-pcols = colours[match(group,gu)]
-nrows = nrow(cmat)
-mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
-if (nrows > nsamp)  {
-cmat = cmat[c(1:nsamp),]
-mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
-}
-newcolnames = substr(colnames(cmat),1,20)
-colnames(cmat) = newcolnames
-pdf(outpdfname)
-heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
-dev.off()
-}
-qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
-# stolen from https://gist.github.com/703512
-{
-o = -log10(sort(pvector,decreasing=F))
-e = -log10( 1:length(o)/length(o) )
-o[o==-Inf] = reallysmall
-o[o==Inf] = reallybig
-maint = descr
-pdf(outpdf)
-plot(e,o,pch=19,cex=1, main=maint, ...,
-xlab=expression(Expected~~-log[10](italic(p))),
-ylab=expression(Observed~~-log[10](italic(p))),
-xlim=c(0,max(e)), ylim=c(0,max(o)))
-lines(e,e,col="red")
-grid(col = "lightgray", lty = "dotted")
-dev.off()
-}
-smearPlot = function(myDGEList,deTags, outSmear, outMain)
-{
-pdf(outSmear)
-plotSmear(myDGEList,de.tags=deTags,main=outMain)
-grid(col="lightgray", lty="dotted")
-dev.off()
-}
-boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
-{
-nc = ncol(rawrs)
-##### for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
-fullnames = colnames(rawrs)
-newcolnames = substr(colnames(rawrs),1,20)
-colnames(rawrs) = newcolnames
-newcolnames = substr(colnames(cleanrs),1,20)
-colnames(cleanrs) = newcolnames
-defpar = par(no.readonly=T)
-print.noquote('@@@ Raw contig counts by sample:')
-print.noquote(summary(rawrs))
-print.noquote('@@@ Library size contig counts by sample:')
-print.noquote(summary(cleanrs))
-pdf(pdfname)
-par(mfrow=c(1,2))
-boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main='log2 raw counts')
-grid(col="lightgray",lty="dotted")
-boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('log2 counts after ',maint))
-grid(col="lightgray",lty="dotted")
-dev.off()
-pdfname = "sample_counts_histogram.pdf"
-nc = ncol(rawrs)
-print.noquote(paste('Using ncol rawrs=',nc))
-ncroot = round(sqrt(nc))
-if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
-m = c()
-for (i in c(1:nc)) {
-rhist = hist(rawrs[,i],breaks=100,plot=F)
-m = append(m,max(rhist\$counts))
-}
-ymax = max(m)
-ncols = length(fullnames)
-if (ncols > 20)
-{
-scale = 7*ncols/20
-pdf(pdfname,width=scale,height=scale)
-} else {
-pdf(pdfname)
-}
-par(mfrow=c(ncroot,ncroot))
-for (i in c(1:nc)) {
-hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
-breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
-}
-dev.off()
-par(defpar)
-}
-cumPlot = function(rawrs,cleanrs,maint,myTitle)
-{   # updated to use ecdf
-pdfname = "Differential_rowsum_bar_charts.pdf"
-defpar = par(no.readonly=T)
-lrs = log(rawrs,10)
-lim = max(lrs)
-pdf(pdfname)
-par(mfrow=c(2,1))
-hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
-ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
-grid(col="lightgray", lty="dotted")
-lrs = log(cleanrs,10)
-hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
-ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
-grid(col="lightgray", lty="dotted")
-dev.off()
-par(defpar)
-}
-cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
-{   # updated to use ecdf
-pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
-pdf(pdfname)
-par(mfrow=c(2,1))
-lastx = max(rawrs)
-rawe = knots(ecdf(rawrs))
-cleane = knots(ecdf(cleanrs))
-cy = 1:length(cleane)/length(cleane)
-ry = 1:length(rawe)/length(rawe)
-plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
-ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
-grid(col="blue")
-plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
-ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
-grid(col="blue")
-dev.off()
-}
-doGSEAold = function(y=NULL,design=NULL,histgmt="",
-bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
-ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
-{
-sink('Camera.log')
-genesets = c()
-if (bigmt > "")
-{
-bigenesets = readLines(bigmt)
-genesets = bigenesets
-}
-if (histgmt > "")
-{
-hgenesets = readLines(histgmt)
-if (bigmt > "") {
-genesets = rbind(genesets,hgenesets)
-} else {
-genesets = hgenesets
-} # use only history if no bi
-}
-print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
-genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
-outf = outfname
-head=paste(myTitle,'edgeR GSEA')
-write(head,file=outfname,append=F)
-ntest=length(genesets)
-urownames = toupper(rownames(y))
-upcam = c()
-downcam = c()
-for (i in 1:ntest) {
-gs = unlist(genesets[i])
-g = gs[1] # geneset_id
-u = gs[2]
-if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
-glist = gs[3:length(gs)] # member gene symbols
-glist = toupper(glist)
-inglist = urownames %in% glist
-nin = sum(inglist)
-if ((nin > minnin) && (nin < maxnin)) {
-### print(paste('@@found',sum(inglist),'genes in glist'))
-camres = camera(y=y,index=inglist,design=design)
-if (! is.null(camres)) {
-rownames(camres) = g # gene set name
-camres = cbind(GeneSet=g,URL=u,camres)
-if (camres\$Direction == "Up")
-{
-upcam = rbind(upcam,camres) } else {
-downcam = rbind(downcam,camres)
-}
-}
-}
-}
-uscam = upcam[order(upcam\$PValue),]
-unadjp = uscam\$PValue
-uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
-nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
-dscam = downcam[order(downcam\$PValue),]
-unadjp = dscam\$PValue
-dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
-ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
-write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
-write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
-print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
-write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
-print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
-write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
-sink()
-}
-doGSEA = function(y=NULL,design=NULL,histgmt="",
-bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
-ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
-{
-sink('Camera.log')
-genesets = c()
-if (bigmt > "")
-{
-bigenesets = readLines(bigmt)
-genesets = bigenesets
-}
-if (histgmt > "")
-{
-hgenesets = readLines(histgmt)
-if (bigmt > "") {
-genesets = rbind(genesets,hgenesets)
-} else {
-genesets = hgenesets
-} # use only history if no bi
-}
-print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
-genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
-outf = outfname
-head=paste(myTitle,'edgeR GSEA')
-write(head,file=outfname,append=F)
-ntest=length(genesets)
-urownames = toupper(rownames(y))
-upcam = c()
-downcam = c()
-incam = c()
-urls = c()
-gsids = c()
-for (i in 1:ntest) {
-gs = unlist(genesets[i])
-gsid = gs[1] # geneset_id
-url = gs[2]
-if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
-glist = gs[3:length(gs)] # member gene symbols
-glist = toupper(glist)
-inglist = urownames %in% glist
-nin = sum(inglist)
-if ((nin > minnin) && (nin < maxnin)) {
-incam = c(incam,inglist)
-gsids = c(gsids,gsid)
-urls = c(urls,url)
-}
-}
-incam = as.list(incam)
-names(incam) = gsids
-allcam = camera(y=y,index=incam,design=design)
-allcamres = cbind(geneset=gsids,allcam,URL=urls)
-for (i in 1:ntest) {
-camres = allcamres[i]
-res = try(test = (camres\$Direction == "Up"))
-if ("try-error" %in% class(res)) {
-cat("test failed, camres = :")
-print.noquote(camres)
-} else  { if (camres\$Direction == "Up")
-{  upcam = rbind(upcam,camres)
-} else { downcam = rbind(downcam,camres)
-}
-}
-}
-uscam = upcam[order(upcam\$PValue),]
-unadjp = uscam\$PValue
-uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
-nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
-dscam = downcam[order(downcam\$PValue),]
-unadjp = dscam\$PValue
-dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
-ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
-write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
-write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
-print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
-write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
-print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
-write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
-sink()
-}
-edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_Voom=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
-fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
-filterquantile=0.2, subjects=c(),TreatmentName="Rx",ControlName="Ctrl",mydesign=NULL,
-doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
-histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
-doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary')
-{
-logf = file('Differential.log', open = "a")
-sink(logf,type = c("output", "message"))
-run_edgeR = function(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod)
-{
-logf = file('edgeR.log', open = "a")
-sink(logf,type = c("output", "message"))
-#### Setup myDGEList object
-myDGEList = DGEList(counts=workCM, group = group)
-myDGEList = calcNormFactors(myDGEList)
-if (robust_meth == 'ordinary') {
-myDGEList = estimateGLMCommonDisp(myDGEList,mydesign)
-myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign)
-if (priordf > 0) {  myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign,prior.df = priordf)
-} else { myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign) }
-comdisp = myDGEList\$common.dispersion
-estpriorn = getPriorN(myDGEList)
-print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
-} else {
-myDGEList = estimateGLMRobustDisp(myDGEList,design=mydesign, prior.df = priordf, maxit = 6, residual.type = robust_meth)
-}
-DGLM = glmFit(myDGEList,design=mydesign)
-DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
-normData = cpm(myDGEList)
-uoutput = cbind(
-Name=as.character(rownames(myDGEList\$counts)),
-DE\$table,
-adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
-Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
-myDGEList\$counts
-)
-soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
-goodness = gof(DGLM, pcutoff=fdrthresh)
-if (sum(goodness\$outlier) > 0) {
-print.noquote('GLM outliers:')
-print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
-} else {
-print('No GLM fit outlier genes found\n')
-}
-z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
-pdf(paste("edgeR",mt,"GoodnessofFit.pdf",sep='_'))
-qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
-abline(0,1,lwd=3)
-points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
-dev.off()
-uniqueg = unique(group)
-write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
-tt = cbind(
-Name=as.character(rownames(myDGEList)),
-DE\$table,
-adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
-Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums
-)
-tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
-stt = tt[order(DE\$table\$PValue),]
-print.noquote("@@ edgeR Top tags\n")
-print.noquote(stt[1:50,])
-deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
-nsig = length(deTags)
-print.noquote(paste('@@',nsig,'tags significant at adj p=',fdrthresh))
-deColours = ifelse(deTags,'red','black')
-pdf(paste("edgeR",mt,"BCV_vs_abundance.pdf",sep="_"))
-plotBCV(myDGEList, cex=0.3, main="Biological CV vs abundance")
-dev.off()
-dg = myDGEList[order(DE\$table\$PValue),]
-outpdfname= paste("edgeR",mt,"top_100_heatmap.pdf",sep="_")
-ocpm = normData[order(DE\$table\$PValue),]
-ocpm = ocpm[c(1:100),]
-hmap2(ocpm,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste(myTitle,'Heatmap'))
-outSmear = paste("edgeR",mt,"smearplot.pdf",sep="_")
-outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
-smearPlot(myDGEList=myDGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
-qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf=paste('edgeR',mt,'qqplot.pdf',sep='_'))
-topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
-edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
-edgeRcounts = rep(0, length(allgenes))
-edgeRcounts[edgeRcountsindex] = 1  # Create venn diagram of hits
-sink()
-return(list(myDGEList=myDGEList,edgeRcounts=edgeRcounts))
-} ### run_edgeR
-run_DESeq2 = function(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType)
-{
-logf = file("DESeq2.log", open = "a")
-sink(logf,type = c("output", "message"))
-# DESeq2
-require('DESeq2')
-library('RColorBrewer')
-if (length(subjects) == 0)
-{
-pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
-deSEQds = DESeqDataSetFromMatrix(countData = workCM,  colData = pdata, design = formula(~ Rx))
-} else {
-pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
-deSEQds = DESeqDataSetFromMatrix(countData = workCM,  colData = pdata, design = formula(~ subjects + Rx))
-}
-deSeqDatsizefac = estimateSizeFactors(deSEQds)
-deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
-resDESeq = nbinomWaldTest(deSeqDatdisp)
-rDESeq = as.data.frame(results(resDESeq))
-rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
-srDESeq = rDESeq[order(rDESeq\$pvalue),]
-qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf=paste('DESeq2',mt,'qqplot.pdf',sep="_"))
-cat("# DESeq top 50\n")
-print.noquote(srDESeq[1:50,])
-write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
-topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
-DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
-DESeqcounts = rep(0, length(allgenes))
-DESeqcounts[DESeqcountsindex] = 1
-pdf(paste("DESeq2",mt,"dispersion_estimates.pdf",sep='_'))
-plotDispEsts(resDESeq)
-dev.off()
-ysmall = abs(min(rDESeq\$log2FoldChange))
-ybig = abs(max(rDESeq\$log2FoldChange))
-ylimit = min(4,ysmall,ybig)
-pdf(paste("DESeq2",mt,"MA_plot.pdf",sep="_"))
-plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
-dev.off()
-rlogres = rlogTransformation(resDESeq)
-sampledists = dist( t( assay(rlogres) ) )
-sdmat = as.matrix(sampledists)
-pdf(paste("DESeq2",mt,"sample_distance_plot.pdf",sep="_"))
-heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
-col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
-dev.off()
-result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
-if ("try-error" %in% class(result)) {
-print.noquote('DESeq2 plotPCA failed.')
-} else {
-pdf(paste("DESeq2",mt,"PCA_plot.pdf",sep="_"))
-#### wtf - print? Seems needed to get this to work
-print(ppca)
-dev.off()
-}
-sink()
-return(DESeqcounts)
-}
-run_Voom = function(workCM,pdata,subjects,group,mydesign,mt,out_Voom)
-{
-logf = file('VOOM.log', open = "a")
-sink(logf,type = c("output", "message"))
-if (doedgeR == F) {
-#### Setup myDGEList object
-myDGEList = DGEList(counts=workCM, group = group)
-myDGEList = calcNormFactors(myDGEList)
-myDGEList = estimateGLMCommonDisp(myDGEList,mydesign)
-myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign)
-myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign)
-}
-pdf(paste("VOOM",mt,"mean_variance_plot.pdf",sep='_'))
-dat.voomed <- voom(myDGEList, mydesign, plot = TRUE, normalize.method="quantil", lib.size = NULL)
-dev.off()
-# Use limma to fit data
-fit = lmFit(dat.voomed, mydesign)
-fit = eBayes(fit)
-rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
-qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf=paste('VOOM',mt,'qqplot.pdf',sep='_'))
-rownames(rvoom) = rownames(workCM)
-rvoom = cbind(Contig=rownames(workCM),rvoom,NReads=cmrowsums,URL=contigurls)
-srvoom = rvoom[order(rvoom\$P.Value),]
-cat("# VOOM top 50\n")
-print(srvoom[1:50,])
-write.table(srvoom,file=out_Voom, quote=FALSE, sep="\t",row.names=F)
-# Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
-topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
-voomcountsindex <- which(allgenes %in% rownames(topresults.voom))
-voomcounts = rep(0, length(allgenes))
-voomcounts[voomcountsindex] = 1
-sink()
-return(voomcounts)
-}
-#### data cleaning and analsis control starts here
-# Error handling
-nugroup = length(unique(group))
-if (nugroup!=2){
-print("Number of conditions identified in experiment does not equal 2")
-q()
-}
-require(edgeR)
-options(width = 512)
-mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
-allN = nrow(Count_Matrix)
-nscut = round(ncol(Count_Matrix)/2) # half samples
-colTotmillionreads = colSums(Count_Matrix)/1e6
-counts.dataframe = as.data.frame(c())
-rawrs = rowSums(Count_Matrix)
-nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
-nzN = nrow(nonzerod)
-nzrs = rowSums(nonzerod)
-zN = allN - nzN
-print('@@@ Quantiles for non-zero row counts:',quote=F)
-print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
-if (useNDF == T)
-{
-gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
-lo = colSums(Count_Matrix[!gt1rpin3,])
-workCM = Count_Matrix[gt1rpin3,]
-cleanrs = rowSums(workCM)
-cleanN = length(cleanrs)
-meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
-print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
-maint = paste('Filter >=1/million reads in >=',nscut,'samples')
-}   else {
-useme = (nzrs > quantile(nzrs,filterquantile))
-workCM = nonzerod[useme,]
-lo = colSums(nonzerod[!useme,])
-cleanrs = rowSums(workCM)
-cleanN = length(cleanrs)
-meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
-print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
-maint = paste('Filter below',filterquantile,'quantile')
-}
-cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
-allgenes = rownames(workCM)
-reg = "^chr([0-9]+):([0-9]+)-([0-9]+)" # ucsc chr:start-end regexp
-genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
-ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
-testreg = str_match(allgenes,reg)
-if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
-{
-print("@@ using ucsc substitution for urls")
-contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
-} else {
-print("@@ using genecards substitution for urls")
-contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
-}
-print.noquote(paste("@@ Total low count contigs per sample = ",paste(table(lo),collapse=',')))
-cmrowsums = rowSums(workCM)
-TName=unique(group)[1]
-CName=unique(group)[2]
-if (is.null(mydesign)) {
-if (length(subjects) == 0)
-{
-mydesign = model.matrix(~group)
-}
-else {
-subjf = factor(subjects)
-mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
-}
-}
-print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
-print.noquote('Using design matrix:')
-print.noquote(mydesign)
-normData = cpm(workCM)*1e6
-colnames(normData) = paste( colnames(workCM),'N',sep="_")
-print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
-if (doedgeR == T) {
-eres = run_edgeR(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod)
-myDGEList = eres\$myDGEList
-edgeRcounts = eres\$edgeRcounts
-#### Plot MDS
-sample_colors =  match(group,levels(group))
-sampleTypes = levels(factor(group))
-print.noquote(sampleTypes)
-pdf(paste("edgeR",mt,"MDSplot.pdf",sep='_'))
-plotMDS.DGEList(myDGEList,main=paste("MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
-legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
-grid(col="blue")
-dev.off()
-scale <- myDGEList\$samples\$lib.size*myDGEList\$samples\$norm.factors
-normCounts <- round(t(t(myDGEList\$counts)/scale)*mean(scale))
-try({boxPlot(rawrs=nzd,cleanrs=log2(normCounts+1),maint='Effects of TMM size normalisation',myTitle=myTitle,pdfname=paste("edgeR",mt,"raw_norm_counts_box.pdf",sep='_'))},T)
-}
-if (doDESeq2 == T) {  DESeqcounts = run_DESeq2(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) }
-if (doVoom == T) { voomcounts = run_Voom(workCM,pdata,subjects,group,mydesign,mt,out_Voom) }
-if (doCamera) {
-doGSEA(y=myDGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
-outfname=paste("GSEA_Camera",mt,"table.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
-}
-counts.dataframe = c()
-vennmain = 'no venn'
-if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
-if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
-vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
-counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
-VOOM_limma = voomcounts, row.names = allgenes)
-} else if ((doDESeq2==T) && (doedgeR==T))  {
-vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
-counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
-} else if ((doVoom==T) && (doedgeR==T)) {
-vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
-counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
-}
-if (nrow(counts.dataframe > 1)) {
-counts.venn = vennCounts(counts.dataframe)
-vennf = paste("Differential_venn",mt,"significant_genes_overlap.pdf",sep="_")
-pdf(vennf)
-vennDiagram(counts.venn,main=vennmain,col="maroon")
-dev.off()
-}
-} #### doDESeq2 or doVoom
-sink()
-}
-#### Done
-]]>
-builtin_gmt = ""
-history_gmt = ""
-history_gmt_name = ""
-out_edgeR = F
-out_DESeq2 = F
-out_Voom = "$out_VOOM"
-edgeR_robust_meth = "ordinary"
-doDESeq2 = $DESeq2.doDESeq2
-doVoom = $doVoom
-doCamera = F
-doedgeR = $edgeR.doedgeR
-edgeR_priordf = 10
-#if $doVoom == "T":
-out_Voom = "$out_VOOM"
-#end if
-#if $DESeq2.doDESeq2 == "T":
-out_DESeq2 = "$out_DESeq2"
-doDESeq2 = T
-DESeq_fitType = "$DESeq2.DESeq_fitType"
-#end if
-#if $edgeR.doedgeR == "T":
-out_edgeR = "$out_edgeR"
-edgeR_priordf = $edgeR.edgeR_priordf
-edgeR_robust_meth = "$edgeR.edgeR_robust_method"
-#end if
-if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
-{
-write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
-quit(save="no",status=2)
-}
-Out_Dir = "$html_file.files_path"
-Input =  "$input1"
-TreatmentName = "$treatment_name"
-TreatmentCols = "$Treat_cols"
-ControlName = "$control_name"
-ControlCols= "$Control_cols"
-org = "$input1.dbkey"
-if (org == "") { org = "hg19"}
-fdrtype = "$fdrtype"
-fdrthresh = $fdrthresh
-useNDF = $useNDF
-fQ = $fQ # non-differential centile cutoff
-myTitle = "$title"
-sids = strsplit("$subjectids",',')
-subjects = unlist(sids)
-nsubj = length(subjects)
-TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
-CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
-cat('Got TCols=')
-cat(TCols)
-cat('; CCols=')
-cat(CCols)
-cat('\n')
-<![CDATA[
-useCols = c(TCols,CCols)
-if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
-Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t')
-snames = colnames(Count_Matrix)
-nsamples = length(snames)
-if (nsubj >  0 & nsubj != nsamples) {
-options("show.error.messages"=T)
-mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
-'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
-write(mess, stderr())
-quit(save="no",status=4)
-}
-if (length(subjects) != 0) {subjects = subjects[useCols]}
-Count_Matrix = Count_Matrix[,useCols] ### reorder columns
-rn = rownames(Count_Matrix)
-islib = rn %in% c('librarySize','NotInBedRegions')
-LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
-Count_Matrix = Count_Matrix[subset(rn,! islib),]
-group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) )
-group = factor(group, levels=c(ControlName,TreatmentName))
-colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_")
-results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_Voom=out_Voom, out_DESeq2=out_DESeq2,
-fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
-myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,TreatmentName=TreatmentName,ControlName=ControlName,
-doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
-histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType,robust_meth=edgeR_robust_meth)
-sessionInfo()
-sink()
-]]>
-</configfile>
-</configfiles>
 <help>
 **What it does**
 Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
 Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
 **Input**
 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
 non-negative integer count of reads from one sample overlapping the feature.
 The matrix must have a header row uniquely identifying the source samples, and unique row names in
 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
 They must be unique and R names or they will be mangled - please read the fine R docs for the rules on identifiers.
 **Specifying comparisons**
 This is basically dumbed down for two factors - case vs control.
 More complex interfaces are possible but painful at present.
 Probably need to specify a phenotype file to do this better.
 Work in progress. Send code.
 If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
 put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
 A list of integers, one for each subject or an empty string if samples are all independent.
 If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
 Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
 So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
 eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
 8,9,1,1,2,2
 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
 **Methods available**
 You can run 3 popular Bioconductor packages available for count data.
 and optionally camera in edgeR which works better if MSigDB is installed.
 **Outputs**
 Some helpful plots and analysis results. Note that most of these are produced using R code
 suggested by the excellent documentation and vignettes for the Bioconductor
 packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
 **Note on Voom**
 vooma is a similar function but for microarrays instead of RNA-seq.
 ***old rant on changes to Bioconductor package variable names between versions***
 The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
 breaking this and all other code that assumed the old name for this variable,
 between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
 This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
 to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
 when their old scripts break. This tool currently now works with 2.4.6.
 **Note on prior.N**
 http://seqanswers.com/forums/showthread.php?t=5591 says:
 *prior.n*
 The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
 You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
 in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
 tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
 common likelihood the weight of one observation.
 In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
 or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
 you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
 (squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
 What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
 If you have more samples, then the tagwise dispersion estimates will be more reliable,
 so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
 From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
 Dear Dorota,
 ----
 **Attributions**
 edgeR - edgeR_
 VOOM/limma - limma_VOOM_
 DESeq2 - DESeq2_ for details
 See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
 Galaxy_ (that's what you are using right now!) for gluing everything together
 Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
 licensed to you under the LGPL_ like other rgenetics artefacts
 .. _LGPL: http://www.gnu.org/copyleft/lesser.html
 .. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html

Mercurial > repos > fubar > differential_count_models

comparison rgedgeRpaired_nocamera.xml @ 149:3107df74056e draft default tip