changeset 28:c4ee2e69d691 draft

Uploaded
author fubar
date Wed, 07 Aug 2013 02:41:40 -0400
parents ddd76b6db251
children ca87f891210c
files rgedgeRpaired.xml rgedgeRpaired.xml.camera
diffstat 2 files changed, 1084 insertions(+), 1084 deletions(-) [+]
line wrap: on
line diff
--- a/rgedgeRpaired.xml	Wed Aug 07 02:10:19 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1084 +0,0 @@
-<tool id="rgDifferentialCount" name="Differential_Count" version="0.20">
-  <description>models using BioConductor packages</description>
-  <requirements>
-      <requirement type="package" version="2.12">biocbasics</requirement>
-      <requirement type="package" version="3.0.1">r3</requirement>
-      <requirement type="package" version="1.3.18">graphicsmagick</requirement>
-      <requirement type="package" version="9.07">ghostscript</requirement>
-  </requirements>
-  
-  <command interpreter="python">
-     rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts" 
-    --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
-  </command>
-  <inputs>
-    <param name="input1"  type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
-       help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
-    <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs" 
-           help="Supply a meaningful name here to remind you what the outputs contain">
-      <sanitizer invalid_char="">
-        <valid initial="string.letters,string.digits"><add value="_" /> </valid>
-      </sanitizer>
-    </param>
-    <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
-    <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True" 
-         multiple="true" use_header_names="true" size="120" display="checkboxes">
-        <validator type="no_options" message="Please select at least one column."/>
-    </param>
-    <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
-    <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True" 
-         multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
-    </param>
-    <param name="subjectids" type="text" optional="true" size="120" value = ""
-       label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input"
-       help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'">
-      <sanitizer>
-        <valid initial="string.letters,string.digits"><add value="," /> </valid>
-      </sanitizer>
-    </param>
-    <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
-     help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
-    <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1" 
-              label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
-     help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
-
-    <conditional name="edgeR">
-        <param name="doedgeR" type="select" 
-           label="Run this model using edgeR"
-           help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
-          <option value="F">Do not run edgeR</option>
-          <option value="T" selected="true">Run edgeR</option>
-         </param>
-         <when value="T">
-          <param name="edgeR_priordf" type="integer" value="20" size="3" 
-           label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n  and prior.df = prior.n * residual.df"
-           help="0 = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
-         </when>
-         <when value="F"></when>
-    </conditional>
-    <conditional name="DESeq2">
-    <param name="doDESeq2" type="select" 
-       label="Run the same model with DESeq2 and compare findings"
-       help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
-      <option value="F" selected="true">Do not run DESeq2</option>
-      <option value="T">Run DESeq2</option>
-     </param>
-     <when value="T">
-         <param name="DESeq_fitType" type="select">
-            <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
-            <option value="local">Local fit - this will automagically be used if parametric fit fails</option>
-            <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual linked below in the documentation</option>
-         </param> 
-     </when>
-     <when value="F"> </when>
-    </conditional>
-    <param name="doVoom" type="select" 
-       label="Run the same model with Voom/limma and compare findings"
-       help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma">
-      <option value="F" selected="true">Do not run VOOM</option>
-      <option value="T">Run VOOM</option>
-     </param>
-    <!--
-    <conditional name="camera">
-        <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets" 
-            help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history">
-        <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option>
-        <option value="T">Run GSEA tests with the Camera algorithm</option>
-        </param>
-         <when value="T">
-         <conditional name="gmtSource">
-          <param name="refgmtSource" type="select" 
-             label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set">
-            <option value="indexed" selected="true">Use a built-in gene set</option>
-            <option value="history">Use a gene set from my history</option>
-            <option value="both">Add a gene set from my history to a built in gene set</option>
-          </param>
-          <when value="indexed">
-            <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
-              <options from_data_table="gseaGMT_3.1">
-                <filter type="sort_by" column="2" />
-                <validator type="no_options" message="No GMT v3.1 files are available - please install them"/>
-              </options>
-            </param>
-          </when>
-          <when value="history">
-            <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" />
-          </when>
-          <when value="both">
-            <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" />
-            <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
-              <options from_data_table="gseaGMT_4">
-                <filter type="sort_by" column="2" />
-                <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/>
-              </options>
-            </param>
-           </when>
-         </conditional>
-         </when>
-         <when value="F"> 
-         </when>
-    </conditional>
-    -->
-    <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
-     help="Conventional default value of 0.05 recommended"/>
-    <param name="fdrtype" type="select" label="FDR (Type II error) control method" 
-         help="Use fdr or bh typically to control for the number of tests in a reliable way">
-            <option value="fdr" selected="true">fdr</option>
-            <option value="BH">Benjamini Hochberg</option>
-            <option value="BY">Benjamini Yukateli</option>
-            <option value="bonferroni">Bonferroni</option>
-            <option value="hochberg">Hochberg</option>
-            <option value="holm">Holm</option>
-            <option value="hommel">Hommel</option>
-            <option value="none">no control for multiple tests</option>
-    </param>
-  </inputs>
-  <outputs>
-    <data format="tabular" name="out_edgeR" label="${title}_topTable_edgeR.xls">
-         <filter>edgeR['doedgeR'] == "T"</filter>
-    </data>
-    <data format="tabular" name="out_DESeq2" label="${title}_topTable_DESeq2.xls">
-         <filter>DESeq2['doDESeq2'] == "T"</filter>
-    </data>
-    <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
-         <filter>doVoom == "T"</filter>
-    </data>
-    <data format="html" name="html_file" label="${title}.html"/>
-  </outputs>
- <stdio>
-     <exit_code range="4"   level="fatal"   description="Number of subject ids must match total number of samples in the input matrix" />
- </stdio>
- <tests>
-<test>
-<param name='input1' value='test_bams2mx.xls' ftype='tabular' />
- <param name='treatment_name' value='liver' />
- <param name='title' value='edgeRtest' />
- <param name='useNDF' value='' />
- <param name='doedgeR' value='T' />
- <param name='doVoom' value='T' />
- <param name='doDESeq2' value='T' />
- <param name='fdrtype' value='fdr' />
- <param name='edgeR_priordf' value="8" />
- <param name='fdrthresh' value="0.05" />
- <param name='control_name' value='heart' />
- <param name='subjectids' value='' />
- <param name='Control_cols' value='3,4,5,9' />
- <param name='Treat_cols' value='2,6,7,8' />
- <output name='out_edgeR' file='edgeRtest1out.xls' compare='diff' />
- <output name='html_file' file='edgeRtest1out.html'  compare='diff' lines_diff='20' />
-</test>
-</tests>
-
-<configfiles>
-<configfile name="runme">
-<![CDATA[
-# 
-# edgeR.Rscript
-# updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
-# Performs DGE on a count table containing n replicates of two conditions
-#
-# Parameters
-#
-# 1 - Output Dir
-
-# Original edgeR code by: S.Lunke and A.Kaspi
-reallybig = log10(.Machine\$double.xmax)
-reallysmall = log10(.Machine\$double.xmin)
-library('stringr')
-library('gplots')
-library('edgeR')
-hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
-{
-# Perform clustering for significant pvalues after controlling FWER
-    samples = colnames(cmat)
-    gu = unique(group)
-    gn = rownames(cmat)
-    if (length(gu) == 2) {
-        col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
-        pcols = unlist(lapply(group,col.map))        
-        } else {
-        colours = rainbow(length(gu),start=0,end=4/6)
-        pcols = colours[match(group,gu)]        }
-    dm = cmat[(! is.na(gn)),] 
-    # remove unlabelled hm rows
-    nprobes = nrow(dm)
-    # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
-    if (nprobes > nsamp) {
-      dm =dm[1:nsamp,]
-      #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
-    }
-    newcolnames = substr(colnames(dm),1,20)
-    colnames(dm) = newcolnames
-    pdf(outpdfname)
-    heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
-         Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
-    dev.off()
-}
-
-hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
-{
-    # for 2 groups only was
-    #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
-    #pcols = unlist(lapply(group,col.map))
-    gu = unique(group)
-    colours = rainbow(length(gu),start=0.3,end=0.6)
-    pcols = colours[match(group,gu)]
-    nrows = nrow(cmat)
-    mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
-    if (nrows > nsamp)  {
-               cmat = cmat[c(1:nsamp),]
-               mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
-          }
-    newcolnames = substr(colnames(cmat),1,20)
-    colnames(cmat) = newcolnames
-    pdf(outpdfname)
-    heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
-    dev.off()
-}
-
-qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
-# stolen from https://gist.github.com/703512
-{
-    o = -log10(sort(pvector,decreasing=F))
-    e = -log10( 1:length(o)/length(o) )
-    o[o==-Inf] = reallysmall
-    o[o==Inf] = reallybig
-    maint = descr
-    pdf(outpdf)
-    plot(e,o,pch=19,cex=1, main=maint, ...,
-        xlab=expression(Expected~~-log[10](italic(p))),
-        ylab=expression(Observed~~-log[10](italic(p))),
-        xlim=c(0,max(e)), ylim=c(0,max(o)))
-    lines(e,e,col="red")
-    grid(col = "lightgray", lty = "dotted")
-    dev.off()
-}
-
-smearPlot = function(DGEList,deTags, outSmear, outMain)
-        {
-        pdf(outSmear)
-        plotSmear(DGEList,de.tags=deTags,main=outMain)
-        grid(col="lightgray", lty="dotted")
-        dev.off()
-        }
-        
-boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
-{   # 
-        nc = ncol(rawrs)
-        for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
-        fullnames = colnames(rawrs)
-        newcolnames = substr(colnames(rawrs),1,20)
-        colnames(rawrs) = newcolnames
-        newcolnames = substr(colnames(cleanrs),1,20)
-        colnames(cleanrs) = newcolnames
-        defpar = par(no.readonly=T)
-        print.noquote('raw contig counts by sample:')
-        print.noquote(summary(rawrs))
-        print.noquote('normalised contig counts by sample:')
-        print.noquote(summary(cleanrs))
-        pdf(pdfname)
-        par(mfrow=c(1,2))
-        boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
-        grid(col="lightgray",lty="dotted")
-        boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
-        grid(col="lightgray",lty="dotted")
-        dev.off()
-        pdfname = "sample_counts_histogram.pdf" 
-        nc = ncol(rawrs)
-        print.noquote(paste('Using ncol rawrs=',nc))
-        ncroot = round(sqrt(nc))
-        if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
-        m = c()
-        for (i in c(1:nc)) {
-              rhist = hist(rawrs[,i],breaks=100,plot=F)
-              m = append(m,max(rhist\$counts))
-             }
-        ymax = max(m)
-        ncols = length(fullnames)
-        if (ncols > 20) 
-        {
-           scale = 7*ncols/20
-           pdf(pdfname,width=scale,height=scale)
-        } else { 
-           pdf(pdfname)
-        }
-        par(mfrow=c(ncroot,ncroot))
-        for (i in c(1:nc)) {
-                 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon", 
-                 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
-             }
-        dev.off()
-        par(defpar)
-      
-}
-
-cumPlot = function(rawrs,cleanrs,maint,myTitle)
-{   # updated to use ecdf
-        pdfname = "Filtering_rowsum_bar_charts.pdf"
-        defpar = par(no.readonly=T)
-        lrs = log(rawrs,10) 
-        lim = max(lrs)
-        pdf(pdfname)
-        par(mfrow=c(2,1))
-        hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
-             ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
-        grid(col="lightgray", lty="dotted")
-        lrs = log(cleanrs,10) 
-        hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
-             ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
-        grid(col="lightgray", lty="dotted")
-        dev.off()
-        par(defpar)
-}
-
-cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
-{   # updated to use ecdf
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
-        pdf(pdfname)
-        par(mfrow=c(2,1))
-        lastx = max(rawrs)
-        rawe = knots(ecdf(rawrs))
-        cleane = knots(ecdf(cleanrs))
-        cy = 1:length(cleane)/length(cleane)
-        ry = 1:length(rawe)/length(rawe)
-        plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
-             ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
-        grid(col="blue")
-        plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
-             ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
-        grid(col="blue")
-        dev.off()
-}
-
-
-
-doGSEAold = function(y=NULL,design=NULL,histgmt="",
-                  bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
-                  ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
-{
-  sink('Camera.log')
-  genesets = c()
-  if (bigmt > "")
-  {
-    bigenesets = readLines(bigmt)
-    genesets = bigenesets
-  }
-  if (histgmt > "")
-  {
-    hgenesets = readLines(histgmt)
-    if (bigmt > "") {
-      genesets = rbind(genesets,hgenesets)
-    } else {
-      genesets = hgenesets
-    } # use only history if no bi
-  }
-  print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
-  genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
-  outf = outfname
-  head=paste(myTitle,'edgeR GSEA')
-  write(head,file=outfname,append=F)
-  ntest=length(genesets)
-  urownames = toupper(rownames(y))
-  upcam = c()
-  downcam = c()
-  for (i in 1:ntest) {
-    gs = unlist(genesets[i])
-    g = gs[1] # geneset_id
-    u = gs[2]
-    if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
-    glist = gs[3:length(gs)] # member gene symbols
-    glist = toupper(glist)
-    inglist = urownames %in% glist
-    nin = sum(inglist)
-    if ((nin > minnin) && (nin < maxnin)) {
-      ### print(paste('@@found',sum(inglist),'genes in glist'))
-      camres = camera(y=y,index=inglist,design=design)
-      if (! is.null(camres)) {
-      rownames(camres) = g # gene set name
-      camres = cbind(GeneSet=g,URL=u,camres)
-      if (camres\$Direction == "Up") 
-        {
-        upcam = rbind(upcam,camres) } else {
-          downcam = rbind(downcam,camres)
-        }
-      }
-   }
-  }
-  uscam = upcam[order(upcam\$PValue),]
-  unadjp = uscam\$PValue
-  uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
-  nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
-  dscam = downcam[order(downcam\$PValue),]
-  unadjp = dscam\$PValue
-  dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
-  ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
-  write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
-  write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
-  print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
-  write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
-  print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
-  write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
-  sink()
-}
-
-
-
-
-doGSEA = function(y=NULL,design=NULL,histgmt="",
-                  bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
-                  ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
-{
-  sink('Camera.log')
-  genesets = c()
-  if (bigmt > "")
-  {
-    bigenesets = readLines(bigmt)
-    genesets = bigenesets
-  }
-  if (histgmt > "")
-  {
-    hgenesets = readLines(histgmt)
-    if (bigmt > "") {
-      genesets = rbind(genesets,hgenesets)
-    } else {
-      genesets = hgenesets
-    } # use only history if no bi
-  }
-  print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
-  genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
-  outf = outfname
-  head=paste(myTitle,'edgeR GSEA')
-  write(head,file=outfname,append=F)
-  ntest=length(genesets)
-  urownames = toupper(rownames(y))
-  upcam = c()
-  downcam = c()
-  incam = c()
-  urls = c()
-  gsids = c()
-  for (i in 1:ntest) {
-    gs = unlist(genesets[i])
-    gsid = gs[1] # geneset_id
-    url = gs[2]
-    if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
-    glist = gs[3:length(gs)] # member gene symbols
-    glist = toupper(glist)
-    inglist = urownames %in% glist
-    nin = sum(inglist)
-    if ((nin > minnin) && (nin < maxnin)) {
-      incam = c(incam,inglist)
-      gsids = c(gsids,gsid)
-      urls = c(urls,url)
-    }
-  }
-  incam = as.list(incam)
-  names(incam) = gsids
-  allcam = camera(y=y,index=incam,design=design)
-  allcamres = cbind(geneset=gsids,allcam,URL=urls)
-  for (i in 1:ntest) {
-    camres = allcamres[i]
-    res = try(test = (camres\$Direction == "Up"))
-    if ("try-error" %in% class(res)) {
-      cat("test failed, camres = :")
-      print.noquote(camres)
-    } else  { if (camres\$Direction == "Up")
-    {  upcam = rbind(upcam,camres)
-    } else { downcam = rbind(downcam,camres)
-    }
-              
-    }
-  }
-  uscam = upcam[order(upcam\$PValue),]
-  unadjp = uscam\$PValue
-  uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
-  nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
-  dscam = downcam[order(downcam\$PValue),]
-  unadjp = dscam\$PValue
-  dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
-  ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
-  write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
-  write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
-  print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
-  write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
-  print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
-  write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
-  sink()
-  }
- 
-
-edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5, 
-        fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
-        filterquantile=0.2, subjects=c(),mydesign=NULL,
-        doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
-        histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
-        doCook=F,DESeq_fitType="parameteric") 
-{
-  # Error handling
-  if (length(unique(group))!=2){
-    print("Number of conditions identified in experiment does not equal 2")
-    q()
-    }
-  require(edgeR)
-  options(width = 512) 
-  mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
-  allN = nrow(Count_Matrix)
-  nscut = round(ncol(Count_Matrix)/2)
-  colTotmillionreads = colSums(Count_Matrix)/1e6
-  counts.dataframe = as.data.frame(c()) 
-  rawrs = rowSums(Count_Matrix)
-  nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
-  nzN = nrow(nonzerod)
-  nzrs = rowSums(nonzerod)
-  zN = allN - nzN
-  print('# Quantiles for non-zero row counts:',quote=F)
-  print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
-  if (useNDF == T)
-  {
-    gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
-    lo = colSums(Count_Matrix[!gt1rpin3,])
-    workCM = Count_Matrix[gt1rpin3,]
-    cleanrs = rowSums(workCM)
-    cleanN = length(cleanrs)
-    meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
-    print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
-    maint = paste('Filter >=1/million reads in >=',nscut,'samples')
-  }   else {        
-    useme = (nzrs > quantile(nzrs,filterquantile))
-    workCM = nonzerod[useme,]
-    lo = colSums(nonzerod[!useme,])
-    cleanrs = rowSums(workCM)
-    cleanN = length(cleanrs)
-    meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
-    print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
-    maint = paste('Filter below',filterquantile,'quantile')
-  }
-  cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
-  allgenes = rownames(workCM)
-  reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
-  genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
-  ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
-  testreg = str_match(allgenes,reg)
-  if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
-  {
-    print("@@ using ucsc substitution for urls")
-    contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
-  } else {
-    print("@@ using genecards substitution for urls")
-    contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
-  }
-  print.noquote("# urls")
-  print.noquote(head(contigurls))
-  print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F) 
-  cmrowsums = rowSums(workCM)
-  TName=unique(group)[1]
-  CName=unique(group)[2]
-  if (is.null(mydesign)) {
-    if (length(subjects) == 0) 
-    {
-      mydesign = model.matrix(~group)
-    } 
-    else { 
-      subjf = factor(subjects)
-      mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
-    }
-  } 
-  print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
-  print.noquote('Using design matrix:')
-  print.noquote(mydesign)
-  if (doedgeR) {
-  sink('edgeR.log')
-  #### Setup DGEList object
-  DGEList = DGEList(counts=workCM, group = group)
-  DGEList = calcNormFactors(DGEList)
-
-  DGEList = estimateGLMCommonDisp(DGEList,mydesign)
-  comdisp = DGEList\$common.dispersion
-  DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
-  if (edgeR_priordf > 0) {
-    print.noquote(paste("prior.df =",edgeR_priordf))
-    DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
-  } else {
-    DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
-  }
-  DGLM = glmFit(DGEList,design=mydesign)
-  DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
-  efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
-  normData = (1e+06*DGEList\$counts/efflib)
-  uoutput = cbind( 
-    Name=as.character(rownames(DGEList\$counts)),
-    DE\$table,
-    adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
-    Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
-    DGEList\$counts
-  )
-  soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
-  goodness = gof(DGLM, pcutoff=fdrthresh)
-  if (sum(goodness\$outlier) > 0) {
-    print.noquote('GLM outliers:')
-    print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
-    } else { 
-      print('No GLM fit outlier genes found\n')
-    }
-  z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
-  pdf("edgeR_GoodnessofFit.pdf")
-  qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
-  abline(0,1,lwd=3)
-  points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
-  dev.off()
-  estpriorn = getPriorN(DGEList)
-  print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
-  efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
-  normData = (1e+06*DGEList\$counts/efflib)
-  uniqueg = unique(group)
-  #### Plot MDS
-  sample_colors =  match(group,levels(group))
-  sampleTypes = levels(factor(group))
-  print.noquote(sampleTypes)
-  pdf("edgeR_MDSplot.pdf")
-  plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
-  legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
-  grid(col="blue")
-  dev.off()
-  colnames(normData) = paste( colnames(normData),'N',sep="_")
-  print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
-  nzd = data.frame(log(nonzerod + 1e-2,10))
-  try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") )
-  write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
-  tt = cbind( 
-    Name=as.character(rownames(DGEList\$counts)),
-    DE\$table,
-    adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
-    Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
-  )
-  print.noquote("# edgeR Top tags\n")
-  tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
-  tt = tt[order(DE\$table\$PValue),] 
-  print.noquote(tt[1:50,])
-  deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
-  nsig = length(deTags)
-  print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
-  deColours = ifelse(deTags,'red','black')
-  pdf("edgeR_BCV_vs_abundance.pdf")
-  plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
-  dev.off()
-  dg = DGEList[order(DE\$table\$PValue),]
-  #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
-  efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
-  normData = (1e+06*dg\$counts/efflib)
-  outpdfname="edgeR_top_100_heatmap.pdf"
-  hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle))
-  outSmear = "edgeR_smearplot.pdf"
-  outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
-  smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
-  qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf')
-  norm.factor = DGEList\$samples\$norm.factors
-  topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
-  edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
-  edgeRcounts = rep(0, length(allgenes))
-  edgeRcounts[edgeRcountsindex] = 1  # Create venn diagram of hits
-  sink()
-  } ### doedgeR
-  if (doDESeq2 == T)
-  {
-    sink("DESeq2.log")
-    # DESeq2
-    require('DESeq2')
-    library('RColorBrewer')
-    if (length(subjects) == 0)
-        {
-        pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
-        deSEQds = DESeqDataSetFromMatrix(countData = workCM,  colData = pdata, design = formula(~ Rx))
-        } else {
-        pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
-        deSEQds = DESeqDataSetFromMatrix(countData = workCM,  colData = pdata, design = formula(~ subjects + Rx))
-        }
-    #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
-    #rDESeq = results(DESeq2)
-    #newCountDataSet(workCM, group)
-    deSeqDatsizefac = estimateSizeFactors(deSEQds)
-    deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
-    resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
-    rDESeq = as.data.frame(results(resDESeq))
-    rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
-    srDESeq = rDESeq[order(rDESeq\$pvalue),]
-    qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf')
-    cat("# DESeq top 50\n")
-    print.noquote(srDESeq[1:50,])
-    write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
-    topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
-    DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
-    DESeqcounts = rep(0, length(allgenes))
-    DESeqcounts[DESeqcountsindex] = 1
-    pdf("DESeq2_dispersion_estimates.pdf")
-    plotDispEsts(resDESeq)
-    dev.off()
-    ysmall = abs(min(rDESeq\$log2FoldChange))
-    ybig = abs(max(rDESeq\$log2FoldChange))
-    ylimit = min(4,ysmall,ybig)
-    pdf("DESeq2_MA_plot.pdf")
-    plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
-    dev.off()
-    rlogres = rlogTransformation(resDESeq)
-    sampledists = dist( t( assay(rlogres) ) )
-    sdmat = as.matrix(sampledists)
-    pdf("DESeq2_sample_distance_plot.pdf")
-    heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
-         col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
-    dev.off()
-    ###outpdfname="DESeq2_top50_heatmap.pdf"
-    ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
-    sink()
-    result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
-    if ("try-error" %in% class(result)) {
-         print.noquote('DESeq2 plotPCA failed.')
-         } else {
-         pdf("DESeq2_PCA_plot.pdf")
-         #### wtf - print? Seems needed to get this to work
-         print(ppca)
-         dev.off()
-        }
-  }
-
-  if (doVoom == T) {
-      sink('VOOM.log')
-      if (doedgeR == F) {
-         #### Setup DGEList object
-         DGEList = DGEList(counts=workCM, group = group)
-         DGEList = calcNormFactors(DGEList)
-         DGEList = estimateGLMCommonDisp(DGEList,mydesign)
-         DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
-         DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
-         DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
-         norm.factor = DGEList\$samples\$norm.factors
-         }
-      pdf("VOOM_mean_variance_plot.pdf")
-      dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
-      dev.off()
-      # Use limma to fit data
-      fit = lmFit(dat.voomed, mydesign)
-      fit = eBayes(fit)
-      rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
-      qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='VOOM_qqplot.pdf')
-      rownames(rvoom) = rownames(workCM)
-      rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls)
-      srvoom = rvoom[order(rvoom\$P.Value),]
-      cat("# VOOM top 50\n")
-      print(srvoom[1:50,])
-      write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F)
-      # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
-      topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
-      voomcountsindex = which(allgenes %in% topresults.voom\$ID)
-      voomcounts = rep(0, length(allgenes))
-      voomcounts[voomcountsindex] = 1
-      sink()
-  }
-
-  if (doCamera) {
-  doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
-    outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
-  }
-
-  if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
-    if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
-        vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
-        counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, 
-                                       VOOM_limma = voomcounts, row.names = allgenes)
-       } else if ((doDESeq2==T) && (doedgeR==T))  {
-         vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
-         counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
-       } else if ((doVoom==T) && (doedgeR==T)) {
-        vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
-        counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
-       }
-    
-    if (nrow(counts.dataframe > 1)) {
-      counts.venn = vennCounts(counts.dataframe)
-      vennf = "Venn_significant_genes_overlap.pdf" 
-      pdf(vennf)
-      vennDiagram(counts.venn,main=vennmain,col="maroon")
-      dev.off()
-    }
-  } #### doDESeq2 or doVoom
-  
-}
-#### Done
-
-###sink(stdout(),append=T,type="message")
-builtin_gmt = ""
-history_gmt = ""
-history_gmt_name = ""
-out_edgeR = F
-out_DESeq2 = F
-out_VOOM = "$out_VOOM"
-doDESeq2 = $DESeq2.doDESeq2 # make these T or F
-doVoom = $doVoom
-doCamera = F
-doedgeR = $edgeR.doedgeR
-edgeR_priordf = 0
-
-
-#if $doVoom == "T":
-  out_VOOM = "$out_VOOM"
-#end if
-
-#if $DESeq2.doDESeq2 == "T":
-  out_DESeq2 = "$out_DESeq2"
-  DESeq_fitType = "$DESeq2.DESeq_fitType"
-#end if
-
-#if $edgeR.doedgeR == "T":
-  out_edgeR = "$out_edgeR"
-  edgeR_priordf = $edgeR.edgeR_priordf  
-#end if
-
-<!--
-#if $camera.doCamera == 'T'
- doCamera = $camera.doCamera
- #if $camera.gmtSource.refgmtSource == "indexed" or $camera.gmtSource.refgmtSource == "both":
-  builtin_gmt = "${camera.gmtSource.builtinGMT.fields.path}"
- #end if
- #if $camera.gmtSource.refgmtSource == "history" or $camera.gmtSource.refgmtSource == "both":
-    history_gmt = "${camera.gmtSource.ownGMT}"
-    history_gmt_name = "${camera.gmtSource.ownGMT.name}"
-  #end if
-#end if
--->
-
-if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
-{
-write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
-quit(save="no",status=2)
-}
-
-Out_Dir = "$html_file.files_path"
-Input =  "$input1"
-TreatmentName = "$treatment_name"
-TreatmentCols = "$Treat_cols"
-ControlName = "$control_name"
-ControlCols= "$Control_cols"
-org = "$input1.dbkey"
-if (org == "") { org = "hg19"}
-fdrtype = "$fdrtype"
-fdrthresh = $fdrthresh
-useNDF = $useNDF
-fQ = $fQ # non-differential centile cutoff
-myTitle = "$title"
-sids = strsplit("$subjectids",',')
-subjects = unlist(sids)
-nsubj = length(subjects)
-TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
-CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1 
-cat('Got TCols=')
-cat(TCols)
-cat('; CCols=')
-cat(CCols)
-cat('\n')
-useCols = c(TCols,CCols)
-if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
-Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
-snames = colnames(Count_Matrix)
-nsamples = length(snames)
-if (nsubj >  0 & nsubj != nsamples) {
-options("show.error.messages"=T)
-mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
-   'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
-write(mess, stderr())
-quit(save="no",status=4)
-}
-if (length(subjects) != 0) {subjects = subjects[useCols]}
-Count_Matrix = Count_Matrix[,useCols] ### reorder columns
-rn = rownames(Count_Matrix)
-islib = rn %in% c('librarySize','NotInBedRegions')
-LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
-Count_Matrix = Count_Matrix[subset(rn,! islib),]
-group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) )             #Build a group descriptor
-group = factor(group, levels=c(ControlName,TreatmentName))
-colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_")                   #Relable columns
-results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
-                 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
-                 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,
-                 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
-                 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType)
-sessionInfo()
-]]>
-</configfile>
-</configfiles>
-<help>
-
-**What it does**
-
-Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
-Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
-
-**Input**
-
-Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
-and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the 
-non-negative integer count of reads from one sample overlapping the feature. 
-The matrix must have a header row uniquely identifying the source samples, and unique row names in 
-the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
-
-**Specifying comparisons**
-
-This is basically dumbed down for two factors - case vs control.
-
-More complex interfaces are possible but painful at present. 
-Probably need to specify a phenotype file to do this better.
-Work in progress. Send code.
-
-If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
-put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or 
-A list of integers, one for each subject or an empty string if samples are all independent.
-If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
-Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
-
-So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
-eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
-8,9,1,1,2,2 
-as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
-
-**Methods available**
-
-You can run 3 popular Bioconductor packages available for count data.
-
-edgeR - see edgeR_ for details
-
-VOOM/limma - see limma_VOOM_ for details
-
-DESeq2 - see DESeq2_ for details
-
-and optionally camera in edgeR which works better if MSigDB is installed.
-
-**Outputs**
-
-Some helpful plots and analysis results. Note that most of these are produced using R code 
-suggested by the excellent documentation and vignettes for the Bioconductor
-packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
-
-**Note on Voom**
-
-The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
-
-This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
-
-voom is an acronym for mean-variance modelling at the observational level.
-The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
-Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
-This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
-The weights are then used in the linear modelling process to adjust for heteroscedasticity.
-
-In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
-The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
-The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
-Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
-Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
-This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
-
-
-Author(s)
-
-Charity Law and Gordon Smyth
-
-References
-
-Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
-
-Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
-Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
-http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
-
-See Also
-
-A voom case study is given in the edgeR User's Guide.
-
-vooma is a similar function but for microarrays instead of RNA-seq.
-
-
-***old rant on changes to Bioconductor package variable names between versions***
-
-The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue) 
-breaking this and all other code that assumed the old name for this variable, 
-between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing). 
-This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing 
-to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
-when their old scripts break. This tool currently now works with 2.4.6.
-
-**Note on prior.N**
-
-http://seqanswers.com/forums/showthread.php?t=5591 says:
-
-*prior.n*
-
-The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion. 
-You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood 
-in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your 
-tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the 
-common likelihood the weight of one observation.
-
-In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value, 
-or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that 
-you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation 
-(squeezing) of the tagwise dispersions. How many samples do you have in your experiment? 
-What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10. 
-If you have more samples, then the tagwise dispersion estimates will be more reliable, 
-so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5. 
-
-
-From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
-
-Dear Dorota,
-
-The important settings are prior.df and trend.
-
-prior.n and prior.df are related through prior.df = prior.n * residual.df,
-and your experiment has residual.df = 36 - 12 = 24.  So the old setting of
-prior.n=10 is equivalent for your data to prior.df = 240, a very large
-value.  Going the other way, the new setting of prior.df=10 is equivalent
-to prior.n=10/24.
-
-To recover old results with the current software you would use
-
-  estimateTagwiseDisp(object, prior.df=240, trend="none")
-
-To get the new default from old software you would use
-
-  estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
-
-Actually the old trend method is equivalent to trend="loess" in the new
-software. You should use plotBCV(object) to see whether a trend is
-required.
-
-Note you could also use
-
-  prior.n = getPriorN(object, prior.df=10)
-
-to map between prior.df and prior.n.
-
-----
-
-**Attributions**
-
-edgeR - edgeR_ 
-
-VOOM/limma - limma_VOOM_ 
-
-DESeq2 - DESeq2_ for details
-
-See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
-
-Galaxy_ (that's what you are using right now!) for gluing everything together 
-
-Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is 
-licensed to you under the LGPL_ like other rgenetics artefacts
-
-.. _LGPL: http://www.gnu.org/copyleft/lesser.html
-.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
-.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
-.. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
-.. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
-.. _Galaxy: http://getgalaxy.org
-</help>
-
-</tool>
-
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rgedgeRpaired.xml.camera	Wed Aug 07 02:41:40 2013 -0400
@@ -0,0 +1,1084 @@
+<tool id="rgDifferentialCount" name="Differential_Count" version="0.20">
+  <description>models using BioConductor packages</description>
+  <requirements>
+      <requirement type="package" version="2.12">biocbasics</requirement>
+      <requirement type="package" version="3.0.1">r3</requirement>
+      <requirement type="package" version="1.3.18">graphicsmagick</requirement>
+      <requirement type="package" version="9.07">ghostscript</requirement>
+  </requirements>
+  
+  <command interpreter="python">
+     rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts" 
+    --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
+  </command>
+  <inputs>
+    <param name="input1"  type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
+       help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
+    <param name="title" type="text" value="Differential Counts" size="80" label="Title for job outputs" 
+           help="Supply a meaningful name here to remind you what the outputs contain">
+      <sanitizer invalid_char="">
+        <valid initial="string.letters,string.digits"><add value="_" /> </valid>
+      </sanitizer>
+    </param>
+    <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
+    <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True" 
+         multiple="true" use_header_names="true" size="120" display="checkboxes">
+        <validator type="no_options" message="Please select at least one column."/>
+    </param>
+    <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
+    <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True" 
+         multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
+    </param>
+    <param name="subjectids" type="text" optional="true" size="120" value = ""
+       label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input"
+       help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'">
+      <sanitizer>
+        <valid initial="string.letters,string.digits"><add value="," /> </valid>
+      </sanitizer>
+    </param>
+    <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
+     help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
+    <param name="useNDF" type="boolean" truevalue="T" falsevalue="F" checked="false" size="1" 
+              label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
+     help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
+
+    <conditional name="edgeR">
+        <param name="doedgeR" type="select" 
+           label="Run this model using edgeR"
+           help="edgeR uses a negative binomial model and seems to be powerful, even with few replicates">
+          <option value="F">Do not run edgeR</option>
+          <option value="T" selected="true">Run edgeR</option>
+         </param>
+         <when value="T">
+          <param name="edgeR_priordf" type="integer" value="20" size="3" 
+           label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n  and prior.df = prior.n * residual.df"
+           help="0 = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
+         </when>
+         <when value="F"></when>
+    </conditional>
+    <conditional name="DESeq2">
+    <param name="doDESeq2" type="select" 
+       label="Run the same model with DESeq2 and compare findings"
+       help="DESeq2 is an update to the DESeq package. It uses different assumptions and methods to edgeR">
+      <option value="F" selected="true">Do not run DESeq2</option>
+      <option value="T">Run DESeq2</option>
+     </param>
+     <when value="T">
+         <param name="DESeq_fitType" type="select">
+            <option value="parametric" selected="true">Parametric (default) fit for dispersions</option>
+            <option value="local">Local fit - this will automagically be used if parametric fit fails</option>
+            <option value="mean">Mean dispersion fit- use this if you really understand what you're doing - read the fine manual linked below in the documentation</option>
+         </param> 
+     </when>
+     <when value="F"> </when>
+    </conditional>
+    <param name="doVoom" type="select" 
+       label="Run the same model with Voom/limma and compare findings"
+       help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma">
+      <option value="F" selected="true">Do not run VOOM</option>
+      <option value="T">Run VOOM</option>
+     </param>
+    <!--
+    <conditional name="camera">
+        <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets" 
+            help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history">
+        <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option>
+        <option value="T">Run GSEA tests with the Camera algorithm</option>
+        </param>
+         <when value="T">
+         <conditional name="gmtSource">
+          <param name="refgmtSource" type="select" 
+             label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set">
+            <option value="indexed" selected="true">Use a built-in gene set</option>
+            <option value="history">Use a gene set from my history</option>
+            <option value="both">Add a gene set from my history to a built in gene set</option>
+          </param>
+          <when value="indexed">
+            <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
+              <options from_data_table="gseaGMT_3.1">
+                <filter type="sort_by" column="2" />
+                <validator type="no_options" message="No GMT v3.1 files are available - please install them"/>
+              </options>
+            </param>
+          </when>
+          <when value="history">
+            <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" />
+          </when>
+          <when value="both">
+            <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" />
+            <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
+              <options from_data_table="gseaGMT_4">
+                <filter type="sort_by" column="2" />
+                <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/>
+              </options>
+            </param>
+           </when>
+         </conditional>
+         </when>
+         <when value="F"> 
+         </when>
+    </conditional>
+    -->
+    <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
+     help="Conventional default value of 0.05 recommended"/>
+    <param name="fdrtype" type="select" label="FDR (Type II error) control method" 
+         help="Use fdr or bh typically to control for the number of tests in a reliable way">
+            <option value="fdr" selected="true">fdr</option>
+            <option value="BH">Benjamini Hochberg</option>
+            <option value="BY">Benjamini Yukateli</option>
+            <option value="bonferroni">Bonferroni</option>
+            <option value="hochberg">Hochberg</option>
+            <option value="holm">Holm</option>
+            <option value="hommel">Hommel</option>
+            <option value="none">no control for multiple tests</option>
+    </param>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_edgeR" label="${title}_topTable_edgeR.xls">
+         <filter>edgeR['doedgeR'] == "T"</filter>
+    </data>
+    <data format="tabular" name="out_DESeq2" label="${title}_topTable_DESeq2.xls">
+         <filter>DESeq2['doDESeq2'] == "T"</filter>
+    </data>
+    <data format="tabular" name="out_VOOM" label="${title}_topTable_VOOM.xls">
+         <filter>doVoom == "T"</filter>
+    </data>
+    <data format="html" name="html_file" label="${title}.html"/>
+  </outputs>
+ <stdio>
+     <exit_code range="4"   level="fatal"   description="Number of subject ids must match total number of samples in the input matrix" />
+ </stdio>
+ <tests>
+<test>
+<param name='input1' value='test_bams2mx.xls' ftype='tabular' />
+ <param name='treatment_name' value='liver' />
+ <param name='title' value='edgeRtest' />
+ <param name='useNDF' value='' />
+ <param name='doedgeR' value='T' />
+ <param name='doVoom' value='T' />
+ <param name='doDESeq2' value='T' />
+ <param name='fdrtype' value='fdr' />
+ <param name='edgeR_priordf' value="8" />
+ <param name='fdrthresh' value="0.05" />
+ <param name='control_name' value='heart' />
+ <param name='subjectids' value='' />
+ <param name='Control_cols' value='3,4,5,9' />
+ <param name='Treat_cols' value='2,6,7,8' />
+ <output name='out_edgeR' file='edgeRtest1out.xls' compare='diff' />
+ <output name='html_file' file='edgeRtest1out.html'  compare='diff' lines_diff='20' />
+</test>
+</tests>
+
+<configfiles>
+<configfile name="runme">
+<![CDATA[
+# 
+# edgeR.Rscript
+# updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
+# Performs DGE on a count table containing n replicates of two conditions
+#
+# Parameters
+#
+# 1 - Output Dir
+
+# Original edgeR code by: S.Lunke and A.Kaspi
+reallybig = log10(.Machine\$double.xmax)
+reallysmall = log10(.Machine\$double.xmin)
+library('stringr')
+library('gplots')
+library('edgeR')
+hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
+{
+# Perform clustering for significant pvalues after controlling FWER
+    samples = colnames(cmat)
+    gu = unique(group)
+    gn = rownames(cmat)
+    if (length(gu) == 2) {
+        col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
+        pcols = unlist(lapply(group,col.map))        
+        } else {
+        colours = rainbow(length(gu),start=0,end=4/6)
+        pcols = colours[match(group,gu)]        }
+    dm = cmat[(! is.na(gn)),] 
+    # remove unlabelled hm rows
+    nprobes = nrow(dm)
+    # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
+    if (nprobes > nsamp) {
+      dm =dm[1:nsamp,]
+      #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
+    }
+    newcolnames = substr(colnames(dm),1,20)
+    colnames(dm) = newcolnames
+    pdf(outpdfname)
+    heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
+         Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
+    dev.off()
+}
+
+hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
+{
+    # for 2 groups only was
+    #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
+    #pcols = unlist(lapply(group,col.map))
+    gu = unique(group)
+    colours = rainbow(length(gu),start=0.3,end=0.6)
+    pcols = colours[match(group,gu)]
+    nrows = nrow(cmat)
+    mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
+    if (nrows > nsamp)  {
+               cmat = cmat[c(1:nsamp),]
+               mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
+          }
+    newcolnames = substr(colnames(cmat),1,20)
+    colnames(cmat) = newcolnames
+    pdf(outpdfname)
+    heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
+    dev.off()
+}
+
+qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
+# stolen from https://gist.github.com/703512
+{
+    o = -log10(sort(pvector,decreasing=F))
+    e = -log10( 1:length(o)/length(o) )
+    o[o==-Inf] = reallysmall
+    o[o==Inf] = reallybig
+    maint = descr
+    pdf(outpdf)
+    plot(e,o,pch=19,cex=1, main=maint, ...,
+        xlab=expression(Expected~~-log[10](italic(p))),
+        ylab=expression(Observed~~-log[10](italic(p))),
+        xlim=c(0,max(e)), ylim=c(0,max(o)))
+    lines(e,e,col="red")
+    grid(col = "lightgray", lty = "dotted")
+    dev.off()
+}
+
+smearPlot = function(DGEList,deTags, outSmear, outMain)
+        {
+        pdf(outSmear)
+        plotSmear(DGEList,de.tags=deTags,main=outMain)
+        grid(col="lightgray", lty="dotted")
+        dev.off()
+        }
+        
+boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
+{   # 
+        nc = ncol(rawrs)
+        for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
+        fullnames = colnames(rawrs)
+        newcolnames = substr(colnames(rawrs),1,20)
+        colnames(rawrs) = newcolnames
+        newcolnames = substr(colnames(cleanrs),1,20)
+        colnames(cleanrs) = newcolnames
+        defpar = par(no.readonly=T)
+        print.noquote('raw contig counts by sample:')
+        print.noquote(summary(rawrs))
+        print.noquote('normalised contig counts by sample:')
+        print.noquote(summary(cleanrs))
+        pdf(pdfname)
+        par(mfrow=c(1,2))
+        boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
+        grid(col="lightgray",lty="dotted")
+        boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
+        grid(col="lightgray",lty="dotted")
+        dev.off()
+        pdfname = "sample_counts_histogram.pdf" 
+        nc = ncol(rawrs)
+        print.noquote(paste('Using ncol rawrs=',nc))
+        ncroot = round(sqrt(nc))
+        if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
+        m = c()
+        for (i in c(1:nc)) {
+              rhist = hist(rawrs[,i],breaks=100,plot=F)
+              m = append(m,max(rhist\$counts))
+             }
+        ymax = max(m)
+        ncols = length(fullnames)
+        if (ncols > 20) 
+        {
+           scale = 7*ncols/20
+           pdf(pdfname,width=scale,height=scale)
+        } else { 
+           pdf(pdfname)
+        }
+        par(mfrow=c(ncroot,ncroot))
+        for (i in c(1:nc)) {
+                 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon", 
+                 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
+             }
+        dev.off()
+        par(defpar)
+      
+}
+
+cumPlot = function(rawrs,cleanrs,maint,myTitle)
+{   # updated to use ecdf
+        pdfname = "Filtering_rowsum_bar_charts.pdf"
+        defpar = par(no.readonly=T)
+        lrs = log(rawrs,10) 
+        lim = max(lrs)
+        pdf(pdfname)
+        par(mfrow=c(2,1))
+        hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
+             ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
+        grid(col="lightgray", lty="dotted")
+        lrs = log(cleanrs,10) 
+        hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
+             ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
+        grid(col="lightgray", lty="dotted")
+        dev.off()
+        par(defpar)
+}
+
+cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
+{   # updated to use ecdf
+        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
+        pdf(pdfname)
+        par(mfrow=c(2,1))
+        lastx = max(rawrs)
+        rawe = knots(ecdf(rawrs))
+        cleane = knots(ecdf(cleanrs))
+        cy = 1:length(cleane)/length(cleane)
+        ry = 1:length(rawe)/length(rawe)
+        plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
+             ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
+        grid(col="blue")
+        plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
+             ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
+        grid(col="blue")
+        dev.off()
+}
+
+
+
+doGSEAold = function(y=NULL,design=NULL,histgmt="",
+                  bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
+                  ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
+{
+  sink('Camera.log')
+  genesets = c()
+  if (bigmt > "")
+  {
+    bigenesets = readLines(bigmt)
+    genesets = bigenesets
+  }
+  if (histgmt > "")
+  {
+    hgenesets = readLines(histgmt)
+    if (bigmt > "") {
+      genesets = rbind(genesets,hgenesets)
+    } else {
+      genesets = hgenesets
+    } # use only history if no bi
+  }
+  print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
+  genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
+  outf = outfname
+  head=paste(myTitle,'edgeR GSEA')
+  write(head,file=outfname,append=F)
+  ntest=length(genesets)
+  urownames = toupper(rownames(y))
+  upcam = c()
+  downcam = c()
+  for (i in 1:ntest) {
+    gs = unlist(genesets[i])
+    g = gs[1] # geneset_id
+    u = gs[2]
+    if (u > "") { u = paste("<a href=\'",u,"\'>",u,"</a>",sep="") }
+    glist = gs[3:length(gs)] # member gene symbols
+    glist = toupper(glist)
+    inglist = urownames %in% glist
+    nin = sum(inglist)
+    if ((nin > minnin) && (nin < maxnin)) {
+      ### print(paste('@@found',sum(inglist),'genes in glist'))
+      camres = camera(y=y,index=inglist,design=design)
+      if (! is.null(camres)) {
+      rownames(camres) = g # gene set name
+      camres = cbind(GeneSet=g,URL=u,camres)
+      if (camres\$Direction == "Up") 
+        {
+        upcam = rbind(upcam,camres) } else {
+          downcam = rbind(downcam,camres)
+        }
+      }
+   }
+  }
+  uscam = upcam[order(upcam\$PValue),]
+  unadjp = uscam\$PValue
+  uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+  nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
+  dscam = downcam[order(downcam\$PValue),]
+  unadjp = dscam\$PValue
+  dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+  ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
+  write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+  write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+  print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
+  write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
+  print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
+  write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
+  sink()
+}
+
+
+
+
+doGSEA = function(y=NULL,design=NULL,histgmt="",
+                  bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
+                  ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
+{
+  sink('Camera.log')
+  genesets = c()
+  if (bigmt > "")
+  {
+    bigenesets = readLines(bigmt)
+    genesets = bigenesets
+  }
+  if (histgmt > "")
+  {
+    hgenesets = readLines(histgmt)
+    if (bigmt > "") {
+      genesets = rbind(genesets,hgenesets)
+    } else {
+      genesets = hgenesets
+    } # use only history if no bi
+  }
+  print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
+  genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
+  outf = outfname
+  head=paste(myTitle,'edgeR GSEA')
+  write(head,file=outfname,append=F)
+  ntest=length(genesets)
+  urownames = toupper(rownames(y))
+  upcam = c()
+  downcam = c()
+  incam = c()
+  urls = c()
+  gsids = c()
+  for (i in 1:ntest) {
+    gs = unlist(genesets[i])
+    gsid = gs[1] # geneset_id
+    url = gs[2]
+    if (url > "") { url = paste("<a href=\'",url,"\'>",url,"</a>",sep="") }
+    glist = gs[3:length(gs)] # member gene symbols
+    glist = toupper(glist)
+    inglist = urownames %in% glist
+    nin = sum(inglist)
+    if ((nin > minnin) && (nin < maxnin)) {
+      incam = c(incam,inglist)
+      gsids = c(gsids,gsid)
+      urls = c(urls,url)
+    }
+  }
+  incam = as.list(incam)
+  names(incam) = gsids
+  allcam = camera(y=y,index=incam,design=design)
+  allcamres = cbind(geneset=gsids,allcam,URL=urls)
+  for (i in 1:ntest) {
+    camres = allcamres[i]
+    res = try(test = (camres\$Direction == "Up"))
+    if ("try-error" %in% class(res)) {
+      cat("test failed, camres = :")
+      print.noquote(camres)
+    } else  { if (camres\$Direction == "Up")
+    {  upcam = rbind(upcam,camres)
+    } else { downcam = rbind(downcam,camres)
+    }
+              
+    }
+  }
+  uscam = upcam[order(upcam\$PValue),]
+  unadjp = uscam\$PValue
+  uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+  nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
+  dscam = downcam[order(downcam\$PValue),]
+  unadjp = dscam\$PValue
+  dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+  ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
+  write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+  write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+  print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
+  write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
+  print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
+  write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
+  sink()
+  }
+ 
+
+edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5, 
+        fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
+        filterquantile=0.2, subjects=c(),mydesign=NULL,
+        doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
+        histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
+        doCook=F,DESeq_fitType="parameteric") 
+{
+  # Error handling
+  if (length(unique(group))!=2){
+    print("Number of conditions identified in experiment does not equal 2")
+    q()
+    }
+  require(edgeR)
+  options(width = 512) 
+  mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
+  allN = nrow(Count_Matrix)
+  nscut = round(ncol(Count_Matrix)/2)
+  colTotmillionreads = colSums(Count_Matrix)/1e6
+  counts.dataframe = as.data.frame(c()) 
+  rawrs = rowSums(Count_Matrix)
+  nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
+  nzN = nrow(nonzerod)
+  nzrs = rowSums(nonzerod)
+  zN = allN - nzN
+  print('# Quantiles for non-zero row counts:',quote=F)
+  print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
+  if (useNDF == T)
+  {
+    gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
+    lo = colSums(Count_Matrix[!gt1rpin3,])
+    workCM = Count_Matrix[gt1rpin3,]
+    cleanrs = rowSums(workCM)
+    cleanN = length(cleanrs)
+    meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
+    print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
+    maint = paste('Filter >=1/million reads in >=',nscut,'samples')
+  }   else {        
+    useme = (nzrs > quantile(nzrs,filterquantile))
+    workCM = nonzerod[useme,]
+    lo = colSums(nonzerod[!useme,])
+    cleanrs = rowSums(workCM)
+    cleanN = length(cleanrs)
+    meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
+    print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
+    maint = paste('Filter below',filterquantile,'quantile')
+  }
+  cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
+  allgenes = rownames(workCM)
+  reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
+  genecards="<a href=\'http://www.genecards.org/index.php?path=/Search/keyword/"
+  ucsc = paste("<a href=\'http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
+  testreg = str_match(allgenes,reg)
+  if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
+  {
+    print("@@ using ucsc substitution for urls")
+    contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
+  } else {
+    print("@@ using genecards substitution for urls")
+    contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
+  }
+  print.noquote("# urls")
+  print.noquote(head(contigurls))
+  print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F) 
+  cmrowsums = rowSums(workCM)
+  TName=unique(group)[1]
+  CName=unique(group)[2]
+  if (is.null(mydesign)) {
+    if (length(subjects) == 0) 
+    {
+      mydesign = model.matrix(~group)
+    } 
+    else { 
+      subjf = factor(subjects)
+      mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
+    }
+  } 
+  print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
+  print.noquote('Using design matrix:')
+  print.noquote(mydesign)
+  if (doedgeR) {
+  sink('edgeR.log')
+  #### Setup DGEList object
+  DGEList = DGEList(counts=workCM, group = group)
+  DGEList = calcNormFactors(DGEList)
+
+  DGEList = estimateGLMCommonDisp(DGEList,mydesign)
+  comdisp = DGEList\$common.dispersion
+  DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
+  if (edgeR_priordf > 0) {
+    print.noquote(paste("prior.df =",edgeR_priordf))
+    DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
+  } else {
+    DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
+  }
+  DGLM = glmFit(DGEList,design=mydesign)
+  DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
+  efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
+  normData = (1e+06*DGEList\$counts/efflib)
+  uoutput = cbind( 
+    Name=as.character(rownames(DGEList\$counts)),
+    DE\$table,
+    adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
+    Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
+    DGEList\$counts
+  )
+  soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
+  goodness = gof(DGLM, pcutoff=fdrthresh)
+  if (sum(goodness\$outlier) > 0) {
+    print.noquote('GLM outliers:')
+    print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
+    } else { 
+      print('No GLM fit outlier genes found\n')
+    }
+  z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
+  pdf("edgeR_GoodnessofFit.pdf")
+  qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
+  abline(0,1,lwd=3)
+  points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
+  dev.off()
+  estpriorn = getPriorN(DGEList)
+  print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
+  efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
+  normData = (1e+06*DGEList\$counts/efflib)
+  uniqueg = unique(group)
+  #### Plot MDS
+  sample_colors =  match(group,levels(group))
+  sampleTypes = levels(factor(group))
+  print.noquote(sampleTypes)
+  pdf("edgeR_MDSplot.pdf")
+  plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
+  legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
+  grid(col="blue")
+  dev.off()
+  colnames(normData) = paste( colnames(normData),'N',sep="_")
+  print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
+  nzd = data.frame(log(nonzerod + 1e-2,10))
+  try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") )
+  write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
+  tt = cbind( 
+    Name=as.character(rownames(DGEList\$counts)),
+    DE\$table,
+    adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
+    Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
+  )
+  print.noquote("# edgeR Top tags\n")
+  tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
+  tt = tt[order(DE\$table\$PValue),] 
+  print.noquote(tt[1:50,])
+  deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
+  nsig = length(deTags)
+  print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
+  deColours = ifelse(deTags,'red','black')
+  pdf("edgeR_BCV_vs_abundance.pdf")
+  plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
+  dev.off()
+  dg = DGEList[order(DE\$table\$PValue),]
+  #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
+  efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
+  normData = (1e+06*dg\$counts/efflib)
+  outpdfname="edgeR_top_100_heatmap.pdf"
+  hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle))
+  outSmear = "edgeR_smearplot.pdf"
+  outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
+  smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
+  qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf')
+  norm.factor = DGEList\$samples\$norm.factors
+  topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
+  edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
+  edgeRcounts = rep(0, length(allgenes))
+  edgeRcounts[edgeRcountsindex] = 1  # Create venn diagram of hits
+  sink()
+  } ### doedgeR
+  if (doDESeq2 == T)
+  {
+    sink("DESeq2.log")
+    # DESeq2
+    require('DESeq2')
+    library('RColorBrewer')
+    if (length(subjects) == 0)
+        {
+        pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
+        deSEQds = DESeqDataSetFromMatrix(countData = workCM,  colData = pdata, design = formula(~ Rx))
+        } else {
+        pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
+        deSEQds = DESeqDataSetFromMatrix(countData = workCM,  colData = pdata, design = formula(~ subjects + Rx))
+        }
+    #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
+    #rDESeq = results(DESeq2)
+    #newCountDataSet(workCM, group)
+    deSeqDatsizefac = estimateSizeFactors(deSEQds)
+    deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
+    resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
+    rDESeq = as.data.frame(results(resDESeq))
+    rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
+    srDESeq = rDESeq[order(rDESeq\$pvalue),]
+    qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf')
+    cat("# DESeq top 50\n")
+    print.noquote(srDESeq[1:50,])
+    write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
+    topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
+    DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
+    DESeqcounts = rep(0, length(allgenes))
+    DESeqcounts[DESeqcountsindex] = 1
+    pdf("DESeq2_dispersion_estimates.pdf")
+    plotDispEsts(resDESeq)
+    dev.off()
+    ysmall = abs(min(rDESeq\$log2FoldChange))
+    ybig = abs(max(rDESeq\$log2FoldChange))
+    ylimit = min(4,ysmall,ybig)
+    pdf("DESeq2_MA_plot.pdf")
+    plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
+    dev.off()
+    rlogres = rlogTransformation(resDESeq)
+    sampledists = dist( t( assay(rlogres) ) )
+    sdmat = as.matrix(sampledists)
+    pdf("DESeq2_sample_distance_plot.pdf")
+    heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
+         col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
+    dev.off()
+    ###outpdfname="DESeq2_top50_heatmap.pdf"
+    ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
+    sink()
+    result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
+    if ("try-error" %in% class(result)) {
+         print.noquote('DESeq2 plotPCA failed.')
+         } else {
+         pdf("DESeq2_PCA_plot.pdf")
+         #### wtf - print? Seems needed to get this to work
+         print(ppca)
+         dev.off()
+        }
+  }
+
+  if (doVoom == T) {
+      sink('VOOM.log')
+      if (doedgeR == F) {
+         #### Setup DGEList object
+         DGEList = DGEList(counts=workCM, group = group)
+         DGEList = calcNormFactors(DGEList)
+         DGEList = estimateGLMCommonDisp(DGEList,mydesign)
+         DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
+         DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
+         DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
+         norm.factor = DGEList\$samples\$norm.factors
+         }
+      pdf("VOOM_mean_variance_plot.pdf")
+      dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
+      dev.off()
+      # Use limma to fit data
+      fit = lmFit(dat.voomed, mydesign)
+      fit = eBayes(fit)
+      rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
+      qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='VOOM_qqplot.pdf')
+      rownames(rvoom) = rownames(workCM)
+      rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls)
+      srvoom = rvoom[order(rvoom\$P.Value),]
+      cat("# VOOM top 50\n")
+      print(srvoom[1:50,])
+      write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F)
+      # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
+      topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
+      voomcountsindex = which(allgenes %in% topresults.voom\$ID)
+      voomcounts = rep(0, length(allgenes))
+      voomcounts[voomcountsindex] = 1
+      sink()
+  }
+
+  if (doCamera) {
+  doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
+    outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
+  }
+
+  if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
+    if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
+        vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
+        counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, 
+                                       VOOM_limma = voomcounts, row.names = allgenes)
+       } else if ((doDESeq2==T) && (doedgeR==T))  {
+         vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
+         counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
+       } else if ((doVoom==T) && (doedgeR==T)) {
+        vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
+        counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
+       }
+    
+    if (nrow(counts.dataframe > 1)) {
+      counts.venn = vennCounts(counts.dataframe)
+      vennf = "Venn_significant_genes_overlap.pdf" 
+      pdf(vennf)
+      vennDiagram(counts.venn,main=vennmain,col="maroon")
+      dev.off()
+    }
+  } #### doDESeq2 or doVoom
+  
+}
+#### Done
+
+###sink(stdout(),append=T,type="message")
+builtin_gmt = ""
+history_gmt = ""
+history_gmt_name = ""
+out_edgeR = F
+out_DESeq2 = F
+out_VOOM = "$out_VOOM"
+doDESeq2 = $DESeq2.doDESeq2 # make these T or F
+doVoom = $doVoom
+doCamera = F
+doedgeR = $edgeR.doedgeR
+edgeR_priordf = 0
+
+
+#if $doVoom == "T":
+  out_VOOM = "$out_VOOM"
+#end if
+
+#if $DESeq2.doDESeq2 == "T":
+  out_DESeq2 = "$out_DESeq2"
+  DESeq_fitType = "$DESeq2.DESeq_fitType"
+#end if
+
+#if $edgeR.doedgeR == "T":
+  out_edgeR = "$out_edgeR"
+  edgeR_priordf = $edgeR.edgeR_priordf  
+#end if
+
+<!--
+#if $camera.doCamera == 'T'
+ doCamera = $camera.doCamera
+ #if $camera.gmtSource.refgmtSource == "indexed" or $camera.gmtSource.refgmtSource == "both":
+  builtin_gmt = "${camera.gmtSource.builtinGMT.fields.path}"
+ #end if
+ #if $camera.gmtSource.refgmtSource == "history" or $camera.gmtSource.refgmtSource == "both":
+    history_gmt = "${camera.gmtSource.ownGMT}"
+    history_gmt_name = "${camera.gmtSource.ownGMT.name}"
+  #end if
+#end if
+-->
+
+if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
+{
+write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
+quit(save="no",status=2)
+}
+
+Out_Dir = "$html_file.files_path"
+Input =  "$input1"
+TreatmentName = "$treatment_name"
+TreatmentCols = "$Treat_cols"
+ControlName = "$control_name"
+ControlCols= "$Control_cols"
+org = "$input1.dbkey"
+if (org == "") { org = "hg19"}
+fdrtype = "$fdrtype"
+fdrthresh = $fdrthresh
+useNDF = $useNDF
+fQ = $fQ # non-differential centile cutoff
+myTitle = "$title"
+sids = strsplit("$subjectids",',')
+subjects = unlist(sids)
+nsubj = length(subjects)
+TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
+CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1 
+cat('Got TCols=')
+cat(TCols)
+cat('; CCols=')
+cat(CCols)
+cat('\n')
+useCols = c(TCols,CCols)
+if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
+Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
+snames = colnames(Count_Matrix)
+nsamples = length(snames)
+if (nsubj >  0 & nsubj != nsamples) {
+options("show.error.messages"=T)
+mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
+   'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
+write(mess, stderr())
+quit(save="no",status=4)
+}
+if (length(subjects) != 0) {subjects = subjects[useCols]}
+Count_Matrix = Count_Matrix[,useCols] ### reorder columns
+rn = rownames(Count_Matrix)
+islib = rn %in% c('librarySize','NotInBedRegions')
+LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
+Count_Matrix = Count_Matrix[subset(rn,! islib),]
+group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) )             #Build a group descriptor
+group = factor(group, levels=c(ControlName,TreatmentName))
+colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_")                   #Relable columns
+results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
+                 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
+                 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,
+                 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
+                 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType)
+sessionInfo()
+]]>
+</configfile>
+</configfiles>
+<help>
+
+**What it does**
+
+Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
+Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
+
+**Input**
+
+Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
+and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the 
+non-negative integer count of reads from one sample overlapping the feature. 
+The matrix must have a header row uniquely identifying the source samples, and unique row names in 
+the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
+
+**Specifying comparisons**
+
+This is basically dumbed down for two factors - case vs control.
+
+More complex interfaces are possible but painful at present. 
+Probably need to specify a phenotype file to do this better.
+Work in progress. Send code.
+
+If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
+put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or 
+A list of integers, one for each subject or an empty string if samples are all independent.
+If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
+Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
+
+So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
+eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
+8,9,1,1,2,2 
+as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
+
+**Methods available**
+
+You can run 3 popular Bioconductor packages available for count data.
+
+edgeR - see edgeR_ for details
+
+VOOM/limma - see limma_VOOM_ for details
+
+DESeq2 - see DESeq2_ for details
+
+and optionally camera in edgeR which works better if MSigDB is installed.
+
+**Outputs**
+
+Some helpful plots and analysis results. Note that most of these are produced using R code 
+suggested by the excellent documentation and vignettes for the Bioconductor
+packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
+
+**Note on Voom**
+
+The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
+
+This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
+
+voom is an acronym for mean-variance modelling at the observational level.
+The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
+Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
+This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
+The weights are then used in the linear modelling process to adjust for heteroscedasticity.
+
+In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
+The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
+The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
+Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
+Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
+This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
+
+
+Author(s)
+
+Charity Law and Gordon Smyth
+
+References
+
+Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
+
+Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
+Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
+http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
+
+See Also
+
+A voom case study is given in the edgeR User's Guide.
+
+vooma is a similar function but for microarrays instead of RNA-seq.
+
+
+***old rant on changes to Bioconductor package variable names between versions***
+
+The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue) 
+breaking this and all other code that assumed the old name for this variable, 
+between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing). 
+This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing 
+to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
+when their old scripts break. This tool currently now works with 2.4.6.
+
+**Note on prior.N**
+
+http://seqanswers.com/forums/showthread.php?t=5591 says:
+
+*prior.n*
+
+The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion. 
+You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood 
+in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your 
+tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the 
+common likelihood the weight of one observation.
+
+In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value, 
+or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that 
+you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation 
+(squeezing) of the tagwise dispersions. How many samples do you have in your experiment? 
+What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10. 
+If you have more samples, then the tagwise dispersion estimates will be more reliable, 
+so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5. 
+
+
+From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
+
+Dear Dorota,
+
+The important settings are prior.df and trend.
+
+prior.n and prior.df are related through prior.df = prior.n * residual.df,
+and your experiment has residual.df = 36 - 12 = 24.  So the old setting of
+prior.n=10 is equivalent for your data to prior.df = 240, a very large
+value.  Going the other way, the new setting of prior.df=10 is equivalent
+to prior.n=10/24.
+
+To recover old results with the current software you would use
+
+  estimateTagwiseDisp(object, prior.df=240, trend="none")
+
+To get the new default from old software you would use
+
+  estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
+
+Actually the old trend method is equivalent to trend="loess" in the new
+software. You should use plotBCV(object) to see whether a trend is
+required.
+
+Note you could also use
+
+  prior.n = getPriorN(object, prior.df=10)
+
+to map between prior.df and prior.n.
+
+----
+
+**Attributions**
+
+edgeR - edgeR_ 
+
+VOOM/limma - limma_VOOM_ 
+
+DESeq2 - DESeq2_ for details
+
+See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
+
+Galaxy_ (that's what you are using right now!) for gluing everything together 
+
+Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is 
+licensed to you under the LGPL_ like other rgenetics artefacts
+
+.. _LGPL: http://www.gnu.org/copyleft/lesser.html
+.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
+.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
+.. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
+.. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
+.. _Galaxy: http://getgalaxy.org
+</help>
+
+</tool>
+
+