Mercurial > repos > fubar > edger_test

--- a/rgedgeR/rgedgeRpaired.xml	Wed Jun 12 05:21:25 2013 -0400
+++ b/rgedgeR/rgedgeRpaired.xml	Wed Jun 12 06:13:41 2013 -0400
@@ -1,7 +1,6 @@
 <tool id="rgedgeRpaired" name="edgeR" version="0.18">
   <description>1 or 2 level models for count data</description>
   <requirements>
-      <requirement type="package" version="6.2">readline</requirement>
       <requirement type="package" version="3.0.1">package_R</requirement>
       <requirement type="package" version="2.12">package_BioCBasics</requirement>
   </requirements>
--- a/rgedgeR/rgedgeRpaired.xml~	Wed Jun 12 05:21:25 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,633 +0,0 @@
-<tool id="rgedgeRpaired" name="edgeR" version="0.18">
-  <description>1 or 2 level models for count data</description>
-  <requirements>
-      <requirement type="package" version="6.2">name=readline</requirement>
-      <requirement type="package" version="3.0.1">name=package_R</requirement>
-      <requirement type="package" version="2.12">name=package_BioCBasics</requirement>
-  </requirements>
-
-  <command interpreter="python">
-     rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "edgeR"
-    --output_dir "$html_file.files_path" --output_html "$html_file" --output_tab "$outtab" --make_HTML "yes"
-  </command>
-  <inputs>
-    <param name="input1"  type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
-       help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
-    <param name="title" type="text" value="edgeR" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain">
-      <sanitizer invalid_char="">
-        <valid initial="string.letters,string.digits"><add value="_" /> </valid>
-      </sanitizer>
-    </param>
-    <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
-    <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
-         multiple="true" use_header_names="true" size="120" display="checkboxes">
-        <validator type="no_options" message="Please select at least one column."/>
-    </param>
-    <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
-    <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
-         multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
-    </param>
-    <param name="subjectids" type="text" optional="true" size="120"
-       label="IF SUBJECTS NOT ALL INDEPENDENT! Enter integers to indicate sample pairing for every column in input"
-       help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter '1,2,1,2'">
-      <sanitizer>
-        <valid initial="string.digits"><add value="," /> </valid>
-      </sanitizer>
-    </param>
-    <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
-     help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
-    <param name="useNDF" type="boolean" truevalue="T" checked='false' falsevalue="" size="1" label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
-     help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
-    <param name="priordf" type="integer" value="20" size="3" label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n  and prior.df = prior.n * residual.df"
-     help="Zero = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
-    <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
-     help="Conventional default value of 0.05 recommended"/>
-    <param name="fdrtype" type="select" label="FDR (Type II error) control method"
-         help="Use fdr or bh typically to control for the number of tests in a reliable way">
-            <option value="fdr" selected="true">fdr</option>
-            <option value="BH">Benjamini Hochberg</option>
-            <option value="BY">Benjamini Yukateli</option>
-            <option value="bonferroni">Bonferroni</option>
-            <option value="hochberg">Hochberg</option>
-            <option value="holm">Holm</option>
-            <option value="hommel">Hommel</option>
-            <option value="none">no control for multiple tests</option>
-    </param>
-  </inputs>
-  <outputs>
-    <data format="tabular" name="outtab" label="${title}.xls"/>
-    <data format="html" name="html_file" label="${title}.html"/>
-  </outputs>
- <stdio>
-     <exit_code range="4"   level="fatal"   description="Number of subject ids must match total number of samples in the input matrix" />
- </stdio>
- <tests>
-<test>
-<param name='input1' value='test_bams2mx.xls' ftype='tabular' />
- <param name='treatment_name' value='case' />
- <param name='title' value='edgeRtest' />
- <param name='fdrtype' value='fdr' />
- <param name='priordf' value="0" />
- <param name='fdrthresh' value="0.05" />
- <param name='control_name' value='control' />
- <param name='Treat_cols' value='3,4,5,9' />
- <param name='Control_cols' value='2,6,7,8' />
- <output name='outtab' file='edgeRtest1out.xls' ftype='tabular' compare='diff' />
- <output name='html_file' file='edgeRtest1out.html' ftype='html' compare='diff' lines_diff='20' />
-</test>
-</tests>
-
-<configfiles>
-<configfile name="runme">
-<![CDATA[
-#
-# edgeR.Rscript
-# updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
-# Performs DGE on a count table containing n replicates of two conditions
-#
-# Parameters
-#
-# 1 - Output Dir
-
-# Original edgeR code by: S.Lunke and A.Kaspi
-reallybig = log10(.Machine\$double.xmax)
-reallysmall = log10(.Machine\$double.xmin)
-library('stringr')
-library('gplots')
-library('DESeq')
-library('edgeR')
-hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
-{
-# Perform clustering for significant pvalues after controlling FWER
-    samples = colnames(cmat)
-    gu = unique(group)
-    if (length(gu) == 2) {
-        col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
-        pcols = unlist(lapply(group,col.map))
-        } else {
-        colours = rainbow(length(gu),start=0,end=4/6)
-        pcols = colours[match(group,gu)]        }
-    gn = rownames(cmat)
-    dm = cmat[(! is.na(gn)),]
-    # remove unlabelled hm rows
-    nprobes = nrow(dm)
-    # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
-    if (nprobes > nsamp) {
-      dm =dm[1:nsamp,]
-      #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
-    }
-    newcolnames = substr(colnames(dm),1,20)
-    colnames(dm) = newcolnames
-    pdf(outpdfname)
-    heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
-         Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
-    dev.off()
-}
-
-hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
-{
-    # for 2 groups only was
-    #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
-    #pcols = unlist(lapply(group,col.map))
-    gu = unique(group)
-    colours = rainbow(length(gu),start=0.3,end=0.6)
-    pcols = colours[match(group,gu)]
-    nrows = nrow(cmat)
-    mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
-    if (nrows > nsamp)  {
-               cmat = cmat[c(1:nsamp),]
-               mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
-          }
-    newcolnames = substr(colnames(cmat),1,20)
-    colnames(cmat) = newcolnames
-    pdf(outpdfname)
-    heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
-    dev.off()
-}
-
-qqPlot = function(descr='Title',pvector, ...)
-# stolen from https://gist.github.com/703512
-{
-    o = -log10(sort(pvector,decreasing=F))
-    e = -log10( 1:length(o)/length(o) )
-    o[o==-Inf] = reallysmall
-    o[o==Inf] = reallybig
-    pdfname = paste(gsub(" ","", descr , fixed=TRUE),'pval_qq.pdf',sep='_')
-    maint = paste(descr,'QQ Plot')
-    pdf(pdfname)
-    plot(e,o,pch=19,cex=1, main=maint, ...,
-        xlab=expression(Expected~~-log[10](italic(p))),
-        ylab=expression(Observed~~-log[10](italic(p))),
-        xlim=c(0,max(e)), ylim=c(0,max(o)))
-    lines(e,e,col="red")
-    grid(col = "lightgray", lty = "dotted")
-    dev.off()
-}
-
-smearPlot = function(DGEList,deTags, outSmear, outMain)
-        {
-        pdf(outSmear)
-        plotSmear(DGEList,de.tags=deTags,main=outMain)
-        grid(col="blue")
-        dev.off()
-        }
-
-boxPlot = function(rawrs,cleanrs,maint,myTitle)
-{   #
-        nc = ncol(rawrs)
-        for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
-        fullnames = colnames(rawrs)
-        newcolnames = substr(colnames(rawrs),1,20)
-        colnames(rawrs) = newcolnames
-        newcolnames = substr(colnames(cleanrs),1,20)
-        colnames(cleanrs) = newcolnames
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"sampleBoxplot.pdf",sep='_')
-        defpar = par(no.readonly=T)
-        pdf(pdfname,height=6,width=8)
-        #par(mfrow=c(1,2)) # 1 rows 2 col
-        l = layout(matrix(c(1,2),1,2,byrow=T))
-        print.noquote('raw contig counts by sample:')
-        print.noquote(summary(rawrs))
-        print.noquote('normalised contig counts by sample:')
-        print.noquote(summary(cleanrs))
-        boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
-        grid(col="blue")
-        boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
-        grid(col="blue")
-        dev.off()
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"samplehistplot.pdf",sep='_')
-        nc = ncol(rawrs)
-        print.noquote(paste('Using ncol rawrs=',nc))
-        ncroot = round(sqrt(nc))
-        if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
-        m = c()
-        for (i in c(1:nc)) {
-              rhist = hist(rawrs[,i],breaks=100,plot=F)
-              m = append(m,max(rhist\$counts))
-             }
-        ymax = max(m)
-        pdf(pdfname)
-        par(mfrow=c(ncroot,ncroot))
-        for (i in c(1:nc)) {
-                 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
-                 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
-             }
-        dev.off()
-        par(defpar)
-
-}
-
-cumPlot = function(rawrs,cleanrs,maint,myTitle)
-{   # updated to use ecdf
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
-        defpar = par(no.readonly=T)
-        pdf(pdfname)
-        par(mfrow=c(2,1))
-        lrs = log(rawrs,10)
-        lim = max(lrs)
-        hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
-             ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
-        grid(col="blue")
-        lrs = log(cleanrs,10)
-        hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
-             ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
-        grid(col="blue")
-        dev.off()
-        par(defpar)
-}
-
-cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
-{   # updated to use ecdf
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
-        pdf(pdfname)
-        par(mfrow=c(2,1))
-        lastx = max(rawrs)
-        rawe = knots(ecdf(rawrs))
-        cleane = knots(ecdf(cleanrs))
-        cy = 1:length(cleane)/length(cleane)
-        ry = 1:length(rawe)/length(rawe)
-        plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
-             ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
-        grid(col="blue")
-        plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
-             ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
-        grid(col="blue")
-        dev.off()
-}
-
-
-
-edgeIt = function (Count_Matrix,group,outputfilename,fdrtype='fdr',priordf=5,fdrthresh=0.05,outputdir='.',
-    myTitle='edgeR',libSize=c(),useNDF="T",filterquantile=0.2,subjects=c()) {
-
-        # Error handling
-        if (length(unique(group))!=2){
-                print("Number of conditions identified in experiment does not equal 2")
-                q()
-        }
-        require(edgeR)
-        mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
-        allN = nrow(Count_Matrix)
-        nscut = round(ncol(Count_Matrix)/2)
-        colTotmillionreads = colSums(Count_Matrix)/1e6
-        rawrs = rowSums(Count_Matrix)
-        nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
-        nzN = nrow(nonzerod)
-        nzrs = rowSums(nonzerod)
-        zN = allN - nzN
-        print('# Quantiles for non-zero row counts:',quote=F)
-        print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
-        if (useNDF == "T")
-        {
-        gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
-        lo = colSums(Count_Matrix[!gt1rpin3,])
-        workCM = Count_Matrix[gt1rpin3,]
-        cleanrs = rowSums(workCM)
-        cleanN = length(cleanrs)
-        meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
-        print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
-        maint = paste('Filter >=1/million reads in >=',nscut,'samples')
-        }
-        else {
-        useme = (nzrs > quantile(nzrs,filterquantile))
-        workCM = nonzerod[useme,]
-        lo = colSums(nonzerod[!useme,])
-        cleanrs = rowSums(workCM)
-        cleanN = length(cleanrs)
-        meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
-        print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
-        maint = paste('Filter below',filterquantile,'quantile')
-        }
-        cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
-        allgenes <- rownames(workCM)
-        print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
-        rsums = rowSums(workCM)
-        TName=unique(group)[1]
-        CName=unique(group)[2]
-        # Setup DGEList object
-        DGEList = DGEList(counts=workCM, group = group)
-        if (length(subjects) == 0)
-            {
-               doDESEQ = T
-               mydesign = model.matrix(~group)
-            }
-            else {
-              doDESEQ = F
-              subjf = factor(subjects)
-              mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
-            }
-        print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
-        print.noquote('Using design matrix:')
-        print.noquote(mydesign)
-        DGEList = estimateGLMCommonDisp(DGEList,mydesign)
-        comdisp = DGEList\$common.dispersion
-        DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
-        if (priordf > 0) {
-           print.noquote(paste("prior.df =",priordf))
-           DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = priordf)
-        } else {
-           DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
-        }
-        DGLM = glmFit(DGEList,design=mydesign)
-        efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
-        normData = (1e+06*DGEList\$counts/efflib)
-        co = length(colnames(mydesign))
-        DE = glmLRT(DGLM,coef=co) # always last one - subject is first if needed
-        uoutput = cbind(
-                Name=as.character(rownames(DGEList\$counts)),
-                DE\$table,
-                adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
-                Dispersion=DGEList\$tagwise.dispersion,totreads=rsums,normData,
-                DGEList\$counts
-        )
-        soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
-        goodness = gof(DGLM, pcutoff=fdrthresh)
-        if (sum(goodness\$outlier) > 0) {
-            print.noquote('GLM outliers:')
-            print(paste(rownames(DGLM)[(goodness\$outlier != 0)],collapse=','),quote=F)
-            z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
-            pdf(paste(mt,"GoodnessofFit.pdf",sep='_'))
-            qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
-            abline(0,1,lwd=3)
-            points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue")
-            dev.off()
-            } else { print('No GLM fit outlier genes found\n')}
-        estpriorn = getPriorN(DGEList)
-        print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
-        efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
-        normData = (1e+06*DGEList\$counts/efflib)
-        uniqueg = unique(group)
-        # Plot MDS
-        sample_colors =  match(group,levels(group))
-        pdf(paste(mt,"MDSplot.pdf",sep='_'))
-        sampleTypes = levels(group)
-        plotMDS.DGEList(DGEList,main=paste("MDS Plot for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
-        legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
-        grid(col="blue")
-        dev.off()
-        colnames(normData) = paste( colnames(normData),'N',sep="_")
-        print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
-        nzd = data.frame(log(nonzerod + 1e-2,10))
-        boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle)
-        if (doDESEQ)
-        {
-        # DESeq
-        deSeqDatcount <- newCountDataSet(workCM, group)
-        deSeqDatsizefac <- estimateSizeFactors(deSeqDatcount)
-        deSeqDatdisp <- estimateDispersions(deSeqDatsizefac)
-        rDESeq <- nbinomTest(deSeqDatdisp, levels(group)[1], levels(group)[2])
-        rDESeq <- rDESeq[order(rDESeq\$pval), ]
-        write.table(rDESeq,paste(mt,'DESeq_TopTable.xls',sep='_'), quote=FALSE, sep="\t",row.names=F)
-        topresults.DESeq <- rDESeq[which(rDESeq\$padj < fdrthresh), ]
-        DESeqcountsindex <- which(allgenes %in% topresults.DESeq\$id)
-        DESeqcounts <- rep(0, length(allgenes))
-        DESeqcounts[DESeqcountsindex] <- 1
-        }
-        DGEList = calcNormFactors(DGEList)
-        norm.factor = DGEList\$samples\$norm.factors
-        pdf(paste(mt,"voomplot.pdf",sep='_'))
-        dat.voomed <- voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
-        dev.off()
-        # Use limma to fit data
-        fit <- lmFit(dat.voomed, mydesign)
-        fit <- eBayes(fit)
-        rvoom <- topTable(fit, coef = length(colnames(mydesign)), adj = "BH", n = Inf)
-        write.table(rvoom,paste(mt,'VOOM_topTable.xls',sep='_'), quote=FALSE, sep="\t",row.names=F)
-        # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
-        topresults.voom <- rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
-        topresults.edgeR <- soutput[which(soutput\$adj.p.value < fdrthresh), ]
-        # Create venn diagram of hits
-        edgeRcountsindex <- which(allgenes %in% rownames(topresults.edgeR))
-        voomcountsindex <- which(allgenes %in% topresults.voom\$ID)
-        edgeRcounts <- rep(0, length(allgenes))
-        edgeRcounts[edgeRcountsindex] <- 1
-        voomcounts <- rep(0, length(allgenes))
-        voomcounts[voomcountsindex] <- 1
-        if (doDESEQ) {
-            vennmain = paste(mt,'Voom,edgeR and DESeq overlap at FDR=',fdrthresh)
-            counts.dataframe <- data.frame(edgeRcounts = edgeRcounts, DESeqcounts = DESeqcounts,
-                voomcounts = voomcounts, row.names = allgenes)
-        } else {
-            vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
-            counts.dataframe <- data.frame(edgeRcounts = edgeRcounts, voomcounts = voomcounts, row.names = allgenes)
-        }
-        counts.venn <- vennCounts(counts.dataframe)
-        vennf = paste(mt,'venn.pdf',sep='_')
-        pdf(vennf)
-        vennDiagram(counts.venn,main=vennmain,col="maroon")
-        dev.off()
-        #Prepare our output file
-        nreads = soutput\$totreads # ordered same way
-        print('# writing output',quote=F)
-        write.table(soutput,outputfilename, quote=FALSE, sep="\t",row.names=F)
-        rn = uoutput\$Name
-        reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
-        org="hg19"
-        genecards="<a href='http://www.genecards.org/index.php?path=/Search/keyword/"
-        ucsc = paste("<a href='http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
-        testreg = str_match(rn,reg)
-        nreads = uoutput\$totreads # ordered same way
-        if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
-        {
-          urls = paste(ucsc,'&amp;position=chr',testreg[,2],':',testreg[,3],"-",testreg[,4],"'>",rn,'</a>',sep='')
-        } else {
-          urls = paste(genecards,rn,"'></a>",rn,'</a>',sep="")
-        }
-        print.noquote('# urls')
-        cat(head(urls))
-        tt = uoutput
-        cat("# Top tags\n")
-        tt = cbind(tt,ntotreads=nreads,URL=urls) # add to end so table isn't laid out strangely
-        tt = tt[order(DE\$table\$PValue),]
-        options(width = 500)
-        print.noquote(tt[1:50,])
-        pdf(paste(mt,"BCV_vs_abundance.pdf",sep='_'))
-        plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
-        dev.off()
-        # Plot MAplot
-        deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
-        nsig = length(deTags)
-        print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
-        print('# deTags',quote=F)
-        print(head(deTags))
-        dg = DGEList[order(DE\$table\$PValue),]
-        #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
-        efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
-        normData = (1e+06*dg\$counts/efflib)
-        outpdfname=paste(mt,"heatmap.pdf",sep='_')
-        hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=myTitle)
-        outSmear = paste(mt,"Smearplot.pdf",sep='_')
-        outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
-        smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
-        qqPlot(descr=myTitle,pvector=DE\$table\$PValue)
-        if (doDESEQ) {
-           cat("# DESeq top 50\n")
-           print(rDESeq[1:50,])
-        }
-        cat("# VOOM top 50\n")
-        print(rvoom[1:50,])
-        # need a design matrix and glm to use this
-        goodness = gof(DGLM, pcutoff=fdrthresh)
-        nout = sum(goodness\$outlier)
-        if (nout > 0) {
-            print.noquote(paste('Found',nout,'Goodness of fit outliers'))
-            rownames(DGLM)[goodness\$outlier]
-            z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
-            pdf(paste(mt,"GoodnessofFit.pdf",sep='_'))
-            qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
-            abline(0,1,lwd=3)
-            points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue")
-            dev.off()
-            }
-        #Return our main table
-        uoutput
-
-}       #Done
-sink(stdout(),append=T,type="message")
-options(width=512)
-Out_Dir = "$html_file.files_path"
-Input =  "$input1"
-TreatmentName = "$treatment_name"
-TreatmentCols = "$Treat_cols"
-ControlName = "$control_name"
-ControlCols= "$Control_cols"
-outputfilename = "$outtab"
-fdrtype = "$fdrtype"
-priordf = $priordf
-fdrthresh = $fdrthresh
-useNDF = "$useNDF"
-fQ = $fQ # non-differential centile cutoff
-myTitle = "$title"
-subjects = c($subjectids)
-nsubj = length(subjects)
-#Set our columns
-TCols           = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
-CCols           = as.numeric(strsplit(ControlCols,",")[[1]])-1
-cat('# got TCols=')
-cat(TCols)
-cat('; CCols=')
-cat(CCols)
-cat('\n')
-useCols = c(TCols,CCols)
-# Create output dir if non existent
-  if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
-
-Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
-snames = colnames(Count_Matrix)
-nsamples = length(snames)
-if (nsubj >  0 & nsubj != nsamples) {
-options("show.error.messages"=T)
-mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
-write(mess, stderr())
-#print(mess)
-quit(save="no",status=4)
-}
-
-Count_Matrix = Count_Matrix[,useCols] # reorder columns
-if (length(subjects) != 0) {subjects = subjects[useCols]}
-rn = rownames(Count_Matrix)
-islib = rn %in% c('librarySize','NotInBedRegions')
-LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
-Count_Matrix = Count_Matrix[subset(rn,! islib),]
-group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) )             #Build a group descriptor
-group = factor(group, levels=c(ControlName,TreatmentName))
-colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_")                   #Relable columns
-results = edgeIt(Count_Matrix=Count_Matrix,group=group,outputfilename=outputfilename,fdrtype=fdrtype,priordf=priordf,fdrthresh=fdrthresh,
-   outputdir=Out_Dir,myTitle=myTitle,libSize=c(),useNDF=useNDF,filterquantile=fQ,subjects=subjects)
-#Run the main function
-# for the log
-sessionInfo()
-]]>
-</configfile>
-</configfiles>
-<help>
-**What it does**
-
-Performs digital gene expression analysis between a treatment and control on a count matrix.
-Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
-
-**Input**
-
-A matrix consisting of non-negative integers. The matrix must have a unique header row identifiying the samples, and a unique set of row names
-as  the first column. Typically the row names are gene symbols or probe id's for downstream use in GSEA and other methods.
-
-If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
-put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
-A list of integers, one for each subject or an empty string if samples are all independent.
-If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
-Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
-
-So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
-eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
-8,9,1,1,2,2
-as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
-
-**Output**
-
-A matrix which consists the original data and relative expression levels and some helpful plots
-
-**Note on edgeR versions**
-
-The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
-breaking this and all other code that assumed the old name for this variable,
-between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
-This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
-to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
-when their old scripts break. This tool currently now works with 2.4.6.
-
-**Note on prior.N**
-
-http://seqanswers.com/forums/showthread.php?t=5591 says:
-
-*prior.n*
-
-The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
-You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
-in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
-tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
-common likelihood the weight of one observation.
-
-In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
-or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
-you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
-(squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
-What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
-If you have more samples, then the tagwise dispersion estimates will be more reliable,
-so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
-
-
-From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
-
-Dear Dorota,
-
-The important settings are prior.df and trend.
-
-prior.n and prior.df are related through prior.df = prior.n * residual.df,
-and your experiment has residual.df = 36 - 12 = 24.  So the old setting of
-prior.n=10 is equivalent for your data to prior.df = 240, a very large
-value.  Going the other way, the new setting of prior.df=10 is equivalent
-to prior.n=10/24.
-
-To recover old results with the current software you would use
-
-  estimateTagwiseDisp(object, prior.df=240, trend="none")
-
-To get the new default from old software you would use
-
-  estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
-
-Actually the old trend method is equivalent to trend="loess" in the new
-software. You should use plotBCV(object) to see whether a trend is
-required.
-
-Note you could also use
-
-  prior.n = getPriorN(object, prior.df=10)
-
-to map between prior.df and prior.n.
-
-</help>
-
-</tool>
-
-
--- a/rgedgeR/test-data/gentestdata.sh~	Wed Jun 12 05:21:25 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-#!/bin/bash
-# generate test data for rgGSEA
-# ross lazarus June 2013
-# adjust gseajar_path !
-GSEAJAR_PATH=/home/rlazarus/galaxy-central/tool_dependency_dir/gsea_jar/2.0.12/fubar/rg_gsea_test/8e291f464aa0/jars/gsea2-2.0.12.jar
-python ../rgGSEA.py --input_tab "gsea_test_DGE.xls"  --adjpvalcol "5" --signcol "2" --idcol "1" --outhtml "gseatestout.html" --input_name "gsea_test" --setMax "500" --setMin "15" --nPerm "10" --plotTop "20" --gsea_jar "$GSEAJAR_PATH" --output_dir "gseatestout" --mode "Max_probe" --title "GSEA test" --builtin_gmt "gseatestdata.gmt"
-
-
--- a/rgedgeR/tool_dependencies.xml	Wed Jun 12 05:21:25 2013 -0400
+++ b/rgedgeR/tool_dependencies.xml	Wed Jun 12 06:13:41 2013 -0400
@@ -1,37 +1,10 @@
 <?xml version="1.0"?>
 <tool_dependency>
-    <package name="readline" version="6.2">
-        <repository changeset_revision="1301ec7705a8" name="package_readline_6_2" owner="boris" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu/" />
-    </package>
-    <package name="package_R" version="3.0.1">
-        <install version="1.0">
-            <actions>
-                <action type="download_by_url">http://cran.ms.unimelb.edu.au/src/base/R-3/R-3.0.1.tar.gz</action>
-                <action type="set_environment_for_install">
-                    <repository changeset_revision="1301ec7705a8" name="package_readline_6_2" owner="boris" toolshed="http://testtoolshed.g2.bx.psu.edu/">
-                        <package name="package_readline_6_2" version="6.2" />
-                    </repository>
-                </action>
-                <action type="make_directory">$INSTALL_DIR</action>
-                <action type="shell_command">./configure --with-blas --with-lapack --enable-R-shlib  --with-x=no --prefix=$INSTALL_DIR &amp;&amp; make &amp;&amp; make install</action>
-                <action type="set_environment">
-                    <environment_variable action="set_to" name="R_HOME">$INSTALL_DIR/lib/R</environment_variable>
-                    <environment_variable action="set_to" name="R_LIBS">$INSTALL_DIR/lib/R/library</environment_variable>
-                    <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/lib/R/bin</environment_variable>
-                </action>
-            </actions>
-        </install>
-        <readme>R is a free software environment for statistical computing and graphics
-                WARNING: See custom compilation options above
-                Modified from an older version of R by Boris by Ross Lazarus for R 3.0
-                Added Bioc basics too
-       </readme>
-    </package>
     <package name="package_BioCBasics" version="2.12">
         <install version="1.0">
             <actions>
                 <action type="set_environment_for_install">
-                    <package name="package_R" prior_installation_required="True" toolshed="http://testtoolshed.g2.bx.psu.edu" version="3.0.1" />
+                    <repository name="package_R" owner="fubar" prior_installation_required="True"  toolshed="http://testtoolshed.g2.bx.psu.edu/"/>
                 </action>
                 <action type="shell_command">$R_HOME/bin/R CMD BATCH installBioC.R </action>
             </actions>
--- a/rgedgeR/tool_dependencies.xml~	Wed Jun 12 05:21:25 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="package_readline_6_2" version="6.2">
-        <repository name="package_readline_6_2" owner="boris" prior_installation_required="True"
-                toolshed="http://testtoolshed.g2.bx.psu.edu/" />
-    </package>
-    <package name="package_R" version="3.0.1">
-        <install version="1.0">
-            <actions>
-                <action type="download_by_url">http://cran.ms.unimelb.edu.au/src/base/R-3/R-3.0.1.tar.gz</action>
-                <action type="set_environment_for_install">
-                    <repository changeset_revision="1301ec7705a8" name="package_readline_6_2" owner="boris"
-                            toolshed="http://testtoolshed.g2.bx.psu.edu/">
-                        <package name="package_readline_6_2" version="6.2" />
-                    </repository>
-                </action>
-                <action type="make_directory">$INSTALL_DIR</action>
-                <action type="shell_command">./configure --with-blas --with-lapack --enable-R-shlib --with-readline=no --with-x=no --prefix=$INSTALL_DIR &amp;&amp; make &amp;&amp; make install</action>
-                <action type="set_environment">
-                    <environment_variable action="set_to" name="R_HOME">$INSTALL_DIR/lib/R</environment_variable>
-                    <environment_variable action="set_to" name="R_LIBS">$INSTALL_DIR/lib/R/library</environment_variable>
-                    <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/lib/R/bin</environment_variable>
-                </action>
-            </actions>
-        </install>
-        <readme>R is a free software environment for statistical computing and graphics
-                WARNING: See custom compilation options above
-                Modified from an older version of R by Boris by Ross Lazarus for R 3.0
-                Added Bioc basics too
-       </readme>
-    </package>
-    <package name="package_BioCBasics" version="2.12">
-        <install version="1.0">
-            <actions>
-                <action type="shell_command">$INSTALL_DIR/lib/R/bin/R CMD BATCH installBioC.R </action>
-            </actions>
-        </install>
-        <readme>R is a free software environment for statistical computing and graphics
-                WARNING: See custom compilation options above
-                Modified from an older version of R by Boris by Ross Lazarus for R 3.0
-                Added Bioc basics via this package installBioC.R script
-       </readme>
-    </package>
-</tool_dependency>