Mercurial > repos > fubar > edger_test

--- a/rgedgeR/rgGSEA.py	Wed Jun 12 02:58:43 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,494 +0,0 @@
-"""
-April 2013
-eeesh GSEA does NOT respect the mode flag!
-
-Now realise that the creation of the input rank file for gsea needs to take the lowest p value for duplicate
-feature names. To make Ish's life easier, remove duplicate gene ids from any gene set to stop GSEA from
-barfing.
-
-October 14 2012
-Amazingly long time to figure out that GSEA fails with useless error message if any filename contains a dash "-"
-eesh.
-
-Added history .gmt source - requires passing a faked name to gsea
-Wrapper for GSEA http://www.broadinstitute.org/gsea/index.jsp
-Started Feb 22
-Copyright 2012 Ross Lazarus
-All rights reserved
-Licensed under the LGPL
-
-called eg as
-
-#!/bin/sh
-GALAXY_LIB="/data/extended/galaxy/lib"
-if [ "$GALAXY_LIB" != "None" ]; then
-    if [ -n "$PYTHONPATH" ]; then
-        PYTHONPATH="$GALAXY_LIB:$PYTHONPATH"
-    else
-        PYTHONPATH="$GALAXY_LIB"
-    fi
-    export PYTHONPATH
-fi
-
-cd /data/extended/galaxy/database/job_working_directory/027/27311
-python /data/extended/galaxy/tools/rgenetics/rgGSEA.py --input_tab "/data/extended/galaxy/database/files/033/dataset_33806.dat"  --adjpvalcol "5" --signcol "2"
---idcol "1" --outhtml "/data/extended/galaxy/database/files/034/dataset_34455.dat" --input_name "actaearly-Controlearly-actalate-Controllate_topTable.xls"
---setMax "500" --setMin "15" --nPerm "1000" --plotTop "20"
---gsea_jar "/data/extended/galaxy/tool-data/shared/jars/gsea2-2.0.12.jar"
---output_dir "/data/extended/galaxy/database/job_working_directory/027/27311/dataset_34455_files" --mode "Max_probe"
- --title " actaearly-Controlearly-actalate-Controllate_interpro_GSEA" --builtin_gmt "/data/genomes/gsea/3.1/IPR_DOMAIN.gmt"
-
-
-"""
-import optparse
-import tempfile
-import os
-import sys
-import subprocess
-import time
-import shutil
-import glob
-import math
-import re
-
-KEEPSELECTION = False # detailed records for selection of multiple probes
-
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-
-
-def fix_subdir(adir,destdir):
-    """ Galaxy wants everything in the same files_dir
-    if os.path.exists(adir):
-        for (d,dirs,files) in os.path.walk(adir):
-            for f in files:
-                sauce = os.path.join(d,f)
-                shutil.copy(sauce,destdir)
-    """
-
-    def fixAffycrap(apath=''):
-        """class='richTable'>RUNNING ES</th><th class='richTable'>CORE ENRICHMENT</th><tr><td class='lessen'>1</td>
-        <td><a href='https://www.affymetrix.com/LinkServlet?probeset=LBR'>LBR</a></td><td></td><td></td><td>1113</td>
-        <td>0.194</td><td>-0.1065</td><td>No</td></tr><tr><td class='lessen'>2</td><td>
-        <a href='https://www.affymetrix.com/LinkServlet?probeset=GGPS1'>GGPS1</a></td><td></td><td></td><td>4309</td><td>0.014</td><td>-0.4328</td>
-        <td>No</td></tr>
-        """
-        html = []
-        try:
-            html = open(apath,'r').readlines()
-        except:
-             return html
-        for i,row in enumerate(html):
-            row = re.sub('https\:\/\/www.affymetrix.com\/LinkServlet\?probeset=',"http://www.genecards.org/index.php?path=/Search/keyword/",row)
-            html[i] = row
-        return html
-
-    cleanup = False
-    if os.path.exists(adir):
-        flist = os.listdir(adir) # get all files created
-        for f in flist:
-           apath = os.path.join(adir,f)
-           dest = os.path.join(destdir,f)
-           if not os.path.isdir(apath):
-               if os.path.splitext(f)[1].lower() == '.html':
-                   html = fixAffycrap(apath)
-                   fixed = open(apath,'w')
-                   fixed.write('\n'.join(html))
-                   fixed.write('\n')
-                   fixed.close()
-               if not os.path.isfile(dest):
-                   shutil.copy(apath,dest)
-           else:
-               fix_subdir(apath,destdir)
-        if cleanup:
-            try:
-                shutil.rmtree(path=adir,ignore_errors=True)
-            except:
-                pass
-
-
-
-def getFileString(fpath, outpath):
-    """
-    format a nice file size string
-    """
-    size = ''
-    fp = os.path.join(outpath, fpath)
-    s = fpath
-    if os.path.isfile(fp):
-        n = float(os.path.getsize(fp))
-        if n > 2**20:
-            size = ' (%1.1f MB)' % (n/2**20)
-        elif n > 2**10:
-            size = ' (%1.1f KB)' % (n/2**10)
-        elif n > 0:
-            size = ' (%d B)' % (int(n))
-        s = '%s %s' % (fpath, size)
-    return s
-
-class gsea_wrapper:
-    """
-    GSEA java desktop client has a CL interface. CL can be gleaned by clicking the 'command line' button after setting up an analysis
-    We don't want gsea to do the analysis but it can read .rnk files containing rows of identifiers and an evidence weight such as the signed t statistic from limma for differential expression
-(vgalaxy)rlazarus@iaas1:~/public_html/idle_illumina_analysis$ cat gseaHumanREFSEQ.sh
-#!/bin/bash
-for RNK in `ls *.rnk`
-do
-DIRNAME=${RNK%.*}
-echo $DIRNAME
-qsub -cwd -b y java -Xmx4096m -cp /data/app/bin/gsea2-2.07.jar xtools.gsea.GseaPreranked -gmx ../msigdb.v3.0.symbols.gmt -collapse true -mode Max_probe -norm meandiv
--nperm 1000 -rnk $RNK -scoring_scheme weighted -rpt_label $RNK -chip ../RefSeq_human.chip -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp
--set_max 500 -set_min 15 -zip_report false -out gseaout/${DIRNAME} -gui false
-done
-    """
-
-    def __init__(self,myName=None,opts=None):
-        """ setup cl for gsea
-        """
-        self.idcol = 0
-        self.signcol = 0
-        self.adjpvalcol = 0
-        self.progname=myName
-        self.opts = opts
-        remove_duplicates=True
-        if not os.path.isdir(opts.output_dir):
-            try:
-                os.makedirs(opts.output_dir)
-            except:
-                print >> sys.stderr,'##Error: GSEA wrapper unable to create or find output directory %s. Stopping' % (opts.output_dir)
-                sys.exit(1)
-        fakeGMT = re.sub('[^a-zA-Z0-9_]+', '', opts.input_name) # gives a more useful title for the GSEA report
-        fakeGMT = os.path.join(opts.output_dir,fakeGMT)
-        fakeGMT = os.path.abspath(fakeGMT)
-        fakeRanks = '%s.rnk' % fakeGMT
-        if not fakeGMT.endswith('.gmt'):
-            fakeGMT = '%s.gmt' % fakeGMT
-        if opts.builtin_gmt and opts.history_gmt:
-            newfile = open(fakeGMT,'w')
-            subprocess.call(['cat',opts.builtin_gmt,opts.history_gmt],stdout=newfile)
-            newfile.close()
-        elif opts.history_gmt:
-           subprocess.call(['cp',opts.history_gmt,fakeGMT])
-        else:
-           subprocess.call(['cp',opts.builtin_gmt,fakeGMT])
-        # remove dupes from each gene set
-        gmt = open(fakeGMT,'r').readlines()
-        gmt = [x for x in gmt if len(x.split('\t')) > 3]
-        ugmt = []
-        for i,row in enumerate(gmt):
-            rows = row.rstrip().split('\t')
-            gmtname = rows[0]
-            gmtcomment = rows[1]
-            glist = list(set(rows[2:]))
-            newgmt = [gmtname,gmtcomment]
-            newgmt += glist
-            ugmt.append('\t'.join(newgmt))
-        gmt = open(fakeGMT,'w')
-        gmt.write('\n'.join(ugmt))
-        gmt.write('\n')
-        gmt.close()
-        if opts.input_ranks:
-            infname = opts.input_ranks
-            rdat = open(opts.input_ranks,'r').readlines() # suck in and remove blank ids that cause gsea to barf rml april 10 2012
-            rdat = [x.rstrip().split('\t') for x in rdat[1:]] # ignore head
-            dat = [[x[0],x[1],x[1]] for x in rdat]
-            # fake same structure as input tabular file
-            try:
-                pvals = [float(x[1]) for x in dat]
-                signs = [float(x[1]) for x in dat]
-            except:
-                print >> sys.stderr, '## error converting floating point - cannot process this input'
-                sys.exit(99)
-        else: # read tabular
-            self.idcol = int(opts.idcol) - 1
-            self.signcol = int(opts.signcol) - 1
-            self.adjpvalcol = int(opts.adjpvalcol) - 1
-            maxcol = max(self.idcol,self.signcol,self.adjpvalcol)
-            infname = opts.input_tab
-            indat = open(opts.input_tab,'r').readlines()
-            dat = [x.rstrip().split('\t') for x in indat[1:]]
-            dat = [x for x in dat if len(x) > maxcol]
-            dat = [[x[self.idcol],x[self.adjpvalcol],x[self.signcol]] for x in dat] # reduce to rank form
-            pvals = [float(x[1]) for x in dat]
-            outofrange = [x for x in pvals if ((x < 0.0) or (x > 1.0))]
-            assert len(outofrange) == 0, '## p values outside 0-1 encountered - was that the right column for adjusted p value?'
-            signs = [float(x[2]) for x in dat]
-            outofrange = [i for i,x in enumerate(signs) if (not x) and (dat[i][self.signcol] <> '0')]
-            bad = [dat[x][2] for x in outofrange]
-            assert len(outofrange) == 0, '## null numeric values encountered for sign - was that the right column? %s' % bad
-        ids = [x[0] for x in dat]
-        res = []
-        self.comments = []
-        useme = []
-        for i,row in enumerate(dat):
-            if row[1].upper() != 'NA' and row[2].upper() != 'NA' and row[0] != '' :
-                useme.append(i)
-        lost = len(dat) - len(useme)
-        if lost <> 0:
-            newdat = [dat[x] for x in useme]
-            del dat
-            dat = newdat
-            print >> sys.stdout, '## %d lost - NA values or null id' % lost
-        if remove_duplicates:
-            uids = list(set(ids)) # complex procedure to get min pval for each unique id
-            if len(uids) <> len(ids): # dupes - deal with mode
-                print >> sys.stdout,'## Dealing with %d uids in %d ids' % (len(uids),len(ids))
-                ures = {}
-                for i,id in enumerate(ids):
-                    p = pvals[i]
-                    ures.setdefault(id,[])
-                    ures[id].append((p,signs[i]))
-                for id in uids:
-                    tlist = ures[id]
-                    tp = [x[0] for x in tlist]
-                    ts = [x[1] for x in tlist]
-                    if len(tp) == 1:
-                        p = tp[0]
-                        sign = ts[0]
-                    else:
-                        if  opts.mode == "Max_probe":
-                            p = min(tp)
-                            sign = ts[tp.index(p)]
-                        else: # guess median - too bad if even count
-                            tp.sort()
-                            ltp = len(tp)
-                            ind = ltp/2 # yes, this is wrong for evens but what if sign wobbles?
-                            if ltp % 2 == 1: # odd
-                                ind += 1 # take the median
-                            p = tp[ind]
-                            sign = ts[ind]
-                        if KEEPSELECTION:
-                              self.comments.append('## for id=%s, got tp=%s, ts=%s, chose p=%f, sign =%f'\
-                                   % (id,str(tp),str(ts),p,sign))
-                    if opts.input_ranks: # must be a rank file
-                        res.append((id,'%f' % p))
-                    else:
-                        if p == 0.0:
-                            p = 1e-99
-                        try:
-                            lp = -math.log10(p) # large positive if low p value
-                        except ValueError:
-                            lp = 0.0
-                        if sign < 0:
-                            lp = -lp # if negative, swap p to negative
-                        res.append((id,'%f' % lp))
-            else: # no duplicates
-                for i,row in enumerate(dat):
-                    (id,p,sign) = (row[0],float(row[1]),float(row[2]))
-                    if opts.input_ranks: # must be a rank file
-                        res.append((id,'%f' % p))
-                    else:
-                        if p == 0.0:
-                            p = 1e-99
-                        try:
-                            lp = -math.log10(p) # large positive if low p value
-                        except ValueError:
-                            lp = 0.0
-                        if sign < 0:
-                            lp = -lp # if negative, swap p to negative
-                        res.append((id,'%f' % lp))
-        else:
-            for i,row in enumerate(dat):
-                (id,p,sign) = (row[0],float(row[1]),float(row[2]))
-                if opts.input_ranks: # must be a rank file
-                    res.append((id,'%f' % p))
-                else:
-                    if p == 0.0:
-                        p = 1e-99
-                    try:
-                        lp = -math.log10(p) # large positive if low p value
-                    except ValueError:
-                        lp = 0.0
-                    if sign < 0:
-                        lp = -lp # if negative, swap p to negative
-                    res.append((id,'%f' % lp))
-        len1 = len(ids)
-        len2 = len(ranks)
-        delta = len1 - len2
-        if delta <> 0:
-            print >> sys.stdout,'NOTE: %d of %d rank input file %s rows deleted - dup, null or NA IDs, pvals or logFCs' % (delta,len1,infname)
-        ranks = [(float(x[1]),x) for x in ranks] # decorate
-        ranks.sort()
-        ranks.reverse()
-        ranks = [x[1] for x in ranks] # undecorate
-        if opts.chip == '': # if mouse - need HUGO
-             ranks = [[x[0].upper(),x[1]] for x in ranks]
-             print >> sys.stdout, '## Fixed any lower case - now have',','.join([x[0] for x in ranks[:5]])
-        ranks = ['\t'.join(x) for x in ranks]
-        if len(ranks) < 2:
-             print >> sys.stderr,'Input %s has 1 or less rows with two tab delimited fields - please check the tool documentation' % infname
-             sys.exit(1)
-        print '### opening %s and writing %s' % (fakeRanks,str(ranks[:10]))
-        rclean = open(fakeRanks,'w')
-        rclean.write('contig\tscore\n')
-        rclean.write('\n'.join(ranks))
-        rclean.write('\n')
-        rclean.close()
-        cl = []
-        a = cl.append
-        a('java -Xmx6G -cp')
-        a(opts.gsea_jar)
-        a('xtools.gsea.GseaPreranked')
-        a('-gmx %s' % fakeGMT) # ensure .gmt extension as required by GSEA - gene sets to use
-        a('-gui false')    # use preranked file mode and no gui
-        a('-make_sets true -rnd_seed timestamp') # more things from the GUI command line display
-        a('-norm meandiv -zip_report false -scoring_scheme weighted')            # ? need to set these?
-        a('-rnk %s' % fakeRanks) # input ranks file symbol (the chip file is the crosswalk for ids in first column)
-        a('-out %s' % opts.output_dir)
-        a('-set_max %s' % opts.setMax)
-        a('-set_min %s' % opts.setMin)
-        a('-mode %s' % opts.mode)
-        if opts.chip > '':
-           #a('-chip %s -collapse true -include_only_symbols true' % opts.chip)
-           a('-chip %s -collapse true' % opts.chip)
-        else:
-           a("-collapse false") # needed if no chip
-        a('-nperm %s' % opts.nPerm)
-        a('-rpt_label %s' % opts.title)
-        a('-plot_top_x %s' % opts.plotTop)
-        self.cl = cl
-        self.comments.append('## GSEA command line:')
-        self.comments.append(' '.join(self.cl))
-        self.fakeRanks = fakeRanks
-        self.fakeGMT = fakeGMT
-
-    def grepIds(self):
-        """
-        """
-        found = []
-        allids = open(self.opts.input_ranks,'r').readlines()
-        allids = [x.strip().split() for x in allids]
-        allids = [x[0] for x in allids] # list of ids
-        gmtpath = os.path.split(self.opts.fakeGMT)[0] # get path to all chip
-
-    def run(self):
-        """
-
-        """
-        tlog = os.path.join(self.opts.output_dir,"gsea_runner.log")
-        sto = open(tlog,'w')
-        x = subprocess.Popen(' '.join(self.cl),shell=True,stdout=sto,stderr=sto,cwd=self.opts.output_dir)
-        retval = x.wait()
-        sto.close()
-        d = glob.glob(os.path.join(self.opts.output_dir,'%s*' % self.opts.title))
-        if len(d) > 0:
-            fix_subdir(d[0],self.opts.output_dir)
-        htmlfname = os.path.join(self.opts.output_dir,'index.html')
-        try:
-            html = open(htmlfname,'r').readlines()
-            html = [x.strip() for x in html if len(x.strip()) > 0]
-            if len(self.comments) > 0:
-                s = ['<pre>']
-                s += self.comments
-                s.append('</pre>')
-                try:
-                    i = html.index('<div id="footer">')
-                except:
-                    i = len(html) - 7 # fudge
-                html = html[:i] + s + html[i:]
-        except:
-            html = []
-            htmlhead = '<html><head></head><body>'
-            html.append('## Galaxy GSEA wrapper failure')
-            html.append('## Unable to find index.html in %s - listdir=%s' % (d,' '.join(os.listdir(self.opts.output_dir))))
-            html.append('## Command line was %s' % (' '.join(self.cl)))
-            html.append('## commonly caused by mismatched ID/chip selection')
-            glog = open(os.path.join(self.opts.output_dir,'gsea_runner.log'),'r').readlines()
-            html.append('## gsea_runner.log=%s' % '\n'.join(glog))
-            #tryme = self.grepIds()
-            retval = 1
-            print >> sys.stderr,'\n'.join(html)
-            html = ['%s<br/>' % x for x in html]
-            html.insert(0,htmlhead)
-            html.append('</body></html>')
-        htmlf = file(self.opts.outhtml,'w')
-        htmlf.write('\n'.join(html))
-        htmlf.write('\n')
-        htmlf.close()
-        os.unlink(self.fakeRanks)
-        os.unlink(self.fakeGMT)
-        if opts.outtab_neg:
-            tabs = glob.glob(os.path.join(opts.output_dir,"gsea_report_for_*.xls"))
-            if len(tabs) > 0:
-                for tabi,t in enumerate(tabs):
-                    tkind = os.path.basename(t).split('_')[4].lower()
-                    if tkind == 'neg':
-                       outtab = opts.outtab_neg
-                    elif tkind == 'pos':
-                       outtab = opts.outtab_pos
-                    else:
-                       print >> sys.stderr, '## tab file matched %s which is not "neg" or "pos" in 4th segment %s' % (t,tkind)
-                       sys.exit()
-                    content = open(t).readlines()
-                    tabf = open(outtab,'w')
-                    tabf.write(''.join(content))
-                    tabf.close()
-            else:
-                print >> sys.stdout, 'Odd, maketab = %s but no matches - tabs = %s' % (makeTab,tabs)
-        return retval
-
-
-if __name__ == "__main__":
-    """
-    called as:
-   <command interpreter="python">rgGSEA.py --input_ranks "$input1"  --outhtml "$html_file"
-       --setMax "$setMax" --setMin "$setMin" --nPerm "$nPerm" --plotTop "$plotTop" --gsea_jar "$GALAXY_DATA_INDEX_DIR/shared/jars/gsea2-2.07.jar"
-       --output_dir "$html_file.files_path" --use_gmt ""${use_gmt.fields.path}"" --chip "${use_chip.fields.path}"
-  </command>
-    """
-    op = optparse.OptionParser()
-    a = op.add_option
-    a('--input_ranks',default=None)
-    a('--input_tab',default=None)
-    a('--input_name',default=None)
-    a('--use_gmt',default=None)
-    a('--history_gmt',default=None)
-    a('--builtin_gmt',default=None)
-    a('--history_gmt_name',default=None)
-    a('--setMax',default="500")
-    a('--setMin',default="15")
-    a('--nPerm',default="1000")
-    a('--title',default="GSEA report")
-    a('--chip',default='')
-    a('--plotTop',default='20')
-    a('--outhtml',default=None)
-    a('--makeTab',default=None)
-    a('--output_dir',default=None)
-    a('--outtab_neg',default=None)
-    a('--outtab_pos',default=None)
-    a('--adjpvalcol',default=None)
-    a('--signcol',default=None)
-    a('--idcol',default=None)
-    a('--mode',default='Max_probe')
-    a('-j','--gsea_jar',default='/usr/local/bin/gsea2-2.07.jar')
-    opts, args = op.parse_args()
-    assert os.path.isfile(opts.gsea_jar),'## GSEA runner unable to find supplied gsea java desktop executable file %s' % opts.gsea_jar
-    if opts.input_ranks:
-        inpf = opts.input_ranks
-    else:
-        inpf = opts.input_tab
-        assert opts.idcol <> None, '## GSEA runner needs an id column if a tabular file provided'
-        assert opts.signcol <> None, '## GSEA runner needs a sign column if a tabular file provided'
-        assert opts.adjpvalcol <> None, '## GSEA runner needs an adjusted p value column if a tabular file provided'
-    assert os.path.isfile(inpf),'## GSEA runner unable to open supplied input file %s' % inpf
-    if opts.chip > '':
-        assert os.path.isfile(opts.chip),'## GSEA runner unable to open supplied chip file %s' % opts.chip
-    some = None
-    if opts.history_gmt <> None:
-        some = 1
-        assert os.path.isfile(opts.history_gmt),'## GSEA runner unable to open supplied history gene set matrix (.gmt) file %s' % opts.history_gmt
-    if opts.builtin_gmt <> None:
-        some = 1
-        assert os.path.isfile(opts.builtin_gmt),'## GSEA runner unable to open supplied history gene set matrix (.gmt) file %s' % opts.builtin_gmt
-    assert some, '## GSEA runner needs a gene set matrix file - none chosen?'
-    opts.title = re.sub('[^a-zA-Z0-9_]+', '', opts.title)
-    myName=os.path.split(sys.argv[0])[-1]
-    gse = gsea_wrapper(myName, opts=opts)
-    retcode = gse.run()
-    if retcode <> 0:
-        sys.exit(retcode) # indicate failure to job runner
-
-
--- a/rgedgeR/rgGSEAcolumns.xml	Wed Jun 12 02:58:43 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,834 +0,0 @@
-<tool id="rgGSEAcolumns" name="Gene set enrichment" version="0.05">
-  <description>using a generic tabular file</description>
-  <requirements>
-      <requirement type="package" version="2.0.12">gsea_jar</requirement>
-      <requirement type="set_environment">GSEAJAR_PATH</requirement>
-  </requirements>
-  <command interpreter="python">rgGSEA.py --input_tab "$input1"  --adjpvalcol "$adjpvalcol" --signcol "$signcol"
-       --idcol "$idcol" --outhtml "$html_file" --input_name "${input1.name}"
-       --setMax "$setMax" --setMin "$setMin" --nPerm "$nPerm" --plotTop "$plotTop"
-       --gsea_jar "\$GSEAJAR_PATH"
-       --output_dir "$html_file.files_path" --mode "$mode" --title "$title"
-#if $makeTab.value=='Yes'
- --outtab_pos "$outtab_pos" --outtab_neg "$outtab_neg"
-#end if
-#if $gmtSource.refgmtSource == "indexed" or $gmtSource.refgmtSource == "both":
---builtin_gmt "${gmtSource.builtinGMT.fields.path}"
-#end if
-#if $gmtSource.refgmtSource == "history" or $gmtSource.refgmtSource == "both":
---history_gmt "${gmtSource.ownGMT}" --history_gmt_name "${gmtSource.ownGMT.name}"
-#end if
-#if $chipSource.refchipSource=="builtin"
- --chip "${chipSource.builtinChip.fields.path}"
-#end if
-#if $chipSource.refchipSource=="history"
- --chip "${chipSource.ownChip}"
-#end if
-</command>
-  <inputs>
-    <param name="input1"  type="data" format="tabular" label="Select a tab delimited file with a probe id, adjusted p value and a signed weight (eg t-test statistic) on each row"
-       help=""/>
-    <param name="adjpvalcol" label="Column containing a p value for the DEG statistical test"
-              help = "Use RAW p-values - not FDR adjusted as these have a non GSEA friendly non-uniform distribution"
-              type="data_column"  data_ref="input1" numerical="True"
-              multiple="false" use_header_names="true" size="5">
-        <validator type="no_options" message="Please select a p value column."/>
-    </param>
-    <param name="signcol" label="Column containing the DE sign - eg log fold change so positive/negative values = upregulated/downregulated in treatment"
-         type="data_column"  data_ref="input1" numerical="True"
-         multiple="false" use_header_names="true" size="5">
-        <validator type="no_options" message="Please select a sign column."/>
-    </param>
-    <param name="idcol" label="Column containing a gene id (refseq, symbol or Entrez"
-         type="data_column"  data_ref="input1" numerical="False"
-         multiple="false" use_header_names="true" size="5">
-        <validator type="no_options" message="Please select an id  column."/>
-    </param>
-
-    <param name="title" type="text" value="GSEA" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain"/>
-    <param name="setMin" type="integer" label="Minimum gene set size to prune (default=15)" size="5" value="15"/>
-    <param name="setMax" type="integer" label="Maximum gene set size to prune (default=500)" size="5" value="500"/>
-    <param name="nPerm" type="integer" label="Number of permutations (default=1000)" size="7" value="1000"/>
-    <param name="plotTop" type="integer" label="Number of top gene sets to plot and present in detailed reports(default=20)" size="10" value="20"/>
-    <param name="mode" type="select" label="Mode for dealing with duplicated gene ids" >
-        <option value="Max_probe" selected="true">Use the most extreme value</option>
-        <option value="Median_of_probes">Use the median of all supplied values</option>
-    </param>
-    <param name="makeTab" type="select" label="Create a tabular report containing ALL gene sets for downstream analysis" >
-        <option value="Yes">Yes</option>
-        <option value="No" selected="true">No</option>
-    </param>
-    <conditional name="chipSource" >
-        <param name="refchipSource" type="select" label="Translate the rank file IDs (first column) using a GSEA 'chip' lookup table">
-        <option value="builtin">Use a builtin GSEA 'chip' file</option>
-        <option value="history">Use one of the GSEA 'chip' files in my history</option>
-        <option value="no" selected="true">IDs are already gene symbols - no translation needed  </option>
-        </param>
-        <when value="no">
-        </when>
-        <when value="builtin">
-           <param name="builtinChip" type="select" label="Select a GSEA 'chip' to map each probe id in the input file to a gene symbol">
-              <options from_data_table="gseaChip_3.1">
-              <filter type="sort_by" column="2" />
-              <validator type="no_options" message="No chip tables are available - please see the Galaxy GSEA tool documentation for instructions on installing them" />
-              </options>
-           </param>
-        </when>
-      <when value="history">
-        <param name="ownChip" type="data" format="gseachip" label="Select an existing gseachip filefrom your history. Probe_id matching ids in your input file and a gene_symbol must be the first 2 tabular columns. Header row required."
-           help='see docs at http://www.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#CHIP:_Chip_file_format_.28.2A.chip.29'>
-            <validator type="no_options" message="Need a gseachip datatype - tabular file, probe_id gene_symbol as first 2 columns. Header row required. Convert datatypes (pencil icon) of existing history tabular files if necessary"/>
-        </param>
-      </when>
-    </conditional>
-    <conditional name="gmtSource">
-      <param name="refgmtSource" type="select" label="Use a gene set (.gmt) from your history or use a built-in (MSigDB etc) gene set">
-        <option value="indexed" selected="true">Use a built-in gene set</option>
-        <option value="history">Use a gene set from my history</option>
-        <option value="both">Add a gene set from my history to a built in gene set</option>
-      </param>
-      <when value="indexed">
-        <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
-          <options from_data_table="gseaGMT_3.1">
-            <filter type="sort_by" column="2" />
-            <validator type="no_options" message="No GMT v3.1 files are available - please install them"/>
-          </options>
-        </param>
-      </when>
-      <when value="history">
-        <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" />
-      </when>
-      <when value="both">
-        <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" />
-        <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
-          <options from_data_table="gseaGMT_3.1">
-            <filter type="sort_by" column="2" />
-            <validator type="no_options" message="No GMT v3.1 files are available - please install them and make sure they match [tool-data]/gseaGMT_3.1.loc, editing as needed"/>
-          </options>
-        </param>
-       </when>
-    </conditional>
-  </inputs>
-  <outputs>
-    <data format="html" name="html_file" label="${title}.html"/>
-    <data format="tabular" name="outtab_pos"  label="${title}_pos_allsets.xls">
-    <filter>makeTab=="Yes"</filter>
-    </data>
-    <data format="tabular" name="outtab_neg" label="${title}_neg_allsets.xls">
-    <filter>makeTab=="Yes"</filter>
-    </data>
-  </outputs>
-  <tests>
-  <test>
-      <param name="input1" value="gsea_test_DGE.xls"/>
-      <param name="ownGMT" value="gsea_test_DGE.xls"/>
-      <param name="adjpvalcol" value="5"/>
-      <param name="signcol" value="2"/>
-      <param name="idcol" value="1"/>
-      <param name="plotTop" value="2"/>
-      <param name="makeTab" value="No"/>
-      <param name="refchipSource" value="no"/>
-      <param name="refgmtSource" value="history"/>
-      <param name="html_file" value="gseatestout.html">
-         <assert_contents>
-              <has_text text="GSEA Report for Dataset" />
-              <has_text text="enrichment results in html"/>
-         </assert_contents>
-      </param>
-      <param name="setMax" value="500"/>
-      <param name="setMin" value="15"/>
-      <param name="nPerm" value="100"/>
-      <param name="output_dir" value="gseatestout"/>
-      <param name="mode" value="Max_probe"/>
-      <param name="title" value="GSEA test"/>
-      <param name="builtin_gmt" value="gseatestdata.gmt"/>
-  </test>
-  </tests>
-<help>
-**What it does**
-
-Performs Gene set enrichment analysis using GSEA_ from the Broad as described in 2005Paper_ and v3.1 of msigdb
-
-Refer to http://www.pnas.org/cgi/content/abstract/0506580102v1 for details of the method.
-
-**Input**
-
-A tabular file with a header row and some columns containing probe ids or gene names; a p value or t-test statistic reflecting the
-strength and a signed quantity such as log fold change or a t-test statistic reflecting the direction of differential gene expression from edgeR for counts
-or some other valid analysis.
-Probe names can be translated using the chip file option (see below)
-A filter has been added to remove NA and null id rows as these can't be used. A summary will appear in stdout in the page linked from the "i" (info) button for the output
-
-**Gene sets**
-
-If you have your own gene sets as .gmt files (see GUIDE_ for details of format) upload them in your history and use them by nominating a history sourced gmt.
-
-Alternatively, use one of the built in MSigDB_ (molecular signatures database) sets.
-There may be additional locally provided sets such as starBase
-
-Optionally, a local gmt can be merged with a built in one if you want to see how they perform competitively
-
-Select the most appropriate subset of MSigDB gene sets or use all by choosing msigdb.v3.0.symbols.
-A non-cancer subset called c2_c3_c5_all.v3.0.symbols is recommended if you are working with non-cancer biology.
-
-
-**Chip**
-
-If your input file contains HUGO gene symbols (TEK, SLC38A1...etc) then you do not need to specify a 'chip'.
-
-GSEA software allows you to specify a translation table from common microarray probe IDs to gene symbols - select the option to use a chip file and choose the one that matches your probe id's
-Failure to match the translation correctly will generally result in failure of the job since none of your input data will be translated properly.
-
-**Output**
-
-A report as html - see the GUIDE_ for a guide on interpretation
-
-
-**Attribution**
-
-This Galaxy tool wrapper was written by Ross Lazarus.
-It executes the GSEA_ java application available using MSigDB and Chip files obtained from the GSEA_ web site
-
-
-**Chip ID samples**
-
-You may be able to look for probe ids in similar form to the ones you have in the list below to find the correct GSEA 'chip' file if you do not have HUGO gene symbols
-
-    ::
-
-        AG.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        11986_at        ---     calcium-dependent lipid-binding protein, putative
-        11987_at        ---     zinc finger (C3HC4-type RING finger) family protein
-        11988_at        ---     WD-40 repeat family protein
-
-        Agilent_Human1A.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A_23_P100001    LOC400451       hypothetical gene supported by AK075564; BC060873
-        A_23_P100011    AP3S2   adaptor-related protein complex 3, sigma 2 subunit
-        A_23_P10002     CEP63   centrosome protein Cep63
-
-        Agilent_Human1Av2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A_23_P100001    LOC400451       hypothetical gene supported by AK075564; BC060873
-        A_23_P100011    AP3S2   adaptor-related protein complex 3, sigma 2 subunit
-        A_23_P10002     CEP63   centrosome protein Cep63
-
-        Agilent_Human1B.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A_32_P10002     TUBA2   tubulin, alpha 2
-        A_32_P100076    RPL19   ribosomal protein L19
-        A_32_P100100    THOC3   THO complex 3
-
-        Agilent_Human1_cDNA.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1       RNASEH2A        ribonuclease H2, large subunit
-        10      SF3B1   splicing factor 3b, subunit 1, 155kDa
-        100     S100A16 S100 calcium binding protein A16
-
-        Agilent_HumanGenome.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A_23_P100001    LOC400451       hypothetical gene supported by AK075564; BC060873
-        A_23_P100011    NULL    NULL
-        A_23_P100022    SV2B    synaptic vesicle glycoprotein 2B
-
-        Agilent_Mouse_cDNA.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1000982 SLC6A9  solute carrier family 6 (neurotransmitter transporter, glycine), member 9
-        1001007 IGF1    insulin-like growth factor 1
-        1001011 FCNA    ficolin A
-
-        Agilent_MouseDev.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A_65_P00523     NULL    NULL
-        A_65_P00524     NULL    NULL
-        A_65_P00525     SFTPD   surfactant associated protein D
-
-        Agilent_MouseGenome.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A_51_P100021    NULL    NULL
-        A_51_P100034    2310075G12RIK   RIKEN cDNA 2310075G12 gene
-        A_51_P100052    SLITRK2 SLIT and NTRK-like family, member 2
-
-        Agilent_MouseOligo.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A_51_P100021    NULL    NULL
-        A_51_P100034    2310075G12RIK   RIKEN cDNA 2310075G12 gene
-        A_51_P100052    SLITRK2 SLIT and NTRK-like family, member 2
-
-        Agilent_RatGenome_G4131A.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A_42_P453055    LU      Lutheran blood group (Auberger b antigen included)
-        A_42_P453737    NULL    NULL
-        A_42_P453894    RGD1306682_PREDICTED    similar to RIKEN cDNA 1810046J19 (predicted)
-
-        Agilent_RatOligo.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A_42_P453131    CNTN1   contactin 1
-        A_42_P453151    TTC8_PREDICTED  tetratricopeptide repeat domain 8 (predicted)
-        A_42_P453162    NULL    NULL
-
-        APPLERA_ABI1700.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        100002  NULL    NULL
-        100003  NULL    NULL
-        100027  ZNF192  zinc finger protein 192
-
-        ATH1_121501.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        244901_at       ---     ---
-        244902_at       ---     ---
-        244903_at       ---     ---
-
-        AtlasMouse1.2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        MA099   HTR1A   5-hydroxytryptamine (serotonin) receptor 1A
-        MA015   JARID2  jumonji, AT rich interactive domain 2
-        M443    CDKN2C  cyclin-dependent kinase inhibitor 2C (p18, inhibits CDK4)
-
-        AtlasRat1.2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        R484    TPP2    tripeptidylpeptidase II
-        RG348   MOS     v-mos moloney murine sarcoma viral oncogene homolog
-        RG119   KCNJ1   potassium inwardly-rectifying channel, subfamily J, member 1
-
-        Bovine.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AFFX-BioB-3_at  ---     ---
-        AFFX-BioB-5_at  ---     ---
-        AFFX-BioB-M_at  ---     ---
-
-        Caltech16KMouse.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        6712    CCL6    chemokine (C-C motif) ligand 6
-        8563    ACTL7B  actin-like 7b
-        8434    1700049M11RIK   RIKEN cDNA 1700049M11 gene
-
-        Caltech16KOligoMouse.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        6712    GCA     grancalcin, EF-hand calcium binding protein
-        8563    MARVELD2        MARVEL (membrane-associating) domain containing 2
-        8434    BC048825        NULL
-
-        Canine_2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AFFX-BioB-3_at  ---     ---
-        AFFX-BioB-5_at  ---     ---
-        AFFX-BioB-M_at  ---     ---
-
-        Clontech_Atlas_13K.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1-A01   NULL    NULL
-        1-A02   A2M     alpha-2-macroglobulin
-        1-A03   AADAC   arylacetamide deacetylase (esterase)
-
-        Clontech_BD_Atlas.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A01a    PCP4    Purkinje cell protein 4
-        A01b    NULL    NULL
-        A01c    PMP22   peripheral myelin protein
-
-        CNMCMuscleChip.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        HSPD4570_at     MBP     myelin basic protein
-        HSPD7328_at     MAPK12  mitogen-activated protein kinase 12
-        HSPD17028_i_at  DIA1    diaphorase (NADH) (cytochrome b-5 reductase)
-
-        CodeLink_Human_Whole_Genome.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        GE469530        NULL    NULL
-        GE469548        TMEM1   transmembrane protein 1
-        GE469549        NULL    NULL
-
-        CodeLink_UniSet_Human_20K_I_Bioarray.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        000106CB1_PROBE1        NDUFA4  NADH dehydrogenase (ubiquinone) 1 alpha subcomplex, 4, 9kDa
-        000136CB1_PROBE1        MRPS33  mitochondrial ribosomal protein S33
-        000278CB1_PROBE1        FDFT1   farnesyl-diphosphate farnesyltransferase 1
-
-        CodeLink_UniSet_Human_I_Bioarray.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AA001334_PROBE1 ZHX2    zinc fingers and homeoboxes 2
-        AA002191_PROBE1 PITPNA  phosphatidylinositol transfer protein, alpha
-        AA004381_PROBE1 NULL    NULL
-
-        CodeLink_UniSet_Human_II_Bioarray.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        000106CB1_PROBE1        NDUFA4  NADH dehydrogenase (ubiquinone) 1 alpha subcomplex, 4, 9kDa
-        000136CB1_PROBE1        MRPS33  mitochondrial ribosomal protein S33
-        000278CB1_PROBE1        FDFT1   farnesyl-diphosphate farnesyltransferase 1
-
-        CodeLink_UniSet_Rat_I_Bioarray.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AA799301_PROBE1 NULL    NULL
-        AA799313_PROBE1 RGD:1303279     sialyltransferase 10 (alpha-2,3-sialyltransferase VI)
-        AA799329_PROBE1 NULL    NULL
-
-        DrosGenome1.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        141200_at       CG9418  CG9418
-        141201_at       CG17840 CG17840
-        141202_at       QKR58E-3        quaking related 58E-3
-
-        Drosophila_2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1616608_a_at    GPDH    Glycerol 3 phosphate dehydrogenase
-        1622892_s_at    CG33057 /// MKG-P       CG33057 /// monkey king protein
-        1622893_at      IM3 /// RPS10B  Immune induced molecule 3 /// Ribosomal protein S10b
-
-        G4110A.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A_23_P7033      MAGEF1  melanoma antigen, family F, 1
-        A_23_P170713    A_23_P170713    ***description missing***
-        A_23_P107801    FLJ21742        hypothetical protein FLJ21742
-
-        G4110Av2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A_23_P7033      MAGEF1  melanoma antigen, family F, 1
-        A_23_P170713    A_23_P170713    No description available
-        A_23_P107801    FLJ21742        hypothetical protein FLJ21742
-
-        GENE_SYMBOL.chip
-        Probe Set ID    Gene Symbol     Gene Title      Aliases
-        A1BG    A1BG    alpha-1-B glycoprotein  GABDKFZP686F0970ABGHYST2477A1B
-        A2M     A2M     alpha-2-macroglobulin   DKFZP779B086S863-7ALPHA 2MFWP007CPAMD5
-        A2ML1   A2ML1   alpha-2-macroglobulin-like 1    FLJ39129DKFZP686O1010FLJ41598DKFZP686G1812FLJ25179CPAMD9DKFZP686D2011FLJ16045FLJ41607DKFZP686C1729FLJ41597DKFZP686L1821
-
-        GenosysCytokineV2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        M05.2   GDF11   growth differentiation factor 11
-        M22.2   AE000170        No description available
-        E06.3   IL5     interleukin 5 (colony-stimulating factor, eosinophil)
-
-        HC_G110.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1000_at MAPK3   mitogen-activated protein kinase 3
-        1001_at TIE1    tyrosine kinase with immunoglobulin-like and EGF-like domains 1
-        1002_f_at       CYP2C19 cytochrome P450, family 2, subfamily C, polypeptide 19
-
-        HG_Focus.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1007_s_at       DDR1    discoidin domain receptor family, member 1
-        1053_at RFC2    replication factor C (activator 1) 2, 40kDa
-        117_at  HSPA6 /// LOC652878     heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B)
-
-        HG_U133A_2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1007_s_at       DDR1    discoidin domain receptor family, member 1
-        1053_at RFC2    replication factor C (activator 1) 2, 40kDa
-        117_at  HSPA6 /// LOC652878     heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B)
-
-        HG_U133AAofAv2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1007_s_at       DDR1    discoidin domain receptor family, member 1
-        1053_at RFC2    replication factor C (activator 1) 2, 40kDa
-        117_at  HSPA6 /// LOC652878     heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B)
-
-        HG_U133A.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1007_s_at       DDR1    discoidin domain receptor family, member 1
-        1053_at RFC2    replication factor C (activator 1) 2, 40kDa
-        117_at  HSPA6 /// LOC652878     heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B)
-
-        HG_U133B.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        200000_s_at     PRPF8   PRP8 pre-mRNA processing factor 8 homolog (S. cerevisiae) /// PRP8 pre-mRNA processing factor 8 homolog (S. cerevisiae)
-        200001_at       CAPNS1  calpain, small subunit 1 /// calpain, small subunit 1
-        200002_at       RPL35   ribosomal protein L35 /// ribosomal protein L35
-
-        HG_U133_Plus_2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1007_s_at       DDR1    discoidin domain receptor family, member 1
-        1053_at RFC2    replication factor C (activator 1) 2, 40kDa
-        117_at  HSPA6 /// LOC652878     heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B)
-
-        HG_U95Av2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1000_at MAPK3   mitogen-activated protein kinase 3
-        1001_at TIE1    tyrosine kinase with immunoglobulin-like and EGF-like domains 1
-        1002_f_at       CYP2C19 cytochrome P450, family 2, subfamily C, polypeptide 19
-
-        HG_U95B.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        41880_at        ---     Transcribed locus
-        41881_at        FLJ40142        FLJ40142 protein
-        41882_at        FBXO42  F-box protein 42
-
-        HG_U95C.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        48609_r_at      KIAA1333        KIAA1333
-        48610_at        C20ORF102       chromosome 20 open reading frame 102
-        48612_at        N4BP1 /// LOC653213     Nedd4 binding protein 1 /// similar to Nedd4 binding protein 1
-
-        HG_U95D.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        67020_at        ---     Transcribed locus
-        67021_at        FAM44B  Family with sequence similarity 44, member B
-        67023_at        ---     Transcribed locus
-
-        HG_U95E.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        67022_at        PIP5K2B Phosphatidylinositol-4-phosphate 5-kinase, type II, beta
-        67024_at        CASD1   CAS1 domain containing 1
-        67026_at        NFASC   neurofascin homolog (chicken)
-
-        HPCGGCompugenAnnotations.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        MGI:1913066     SH3MD2  Sh3md2, SH3 multiple domains 2
-        MGI:1913099     5430413I02RIK   5430413I02Rik, RIKEN cDNA 5430413I02 gene
-        MGI:98848       TSHB    Tshb, thyroid stimulating hormone, beta subunit
-
-        HT_HG_U133A.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1007_s_at       DDR1    discoidin domain receptor family, member 1
-        1053_at RFC2    replication factor C (activator 1) 2, 40kDa
-        117_at  HSPA6 /// LOC652878     heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B)
-
-        Hu35KsubA.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AA000993_at     PRDM8   PR domain containing 8
-        AA001296_s_at   PHF23   PHD finger protein 23
-        AA002245_at     ZC3H11A zinc finger CCCH-type containing 11A
-
-        Hu35KsubB.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AFFX-BioB-3_at  ---     ---
-        AFFX-BioB-3_st  ---     ---
-        AFFX-BioB-5_at  ---     ---
-
-        Hu35KsubC.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AFFX-BioB-3_at  ---     ---
-        AFFX-BioB-3_st  ---     ---
-        AFFX-BioB-5_at  ---     ---
-
-        Hu35KsubD.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AFFX-BioB-3_at  ---     ---
-        AFFX-BioB-3_st  ---     ---
-        AFFX-BioB-5_at  ---     ---
-
-        Hu6800.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A28102_at       GABRA3  gamma-aminobutyric acid (GABA) A receptor, alpha 3
-        AB000114_at     OMD     osteomodulin
-        AB000115_at     IFI44L  interferon-induced protein 44-like
-
-        Illumina_Human.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1-A-1   BCAP31  B-cell receptor-associated protein 31
-        1-A-10  NSFL1C  NSFL1 (p97) cofactor (p47)
-        1-A-11  C20ORF24        chromosome 20 open reading frame 24
-
-        ilmn_HumanHT_12_V4_0_R1_15002873_B.chip
-        Probe Set ID    Gene Symbol     Gene Title      Entrez Gene ID
-        ILMN_1725881    LOC23117        PREDICTED: Homo sapiens KIAA0220-like protein, transcript variant 11 (LOC23117), mRNA.  23117
-        ILMN_1910180            Homo sapiens cDNA: FLJ21027 fis, clone CAE07110 -999
-        ILMN_1804174    FCGR2B  PREDICTED: Homo sapiens Fc fragment of IgG, low affinity IIb, receptor (CD32) (FCGR2B), mRNA.   2213
-
-        ilmn_MouseRef_8_V2_0_R3_11278551_A.chip
-        Probe Set ID    Gene Symbol     Gene Title      Entrez Gene ID
-        ILMN_2607609    YIF1B   Mus musculus Yip1 interacting factor homolog B (S. cerevisiae) (Yif1b), mRNA.   77254
-        ILMN_1238674    2700007P21RIK   Mus musculus RIKEN cDNA 2700007P21 gene (2700007P21Rik), transcript variant 2, mRNA.    212772
-        ILMN_3062534    2700007P21RIK   Mus musculus RIKEN cDNA 2700007P21 gene (2700007P21Rik), transcript variant 2, mRNA.    212772
-
-        ilmn_MouseWG_6_V2_0_R3_11278593_A.chip
-        Probe Set ID    Gene Symbol     Gene Title      Entrez Gene ID
-        ILMN_1243094    THRSP   NULL    -999
-        ILMN_1238674    2700007P21RIK   Mus musculus RIKEN cDNA 2700007P21 gene (2700007P21Rik), transcript variant 2, mRNA.    212772
-        ILMN_2454720    2700007P21RIK   Mus musculus RIKEN cDNA 2700007P21 gene (2700007P21Rik), transcript variant 2, mRNA.    212772
-
-        labonweb_human.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1-A1    GAPD    glyceraldehyde-3-phosphate dehydrogenase
-        1-A10   BMP5    bone morphogenetic protein 5
-        1-A11   C9ORF78 chromosome 9 open reading frame 78
-
-        lymphochip.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1317148 TNF     tumor necrosis factor (TNF superfamily, member 2)
-        814251  SLAMF1  signaling lymphocytic activation molecule family member 1
-        712899___1      AA282233        zt12d02.s1 NCI_CGAP_GCB1 Homo sapiens cDNA clone
-
-        MG_U74Av2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        100001_at       CD3G    CD3 antigen, gamma polypeptide
-        100002_at       ITIH3   inter-alpha trypsin inhibitor, heavy chain 3
-        100003_at       RYR1    ryanodine receptor 1, skeletal muscle
-
-        MG_U74Bv2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        104769_at       LIMA1   LIM domain and actin binding 1
-        104770_at       ---     16 days neonate heart cDNA, RIKEN full-length enriched library, clone:D830033B01 product:unclassifiable, full insert sequence
-        104771_at       RAC2    RAS-related C3 botulinum substrate 2
-
-        MG_U74Cv2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        128520_at       CCDC97  coiled-coil domain containing 97
-        128523_at       A730063M14RIK   RIKEN cDNA A730063M14 gene
-        128529_at       A930009G19RIK   RIKEN cDNA A930009G19 gene
-
-        MOE430A.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1415670_at      COPG    coatomer protein complex, subunit gamma
-        1415671_at      ATP6V0D1        ATPase, H+ transporting, lysosomal V0 subunit D1
-        1415672_at      GOLGA7  golgi autoantigen, golgin subfamily a, 7
-
-        MOE430B.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1415670_at      COPG    coatomer protein complex, subunit gamma
-        1415671_at      ATP6V0D1        ATPase, H+ transporting, lysosomal V0 subunit D1
-        1415672_at      GOLGA7  golgi autoantigen, golgin subfamily a, 7
-
-        Mouse430_2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1415670_at      COPG    coatomer protein complex, subunit gamma
-        1415671_at      ATP6V0D1        ATPase, H+ transporting, lysosomal V0 subunit D1
-        1415672_at      GOLGA7  golgi autoantigen, golgin subfamily a, 7
-
-        Mouse430A_2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1415670_at      COPG    coatomer protein complex, subunit gamma
-        1415671_at      ATP6V0D1        ATPase, H+ transporting, lysosomal V0 subunit D1
-        1415672_at      GOLGA7  golgi autoantigen, golgin subfamily a, 7
-
-        Mu11KsubA.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        aa000148_s_at   NIP7    nuclear import 7 homolog (S. cerevisiae)
-        AA000151_at     1200011I18RIK   RIKEN cDNA 1200011I18 gene
-        aa000380_s_at   TARDBP  TAR DNA binding protein
-
-        Mu11KsubB.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AFFX-18SRNAMur/X00686_3_at      ---     ---
-        AFFX-18SRNAMur/X00686_5_at      ---     ---
-        AFFX-18SRNAMur/X00686_M_at      ---     ---
-
-        Mu19KsubA.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AFFX-18SRNAMur/X00686_3_at      ---     ---
-        AFFX-18SRNAMur/X00686_5_at      ---     ---
-        AFFX-18SRNAMur/X00686_M_at      ---     ---
-
-        Mu19KsubB.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AFFX-18SRNAMur/X00686_3_at      ---     ---
-        AFFX-18SRNAMur/X00686_5_at      ---     ---
-        AFFX-18SRNAMur/X00686_M_at      ---     ---
-
-        Mu19KsubC.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AFFX-18SRNAMur/X00686_3_at      ---     ---
-        AFFX-18SRNAMur/X00686_5_at      ---     ---
-        AFFX-18SRNAMur/X00686_M_at      ---     ---
-
-        MWG_Human_30K_A.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1-A1    IFNA8   interferon, alpha 8
-        1-A10   AGL     amylo-1, 6-glucosidase, 4-alpha-glucanotransferase (glycogen debranching enzyme, glycogen storage disease type III)
-        1-A11   NULL    NULL
-
-        MWG_Human_30K_B.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1-A10   ADAM11  a disintegrin and metalloproteinase domain 11
-        1-A11   NULL    NULL
-        1-A12   MAST3   microtubule associated serine/threonine kinase 3
-
-        Netherland_cancer_institute_operon_human_35k.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        H200001046      ZZZ3    zinc finger, ZZ domain containing 3 [Homo sapiens]. [Source:RefSeq;Acc:NM_015534]
-        H200011098      ZZEF1   zinc finger, ZZ-type with EF hand domain 1 [Homo sapiens]. [Source:RefSeq;Acc:NM_015113]
-        H200006223      ZYX     Zyxin (Zyxin 2). [Source:Uniprot/SWISSPROT;Acc:Q15942]
-
-        Netherland_cancer_institute_operon_mouse_FOOk.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        M300012631      ZZEF1   NULL
-        M300012632      ZZEF1   NULL
-        M200002019      ZYX     Zyxin. [Source:Uniprot/SWISSPROT;Acc:Q62523]
-
-        NIA15k.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        H3051B03        BG067146        H3051B03-3 NIA Mouse 15K cDNA Clone Set Mus musculus cDNA
-        H3067H02        BG068648        H3067H02-3 NIA Mouse 15K cDNA Clone Set Mus musculus cDNA
-        H3121E03        AK029878        NULL
-
-        OPERON_HUMANv2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1-A01   NULL    NULL
-        1-A02   NULL    NULL
-        1-A03   NAT2    N-acetyltransferase 2 (arylamine N-acetyltransferase)
-
-        OPERON_HUMANv3.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1-A03   TSPAN6  tetraspanin 6
-        1-A04   AK2     adenylate kinase 2
-        1-A05   NULL    NULL
-
-        RAE230A.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1367452_at      SUMO2   SMT3 suppressor of mif two 3 homolog 2 (yeast)
-        1367453_at      CDC37   cell division cycle 37 homolog (S. cerevisiae)
-        1367454_at      COPB2   coatomer protein complex, subunit beta 2 (beta prime)
-
-        RAE230B.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1367452_at      SUMO2   SMT3 suppressor of mif two 3 homolog 2 (yeast)
-        1367453_at      CDC37   cell division cycle 37 homolog (S. cerevisiae)
-        1367454_at      COPB2   coatomer protein complex, subunit beta 2 (beta prime)
-
-        Rat230_2.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1367452_at      SUMO2   SMT3 suppressor of mif two 3 homolog 2 (yeast)
-        1367453_at      CDC37   cell division cycle 37 homolog (S. cerevisiae)
-        1367454_at      COPB2   coatomer protein complex, subunit beta 2 (beta prime)
-
-        RefSeq_human.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        NM_030625       CXXC6   CXXC finger 6
-        NM_181776       SLC36A2 solute carrier family 36 (proton/amino acid symporter), member 2
-        NM_152616       TRIM42  tripartite motif-containing 42
-
-        RefSeq_NP_Human.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        NP_000005       A2M     alpha-2-macroglobulin
-        NP_000006       NAT2    N-acetyltransferase 2 (arylamine N-acetyltransferase)
-        NP_000007       ACADM   acyl-Coenzyme A dehydrogenase, C-4 to C-12 straight chain
-
-        RefSeq_NP_Mouse.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AAF62769        NULL    NULL
-        AAF62770        NULL    NULL
-        AAF62771        NULL    NULL
-
-        RefSeq_NP_Rat.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        NP_007225       NULL    NULL
-        NP_007226       NULL    NULL
-        NP_007227       NULL    NULL
-
-        Research_Genetics.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1-a-1   NULL    NULL
-        1-a-10  WNT2    wingless-type MMTV integration site family member 2
-        1-a-11  VHL     von Hippel-Lindau tumor suppressor
-
-        RG_U34A.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A01157cds_s_at  LIPF    lipase, gastric
-        A03913cds_s_at  SERPINE2        serine (or cysteine) proteinase inhibitor, clade E, member 2
-        A04674cds_s_at  UCP1    uncoupling protein 1 (mitochondrial, proton carrier)
-
-        RG_U34B.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AFFX-BioB-3_at  ---     ---
-        AFFX-BioB-3_st  ---     ---
-        AFFX-BioB-5_at  ---     ---
-
-        RG_U34C.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AA012646_i_at   ---     Transcribed locus
-        AA012646_r_at   ---     Transcribed locus
-        AA012654_at     ---     Transcribed locus
-
-        RN_U34.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        A03913cds_s_at  SERPINE2        serine (or cysteine) proteinase inhibitor, clade E, member 2
-        A17753cds_s_at  DRD3    dopamine receptor D3
-        AA848563_s_at   HSPA1A /// HSPA1B_MAPPED        heat shock 70kD protein 1A /// heat shock 70kD protein 1B (mapped)
-
-        Rosetta50K.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1360465 RANBP9  RAN binding protein 9
-        1176829 LOC375449       similar to microtubule associated testis specific serine/threonine protein kinase
-        1351074 NEDD4L  neural precursor cell expressed, developmentally down-regulated 4-like
-
-        Rosetta.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        NM_000504       F10     coagulation factor X
-        Contig32955_RC  ARL6IP6 ADP-ribosylation-like factor 6 interacting protein 6
-        AK000455        MGC16733        hypothetical gene MGC16733 similar to CG12113
-
-        RT_U34.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AA108277_at     HSPH1   heat shock 105kDa/110kDa protein 1
-        AA108308_i_at   MDM2_PREDICTED  Transformed mouse 3T3 cell double minute 2 homolog (mouse) (predicted)
-        AA108308_s_at   ---     ---
-
-        RZPD_Human_Ensembl1.1.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        RZPDp203A011001D        SARDH   sarcosine dehydrogenase
-        RZPDp203A011002D        ARHGAP22        Rho GTPase activating protein 22
-        RZPDp203A011003D        CNGA3   cyclic nucleotide gated channel alpha 3
-
-        RZPD_Human_ORF_Clones_Gateway.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        RZPDo834A0110D - Gateway (closed)       PTD015  PTD015 protein
-        RZPDo834A0114D - Gateway (closed)       TRIB3   tribbles homolog 3 (Drosophila)
-        RZPDo834A0116D - Gateway (closed)       ORM2    orosomucoid 2
-
-        RZPD_Human_Unigene3.1.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        HU3_p983A011001D        SARDH   sarcosine dehydrogenase
-        HU3_p983A011002D        ARHGAP22        Rho GTPase activating protein 22
-        HU3_p983A011003D        CNGA3   cyclic nucleotide gated channel alpha 3
-
-        Seq_Accession.chip
-        Probe Set ID    Gene Symbol
-        AA017197        C21ORF36
-        AA191116        MTVR2
-        AA280701        CXYORF7
-
-        Stanford.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        IMAGE:703849    DDB2    damage-specific DNA binding protein 2, 48kDa
-        IMAGE:1301778   ZFY     zinc finger protein, Y-linked
-        IMAGE:795810    HS.99503HS.520681       Homo sapiens transcribed sequenceHomo sapiens, clone IMAGE:4823270, mRNA
-
-        Stanford_Source_Accessions.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AI848107        0610010K14RIK   RIKEN cDNA 0610010K14 gene
-        AK002491        0610010K14RIK   RIKEN cDNA 0610010K14 gene
-        AK003842        0610010K14RIK   RIKEN cDNA 0610010K14 gene
-
-        TIGR_31K_Human_Set.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1-1     NULL    NULL
-        1-10    WNT2    wingless-type MMTV integration site family member 2
-        1-11    VHL     von Hippel-Lindau tumor suppressor
-
-        TIGR_40K_Human_Set.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        10      NULL    NULL
-        100     TEX27   testis expressed sequence 27
-        1000    HOXA1   homeo box A1
-
-        U133_X3P.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1053_3p_at      RFC2    replication factor C (activator 1) 2, 40kDa
-        117_3p_at       HSPA6 /// LOC652878     heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B)
-        1494_3p_f_at    CYP2A6  cytochrome P450, family 2, subfamily A, polypeptide 6
-
-        UCLA_NIH_33K.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        1020181 NULL    NULL
-        1020315 VAV2    vav 2 oncogene
-        1020478 AP1GBP1 AP1 gamma subunit binding protein 1
-
-        Zebrafish.chip
-        Probe Set ID    Gene Symbol     Gene Title
-        AFFX-BioB-3_at  ---     ---
-        AFFX-BioB-5_at  ---     ---
-        AFFX-BioB-M_at  ---     ---
-
-
- .. _LGPL: http://www.gnu.org/copyleft/lesser.html
- .. _GSEA: http://www.broadinstitute.org/gsea
- .. _GUIDE: http://www.broadinstitute.org/gsea/doc/GSEAUserGuideFrame.html?_Interpreting_GSEA_Results
- .. _MSigDB: http://www.broadinstitute.org/gsea/msigdb/index.jsp
- .. _2005Paper: http://www.pnas.org/content/102/43/15545.full
-
-</help>
-
-</tool>
-
-
--- a/rgedgeR/rgedgeR.xml	Wed Jun 12 02:58:43 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,504 +0,0 @@
-<tool id="rgedgeR" name="edgeR" version="0.18">
-  <description>digital DGE between two groups of replicates</description>
-  <command interpreter="python">
-     rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "edgeR"
-    --output_dir "$html_file.files_path" --output_html "$html_file" --output_tab "$outtab" --make_HTML "yes"
-  </command>
-  <inputs>
-    <param name="input1"  type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
-       help="Use the HTSeq based count matrix preparation tool to create these count matrices from BAM files and a GTF file"/>
-    <param name="title" type="text" value="DGE" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain">
-      <sanitizer invalid_char="">
-        <valid initial="string.letters,string.digits"><add value="_" /> </valid>
-      </sanitizer>
-    </param>
-    <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
-    <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
-         multiple="true" use_header_names="true" size="120" display="checkboxes">
-        <validator type="no_options" message="Please select at least one column."/>
-    </param>
-    <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
-    <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
-         multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
-    </param>
-    <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
-     help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
-    <param name="useQuantile" type="boolean" truevalue="T" checked='false' falsevalue="" size="1" label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
-     help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
-    <param name="priorn" type="integer" value="4" size="3" label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance - note this used to be prior.n"
-     help="Zero = auto-estimate. 1 to force high variance tags out. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
-    <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
-     help="Conventional default value of 0.05 recommended"/>
-    <param name="fdrtype" type="select" label="FDR (Type II error) control method"
-         help="Use fdr or bh typically to control for the number of tests in a reliable way">
-            <option value="fdr" selected="true">fdr</option>
-            <option value="BH">Benjamini Hochberg</option>
-            <option value="BY">Benjamini Yukateli</option>
-            <option value="bonferroni">Bonferroni</option>
-            <option value="hochberg">Hochberg</option>
-            <option value="holm">Holm</option>
-            <option value="hommel">Hommel</option>
-            <option value="none">no control for multiple tests</option>
-    </param>
-  </inputs>
-  <outputs>
-    <data format="tabular" name="outtab" label="${title}.xls"/>
-    <data format="html" name="html_file" label="${title}.html"/>
-    <data format="gsearank" name="outgsea" label="${title}.gsearank">
-        <filter> makeRank == 'Yes' </filter>
-    </data>
-  </outputs>
-<configfiles>
-<configfile name="runme">
-
-# edgeR.Rscript
-# updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
-# Performs DGE on a count table containing n replicates of two conditions
-#
-# Parameters
-#
-# 1 - Output Dir
-
-# Original edgeR code by: S.Lunke and A.Kaspi
-sink(stdout(),append=T,type="message")
-reallybig = log10(.Machine\$double.xmax)
-reallysmall = log10(.Machine\$double.xmin)
-require('stringr')
-require('gplots')
-library('ggplot2')
-library('gridExtra')
-
-hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
-{
-# Perform clustering for significant pvalues after controlling FWER
-    samples = colnames(cmat)
-    gu = unique(group)
-    if (length(gu) == 2) {
-        col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
-        pcols = unlist(lapply(group,col.map))
-        } else {
-        colours = rainbow(length(gu),start=0,end=4/6)
-        pcols = colours[match(group,gu)]        }
-    print(paste('pcols',pcols))
-    gn = rownames(cmat)
-    dm = cmat[(! is.na(gn)),]
-    # remove unlabelled hm rows
-    nprobes = nrow(dm)
-    # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
-    if (nprobes &gt; nsamp) {
-      dm =dm[1:nsamp,]
-      #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
-    }
-    newcolnames = substr(colnames(dm),1,20)
-    colnames(dm) = newcolnames
-    pdf(outpdfname)
-    heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
-         Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
-    dev.off()
-}
-
-hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
-{
-    # for 2 groups only was
-    #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
-    #pcols = unlist(lapply(group,col.map))
-    gu = unique(group)
-    colours = rainbow(length(gu),start=0.3,end=0.6)
-    pcols = colours[match(group,gu)]
-    nrows = nrow(cmat)
-    mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
-    if (nrows &gt; nsamp)  {
-               cmat = cmat[c(1:nsamp),]
-               mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
-          }
-    newcolnames = substr(colnames(cmat),1,20)
-    colnames(cmat) = newcolnames
-    pdf(outpdfname)
-    heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
-    dev.off()
-}
-
-qqPlot = function(descr='Title',pvector, ...)
-# stolen from https://gist.github.com/703512
-{
-    o = -log10(sort(pvector,decreasing=F))
-    e = -log10( 1:length(o)/length(o) )
-    o[o==-Inf] = reallysmall
-    o[o==Inf] = reallybig
-    pdfname = paste(gsub(" ","", descr , fixed=TRUE),'pval_qq.pdf',sep='_')
-    maint = paste(descr,'QQ Plot')
-    pdf(pdfname)
-    plot(e,o,pch=19,cex=1, main=maint, ...,
-        xlab=expression(Expected~~-log[10](italic(p))),
-        ylab=expression(Observed~~-log[10](italic(p))),
-        xlim=c(0,max(e)), ylim=c(0,max(o)))
-    lines(e,e,col="red")
-    grid(col = "lightgray", lty = "dotted")
-    dev.off()
-}
-
-smearPlot = function(DGEList,deTags, outSmear, outMain)
-        {
-        pdf(outSmear)
-        plotSmear(DGEList,de.tags=deTags,main=outMain)
-        grid(col="blue")
-        dev.off()
-        }
-
-
-
-boxPlot = function(rawdat,tmdat,maint,myTitle)
-  {
-  # give up on boxplot - it's just too buggy
-  rscolnames = substr(colnames(rawdat),1,25)
-  colnames(rawdat) = rscolnames
-  ccolnames = substr(colnames(tmdat),1,25)
-  colnames(tmdat) = ccolnames
-  print(paste('rawdat',paste(rscolnames,collapse=',')))
-  print(paste('tmdat',paste(ccolnames,collapse=',')))
-  pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"sampleBoxplot.pdf",sep='_')
-  raw = data.frame(rawdat)
-  cn = rscolnames
-  rdat = reshape(raw, direction="long",varying=list(cn),v.names="counts",times=cn)
-  rdat\$Sample = factor(rdat\$time,levels=cn)
-  rdat\$Counts = log(rdat\$counts + 0.1)
-  p1 = ggplot(rdat,aes(x=Sample,y=Counts)) + geom_boxplot(notch=T) + ylab("log Count")
-  p1 = p1 + theme(axis.text.x  = element_text(angle=90, size=9)) + ggtitle('Raw Contig Counts')
-  raw = data.frame(tmdat)
-  cn = ccolnames
-  rdat = reshape(raw, direction="long",varying=list(cn),v.names="counts",times=cn)
-  rdat\$Sample = factor(rdat\$time,levels=cn)
-  rdat\$Counts = log(rdat\$counts + 0.1)
-  p2 = ggplot(rdat,aes(x=Sample,y=Counts)) + geom_boxplot(notch=T) + ylab("log Count")
-  p2 = p2 + theme(axis.text.x  = element_text(angle=90, size=9)) + ggtitle('Normalised Contig Counts')
-  pdf(pdfname)
-  grid.arrange(p1,p2,nrow=1)
-  dev.off()
-}
-
-cumPlot = function(rawrs,cleanrs,maint,myTitle)
-{   # updated to use ecdf
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
-        defpar = par(no.readonly=T)
-        pdf(pdfname)
-        par(mfrow=c(2,1))
-        lrs = log(rawrs,10)
-        lim = max(lrs)
-        hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
-             ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
-        grid(col="blue")
-        lrs = log(cleanrs,10)
-        hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
-             ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
-        grid(col="blue")
-        dev.off()
-        par(defpar)
-}
-
-cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
-{   # updated to use ecdf
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
-        pdf(pdfname)
-        par(mfrow=c(2,1))
-        lastx = max(rawrs)
-        rawe = knots(ecdf(rawrs))
-        cleane = knots(ecdf(cleanrs))
-        cy = 1:length(cleane)/length(cleane)
-        ry = 1:length(rawe)/length(rawe)
-        plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
-             ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
-        grid(col="blue")
-        plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
-             ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
-        grid(col="blue")
-        dev.off()
-}
-
-
-
-edgeIt = function (Count_Matrix,group,outputfilename,fdrtype='fdr',priorn=5,fdrthresh=0.05,outputdir='.',
-    myTitle='edgeR',libSize=c(),useQuantile="T",filterquantile=0.2,subjects=c()) {
-
-        # Error handling
-        if (length(unique(group))!=2){
-                print("Number of conditions identified in experiment does not equal 2")
-                q()
-        }
-        require(edgeR)
-        mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
-        allN = nrow(Count_Matrix)
-        nscut = round(ncol(Count_Matrix)/2)
-        colTotmillionreads = colSums(Count_Matrix)/1e6
-        rawrs = rowSums(Count_Matrix)
-        nonzerod = Count_Matrix[(rawrs &gt; 0),] # remove all zero count genes
-        nzN = nrow(nonzerod)
-        nzrs = rowSums(nonzerod)
-        zN = allN - nzN
-        print('# Quantiles for non-zero row counts:',quote=F)
-        print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
-        if (useQuantile == "T")
-        {
-        gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) &gt;= 1) &gt;= nscut
-        lo = colSums(Count_Matrix[!gt1rpin3,])
-        workCM = Count_Matrix[gt1rpin3,]
-        cleanrs = rowSums(workCM)
-        cleanN = length(cleanrs)
-        meth = paste( "After removing",length(lo),"contigs with fewer than",nscut,"sample read counts &gt;= 1 per million, there are",sep="")
-        print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
-        maint = paste('Filter &gt;=1/million reads in &gt;=',nscut,'samples')
-        }
-        else {
-        useme = (nzrs &gt; quantile(nzrs,filterquantile))
-        workCM = nonzerod[useme,]
-        lo = colSums(nonzerod[!useme,])
-        cleanrs = rowSums(workCM)
-        cleanN = length(cleanrs)
-        meth = paste("After filtering at count quantile =",filterquantile,"there are",sep="")
-        print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
-        maint = paste('Filter below',filterquantile,'quantile')
-        }
-        cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
-
-        print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
-        rsums = rowSums(workCM)
-        # Setup DGEList object
-        DGEList = DGEList(counts=workCM, group = group)
-        #Extract T v C names
-        TName=unique(group)[1]
-        CName=unique(group)[2]
-        if (length(subjects) == 0) { mydesign = model.matrix(~group)
-               } else { sf = factor(subjects)
-                       mydesign = model.matrix(~sf+group)
-                       }
-        print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
-        print.noquote('Using design matrix:')
-        print.noquote(mydesign)
-        print.noquote(paste("prior.df =",priorn))
-        DGEList = calcNormFactors(DGEList)
-        DGEList = estimateGLMCommonDisp(DGEList,mydesign)
-        comdisp = DGEList\$common.dispersion
-        DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
-        DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
-        DGLM = glmFit(DGEList,design=mydesign)
-        co = length(colnames(mydesign))
-        DE = glmLRT(DGLM,coef=co) # always last one - subject is first if needed
-        goodness = gof(DGLM, pcutoff=fdrthresh)
-        if (sum(goodness\$outlier) &gt; 0) {
-            print.noquote('GLM outliers:')
-            print.noquote(rownames(DE)[(goodness\$outlier != 0)])
-            z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
-            pdf(paste(outputdir,paste(mt,"GoodnessofFit.pdf",sep='_'),sep='/'))
-            qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
-            abline(0,1,lwd=3)
-            points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue")
-            dev.off()
-            } else { print('No GLM fit outlier genes found\n')}
-        estpriorn = getPriorN(DGEList)
-        print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
-        efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
-        normData = (1e+06*DGEList\$counts/efflib)
-        #normData = (1e+06 * DGEList\$counts/expandAsMatrix(DGEList\$samples\$lib.size, dim(DGEList)))
-        colnames(normData) = paste( colnames(normData),'N',sep="_")
-        print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
-        nzd = data.frame(log(nonzerod + 1e-2,10))
-        boxPlot(rawdat=nzd,tmdat=normData,maint='TMM Normalisation',myTitle=myTitle)
-        #Prepare our output file
-        output = cbind(
-                Name=as.character(rownames(DGEList\$counts)),
-                DE\$table,
-                adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
-                Dispersion=DGEList\$tagwise.dispersion,totreads=rsums,normData,
-                DGEList\$counts
-        )
-        soutput = output[order(output\$PVal),] # sorted into p value order - for quick toptable
-        nreads = soutput\$totreads # ordered same way
-        print('# writing output',quote=F)
-        write.table(soutput,outputfilename, quote=FALSE, sep="\t",row.names=F)
-        tt = topTags(DE,n=nrow(DE))
-        rn = rownames(tt\$table)
-        reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
-        org="hg19"
-        genecards="&lt;a href='http://www.genecards.org/index.php?path=/Search/keyword/"
-        ucsc = paste("&lt;a href='http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
-        testreg = str_match(rn,reg)
-        if (sum(!is.na(testreg[,1]))/length(testreg[,1]) &gt; 0.9) # is ucsc style string
-        {
-          urls = paste(ucsc,'&amp;position=chr',testreg[,2],':',testreg[,3],"-",testreg[,4],"'&gt;",rn,'&lt;/a&gt;',sep='')
-        } else {
-          urls = paste(genecards,rn,"'&gt;",rn,'&lt;/a&gt;',sep="")
-        }
-        cat("# Top tags\n")
-        tt\$table = cbind(tt\$table,ntotreads=nreads,URL=urls) # add to end so table isn't laid out strangely
-        print(tt[1:50,])
-        pdf(paste(mt,"BCV_vs_abundance.pdf",sep='_'))
-        plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
-        dev.off()
-        # Plot MAplot
-        fname = gsub(' ','_',myTitle,fixed=T)
-        deTags = rownames(output[output\$adj.p.value &lt; fdrthresh,])
-        nsig = length(deTags)
-        print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
-        print('# deTags',quote=F)
-        print(head(deTags))
-        dg = DGEList[order(DE\$table\$PValue),]
-        #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
-        efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
-        normData = (1e+06*dg\$counts/efflib)
-        outpdfname=paste(mt,"heatmap.pdf",sep='_')
-        hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=myTitle)
-        outSmear = paste(outputdir,paste(fname,"Smearplot.pdf",sep='_'),sep='/')
-        outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
-        smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
-        qqPlot(descr=myTitle,pvector=DE\$table\$PValue)
-        # Plot MDS
-        ug = unique(group)
-        sample_colors =  match(DGEList\$samples\$group,ug) #ifelse (DGEList\$samples\$group==group[1], 1, 2)
-        pdf(paste(outputdir,paste(fname,"MDSplot.pdf",sep='_'),sep='/'))
-        plotMDS.DGEList(DGEList,main=paste("MDS Plot for",TName,'Vs',CName),cex=0.5,col=sample_colors,pch=sample_colors)
-        legend(x="topleft", legend = c(group[1],group[length(group)]),col=c(1,2), pch=19)
-        grid(col="blue")
-        dev.off()
-        if (FALSE==TRUE) {
-        # need a design matrix and glm to use this
-        glmfit = glmFit(DGEList, design)
-        goodness = gof(glmfit, pcutoff=fdrpval)
-        sum(goodness\$outlier)
-        rownames(d)[goodness\$outlier]
-        z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
-        pdf(paste(outputdir,paste(fname,"GoodnessofFit.pdf",sep='_'),sep='/'))
-        qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
-        abline(0,1,lwd=3)
-        points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue")
-        dev.off()
-        }
-        #Return our main table
-        output
-
-}       #Done
-
-options(width=512)
-Out_Dir = "$html_file.files_path"
-Input =  "$input1"
-ORG = "$input1.dbkey"
-TreatmentName = "$treatment_name"
-TreatmentCols = "$Treat_cols"
-ControlName = "$control_name"
-ControlCols= "$Control_cols"
-outputfilename = "$outtab"
-fdrtype = "$fdrtype"
-priorn = $priorn
-fdrthresh = $fdrthresh
-useQuantile = "$useQuantile"
-fQ = $fQ # non-differential centile cutoff
-myTitle = "$title"
-makeRank = "$makeRank"
-outgsea = ""
-if (makeRank &gt; "") outgsea = "$outgsea"
-#Set our columns
-TCols           = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
-CCols           = as.numeric(strsplit(ControlCols,",")[[1]])-1
-cat('# got TCols=')
-cat(TCols)
-cat('; CCols=')
-cat(CCols)
-cat('\n')
-
-
-# Create output dir if non existent
-  if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
-
-Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t')                           #Load tab file assume header
-Count_Matrix = Count_Matrix[,c(TCols,CCols)]
-rn = rownames(Count_Matrix)
-islib = rn %in% c('librarySize','NotInBedRegions')
-LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
-Count_Matrix = Count_Matrix[subset(rn,! islib),]
-group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) )             #Build a group descriptor
-group = factor(group, levels=c(ControlName,TreatmentName))
-colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_")                   #Relable columns
-if (priorn &lt;= 0) {priorn = ceiling(20/(length(group)-1))} # estimate prior.n if not provided
-# see http://comments.gmane.org/gmane.comp.lang.r.sequencing/2009
-results = edgeIt(Count_Matrix=Count_Matrix,group=group,outputfilename=outputfilename,fdrtype=fdrtype,priorn=priorn,fdrthresh=fdrthresh,
-   outputdir=Out_Dir,myTitle=myTitle,libSize=c(),useQuantile=useQuantile,filterquantile=fQ) #Run the main function
-# for the log
-
-
-sessionInfo()
-
-
-</configfile>
-</configfiles>
-<tests>
-<test>
-<param name='input1' value='DGEtest.xls' ftype='tabular' />
- <param name='treatment_name' value='case' />
- <param name='title' value='DGEtest' />
- <param name='fdrtype' value='fdr' />
- <param name='priorn' value="5" />
- <param name='fdrthresh' value="0.05" />
- <param name='control_name' value='control' />
- <param name='Treat_cols' value='c3,c6,c9' />
- <param name='Control_cols' value='c2,c5,c8' />
- <output name='outtab' file='DGEtest1out.xls' ftype='tabular' compare='diff' />
- <output name='html_file' file='DGEtest1out.html' ftype='html' compare='diff' lines_diff='20' />
-</test>
-</tests>
-<help>
-**What it does**
-
-Performs digital gene expression analysis between a treatment and control on a matrix.
-
-**Documentation** Please see documentation_ for methods and parameter details
-
-**Input**
-
-A matrix consisting of non-negative integers. The matrix must have a unique header row identifiying the samples, as well as a unique set of row names
-as  the first column.
-
-**Output**
-
-A matrix which consists the original data and relative expression levels and some helpful plots
-
-**Note on edgeR versions**
-
-The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
-breaking this and all other code that assumed the old name for this variable,
-between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
-This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
-to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
-when their old scripts break. This tool currently now works with 2.4.6.
-
-**Note on prior.N**
-
-http://seqanswers.com/forums/showthread.php?t=5591 says:
-
-*prior.n*
-
-The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
-You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
-in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
-tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
-common likelihood the weight of one observation.
-
-In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
-or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
-you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
-(squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
-What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
-If you have more samples, then the tagwise dispersion estimates will be more reliable,
-so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
-
-**Attribution** Copyright Ross Lazarus (ross period lazarus at gmail period com) May 2012
-Derived from the implementation by Antony Kaspi and Sebastian Lunke at the BakerIDI
-
-All rights reserved.
-
-Licensed under the LGPL_
-
-.. _LGPL: http://www.gnu.org/copyleft/lesser.html
-.. _documentation: http://bioconductor.org/packages/release/bioc/html/edgeR.html
-</help>
-
-</tool>
-
-
--- a/rgedgeR/rgedgeRglm.xml	Wed Jun 12 02:58:43 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,537 +0,0 @@
-
-<tool id="rgedgeRglm" name="edgeRglm" version="0.18">
-  <description>digital DGE glm</description>
-  <command interpreter="python">
-     rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "edgeRglm"
-    --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
-  </command>
-  <inputs>
-    <param name="input1"  type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
-       help="Use the DGE matrix preparation tool to create these matrices from BAM files and a BED file of contigs"/>
-    <param name="title" type="text" value="Factorial DGE" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain">
-      <sanitizer invalid_char="">
-        <valid initial="string.letters,string.digits"><add value="_" /> </valid>
-      </sanitizer>
-    </param>
-    <param name="factor1name" type="text" value="Factor 1" size="80" label="Factor 1 name" help="Supply a meaningful name here to remind you when looking at the results">
-      <sanitizer invalid_char="">
-        <valid initial="string.letters,string.digits"><add value="_" /> </valid>
-      </sanitizer>
-    </param>
-    <param name="factor1" type="text" optional="false" size="120"
-       label="Enter comma separated values to indicate the factor level for all input file count columns"
-       help="EG if there are 4 columns of counts from 2 treatment replicates (2,4) and 2 control replicates (3,5) then enter 'treat,control,treat,control'">
-      <sanitizer>
-        <valid initial="string.digits,string.letters"><add value="," /> </valid>
-      </sanitizer>
-    </param>
-    <param name="factor2name" type="text" value="Factor 2" size="80" label="Factor 2 name" help="Supply a meaningful name here to remind you when looking at the results">
-      <sanitizer invalid_char="">
-        <valid initial="string.letters,string.digits"><add value="_" /> </valid>
-      </sanitizer>
-    </param>
-    <param name="factor2" type="text" optional="false" size="120"
-       label="Enter comma separated values to indicate factor 2 level for all input file count columns"
-       help="Leave blank if no factor 2, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter '1,2,1,2'">
-      <sanitizer>
-        <valid initial="string.digits,string.letters"><add value="," /> </valid>
-      </sanitizer>
-    </param>
-    <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
-     help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
-    <param name="useNDfilt" type="boolean" truevalue="T" checked='false' falsevalue="" size="1" label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
-     help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
-    <param name="priorn" type="integer" value="4" size="3" label="prior.df for tagwise dispersion - higher value = more emphasis on each tag's variance - note this used to be prior.n"
-     help="Zero = auto-estimate. 1 to force high variance tags out. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
-    <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for family wise error rate control"
-     help="Conventional default value of 0.05 recommended"/>
-    <param name="fdrtype" type="select" label="FDR (Type II error) control method"
-         help="Use fdr or bh typically to control for the number of tests in a reliable way">
-            <option value="fdr" selected="true">fdr</option>
-            <option value="BH">Benjamini Hochberg</option>
-            <option value="BY">Benjamini Yukateli</option>
-            <option value="bonferroni">Bonferroni</option>
-            <option value="hochberg">Hochberg</option>
-            <option value="holm">Holm</option>
-            <option value="hommel">Hommel</option>
-            <option value="none">no control for multiple tests</option>
-    </param>
-  </inputs>
-  <outputs>
-    <data format="tabular" name="outtab1" label="${treatment1_name}-${control1_name}_topTable.xls"/>
-    <data format="tabular" name="outtab2" label="${treatment2_name}-${control2_name}_topTable.xls"/>
-    <data format="tabular" name="outtab3" label="${treatment1_name}-${control1_name}-${treatment2_name}-${control2_name}_topTable.xls"/>
-    <data format="html" name="html_file" label="${input1.name}_${title}.html"/>
-  </outputs>
-<configfiles>
-<configfile name="runme">
-# edgeR.Rscript
-# new nov 2012 for 2x2 factorial designs
-# updated nov 2011 for R 2.14.0 and edgeR 2.4.0 by ross
-# Performs DGE on a count table containing n replicates of two conditions
-#
-
-reallybig = log10(.Machine\$double.xmax)
-reallysmall = log10(.Machine\$double.xmin)
-require('stringr')
-require('gplots')
-hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
-{
-    samples = colnames(cmat)
-    gu = unique(group)
-    if (length(gu) == 2) {
-        col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
-        pcols = unlist(lapply(group,col.map))
-        } else {
-        colours = rainbow(length(gu),start=0,end=4/6)
-        pcols = colours[match(group,gu)]        }
-    print(paste('pcols',pcols))
-    gn = rownames(cmat)
-    dm = cmat[(! is.na(gn)),]
-    nprobes = nrow(dm)
-    if (nprobes &gt; nsamp) {
-      dm =dm[1:nsamp,]
-    }
-    newcolnames = substr(colnames(dm),1,20)
-    colnames(dm) = newcolnames
-    pdf(outpdfname)
-    heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
-         Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
-    dev.off()
-}
-
-hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
-{
-    gu = unique(group)
-    colours = rainbow(length(gu),start=0.3,end=0.6)
-    pcols = colours[match(group,gu)]
-    nrows = nrow(cmat)
-    mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
-    if (nrows &gt; nsamp)  {
-               cmat = cmat[c(1:nsamp),]
-               mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
-          }
-    newcolnames = substr(colnames(cmat),1,20)
-    colnames(cmat) = newcolnames
-    pdf(outpdfname)
-    heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
-    dev.off()
-}
-
-qqPlot = function(descr='Title',pvector, ...)
-{
-    o = -log10(sort(pvector,decreasing=F))
-    e = -log10( 1:length(o)/length(o) )
-    o[o==-Inf] = reallysmall
-    o[o==Inf] = reallybig
-    pdfname = paste(gsub(" ","", descr , fixed=TRUE),'pval_qq.pdf',sep='_')
-    maint = paste(descr,'QQ Plot')
-    pdf(pdfname)
-    plot(e,o,pch=19,cex=1, main=maint, ...,
-        xlab=expression(Expected~~-log[10](italic(p))),
-        ylab=expression(Observed~~-log[10](italic(p))),
-        xlim=c(0,max(e)), ylim=c(0,max(o)))
-    lines(e,e,col="red")
-    grid(col = "lightgray", lty = "dotted")
-    dev.off()
-}
-
-smearPlot = function(DGEList,deTags, outSmear, outMain)
-        {
-        pdf(outSmear)
-        plotSmear(DGEList,de.tags=deTags,main=outMain)
-        grid(col="blue")
-        dev.off()
-        }
-
-boxPlot = function(rawrs,cleanrs,maint,myTitle)
-{
-        newcolnames = substr(colnames(rawrs),1,15)
-        colnames(rawrs) = newcolnames
-        newcolnames = substr(colnames(cleanrs),1,15)
-        colnames(cleanrs) = newcolnames
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"sampleBoxplot.pdf",sep='_')
-        defpar = par(no.readonly=T)
-        pdf(pdfname,height=6,width=8)
-        par(mfrow=c(1,2))
-        boxplot(rawrs,main=paste('Before:',maint),col="maroon",las=3,cex.axis=0.4)
-        grid(col="blue")
-        lrs = log(cleanrs,10)
-        boxplot(cleanrs,main=paste('After:',maint),col="maroon",las=3,cex.axis=0.4)
-        grid(col="blue")
-        dev.off()
-        par(defpar)
-}
-
-cumPlot = function(rawrs,cleanrs,maint,myTitle)
-{
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
-        defpar = par(no.readonly=T)
-        pdf(pdfname)
-        par(mfrow=c(2,1))
-        lrs = log(rawrs,10)
-        lim = max(lrs)
-        hist(lrs,breaks=100,main=paste('Before:',maint),xlab="Reads (log)",
-             ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
-        grid(col="blue")
-        lrs = log(cleanrs,10)
-        hist(lrs,breaks=100,main=paste('After:',maint),xlab="Reads (log)",
-             ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
-        grid(col="blue")
-        dev.off()
-        par(defpar)
-}
-
-cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
-{
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
-        pdf(pdfname)
-        par(mfrow=c(2,1))
-        lastx = max(rawrs)
-        rawe = knots(ecdf(rawrs))
-        cleane = knots(ecdf(cleanrs))
-        cy = 1:length(cleane)/length(cleane)
-        ry = 1:length(rawe)/length(rawe)
-        plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
-             ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
-        grid(col="blue")
-        plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
-             ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
-        grid(col="blue")
-        dev.off()
-}
-
-
-
-edgeIt = function (Count_Matrix,group,outtab1,outtab2,outtab3,fdrtype='fdr',priorn=5,fdrthresh=0.05,outputdir='.',
-                   myTitle='edgeR',libSize=c(),useQuantile=T,filterquantile=0.2,org='hg19')
-{
-
-    if (length(unique(group))!=4) {
-    print.noquote("Number of conditions identified in experiment does not equal 4 - full 2x2 factorial not possible")
-    q()
-    }
-    require(edgeR)
-    require(limma)
-    mt = paste(unlist(strsplit(myTitle,' ')),collapse="")
-    expfact = factor(group,levels=unique(group))
-    sampleTypes = levels(expfact)
-    mycontrast = NA
-    colnamesDesign = list()
-    allN = nrow(Count_Matrix)
-    nscut = round(ncol(Count_Matrix)/2)
-    colTotmillionreads = colSums(Count_Matrix)/1e6
-    rawrs = rowSums(Count_Matrix)
-    nonzerod = Count_Matrix[(rawrs &gt; 0),]
-    nzN = nrow(nonzerod)
-    nzrs = rowSums(nonzerod)
-    zN = allN - nzN
-    print.noquote('Quantiles for non-zero row counts:')
-    print.noquote(quantile(nzrs,probs=seq(0,1,0.1)))
-    if (useQuantile == "T")
-    {
-      gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) &gt;= 1) &gt;= nscut
-      lo = colSums(Count_Matrix[!gt1rpin3,])
-      workCM = Count_Matrix[gt1rpin3,]
-      cleanrs = rowSums(workCM)
-      cleanN = length(cleanrs)
-      meth = paste( "After removing",length(lo),"contigs with fewer than",nscut,"sample read counts &gt;= 1 per million, there are",sep="")
-      print.noquote(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"))
-      maint = paste('Filter &gt;= 1/million reads in &gt;=',nscut,'samples')
-    }  else {
-      useme = (nzrs &gt; quantile(nzrs,filterquantile))
-      workCM = nonzerod[useme,]
-      lo = colSums(nonzerod[!useme,])
-      cleanrs = rowSums(workCM)
-      cleanN = length(cleanrs)
-      meth = paste("After filtering at count quantile =",filterquantile,"there are",sep="")
-      print.noquote(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"))
-      maint = paste('Filter below',filterquantile,'quantile')
-    }
-    cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
-    print.noquote(paste("Total low count contigs per sample = ",paste(lo,collapse=',')))
-    rsums = rowSums(workCM)
-    allttn = c()
-    colnamesDesign = list()
-    l = sampleTypes
-    p1=paste(l[1],'-',l[2],sep='')
-    p2=paste(l[3],'-',l[4],sep='')
-    p3=paste('(',l[3],'-',l[4],') - (',l[1],'-',l[2],')',sep='')
-    colnamesDesign = list(comp1=p1,comp2=p2,diff=p3)
-    mydesign = model.matrix(~0+expfact)
-    colnames(mydesign) = sampleTypes
-    mycontrast = makeContrasts(contrasts=colnamesDesign,levels=mydesign)
-    print.noquote('Design matrix=')
-    print(mydesign,quote=F)
-    print.noquote('Contrast matrix=')
-    print(mycontrast,quote=F)
-    print.noquote('Samples=')
-    print(colnames(workCM),quote=F)
-    DGEList = DGEList(counts=workCM, group = group)
-    print.noquote(paste("prior.df =",priorn))
-    DGEList = calcNormFactors(DGEList)
-    DGEList = estimateGLMCommonDisp(DGEList,mydesign)
-    comdisp = DGEList\$common.dispersion
-    DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
-    DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
-    pdf(paste(mt,"BCV_vs_abundance.pdf",sep='_'))
-    plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
-    dev.off()
-    estpriorn = getPriorN(DGEList)
-    print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
-    efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
-    normData = (1e+06*DGEList\$counts/efflib)
-    #normData = (1e+06 * DGEList\$counts/expandAsMatrix(DGEList\$samples\$lib.size, dim(DGEList)))
-    colnames(normData) = paste( colnames(normData),'N',sep="_")
-    boxPlot(rawrs=log(nonzerod),cleanrs=log(normData),maint='TMM Normalisation (log scale)',myTitle=myTitle)
-    ug = unique(group)
-    ncond = length(ug)
-    sample_colors =  match(DGEList\$samples\$group,ug)
-    pdf(paste(outputdir,paste(mt,"MDSplot.pdf",sep='_'),sep='/'))
-    plotMDS.DGEList(DGEList,main=paste("MDS Plot for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
-    legend(x="topleft", legend = sampleTypes,col=sample_colors, pch=19)
-    grid(col="blue")
-    dev.off()
-    cn = colnames(Count_Matrix)
-    print('Using design matrix:')
-    print(mydesign)
-    DGLM = glmFit(DGEList, mydesign)
-    goodness = gof(DGLM, pcutoff=fdrthresh)
-    if (sum(goodness\$outlier) &gt; 0) {
-        print.noquote('GLM outliers:')
-        print(paste(rownames(DGLM)[(goodness\$outlier != 0)],collapse=','),quote=F)
-        z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
-        pdf(paste(mt,"GoodnessofFit.pdf",sep='_'))
-        qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
-        abline(0,1,lwd=3)
-        points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue")
-        dev.off()
-    } else { print('No GLM fit outlier genes found\n')}
-    outtabs = c(outtab1,outtab2,outtab3)
-    for (coeff in c(1:3))
-        {
-        DE = glmLRT(DGLM,contrast=mycontrast[,coeff])
-        cont = paste(unlist(strsplit(colnames(mycontrast)[coeff],' ')),collapse="")
-        output = cbind(Name=as.character(rownames(DGEList\$counts)),
-        DE\$table,adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
-        Dispersion=DGEList\$tagwise.dispersion,totreads=rsums,normData,
-        DGEList\$counts)
-        soutput = output[order(output\$PVal),]
-        nreads = soutput\$totreads
-        write.table(soutput,outtabs[coeff], quote=FALSE, sep="\t",row.names=F)
-        print.noquote(paste("Top tags for",cont))
-        genecards="&lt;a href='http://www.genecards.org/index.php?path=/Search/keyword/"
-        ucsc = paste("&lt;a href='http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
-        tt = topTags(DE,n=nrow(DE))
-        rn = rownames(tt\$table)
-        reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
-        testreg = str_match(rn,reg)
-        isucsc = F
-        if (!is.na(testreg[1,])) {isucsc = T}
-        if (isucsc == T)
-        {
-          urls = paste(ucsc,'&amp;position=chr',testreg[,2],':',testreg[,3],"-",testreg[,4],"'&gt;",rn,'&lt;/a&gt;',sep='')
-        } else {
-          urls = paste(genecards,rn,"'&gt;",rn,'&lt;/a&gt;',sep="")
-        }
-        tt\$table = cbind(tt\$table,ntotreads=nreads,geneCardslink=urls)
-        print.noquote(tt[1:50,])
-        deTags = rownames(output[output\$adj.p.value &lt; fdrthresh,])
-        nsig = length(deTags)
-        print(paste(nsig,'tags significant at adj p=',fdrthresh),quote=F)
-        outSmear = paste(outputdir,paste(mt,cont,"Smearplot.pdf",sep='_'),sep='/')
-        outMain = paste(cont,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
-        smearPlot(DGEList=DE,deTags=deTags, outSmear=outSmear, outMain = outMain)
-        qqPlot(descr=paste(myTitle,cont),pvector=DE\$table\$PValue)
-        }
-}
-
-rossDecide = function (object, adjust.method="BH", p.value=0.05,verbose=T)
-{
-    if (!is(object, "DGEExact") &amp; !is(object, "DGELRT"))
-        stop("Need DGEExact or DGELRT object")
-    DE = decideTestsDGE(object, adjust.method = adjust.method, p.value = p.value)
-    sumDE = summary(DE)
-    rownames(sumDE)[1] = "DOWN"
-    rownames(sumDE)[3] = "UP"
-    sumDE = sumDE[c(3, 1), ]
-    if (verbose) {
-        cat("\n")
-        cat("------------------------------------------------------",
-            "\n")
-        cat(" Method for Selecting DEGs:", DEmethod, "\n")
-        cat(" Multiple Testing  method: ", MTestmethod, "- pval",
-            PVcut, "\n")
-        cat("\n")
-        print.noquote(sumDE)
-        cat("------------------------------------------------------",
-            "\n")
-    }
-    return(DE)
-}
-
-options(width=512)
-Out_Dir = "$html_file.files_path"
-Input =  "$input1"
-factor1name = "$factor1_name"
-factor1 = as.factor(strsplit("$factor1",","))
-factor2name = "$factor2_name"
-factor2 = as.factor(strsplit("$factor2",","))
-org = "$input1.dbkey"
-outtabs = strsplit("$outtab",",")
-fdrtype = "$fdrtype"
-priorn = $priorn
-fdrthresh = $fdrthresh
-useNDfilt = "$useNDfilt"
-fQ = $fQ
-myTitle = "$title"
-
-print.noquote(paste('# got =',paste(CCols2,collapse=',')))
-Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t')
-rn = rownames(Count_Matrix)
-nsamp = length(colnames(Count_Matrix))
-
-if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
-print.noquote(paste('Got nsamp = ',nsamp,'org=',org))
-islib = rn %in% c('librarySize','NotInBedRegions')
-LibSizes = Count_Matrix[subset(rn,islib),][1]
-Count_Matrix = Count_Matrix[subset(rn,! islib),]
-print(paste('Got groups =',paste(group,collapse=' ')))
-if (priorn &lt;= 0) {priorn = ceiling(20/(length(group)-1))}
-cn = colnames(Count_Matrix)
-results = edgeIt(Count_Matrix,group=group,outtab1=outtab1,outtab2=outtab2,outtab3=outtab3,fdrtype,priorn,fdrthresh,
-                 Out_Dir,myTitle,libSize=c(),useQuantile=useNDfilt,filterquantile=fQ,org=org)
-options(width=80)
-sessionInfo()
-
-</configfile>
-</configfiles>
-<tests>
-<test>
-<param name='input1' value='DGEtest.xls' ftype='tabular' />
- <param name='treatment_name' value='case' />
- <param name='title' value='DGEtest' />
- <param name='fdrtype' value='fdr' />
- <param name='priorn' value="5" />
- <param name='fdrthresh' value="0.05" />
- <param name='control_name' value='control' />
- <param name='Treat_cols' value='c3,c6,c9' />
- <param name='Control_cols' value='c2,c5,c8' />
- <output name='outtab' file='DGEtest1out.xls' ftype='tabular' compare='diff' />
- <output name='html_file' file='DGEtest1out.html' ftype='html' compare='diff' lines_diff='20' />
-</test>
-</tests>
-<help>
-**Executive Summary**
-This is a Galaxy wrapper exposing the Bioconductor edgeR_ package.
-There is a companion pairwise comparison tool but this is the two way model tool.
-It will help you to analyse the main effects in a 2 way anova.
-
-**If things go wrong**
-edgeR software is thoroughly documented at edgeR_Manual_ and the authors
-provide excellent support on the Bioconductor users' mailing list for their code, but
-if this tool fails to run properly in Galaxy, please make sure you
-fill in the form and send the administrators a Galaxy bug report.
-If you have questions about the edgeR code this tool wraps, please
-try the Bioconductor users' list.
-
-**What data is it for?**
-Like the pairwise version, it takes a count matrix with columns containing reads per contig (NOT transformed!) for
-multiple replicates in comparison groups. The counts can be any kind of counts. It was designed for
-short read sequence data - RNA-seq, miR-seq,...ChIP-seq.
-
-**Slightly Longer Version**
-Performs digital gene expression factorial analysis for a 2x2 design.
-Factorial designs are old but good if you want to get at the individual effects and interaction between two factors.
-EG: a treatment applied to 2 different cell types, or two treatments applied in all 4 possible combinations (-a-b,-a,+b,+a-b,+a+b).
-Data with replicates in the four groups is supplied as a count matrix.
-The two primary comparisons are defined by selecting two sets of samples for comparison
-best thought of as control and treatment (whatever that means) for each of the main comparisons.
-
-The interaction is defined as the difference between those two comparisons and is reported as a topTable as are the
-primary comparisons.
-
-All comparisons are reported as separate tabular spreadsheets ordered by p value and a comprehensive summary is
-provided in the html output.
-
-This code essentially embelishes the code described by Gordon Smythe in the limma documentation for a factorial
-analysis.
-
-**Input**
-
-A matrix consisting of non-negative integers. The matrix must have a unique header row identifiying the samples, as well as a unique set of row names
-as the first column.
-
-**Output**
-
-Tabular files which contain the statistical results and the raw and transformed counts and some colourful
-and helpful plots
-
-**Note on edgeR versions**
-
-The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
-breaking this and all other code that assumed the old name for this variable,
-between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
-This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
-to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
-when their old scripts break. This tool currently now works with 2.4.6.
-
-**Note on prior.N - now replaced with prior.df**
-
-http://seqanswers.com/forums/showthread.php?t=5591 says:
-
-*prior.n*
-
-The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
-You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
-in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
-tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
-common likelihood the weight of one observation.
-
-In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
-or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
-you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
-(squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
-What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
-If you have more samples, then the tagwise dispersion estimates will be more reliable,
-so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
-
-From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
-
-Dear Dorota,
-
-The important settings are prior.df and trend.
-
-prior.n and prior.df are related through prior.df = prior.n * residual.df,
-and your experiment has residual.df = 36 - 12 = 24.  So the old setting of
-prior.n=10 is equivalent for your data to prior.df = 240, a very large
-value.  Going the other way, the new setting of prior.df=10 is equivalent
-to prior.n=10/24.
-
-To recover old results with the current software you would use
-
-  estimateTagwiseDisp(object, prior.df=240, trend="none")
-
-To get the new default from old software you would use
-
-  estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
-
-Actually the old trend method is equivalent to trend="loess" in the new
-software. You should use plotBCV(object) to see whether a trend is
-required.
-
-Note you could also use
-
-  prior.n = getPriorN(object, prior.df=10)
-
-to map between prior.df and prior.n.
-
-
- .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
- .. _edgeR_Manual: http://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
-
-</help>
-
-</tool>
-
-
--- a/rgedgeR/rgedgeRpaired.xml.iaas1	Wed Jun 12 02:58:43 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,627 +0,0 @@
-<tool id="rgedgeRpaired" name="edgeR paired" version="0.18">
-  <description>2 level Anova for counts</description>
-  <command interpreter="python">
-     rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "edgeR"
-    --output_dir "$html_file.files_path" --output_html "$html_file" --output_tab "$outtab" --make_HTML "yes"
-  </command>
-  <inputs>
-    <param name="input1"  type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
-       help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
-    <param name="title" type="text" value="edgeR" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain">
-      <sanitizer invalid_char="">
-        <valid initial="string.letters,string.digits"><add value="_" /> </valid>
-      </sanitizer>
-    </param>
-    <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
-    <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
-         multiple="true" use_header_names="true" size="120" display="checkboxes">
-        <validator type="no_options" message="Please select at least one column."/>
-    </param>
-    <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
-    <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
-         multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true">
-    </param>
-    <param name="subjectids" type="text" optional="true" size="120"
-       label="IF SUBJECTS NOT ALL INDEPENDENT! Enter integers to indicate sample pairing for every column in input"
-       help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter '1,2,1,2'">
-      <sanitizer>
-        <valid initial="string.digits"><add value="," /> </valid>
-      </sanitizer>
-    </param>
-    <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs"
-     help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/>
-    <param name="useNDF" type="boolean" truevalue="T" checked='false' falsevalue="" size="1" label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples"
-     help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/>
-    <param name="priordf" type="integer" value="20" size="3" label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n  and prior.df = prior.n * residual.df"
-     help="Zero = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/>
-    <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
-     help="Conventional default value of 0.05 recommended"/>
-    <param name="fdrtype" type="select" label="FDR (Type II error) control method"
-         help="Use fdr or bh typically to control for the number of tests in a reliable way">
-            <option value="fdr" selected="true">fdr</option>
-            <option value="BH">Benjamini Hochberg</option>
-            <option value="BY">Benjamini Yukateli</option>
-            <option value="bonferroni">Bonferroni</option>
-            <option value="hochberg">Hochberg</option>
-            <option value="holm">Holm</option>
-            <option value="hommel">Hommel</option>
-            <option value="none">no control for multiple tests</option>
-    </param>
-  </inputs>
-  <outputs>
-    <data format="tabular" name="outtab" label="${title}.xls"/>
-    <data format="html" name="html_file" label="${title}.html"/>
-  </outputs>
- <stdio>
-     <exit_code range="4"   level="fatal"   description="Number of subject ids must match total number of samples in the input matrix" />
- </stdio>
- <tests>
-<test>
-<param name='input1' value='test_bams2mx.xls' ftype='tabular' />
- <param name='treatment_name' value='case' />
- <param name='title' value='edgeRtest' />
- <param name='fdrtype' value='fdr' />
- <param name='priordf' value="0" />
- <param name='fdrthresh' value="0.05" />
- <param name='control_name' value='control' />
- <param name='Treat_cols' value='c3,c4,c5,c9' />
- <param name='Control_cols' value='c2,c6,c7,c8' />
- <output name='outtab' file='edgeRtest1out.xls' ftype='tabular' compare='diff' />
- <output name='html_file' file='edgeRtest1out.html' ftype='html' compare='diff' lines_diff='20' />
-</test>
-</tests>
-
-<configfiles>
-<configfile name="runme">
-<![CDATA[
-#
-# edgeR.Rscript
-# updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross
-# Performs DGE on a count table containing n replicates of two conditions
-#
-# Parameters
-#
-# 1 - Output Dir
-
-# Original edgeR code by: S.Lunke and A.Kaspi
-reallybig = log10(.Machine\$double.xmax)
-reallysmall = log10(.Machine\$double.xmin)
-library('stringr')
-library('gplots')
-library('DESeq')
-library('edgeR')
-hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here')
-{
-# Perform clustering for significant pvalues after controlling FWER
-    samples = colnames(cmat)
-    gu = unique(group)
-    if (length(gu) == 2) {
-        col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"}
-        pcols = unlist(lapply(group,col.map))
-        } else {
-        colours = rainbow(length(gu),start=0,end=4/6)
-        pcols = colours[match(group,gu)]        }
-    gn = rownames(cmat)
-    dm = cmat[(! is.na(gn)),]
-    # remove unlabelled hm rows
-    nprobes = nrow(dm)
-    # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance')
-    if (nprobes > nsamp) {
-      dm =dm[1:nsamp,]
-      #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
-    }
-    newcolnames = substr(colnames(dm),1,20)
-    colnames(dm) = newcolnames
-    pdf(outpdfname)
-    heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
-         Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
-    dev.off()
-}
-
-hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
-{
-    # for 2 groups only was
-    #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
-    #pcols = unlist(lapply(group,col.map))
-    gu = unique(group)
-    colours = rainbow(length(gu),start=0.3,end=0.6)
-    pcols = colours[match(group,gu)]
-    nrows = nrow(cmat)
-    mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
-    if (nrows > nsamp)  {
-               cmat = cmat[c(1:nsamp),]
-               mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
-          }
-    newcolnames = substr(colnames(cmat),1,20)
-    colnames(cmat) = newcolnames
-    pdf(outpdfname)
-    heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
-    dev.off()
-}
-
-qqPlot = function(descr='Title',pvector, ...)
-# stolen from https://gist.github.com/703512
-{
-    o = -log10(sort(pvector,decreasing=F))
-    e = -log10( 1:length(o)/length(o) )
-    o[o==-Inf] = reallysmall
-    o[o==Inf] = reallybig
-    pdfname = paste(gsub(" ","", descr , fixed=TRUE),'pval_qq.pdf',sep='_')
-    maint = paste(descr,'QQ Plot')
-    pdf(pdfname)
-    plot(e,o,pch=19,cex=1, main=maint, ...,
-        xlab=expression(Expected~~-log[10](italic(p))),
-        ylab=expression(Observed~~-log[10](italic(p))),
-        xlim=c(0,max(e)), ylim=c(0,max(o)))
-    lines(e,e,col="red")
-    grid(col = "lightgray", lty = "dotted")
-    dev.off()
-}
-
-smearPlot = function(DGEList,deTags, outSmear, outMain)
-        {
-        pdf(outSmear)
-        plotSmear(DGEList,de.tags=deTags,main=outMain)
-        grid(col="blue")
-        dev.off()
-        }
-
-boxPlot = function(rawrs,cleanrs,maint,myTitle)
-{   #
-        nc = ncol(rawrs)
-        for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
-        fullnames = colnames(rawrs)
-        newcolnames = substr(colnames(rawrs),1,20)
-        colnames(rawrs) = newcolnames
-        newcolnames = substr(colnames(cleanrs),1,20)
-        colnames(cleanrs) = newcolnames
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"sampleBoxplot.pdf",sep='_')
-        defpar = par(no.readonly=T)
-        pdf(pdfname,height=6,width=8)
-        #par(mfrow=c(1,2)) # 1 rows 2 col
-        l = layout(matrix(c(1,2),1,2,byrow=T))
-        print.noquote('raw contig counts by sample:')
-        print.noquote(summary(rawrs))
-        print.noquote('normalised contig counts by sample:')
-        print.noquote(summary(cleanrs))
-        boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
-        grid(col="blue")
-        boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
-        grid(col="blue")
-        dev.off()
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"samplehistplot.pdf",sep='_')
-        nc = ncol(rawrs)
-        print.noquote(paste('Using ncol rawrs=',nc))
-        ncroot = round(sqrt(nc))
-        if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
-        m = c()
-        for (i in c(1:nc)) {
-              rhist = hist(rawrs[,i],breaks=100,plot=F)
-              m = append(m,max(rhist\$counts))
-             }
-        ymax = max(m)
-        pdf(pdfname)
-        par(mfrow=c(ncroot,ncroot))
-        for (i in c(1:nc)) {
-                 hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
-                 breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
-             }
-        dev.off()
-        par(defpar)
-
-}
-
-cumPlot = function(rawrs,cleanrs,maint,myTitle)
-{   # updated to use ecdf
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
-        defpar = par(no.readonly=T)
-        pdf(pdfname)
-        par(mfrow=c(2,1))
-        lrs = log(rawrs,10)
-        lim = max(lrs)
-        hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
-             ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
-        grid(col="blue")
-        lrs = log(cleanrs,10)
-        hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
-             ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
-        grid(col="blue")
-        dev.off()
-        par(defpar)
-}
-
-cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
-{   # updated to use ecdf
-        pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
-        pdf(pdfname)
-        par(mfrow=c(2,1))
-        lastx = max(rawrs)
-        rawe = knots(ecdf(rawrs))
-        cleane = knots(ecdf(cleanrs))
-        cy = 1:length(cleane)/length(cleane)
-        ry = 1:length(rawe)/length(rawe)
-        plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
-             ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
-        grid(col="blue")
-        plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
-             ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
-        grid(col="blue")
-        dev.off()
-}
-
-
-
-edgeIt = function (Count_Matrix,group,outputfilename,fdrtype='fdr',priordf=5,fdrthresh=0.05,outputdir='.',
-    myTitle='edgeR',libSize=c(),useNDF="T",filterquantile=0.2,subjects=c()) {
-
-        # Error handling
-        if (length(unique(group))!=2){
-                print("Number of conditions identified in experiment does not equal 2")
-                q()
-        }
-        require(edgeR)
-        mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
-        allN = nrow(Count_Matrix)
-        nscut = round(ncol(Count_Matrix)/2)
-        colTotmillionreads = colSums(Count_Matrix)/1e6
-        rawrs = rowSums(Count_Matrix)
-        nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
-        nzN = nrow(nonzerod)
-        nzrs = rowSums(nonzerod)
-        zN = allN - nzN
-        print('# Quantiles for non-zero row counts:',quote=F)
-        print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
-        if (useNDF == "T")
-        {
-        gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
-        lo = colSums(Count_Matrix[!gt1rpin3,])
-        workCM = Count_Matrix[gt1rpin3,]
-        cleanrs = rowSums(workCM)
-        cleanN = length(cleanrs)
-        meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
-        print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
-        maint = paste('Filter >=1/million reads in >=',nscut,'samples')
-        }
-        else {
-        useme = (nzrs > quantile(nzrs,filterquantile))
-        workCM = nonzerod[useme,]
-        lo = colSums(nonzerod[!useme,])
-        cleanrs = rowSums(workCM)
-        cleanN = length(cleanrs)
-        meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
-        print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
-        maint = paste('Filter below',filterquantile,'quantile')
-        }
-        cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
-        allgenes <- rownames(workCM)
-        print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
-        rsums = rowSums(workCM)
-        TName=unique(group)[1]
-        CName=unique(group)[2]
-        # Setup DGEList object
-        DGEList = DGEList(counts=workCM, group = group)
-        if (length(subjects) == 0)
-            {
-               doDESEQ = T
-               mydesign = model.matrix(~group)
-            }
-            else {
-              doDESEQ = F
-              subjf = factor(subjects)
-              mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
-            }
-        print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
-        print.noquote('Using design matrix:')
-        print.noquote(mydesign)
-        DGEList = estimateGLMCommonDisp(DGEList,mydesign)
-        comdisp = DGEList\$common.dispersion
-        DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
-        if (priordf > 0) {
-           print.noquote(paste("prior.df =",priordf))
-           DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = priordf)
-        } else {
-           DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
-        }
-        DGLM = glmFit(DGEList,design=mydesign)
-        efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
-        normData = (1e+06*DGEList\$counts/efflib)
-        co = length(colnames(mydesign))
-        DE = glmLRT(DGLM,coef=co) # always last one - subject is first if needed
-        uoutput = cbind(
-                Name=as.character(rownames(DGEList\$counts)),
-                DE\$table,
-                adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
-                Dispersion=DGEList\$tagwise.dispersion,totreads=rsums,normData,
-                DGEList\$counts
-        )
-        soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
-        goodness = gof(DGLM, pcutoff=fdrthresh)
-        if (sum(goodness\$outlier) > 0) {
-            print.noquote('GLM outliers:')
-            print(paste(rownames(DGLM)[(goodness\$outlier != 0)],collapse=','),quote=F)
-            z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
-            pdf(paste(mt,"GoodnessofFit.pdf",sep='_'))
-            qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
-            abline(0,1,lwd=3)
-            points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue")
-            dev.off()
-            } else { print('No GLM fit outlier genes found\n')}
-        estpriorn = getPriorN(DGEList)
-        print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
-        efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
-        normData = (1e+06*DGEList\$counts/efflib)
-        uniqueg = unique(group)
-        # Plot MDS
-        sample_colors =  match(group,levels(group))
-        pdf(paste(mt,"MDSplot.pdf",sep='_'))
-        sampleTypes = levels(group)
-        plotMDS.DGEList(DGEList,main=paste("MDS Plot for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
-        legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
-        grid(col="blue")
-        dev.off()
-        colnames(normData) = paste( colnames(normData),'N',sep="_")
-        print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
-        nzd = data.frame(log(nonzerod + 1e-2,10))
-        boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle)
-        if (doDESEQ)
-        {
-        # DESeq
-        deSeqDatcount <- newCountDataSet(workCM, group)
-        deSeqDatsizefac <- estimateSizeFactors(deSeqDatcount)
-        deSeqDatdisp <- estimateDispersions(deSeqDatsizefac)
-        rDESeq <- nbinomTest(deSeqDatdisp, levels(group)[1], levels(group)[2])
-        rDESeq <- rDESeq[order(rDESeq\$pval), ]
-        write.table(rDESeq,paste(mt,'DESeq_TopTable.xls',sep='_'), quote=FALSE, sep="\t",row.names=F)
-        topresults.DESeq <- rDESeq[which(rDESeq\$padj < fdrthresh), ]
-        DESeqcountsindex <- which(allgenes %in% topresults.DESeq\$id)
-        DESeqcounts <- rep(0, length(allgenes))
-        DESeqcounts[DESeqcountsindex] <- 1
-        }
-        DGEList = calcNormFactors(DGEList)
-        norm.factor = DGEList\$samples\$norm.factors
-        pdf(paste(mt,"voomplot.pdf",sep='_'))
-        dat.voomed <- voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
-        dev.off()
-        # Use limma to fit data
-        fit <- lmFit(dat.voomed, mydesign)
-        fit <- eBayes(fit)
-        rvoom <- topTable(fit, coef = length(colnames(mydesign)), adj = "BH", n = Inf)
-        write.table(rvoom,paste(mt,'VOOM_topTable.xls',sep='_'), quote=FALSE, sep="\t",row.names=F)
-        # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
-        topresults.voom <- rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
-        topresults.edgeR <- soutput[which(soutput\$adj.p.value < fdrthresh), ]
-        # Create venn diagram of hits
-        edgeRcountsindex <- which(allgenes %in% rownames(topresults.edgeR))
-        voomcountsindex <- which(allgenes %in% topresults.voom\$ID)
-        edgeRcounts <- rep(0, length(allgenes))
-        edgeRcounts[edgeRcountsindex] <- 1
-        voomcounts <- rep(0, length(allgenes))
-        voomcounts[voomcountsindex] <- 1
-        if (doDESEQ) {
-            vennmain = paste(mt,'Voom,edgeR and DESeq overlap at FDR=',fdrthresh)
-            counts.dataframe <- data.frame(edgeRcounts = edgeRcounts, DESeqcounts = DESeqcounts,
-                voomcounts = voomcounts, row.names = allgenes)
-        } else {
-            vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
-            counts.dataframe <- data.frame(edgeRcounts = edgeRcounts, voomcounts = voomcounts, row.names = allgenes)
-        }
-        counts.venn <- vennCounts(counts.dataframe)
-        vennf = paste(mt,'venn.pdf',sep='_')
-        pdf(vennf)
-        vennDiagram(counts.venn,main=vennmain,col="maroon")
-        dev.off()
-        #Prepare our output file
-        nreads = soutput\$totreads # ordered same way
-        print('# writing output',quote=F)
-        write.table(soutput,outputfilename, quote=FALSE, sep="\t",row.names=F)
-        rn = uoutput\$Name
-        reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
-        org="hg19"
-        genecards="<a href='http://www.genecards.org/index.php?path=/Search/keyword/"
-        ucsc = paste("<a href='http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='')
-        testreg = str_match(rn,reg)
-        nreads = uoutput\$totreads # ordered same way
-        if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string
-        {
-          urls = paste(ucsc,'&amp;position=chr',testreg[,2],':',testreg[,3],"-",testreg[,4],"'>",rn,'</a>',sep='')
-        } else {
-          urls = paste(genecards,rn,"'></a>",rn,'</a>',sep="")
-        }
-        print.noquote('# urls')
-        cat(head(urls))
-        tt = uoutput
-        cat("# Top tags\n")
-        tt = cbind(tt,ntotreads=nreads,URL=urls) # add to end so table isn't laid out strangely
-        tt = tt[order(DE\$table\$PValue),]
-        options(width = 500)
-        print.noquote(tt[1:50,])
-        pdf(paste(mt,"BCV_vs_abundance.pdf",sep='_'))
-        plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance")
-        dev.off()
-        # Plot MAplot
-        deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
-        nsig = length(deTags)
-        print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
-        print('# deTags',quote=F)
-        print(head(deTags))
-        dg = DGEList[order(DE\$table\$PValue),]
-        #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
-        efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
-        normData = (1e+06*dg\$counts/efflib)
-        outpdfname=paste(mt,"heatmap.pdf",sep='_')
-        hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=myTitle)
-        outSmear = paste(mt,"Smearplot.pdf",sep='_')
-        outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
-        smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
-        qqPlot(descr=myTitle,pvector=DE\$table\$PValue)
-        if (doDESEQ) {
-           cat("# DESeq top 50\n")
-           print(rDESeq[1:50,])
-        }
-        cat("# VOOM top 50\n")
-        print(rvoom[1:50,])
-        # need a design matrix and glm to use this
-        goodness = gof(DGLM, pcutoff=fdrthresh)
-        nout = sum(goodness\$outlier)
-        if (nout > 0) {
-            print.noquote(paste('Found',nout,'Goodness of fit outliers'))
-            rownames(DGLM)[goodness\$outlier]
-            z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
-            pdf(paste(mt,"GoodnessofFit.pdf",sep='_'))
-            qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
-            abline(0,1,lwd=3)
-            points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue")
-            dev.off()
-            }
-        #Return our main table
-        uoutput
-
-}       #Done
-sink(stdout(),append=T,type="message")
-options(width=512)
-Out_Dir = "$html_file.files_path"
-Input =  "$input1"
-TreatmentName = "$treatment_name"
-TreatmentCols = "$Treat_cols"
-ControlName = "$control_name"
-ControlCols= "$Control_cols"
-outputfilename = "$outtab"
-fdrtype = "$fdrtype"
-priordf = $priordf
-fdrthresh = $fdrthresh
-useNDF = "$useNDF"
-fQ = $fQ # non-differential centile cutoff
-myTitle = "$title"
-subjects = c($subjectids)
-nsubj = length(subjects)
-#Set our columns
-TCols           = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
-CCols           = as.numeric(strsplit(ControlCols,",")[[1]])-1
-cat('# got TCols=')
-cat(TCols)
-cat('; CCols=')
-cat(CCols)
-cat('\n')
-useCols = c(TCols,CCols)
-# Create output dir if non existent
-  if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
-
-Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
-snames = colnames(Count_Matrix)
-nsamples = length(snames)
-if (nsubj >  0 & nsubj != nsamples) {
-options("show.error.messages"=T)
-mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
-write(mess, stderr())
-#print(mess)
-quit(save="no",status=4)
-}
-
-Count_Matrix = Count_Matrix[,useCols] # reorder columns
-if (length(subjects) != 0) {subjects = subjects[useCols]}
-rn = rownames(Count_Matrix)
-islib = rn %in% c('librarySize','NotInBedRegions')
-LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
-Count_Matrix = Count_Matrix[subset(rn,! islib),]
-group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) )             #Build a group descriptor
-group = factor(group, levels=c(ControlName,TreatmentName))
-colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_")                   #Relable columns
-results = edgeIt(Count_Matrix=Count_Matrix,group=group,outputfilename=outputfilename,fdrtype=fdrtype,priordf=priordf,fdrthresh=fdrthresh,
-   outputdir=Out_Dir,myTitle=myTitle,libSize=c(),useNDF=useNDF,filterquantile=fQ,subjects=subjects)
-#Run the main function
-# for the log
-sessionInfo()
-]]>
-</configfile>
-</configfiles>
-<help>
-**What it does**
-
-Performs digital gene expression analysis between a treatment and control on a count matrix.
-Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
-
-**Input**
-
-A matrix consisting of non-negative integers. The matrix must have a unique header row identifiying the samples, and a unique set of row names
-as  the first column. Typically the row names are gene symbols or probe id's for downstream use in GSEA and other methods.
-
-If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
-put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
-A list of integers, one for each subject or an empty string if samples are all independent.
-If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
-Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
-
-So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
-eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
-8,9,1,1,2,2
-as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
-
-**Output**
-
-A matrix which consists the original data and relative expression levels and some helpful plots
-
-**Note on edgeR versions**
-
-The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
-breaking this and all other code that assumed the old name for this variable,
-between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
-This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
-to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
-when their old scripts break. This tool currently now works with 2.4.6.
-
-**Note on prior.N**
-
-http://seqanswers.com/forums/showthread.php?t=5591 says:
-
-*prior.n*
-
-The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
-You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
-in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
-tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
-common likelihood the weight of one observation.
-
-In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
-or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
-you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
-(squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
-What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
-If you have more samples, then the tagwise dispersion estimates will be more reliable,
-so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
-
-
-From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
-
-Dear Dorota,
-
-The important settings are prior.df and trend.
-
-prior.n and prior.df are related through prior.df = prior.n * residual.df,
-and your experiment has residual.df = 36 - 12 = 24.  So the old setting of
-prior.n=10 is equivalent for your data to prior.df = 240, a very large
-value.  Going the other way, the new setting of prior.df=10 is equivalent
-to prior.n=10/24.
-
-To recover old results with the current software you would use
-
-  estimateTagwiseDisp(object, prior.df=240, trend="none")
-
-To get the new default from old software you would use
-
-  estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
-
-Actually the old trend method is equivalent to trend="loess" in the new
-software. You should use plotBCV(object) to see whether a trend is
-required.
-
-Note you could also use
-
-  prior.n = getPriorN(object, prior.df=10)
-
-to map between prior.df and prior.n.
-
-</help>
-
-</tool>
-
-