Mercurial > repos > fubar > edger_test
changeset 1:34cbb9e749da draft
Uploaded
author | fubar |
---|---|
date | Wed, 12 Jun 2013 03:00:31 -0400 |
parents | 82e0af566160 |
children | 515a5e61874c |
files | rgedgeR/rgGSEA.py rgedgeR/rgGSEAcolumns.xml rgedgeR/rgedgeR.xml rgedgeR/rgedgeRglm.xml rgedgeR/rgedgeRpaired.xml.iaas1 |
diffstat | 5 files changed, 0 insertions(+), 2996 deletions(-) [+] |
line wrap: on
line diff
--- a/rgedgeR/rgGSEA.py Wed Jun 12 02:58:43 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,494 +0,0 @@ -""" -April 2013 -eeesh GSEA does NOT respect the mode flag! - -Now realise that the creation of the input rank file for gsea needs to take the lowest p value for duplicate -feature names. To make Ish's life easier, remove duplicate gene ids from any gene set to stop GSEA from -barfing. - -October 14 2012 -Amazingly long time to figure out that GSEA fails with useless error message if any filename contains a dash "-" -eesh. - -Added history .gmt source - requires passing a faked name to gsea -Wrapper for GSEA http://www.broadinstitute.org/gsea/index.jsp -Started Feb 22 -Copyright 2012 Ross Lazarus -All rights reserved -Licensed under the LGPL - -called eg as - -#!/bin/sh -GALAXY_LIB="/data/extended/galaxy/lib" -if [ "$GALAXY_LIB" != "None" ]; then - if [ -n "$PYTHONPATH" ]; then - PYTHONPATH="$GALAXY_LIB:$PYTHONPATH" - else - PYTHONPATH="$GALAXY_LIB" - fi - export PYTHONPATH -fi - -cd /data/extended/galaxy/database/job_working_directory/027/27311 -python /data/extended/galaxy/tools/rgenetics/rgGSEA.py --input_tab "/data/extended/galaxy/database/files/033/dataset_33806.dat" --adjpvalcol "5" --signcol "2" ---idcol "1" --outhtml "/data/extended/galaxy/database/files/034/dataset_34455.dat" --input_name "actaearly-Controlearly-actalate-Controllate_topTable.xls" ---setMax "500" --setMin "15" --nPerm "1000" --plotTop "20" ---gsea_jar "/data/extended/galaxy/tool-data/shared/jars/gsea2-2.0.12.jar" ---output_dir "/data/extended/galaxy/database/job_working_directory/027/27311/dataset_34455_files" --mode "Max_probe" - --title " actaearly-Controlearly-actalate-Controllate_interpro_GSEA" --builtin_gmt "/data/genomes/gsea/3.1/IPR_DOMAIN.gmt" - - -""" -import optparse -import tempfile -import os -import sys -import subprocess -import time -import shutil -import glob -import math -import re - -KEEPSELECTION = False # detailed records for selection of multiple probes - -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - - - -def fix_subdir(adir,destdir): - """ Galaxy wants everything in the same files_dir - if os.path.exists(adir): - for (d,dirs,files) in os.path.walk(adir): - for f in files: - sauce = os.path.join(d,f) - shutil.copy(sauce,destdir) - """ - - def fixAffycrap(apath=''): - """class='richTable'>RUNNING ES</th><th class='richTable'>CORE ENRICHMENT</th><tr><td class='lessen'>1</td> - <td><a href='https://www.affymetrix.com/LinkServlet?probeset=LBR'>LBR</a></td><td></td><td></td><td>1113</td> - <td>0.194</td><td>-0.1065</td><td>No</td></tr><tr><td class='lessen'>2</td><td> - <a href='https://www.affymetrix.com/LinkServlet?probeset=GGPS1'>GGPS1</a></td><td></td><td></td><td>4309</td><td>0.014</td><td>-0.4328</td> - <td>No</td></tr> - """ - html = [] - try: - html = open(apath,'r').readlines() - except: - return html - for i,row in enumerate(html): - row = re.sub('https\:\/\/www.affymetrix.com\/LinkServlet\?probeset=',"http://www.genecards.org/index.php?path=/Search/keyword/",row) - html[i] = row - return html - - cleanup = False - if os.path.exists(adir): - flist = os.listdir(adir) # get all files created - for f in flist: - apath = os.path.join(adir,f) - dest = os.path.join(destdir,f) - if not os.path.isdir(apath): - if os.path.splitext(f)[1].lower() == '.html': - html = fixAffycrap(apath) - fixed = open(apath,'w') - fixed.write('\n'.join(html)) - fixed.write('\n') - fixed.close() - if not os.path.isfile(dest): - shutil.copy(apath,dest) - else: - fix_subdir(apath,destdir) - if cleanup: - try: - shutil.rmtree(path=adir,ignore_errors=True) - except: - pass - - - -def getFileString(fpath, outpath): - """ - format a nice file size string - """ - size = '' - fp = os.path.join(outpath, fpath) - s = fpath - if os.path.isfile(fp): - n = float(os.path.getsize(fp)) - if n > 2**20: - size = ' (%1.1f MB)' % (n/2**20) - elif n > 2**10: - size = ' (%1.1f KB)' % (n/2**10) - elif n > 0: - size = ' (%d B)' % (int(n)) - s = '%s %s' % (fpath, size) - return s - -class gsea_wrapper: - """ - GSEA java desktop client has a CL interface. CL can be gleaned by clicking the 'command line' button after setting up an analysis - We don't want gsea to do the analysis but it can read .rnk files containing rows of identifiers and an evidence weight such as the signed t statistic from limma for differential expression -(vgalaxy)rlazarus@iaas1:~/public_html/idle_illumina_analysis$ cat gseaHumanREFSEQ.sh -#!/bin/bash -for RNK in `ls *.rnk` -do -DIRNAME=${RNK%.*} -echo $DIRNAME -qsub -cwd -b y java -Xmx4096m -cp /data/app/bin/gsea2-2.07.jar xtools.gsea.GseaPreranked -gmx ../msigdb.v3.0.symbols.gmt -collapse true -mode Max_probe -norm meandiv --nperm 1000 -rnk $RNK -scoring_scheme weighted -rpt_label $RNK -chip ../RefSeq_human.chip -include_only_symbols true -make_sets true -plot_top_x 20 -rnd_seed timestamp --set_max 500 -set_min 15 -zip_report false -out gseaout/${DIRNAME} -gui false -done - """ - - def __init__(self,myName=None,opts=None): - """ setup cl for gsea - """ - self.idcol = 0 - self.signcol = 0 - self.adjpvalcol = 0 - self.progname=myName - self.opts = opts - remove_duplicates=True - if not os.path.isdir(opts.output_dir): - try: - os.makedirs(opts.output_dir) - except: - print >> sys.stderr,'##Error: GSEA wrapper unable to create or find output directory %s. Stopping' % (opts.output_dir) - sys.exit(1) - fakeGMT = re.sub('[^a-zA-Z0-9_]+', '', opts.input_name) # gives a more useful title for the GSEA report - fakeGMT = os.path.join(opts.output_dir,fakeGMT) - fakeGMT = os.path.abspath(fakeGMT) - fakeRanks = '%s.rnk' % fakeGMT - if not fakeGMT.endswith('.gmt'): - fakeGMT = '%s.gmt' % fakeGMT - if opts.builtin_gmt and opts.history_gmt: - newfile = open(fakeGMT,'w') - subprocess.call(['cat',opts.builtin_gmt,opts.history_gmt],stdout=newfile) - newfile.close() - elif opts.history_gmt: - subprocess.call(['cp',opts.history_gmt,fakeGMT]) - else: - subprocess.call(['cp',opts.builtin_gmt,fakeGMT]) - # remove dupes from each gene set - gmt = open(fakeGMT,'r').readlines() - gmt = [x for x in gmt if len(x.split('\t')) > 3] - ugmt = [] - for i,row in enumerate(gmt): - rows = row.rstrip().split('\t') - gmtname = rows[0] - gmtcomment = rows[1] - glist = list(set(rows[2:])) - newgmt = [gmtname,gmtcomment] - newgmt += glist - ugmt.append('\t'.join(newgmt)) - gmt = open(fakeGMT,'w') - gmt.write('\n'.join(ugmt)) - gmt.write('\n') - gmt.close() - if opts.input_ranks: - infname = opts.input_ranks - rdat = open(opts.input_ranks,'r').readlines() # suck in and remove blank ids that cause gsea to barf rml april 10 2012 - rdat = [x.rstrip().split('\t') for x in rdat[1:]] # ignore head - dat = [[x[0],x[1],x[1]] for x in rdat] - # fake same structure as input tabular file - try: - pvals = [float(x[1]) for x in dat] - signs = [float(x[1]) for x in dat] - except: - print >> sys.stderr, '## error converting floating point - cannot process this input' - sys.exit(99) - else: # read tabular - self.idcol = int(opts.idcol) - 1 - self.signcol = int(opts.signcol) - 1 - self.adjpvalcol = int(opts.adjpvalcol) - 1 - maxcol = max(self.idcol,self.signcol,self.adjpvalcol) - infname = opts.input_tab - indat = open(opts.input_tab,'r').readlines() - dat = [x.rstrip().split('\t') for x in indat[1:]] - dat = [x for x in dat if len(x) > maxcol] - dat = [[x[self.idcol],x[self.adjpvalcol],x[self.signcol]] for x in dat] # reduce to rank form - pvals = [float(x[1]) for x in dat] - outofrange = [x for x in pvals if ((x < 0.0) or (x > 1.0))] - assert len(outofrange) == 0, '## p values outside 0-1 encountered - was that the right column for adjusted p value?' - signs = [float(x[2]) for x in dat] - outofrange = [i for i,x in enumerate(signs) if (not x) and (dat[i][self.signcol] <> '0')] - bad = [dat[x][2] for x in outofrange] - assert len(outofrange) == 0, '## null numeric values encountered for sign - was that the right column? %s' % bad - ids = [x[0] for x in dat] - res = [] - self.comments = [] - useme = [] - for i,row in enumerate(dat): - if row[1].upper() != 'NA' and row[2].upper() != 'NA' and row[0] != '' : - useme.append(i) - lost = len(dat) - len(useme) - if lost <> 0: - newdat = [dat[x] for x in useme] - del dat - dat = newdat - print >> sys.stdout, '## %d lost - NA values or null id' % lost - if remove_duplicates: - uids = list(set(ids)) # complex procedure to get min pval for each unique id - if len(uids) <> len(ids): # dupes - deal with mode - print >> sys.stdout,'## Dealing with %d uids in %d ids' % (len(uids),len(ids)) - ures = {} - for i,id in enumerate(ids): - p = pvals[i] - ures.setdefault(id,[]) - ures[id].append((p,signs[i])) - for id in uids: - tlist = ures[id] - tp = [x[0] for x in tlist] - ts = [x[1] for x in tlist] - if len(tp) == 1: - p = tp[0] - sign = ts[0] - else: - if opts.mode == "Max_probe": - p = min(tp) - sign = ts[tp.index(p)] - else: # guess median - too bad if even count - tp.sort() - ltp = len(tp) - ind = ltp/2 # yes, this is wrong for evens but what if sign wobbles? - if ltp % 2 == 1: # odd - ind += 1 # take the median - p = tp[ind] - sign = ts[ind] - if KEEPSELECTION: - self.comments.append('## for id=%s, got tp=%s, ts=%s, chose p=%f, sign =%f'\ - % (id,str(tp),str(ts),p,sign)) - if opts.input_ranks: # must be a rank file - res.append((id,'%f' % p)) - else: - if p == 0.0: - p = 1e-99 - try: - lp = -math.log10(p) # large positive if low p value - except ValueError: - lp = 0.0 - if sign < 0: - lp = -lp # if negative, swap p to negative - res.append((id,'%f' % lp)) - else: # no duplicates - for i,row in enumerate(dat): - (id,p,sign) = (row[0],float(row[1]),float(row[2])) - if opts.input_ranks: # must be a rank file - res.append((id,'%f' % p)) - else: - if p == 0.0: - p = 1e-99 - try: - lp = -math.log10(p) # large positive if low p value - except ValueError: - lp = 0.0 - if sign < 0: - lp = -lp # if negative, swap p to negative - res.append((id,'%f' % lp)) - else: - for i,row in enumerate(dat): - (id,p,sign) = (row[0],float(row[1]),float(row[2])) - if opts.input_ranks: # must be a rank file - res.append((id,'%f' % p)) - else: - if p == 0.0: - p = 1e-99 - try: - lp = -math.log10(p) # large positive if low p value - except ValueError: - lp = 0.0 - if sign < 0: - lp = -lp # if negative, swap p to negative - res.append((id,'%f' % lp)) - len1 = len(ids) - len2 = len(ranks) - delta = len1 - len2 - if delta <> 0: - print >> sys.stdout,'NOTE: %d of %d rank input file %s rows deleted - dup, null or NA IDs, pvals or logFCs' % (delta,len1,infname) - ranks = [(float(x[1]),x) for x in ranks] # decorate - ranks.sort() - ranks.reverse() - ranks = [x[1] for x in ranks] # undecorate - if opts.chip == '': # if mouse - need HUGO - ranks = [[x[0].upper(),x[1]] for x in ranks] - print >> sys.stdout, '## Fixed any lower case - now have',','.join([x[0] for x in ranks[:5]]) - ranks = ['\t'.join(x) for x in ranks] - if len(ranks) < 2: - print >> sys.stderr,'Input %s has 1 or less rows with two tab delimited fields - please check the tool documentation' % infname - sys.exit(1) - print '### opening %s and writing %s' % (fakeRanks,str(ranks[:10])) - rclean = open(fakeRanks,'w') - rclean.write('contig\tscore\n') - rclean.write('\n'.join(ranks)) - rclean.write('\n') - rclean.close() - cl = [] - a = cl.append - a('java -Xmx6G -cp') - a(opts.gsea_jar) - a('xtools.gsea.GseaPreranked') - a('-gmx %s' % fakeGMT) # ensure .gmt extension as required by GSEA - gene sets to use - a('-gui false') # use preranked file mode and no gui - a('-make_sets true -rnd_seed timestamp') # more things from the GUI command line display - a('-norm meandiv -zip_report false -scoring_scheme weighted') # ? need to set these? - a('-rnk %s' % fakeRanks) # input ranks file symbol (the chip file is the crosswalk for ids in first column) - a('-out %s' % opts.output_dir) - a('-set_max %s' % opts.setMax) - a('-set_min %s' % opts.setMin) - a('-mode %s' % opts.mode) - if opts.chip > '': - #a('-chip %s -collapse true -include_only_symbols true' % opts.chip) - a('-chip %s -collapse true' % opts.chip) - else: - a("-collapse false") # needed if no chip - a('-nperm %s' % opts.nPerm) - a('-rpt_label %s' % opts.title) - a('-plot_top_x %s' % opts.plotTop) - self.cl = cl - self.comments.append('## GSEA command line:') - self.comments.append(' '.join(self.cl)) - self.fakeRanks = fakeRanks - self.fakeGMT = fakeGMT - - def grepIds(self): - """ - """ - found = [] - allids = open(self.opts.input_ranks,'r').readlines() - allids = [x.strip().split() for x in allids] - allids = [x[0] for x in allids] # list of ids - gmtpath = os.path.split(self.opts.fakeGMT)[0] # get path to all chip - - def run(self): - """ - - """ - tlog = os.path.join(self.opts.output_dir,"gsea_runner.log") - sto = open(tlog,'w') - x = subprocess.Popen(' '.join(self.cl),shell=True,stdout=sto,stderr=sto,cwd=self.opts.output_dir) - retval = x.wait() - sto.close() - d = glob.glob(os.path.join(self.opts.output_dir,'%s*' % self.opts.title)) - if len(d) > 0: - fix_subdir(d[0],self.opts.output_dir) - htmlfname = os.path.join(self.opts.output_dir,'index.html') - try: - html = open(htmlfname,'r').readlines() - html = [x.strip() for x in html if len(x.strip()) > 0] - if len(self.comments) > 0: - s = ['<pre>'] - s += self.comments - s.append('</pre>') - try: - i = html.index('<div id="footer">') - except: - i = len(html) - 7 # fudge - html = html[:i] + s + html[i:] - except: - html = [] - htmlhead = '<html><head></head><body>' - html.append('## Galaxy GSEA wrapper failure') - html.append('## Unable to find index.html in %s - listdir=%s' % (d,' '.join(os.listdir(self.opts.output_dir)))) - html.append('## Command line was %s' % (' '.join(self.cl))) - html.append('## commonly caused by mismatched ID/chip selection') - glog = open(os.path.join(self.opts.output_dir,'gsea_runner.log'),'r').readlines() - html.append('## gsea_runner.log=%s' % '\n'.join(glog)) - #tryme = self.grepIds() - retval = 1 - print >> sys.stderr,'\n'.join(html) - html = ['%s<br/>' % x for x in html] - html.insert(0,htmlhead) - html.append('</body></html>') - htmlf = file(self.opts.outhtml,'w') - htmlf.write('\n'.join(html)) - htmlf.write('\n') - htmlf.close() - os.unlink(self.fakeRanks) - os.unlink(self.fakeGMT) - if opts.outtab_neg: - tabs = glob.glob(os.path.join(opts.output_dir,"gsea_report_for_*.xls")) - if len(tabs) > 0: - for tabi,t in enumerate(tabs): - tkind = os.path.basename(t).split('_')[4].lower() - if tkind == 'neg': - outtab = opts.outtab_neg - elif tkind == 'pos': - outtab = opts.outtab_pos - else: - print >> sys.stderr, '## tab file matched %s which is not "neg" or "pos" in 4th segment %s' % (t,tkind) - sys.exit() - content = open(t).readlines() - tabf = open(outtab,'w') - tabf.write(''.join(content)) - tabf.close() - else: - print >> sys.stdout, 'Odd, maketab = %s but no matches - tabs = %s' % (makeTab,tabs) - return retval - - -if __name__ == "__main__": - """ - called as: - <command interpreter="python">rgGSEA.py --input_ranks "$input1" --outhtml "$html_file" - --setMax "$setMax" --setMin "$setMin" --nPerm "$nPerm" --plotTop "$plotTop" --gsea_jar "$GALAXY_DATA_INDEX_DIR/shared/jars/gsea2-2.07.jar" - --output_dir "$html_file.files_path" --use_gmt ""${use_gmt.fields.path}"" --chip "${use_chip.fields.path}" - </command> - """ - op = optparse.OptionParser() - a = op.add_option - a('--input_ranks',default=None) - a('--input_tab',default=None) - a('--input_name',default=None) - a('--use_gmt',default=None) - a('--history_gmt',default=None) - a('--builtin_gmt',default=None) - a('--history_gmt_name',default=None) - a('--setMax',default="500") - a('--setMin',default="15") - a('--nPerm',default="1000") - a('--title',default="GSEA report") - a('--chip',default='') - a('--plotTop',default='20') - a('--outhtml',default=None) - a('--makeTab',default=None) - a('--output_dir',default=None) - a('--outtab_neg',default=None) - a('--outtab_pos',default=None) - a('--adjpvalcol',default=None) - a('--signcol',default=None) - a('--idcol',default=None) - a('--mode',default='Max_probe') - a('-j','--gsea_jar',default='/usr/local/bin/gsea2-2.07.jar') - opts, args = op.parse_args() - assert os.path.isfile(opts.gsea_jar),'## GSEA runner unable to find supplied gsea java desktop executable file %s' % opts.gsea_jar - if opts.input_ranks: - inpf = opts.input_ranks - else: - inpf = opts.input_tab - assert opts.idcol <> None, '## GSEA runner needs an id column if a tabular file provided' - assert opts.signcol <> None, '## GSEA runner needs a sign column if a tabular file provided' - assert opts.adjpvalcol <> None, '## GSEA runner needs an adjusted p value column if a tabular file provided' - assert os.path.isfile(inpf),'## GSEA runner unable to open supplied input file %s' % inpf - if opts.chip > '': - assert os.path.isfile(opts.chip),'## GSEA runner unable to open supplied chip file %s' % opts.chip - some = None - if opts.history_gmt <> None: - some = 1 - assert os.path.isfile(opts.history_gmt),'## GSEA runner unable to open supplied history gene set matrix (.gmt) file %s' % opts.history_gmt - if opts.builtin_gmt <> None: - some = 1 - assert os.path.isfile(opts.builtin_gmt),'## GSEA runner unable to open supplied history gene set matrix (.gmt) file %s' % opts.builtin_gmt - assert some, '## GSEA runner needs a gene set matrix file - none chosen?' - opts.title = re.sub('[^a-zA-Z0-9_]+', '', opts.title) - myName=os.path.split(sys.argv[0])[-1] - gse = gsea_wrapper(myName, opts=opts) - retcode = gse.run() - if retcode <> 0: - sys.exit(retcode) # indicate failure to job runner - -
--- a/rgedgeR/rgGSEAcolumns.xml Wed Jun 12 02:58:43 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,834 +0,0 @@ -<tool id="rgGSEAcolumns" name="Gene set enrichment" version="0.05"> - <description>using a generic tabular file</description> - <requirements> - <requirement type="package" version="2.0.12">gsea_jar</requirement> - <requirement type="set_environment">GSEAJAR_PATH</requirement> - </requirements> - <command interpreter="python">rgGSEA.py --input_tab "$input1" --adjpvalcol "$adjpvalcol" --signcol "$signcol" - --idcol "$idcol" --outhtml "$html_file" --input_name "${input1.name}" - --setMax "$setMax" --setMin "$setMin" --nPerm "$nPerm" --plotTop "$plotTop" - --gsea_jar "\$GSEAJAR_PATH" - --output_dir "$html_file.files_path" --mode "$mode" --title "$title" -#if $makeTab.value=='Yes' - --outtab_pos "$outtab_pos" --outtab_neg "$outtab_neg" -#end if -#if $gmtSource.refgmtSource == "indexed" or $gmtSource.refgmtSource == "both": ---builtin_gmt "${gmtSource.builtinGMT.fields.path}" -#end if -#if $gmtSource.refgmtSource == "history" or $gmtSource.refgmtSource == "both": ---history_gmt "${gmtSource.ownGMT}" --history_gmt_name "${gmtSource.ownGMT.name}" -#end if -#if $chipSource.refchipSource=="builtin" - --chip "${chipSource.builtinChip.fields.path}" -#end if -#if $chipSource.refchipSource=="history" - --chip "${chipSource.ownChip}" -#end if -</command> - <inputs> - <param name="input1" type="data" format="tabular" label="Select a tab delimited file with a probe id, adjusted p value and a signed weight (eg t-test statistic) on each row" - help=""/> - <param name="adjpvalcol" label="Column containing a p value for the DEG statistical test" - help = "Use RAW p-values - not FDR adjusted as these have a non GSEA friendly non-uniform distribution" - type="data_column" data_ref="input1" numerical="True" - multiple="false" use_header_names="true" size="5"> - <validator type="no_options" message="Please select a p value column."/> - </param> - <param name="signcol" label="Column containing the DE sign - eg log fold change so positive/negative values = upregulated/downregulated in treatment" - type="data_column" data_ref="input1" numerical="True" - multiple="false" use_header_names="true" size="5"> - <validator type="no_options" message="Please select a sign column."/> - </param> - <param name="idcol" label="Column containing a gene id (refseq, symbol or Entrez" - type="data_column" data_ref="input1" numerical="False" - multiple="false" use_header_names="true" size="5"> - <validator type="no_options" message="Please select an id column."/> - </param> - - <param name="title" type="text" value="GSEA" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain"/> - <param name="setMin" type="integer" label="Minimum gene set size to prune (default=15)" size="5" value="15"/> - <param name="setMax" type="integer" label="Maximum gene set size to prune (default=500)" size="5" value="500"/> - <param name="nPerm" type="integer" label="Number of permutations (default=1000)" size="7" value="1000"/> - <param name="plotTop" type="integer" label="Number of top gene sets to plot and present in detailed reports(default=20)" size="10" value="20"/> - <param name="mode" type="select" label="Mode for dealing with duplicated gene ids" > - <option value="Max_probe" selected="true">Use the most extreme value</option> - <option value="Median_of_probes">Use the median of all supplied values</option> - </param> - <param name="makeTab" type="select" label="Create a tabular report containing ALL gene sets for downstream analysis" > - <option value="Yes">Yes</option> - <option value="No" selected="true">No</option> - </param> - <conditional name="chipSource" > - <param name="refchipSource" type="select" label="Translate the rank file IDs (first column) using a GSEA 'chip' lookup table"> - <option value="builtin">Use a builtin GSEA 'chip' file</option> - <option value="history">Use one of the GSEA 'chip' files in my history</option> - <option value="no" selected="true">IDs are already gene symbols - no translation needed </option> - </param> - <when value="no"> - </when> - <when value="builtin"> - <param name="builtinChip" type="select" label="Select a GSEA 'chip' to map each probe id in the input file to a gene symbol"> - <options from_data_table="gseaChip_3.1"> - <filter type="sort_by" column="2" /> - <validator type="no_options" message="No chip tables are available - please see the Galaxy GSEA tool documentation for instructions on installing them" /> - </options> - </param> - </when> - <when value="history"> - <param name="ownChip" type="data" format="gseachip" label="Select an existing gseachip filefrom your history. Probe_id matching ids in your input file and a gene_symbol must be the first 2 tabular columns. Header row required." - help='see docs at http://www.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#CHIP:_Chip_file_format_.28.2A.chip.29'> - <validator type="no_options" message="Need a gseachip datatype - tabular file, probe_id gene_symbol as first 2 columns. Header row required. Convert datatypes (pencil icon) of existing history tabular files if necessary"/> - </param> - </when> - </conditional> - <conditional name="gmtSource"> - <param name="refgmtSource" type="select" label="Use a gene set (.gmt) from your history or use a built-in (MSigDB etc) gene set"> - <option value="indexed" selected="true">Use a built-in gene set</option> - <option value="history">Use a gene set from my history</option> - <option value="both">Add a gene set from my history to a built in gene set</option> - </param> - <when value="indexed"> - <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis"> - <options from_data_table="gseaGMT_3.1"> - <filter type="sort_by" column="2" /> - <validator type="no_options" message="No GMT v3.1 files are available - please install them"/> - </options> - </param> - </when> - <when value="history"> - <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" /> - </when> - <when value="both"> - <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" /> - <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis"> - <options from_data_table="gseaGMT_3.1"> - <filter type="sort_by" column="2" /> - <validator type="no_options" message="No GMT v3.1 files are available - please install them and make sure they match [tool-data]/gseaGMT_3.1.loc, editing as needed"/> - </options> - </param> - </when> - </conditional> - </inputs> - <outputs> - <data format="html" name="html_file" label="${title}.html"/> - <data format="tabular" name="outtab_pos" label="${title}_pos_allsets.xls"> - <filter>makeTab=="Yes"</filter> - </data> - <data format="tabular" name="outtab_neg" label="${title}_neg_allsets.xls"> - <filter>makeTab=="Yes"</filter> - </data> - </outputs> - <tests> - <test> - <param name="input1" value="gsea_test_DGE.xls"/> - <param name="ownGMT" value="gsea_test_DGE.xls"/> - <param name="adjpvalcol" value="5"/> - <param name="signcol" value="2"/> - <param name="idcol" value="1"/> - <param name="plotTop" value="2"/> - <param name="makeTab" value="No"/> - <param name="refchipSource" value="no"/> - <param name="refgmtSource" value="history"/> - <param name="html_file" value="gseatestout.html"> - <assert_contents> - <has_text text="GSEA Report for Dataset" /> - <has_text text="enrichment results in html"/> - </assert_contents> - </param> - <param name="setMax" value="500"/> - <param name="setMin" value="15"/> - <param name="nPerm" value="100"/> - <param name="output_dir" value="gseatestout"/> - <param name="mode" value="Max_probe"/> - <param name="title" value="GSEA test"/> - <param name="builtin_gmt" value="gseatestdata.gmt"/> - </test> - </tests> -<help> -**What it does** - -Performs Gene set enrichment analysis using GSEA_ from the Broad as described in 2005Paper_ and v3.1 of msigdb - -Refer to http://www.pnas.org/cgi/content/abstract/0506580102v1 for details of the method. - -**Input** - -A tabular file with a header row and some columns containing probe ids or gene names; a p value or t-test statistic reflecting the -strength and a signed quantity such as log fold change or a t-test statistic reflecting the direction of differential gene expression from edgeR for counts -or some other valid analysis. -Probe names can be translated using the chip file option (see below) -A filter has been added to remove NA and null id rows as these can't be used. A summary will appear in stdout in the page linked from the "i" (info) button for the output - -**Gene sets** - -If you have your own gene sets as .gmt files (see GUIDE_ for details of format) upload them in your history and use them by nominating a history sourced gmt. - -Alternatively, use one of the built in MSigDB_ (molecular signatures database) sets. -There may be additional locally provided sets such as starBase - -Optionally, a local gmt can be merged with a built in one if you want to see how they perform competitively - -Select the most appropriate subset of MSigDB gene sets or use all by choosing msigdb.v3.0.symbols. -A non-cancer subset called c2_c3_c5_all.v3.0.symbols is recommended if you are working with non-cancer biology. - - -**Chip** - -If your input file contains HUGO gene symbols (TEK, SLC38A1...etc) then you do not need to specify a 'chip'. - -GSEA software allows you to specify a translation table from common microarray probe IDs to gene symbols - select the option to use a chip file and choose the one that matches your probe id's -Failure to match the translation correctly will generally result in failure of the job since none of your input data will be translated properly. - -**Output** - -A report as html - see the GUIDE_ for a guide on interpretation - - -**Attribution** - -This Galaxy tool wrapper was written by Ross Lazarus. -It executes the GSEA_ java application available using MSigDB and Chip files obtained from the GSEA_ web site - - -**Chip ID samples** - -You may be able to look for probe ids in similar form to the ones you have in the list below to find the correct GSEA 'chip' file if you do not have HUGO gene symbols - - :: - - AG.chip - Probe Set ID Gene Symbol Gene Title - 11986_at --- calcium-dependent lipid-binding protein, putative - 11987_at --- zinc finger (C3HC4-type RING finger) family protein - 11988_at --- WD-40 repeat family protein - - Agilent_Human1A.chip - Probe Set ID Gene Symbol Gene Title - A_23_P100001 LOC400451 hypothetical gene supported by AK075564; BC060873 - A_23_P100011 AP3S2 adaptor-related protein complex 3, sigma 2 subunit - A_23_P10002 CEP63 centrosome protein Cep63 - - Agilent_Human1Av2.chip - Probe Set ID Gene Symbol Gene Title - A_23_P100001 LOC400451 hypothetical gene supported by AK075564; BC060873 - A_23_P100011 AP3S2 adaptor-related protein complex 3, sigma 2 subunit - A_23_P10002 CEP63 centrosome protein Cep63 - - Agilent_Human1B.chip - Probe Set ID Gene Symbol Gene Title - A_32_P10002 TUBA2 tubulin, alpha 2 - A_32_P100076 RPL19 ribosomal protein L19 - A_32_P100100 THOC3 THO complex 3 - - Agilent_Human1_cDNA.chip - Probe Set ID Gene Symbol Gene Title - 1 RNASEH2A ribonuclease H2, large subunit - 10 SF3B1 splicing factor 3b, subunit 1, 155kDa - 100 S100A16 S100 calcium binding protein A16 - - Agilent_HumanGenome.chip - Probe Set ID Gene Symbol Gene Title - A_23_P100001 LOC400451 hypothetical gene supported by AK075564; BC060873 - A_23_P100011 NULL NULL - A_23_P100022 SV2B synaptic vesicle glycoprotein 2B - - Agilent_Mouse_cDNA.chip - Probe Set ID Gene Symbol Gene Title - 1000982 SLC6A9 solute carrier family 6 (neurotransmitter transporter, glycine), member 9 - 1001007 IGF1 insulin-like growth factor 1 - 1001011 FCNA ficolin A - - Agilent_MouseDev.chip - Probe Set ID Gene Symbol Gene Title - A_65_P00523 NULL NULL - A_65_P00524 NULL NULL - A_65_P00525 SFTPD surfactant associated protein D - - Agilent_MouseGenome.chip - Probe Set ID Gene Symbol Gene Title - A_51_P100021 NULL NULL - A_51_P100034 2310075G12RIK RIKEN cDNA 2310075G12 gene - A_51_P100052 SLITRK2 SLIT and NTRK-like family, member 2 - - Agilent_MouseOligo.chip - Probe Set ID Gene Symbol Gene Title - A_51_P100021 NULL NULL - A_51_P100034 2310075G12RIK RIKEN cDNA 2310075G12 gene - A_51_P100052 SLITRK2 SLIT and NTRK-like family, member 2 - - Agilent_RatGenome_G4131A.chip - Probe Set ID Gene Symbol Gene Title - A_42_P453055 LU Lutheran blood group (Auberger b antigen included) - A_42_P453737 NULL NULL - A_42_P453894 RGD1306682_PREDICTED similar to RIKEN cDNA 1810046J19 (predicted) - - Agilent_RatOligo.chip - Probe Set ID Gene Symbol Gene Title - A_42_P453131 CNTN1 contactin 1 - A_42_P453151 TTC8_PREDICTED tetratricopeptide repeat domain 8 (predicted) - A_42_P453162 NULL NULL - - APPLERA_ABI1700.chip - Probe Set ID Gene Symbol Gene Title - 100002 NULL NULL - 100003 NULL NULL - 100027 ZNF192 zinc finger protein 192 - - ATH1_121501.chip - Probe Set ID Gene Symbol Gene Title - 244901_at --- --- - 244902_at --- --- - 244903_at --- --- - - AtlasMouse1.2.chip - Probe Set ID Gene Symbol Gene Title - MA099 HTR1A 5-hydroxytryptamine (serotonin) receptor 1A - MA015 JARID2 jumonji, AT rich interactive domain 2 - M443 CDKN2C cyclin-dependent kinase inhibitor 2C (p18, inhibits CDK4) - - AtlasRat1.2.chip - Probe Set ID Gene Symbol Gene Title - R484 TPP2 tripeptidylpeptidase II - RG348 MOS v-mos moloney murine sarcoma viral oncogene homolog - RG119 KCNJ1 potassium inwardly-rectifying channel, subfamily J, member 1 - - Bovine.chip - Probe Set ID Gene Symbol Gene Title - AFFX-BioB-3_at --- --- - AFFX-BioB-5_at --- --- - AFFX-BioB-M_at --- --- - - Caltech16KMouse.chip - Probe Set ID Gene Symbol Gene Title - 6712 CCL6 chemokine (C-C motif) ligand 6 - 8563 ACTL7B actin-like 7b - 8434 1700049M11RIK RIKEN cDNA 1700049M11 gene - - Caltech16KOligoMouse.chip - Probe Set ID Gene Symbol Gene Title - 6712 GCA grancalcin, EF-hand calcium binding protein - 8563 MARVELD2 MARVEL (membrane-associating) domain containing 2 - 8434 BC048825 NULL - - Canine_2.chip - Probe Set ID Gene Symbol Gene Title - AFFX-BioB-3_at --- --- - AFFX-BioB-5_at --- --- - AFFX-BioB-M_at --- --- - - Clontech_Atlas_13K.chip - Probe Set ID Gene Symbol Gene Title - 1-A01 NULL NULL - 1-A02 A2M alpha-2-macroglobulin - 1-A03 AADAC arylacetamide deacetylase (esterase) - - Clontech_BD_Atlas.chip - Probe Set ID Gene Symbol Gene Title - A01a PCP4 Purkinje cell protein 4 - A01b NULL NULL - A01c PMP22 peripheral myelin protein - - CNMCMuscleChip.chip - Probe Set ID Gene Symbol Gene Title - HSPD4570_at MBP myelin basic protein - HSPD7328_at MAPK12 mitogen-activated protein kinase 12 - HSPD17028_i_at DIA1 diaphorase (NADH) (cytochrome b-5 reductase) - - CodeLink_Human_Whole_Genome.chip - Probe Set ID Gene Symbol Gene Title - GE469530 NULL NULL - GE469548 TMEM1 transmembrane protein 1 - GE469549 NULL NULL - - CodeLink_UniSet_Human_20K_I_Bioarray.chip - Probe Set ID Gene Symbol Gene Title - 000106CB1_PROBE1 NDUFA4 NADH dehydrogenase (ubiquinone) 1 alpha subcomplex, 4, 9kDa - 000136CB1_PROBE1 MRPS33 mitochondrial ribosomal protein S33 - 000278CB1_PROBE1 FDFT1 farnesyl-diphosphate farnesyltransferase 1 - - CodeLink_UniSet_Human_I_Bioarray.chip - Probe Set ID Gene Symbol Gene Title - AA001334_PROBE1 ZHX2 zinc fingers and homeoboxes 2 - AA002191_PROBE1 PITPNA phosphatidylinositol transfer protein, alpha - AA004381_PROBE1 NULL NULL - - CodeLink_UniSet_Human_II_Bioarray.chip - Probe Set ID Gene Symbol Gene Title - 000106CB1_PROBE1 NDUFA4 NADH dehydrogenase (ubiquinone) 1 alpha subcomplex, 4, 9kDa - 000136CB1_PROBE1 MRPS33 mitochondrial ribosomal protein S33 - 000278CB1_PROBE1 FDFT1 farnesyl-diphosphate farnesyltransferase 1 - - CodeLink_UniSet_Rat_I_Bioarray.chip - Probe Set ID Gene Symbol Gene Title - AA799301_PROBE1 NULL NULL - AA799313_PROBE1 RGD:1303279 sialyltransferase 10 (alpha-2,3-sialyltransferase VI) - AA799329_PROBE1 NULL NULL - - DrosGenome1.chip - Probe Set ID Gene Symbol Gene Title - 141200_at CG9418 CG9418 - 141201_at CG17840 CG17840 - 141202_at QKR58E-3 quaking related 58E-3 - - Drosophila_2.chip - Probe Set ID Gene Symbol Gene Title - 1616608_a_at GPDH Glycerol 3 phosphate dehydrogenase - 1622892_s_at CG33057 /// MKG-P CG33057 /// monkey king protein - 1622893_at IM3 /// RPS10B Immune induced molecule 3 /// Ribosomal protein S10b - - G4110A.chip - Probe Set ID Gene Symbol Gene Title - A_23_P7033 MAGEF1 melanoma antigen, family F, 1 - A_23_P170713 A_23_P170713 ***description missing*** - A_23_P107801 FLJ21742 hypothetical protein FLJ21742 - - G4110Av2.chip - Probe Set ID Gene Symbol Gene Title - A_23_P7033 MAGEF1 melanoma antigen, family F, 1 - A_23_P170713 A_23_P170713 No description available - A_23_P107801 FLJ21742 hypothetical protein FLJ21742 - - GENE_SYMBOL.chip - Probe Set ID Gene Symbol Gene Title Aliases - A1BG A1BG alpha-1-B glycoprotein GABDKFZP686F0970ABGHYST2477A1B - A2M A2M alpha-2-macroglobulin DKFZP779B086S863-7ALPHA 2MFWP007CPAMD5 - A2ML1 A2ML1 alpha-2-macroglobulin-like 1 FLJ39129DKFZP686O1010FLJ41598DKFZP686G1812FLJ25179CPAMD9DKFZP686D2011FLJ16045FLJ41607DKFZP686C1729FLJ41597DKFZP686L1821 - - GenosysCytokineV2.chip - Probe Set ID Gene Symbol Gene Title - M05.2 GDF11 growth differentiation factor 11 - M22.2 AE000170 No description available - E06.3 IL5 interleukin 5 (colony-stimulating factor, eosinophil) - - HC_G110.chip - Probe Set ID Gene Symbol Gene Title - 1000_at MAPK3 mitogen-activated protein kinase 3 - 1001_at TIE1 tyrosine kinase with immunoglobulin-like and EGF-like domains 1 - 1002_f_at CYP2C19 cytochrome P450, family 2, subfamily C, polypeptide 19 - - HG_Focus.chip - Probe Set ID Gene Symbol Gene Title - 1007_s_at DDR1 discoidin domain receptor family, member 1 - 1053_at RFC2 replication factor C (activator 1) 2, 40kDa - 117_at HSPA6 /// LOC652878 heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B) - - HG_U133A_2.chip - Probe Set ID Gene Symbol Gene Title - 1007_s_at DDR1 discoidin domain receptor family, member 1 - 1053_at RFC2 replication factor C (activator 1) 2, 40kDa - 117_at HSPA6 /// LOC652878 heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B) - - HG_U133AAofAv2.chip - Probe Set ID Gene Symbol Gene Title - 1007_s_at DDR1 discoidin domain receptor family, member 1 - 1053_at RFC2 replication factor C (activator 1) 2, 40kDa - 117_at HSPA6 /// LOC652878 heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B) - - HG_U133A.chip - Probe Set ID Gene Symbol Gene Title - 1007_s_at DDR1 discoidin domain receptor family, member 1 - 1053_at RFC2 replication factor C (activator 1) 2, 40kDa - 117_at HSPA6 /// LOC652878 heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B) - - HG_U133B.chip - Probe Set ID Gene Symbol Gene Title - 200000_s_at PRPF8 PRP8 pre-mRNA processing factor 8 homolog (S. cerevisiae) /// PRP8 pre-mRNA processing factor 8 homolog (S. cerevisiae) - 200001_at CAPNS1 calpain, small subunit 1 /// calpain, small subunit 1 - 200002_at RPL35 ribosomal protein L35 /// ribosomal protein L35 - - HG_U133_Plus_2.chip - Probe Set ID Gene Symbol Gene Title - 1007_s_at DDR1 discoidin domain receptor family, member 1 - 1053_at RFC2 replication factor C (activator 1) 2, 40kDa - 117_at HSPA6 /// LOC652878 heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B) - - HG_U95Av2.chip - Probe Set ID Gene Symbol Gene Title - 1000_at MAPK3 mitogen-activated protein kinase 3 - 1001_at TIE1 tyrosine kinase with immunoglobulin-like and EGF-like domains 1 - 1002_f_at CYP2C19 cytochrome P450, family 2, subfamily C, polypeptide 19 - - HG_U95B.chip - Probe Set ID Gene Symbol Gene Title - 41880_at --- Transcribed locus - 41881_at FLJ40142 FLJ40142 protein - 41882_at FBXO42 F-box protein 42 - - HG_U95C.chip - Probe Set ID Gene Symbol Gene Title - 48609_r_at KIAA1333 KIAA1333 - 48610_at C20ORF102 chromosome 20 open reading frame 102 - 48612_at N4BP1 /// LOC653213 Nedd4 binding protein 1 /// similar to Nedd4 binding protein 1 - - HG_U95D.chip - Probe Set ID Gene Symbol Gene Title - 67020_at --- Transcribed locus - 67021_at FAM44B Family with sequence similarity 44, member B - 67023_at --- Transcribed locus - - HG_U95E.chip - Probe Set ID Gene Symbol Gene Title - 67022_at PIP5K2B Phosphatidylinositol-4-phosphate 5-kinase, type II, beta - 67024_at CASD1 CAS1 domain containing 1 - 67026_at NFASC neurofascin homolog (chicken) - - HPCGGCompugenAnnotations.chip - Probe Set ID Gene Symbol Gene Title - MGI:1913066 SH3MD2 Sh3md2, SH3 multiple domains 2 - MGI:1913099 5430413I02RIK 5430413I02Rik, RIKEN cDNA 5430413I02 gene - MGI:98848 TSHB Tshb, thyroid stimulating hormone, beta subunit - - HT_HG_U133A.chip - Probe Set ID Gene Symbol Gene Title - 1007_s_at DDR1 discoidin domain receptor family, member 1 - 1053_at RFC2 replication factor C (activator 1) 2, 40kDa - 117_at HSPA6 /// LOC652878 heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B) - - Hu35KsubA.chip - Probe Set ID Gene Symbol Gene Title - AA000993_at PRDM8 PR domain containing 8 - AA001296_s_at PHF23 PHD finger protein 23 - AA002245_at ZC3H11A zinc finger CCCH-type containing 11A - - Hu35KsubB.chip - Probe Set ID Gene Symbol Gene Title - AFFX-BioB-3_at --- --- - AFFX-BioB-3_st --- --- - AFFX-BioB-5_at --- --- - - Hu35KsubC.chip - Probe Set ID Gene Symbol Gene Title - AFFX-BioB-3_at --- --- - AFFX-BioB-3_st --- --- - AFFX-BioB-5_at --- --- - - Hu35KsubD.chip - Probe Set ID Gene Symbol Gene Title - AFFX-BioB-3_at --- --- - AFFX-BioB-3_st --- --- - AFFX-BioB-5_at --- --- - - Hu6800.chip - Probe Set ID Gene Symbol Gene Title - A28102_at GABRA3 gamma-aminobutyric acid (GABA) A receptor, alpha 3 - AB000114_at OMD osteomodulin - AB000115_at IFI44L interferon-induced protein 44-like - - Illumina_Human.chip - Probe Set ID Gene Symbol Gene Title - 1-A-1 BCAP31 B-cell receptor-associated protein 31 - 1-A-10 NSFL1C NSFL1 (p97) cofactor (p47) - 1-A-11 C20ORF24 chromosome 20 open reading frame 24 - - ilmn_HumanHT_12_V4_0_R1_15002873_B.chip - Probe Set ID Gene Symbol Gene Title Entrez Gene ID - ILMN_1725881 LOC23117 PREDICTED: Homo sapiens KIAA0220-like protein, transcript variant 11 (LOC23117), mRNA. 23117 - ILMN_1910180 Homo sapiens cDNA: FLJ21027 fis, clone CAE07110 -999 - ILMN_1804174 FCGR2B PREDICTED: Homo sapiens Fc fragment of IgG, low affinity IIb, receptor (CD32) (FCGR2B), mRNA. 2213 - - ilmn_MouseRef_8_V2_0_R3_11278551_A.chip - Probe Set ID Gene Symbol Gene Title Entrez Gene ID - ILMN_2607609 YIF1B Mus musculus Yip1 interacting factor homolog B (S. cerevisiae) (Yif1b), mRNA. 77254 - ILMN_1238674 2700007P21RIK Mus musculus RIKEN cDNA 2700007P21 gene (2700007P21Rik), transcript variant 2, mRNA. 212772 - ILMN_3062534 2700007P21RIK Mus musculus RIKEN cDNA 2700007P21 gene (2700007P21Rik), transcript variant 2, mRNA. 212772 - - ilmn_MouseWG_6_V2_0_R3_11278593_A.chip - Probe Set ID Gene Symbol Gene Title Entrez Gene ID - ILMN_1243094 THRSP NULL -999 - ILMN_1238674 2700007P21RIK Mus musculus RIKEN cDNA 2700007P21 gene (2700007P21Rik), transcript variant 2, mRNA. 212772 - ILMN_2454720 2700007P21RIK Mus musculus RIKEN cDNA 2700007P21 gene (2700007P21Rik), transcript variant 2, mRNA. 212772 - - labonweb_human.chip - Probe Set ID Gene Symbol Gene Title - 1-A1 GAPD glyceraldehyde-3-phosphate dehydrogenase - 1-A10 BMP5 bone morphogenetic protein 5 - 1-A11 C9ORF78 chromosome 9 open reading frame 78 - - lymphochip.chip - Probe Set ID Gene Symbol Gene Title - 1317148 TNF tumor necrosis factor (TNF superfamily, member 2) - 814251 SLAMF1 signaling lymphocytic activation molecule family member 1 - 712899___1 AA282233 zt12d02.s1 NCI_CGAP_GCB1 Homo sapiens cDNA clone - - MG_U74Av2.chip - Probe Set ID Gene Symbol Gene Title - 100001_at CD3G CD3 antigen, gamma polypeptide - 100002_at ITIH3 inter-alpha trypsin inhibitor, heavy chain 3 - 100003_at RYR1 ryanodine receptor 1, skeletal muscle - - MG_U74Bv2.chip - Probe Set ID Gene Symbol Gene Title - 104769_at LIMA1 LIM domain and actin binding 1 - 104770_at --- 16 days neonate heart cDNA, RIKEN full-length enriched library, clone:D830033B01 product:unclassifiable, full insert sequence - 104771_at RAC2 RAS-related C3 botulinum substrate 2 - - MG_U74Cv2.chip - Probe Set ID Gene Symbol Gene Title - 128520_at CCDC97 coiled-coil domain containing 97 - 128523_at A730063M14RIK RIKEN cDNA A730063M14 gene - 128529_at A930009G19RIK RIKEN cDNA A930009G19 gene - - MOE430A.chip - Probe Set ID Gene Symbol Gene Title - 1415670_at COPG coatomer protein complex, subunit gamma - 1415671_at ATP6V0D1 ATPase, H+ transporting, lysosomal V0 subunit D1 - 1415672_at GOLGA7 golgi autoantigen, golgin subfamily a, 7 - - MOE430B.chip - Probe Set ID Gene Symbol Gene Title - 1415670_at COPG coatomer protein complex, subunit gamma - 1415671_at ATP6V0D1 ATPase, H+ transporting, lysosomal V0 subunit D1 - 1415672_at GOLGA7 golgi autoantigen, golgin subfamily a, 7 - - Mouse430_2.chip - Probe Set ID Gene Symbol Gene Title - 1415670_at COPG coatomer protein complex, subunit gamma - 1415671_at ATP6V0D1 ATPase, H+ transporting, lysosomal V0 subunit D1 - 1415672_at GOLGA7 golgi autoantigen, golgin subfamily a, 7 - - Mouse430A_2.chip - Probe Set ID Gene Symbol Gene Title - 1415670_at COPG coatomer protein complex, subunit gamma - 1415671_at ATP6V0D1 ATPase, H+ transporting, lysosomal V0 subunit D1 - 1415672_at GOLGA7 golgi autoantigen, golgin subfamily a, 7 - - Mu11KsubA.chip - Probe Set ID Gene Symbol Gene Title - aa000148_s_at NIP7 nuclear import 7 homolog (S. cerevisiae) - AA000151_at 1200011I18RIK RIKEN cDNA 1200011I18 gene - aa000380_s_at TARDBP TAR DNA binding protein - - Mu11KsubB.chip - Probe Set ID Gene Symbol Gene Title - AFFX-18SRNAMur/X00686_3_at --- --- - AFFX-18SRNAMur/X00686_5_at --- --- - AFFX-18SRNAMur/X00686_M_at --- --- - - Mu19KsubA.chip - Probe Set ID Gene Symbol Gene Title - AFFX-18SRNAMur/X00686_3_at --- --- - AFFX-18SRNAMur/X00686_5_at --- --- - AFFX-18SRNAMur/X00686_M_at --- --- - - Mu19KsubB.chip - Probe Set ID Gene Symbol Gene Title - AFFX-18SRNAMur/X00686_3_at --- --- - AFFX-18SRNAMur/X00686_5_at --- --- - AFFX-18SRNAMur/X00686_M_at --- --- - - Mu19KsubC.chip - Probe Set ID Gene Symbol Gene Title - AFFX-18SRNAMur/X00686_3_at --- --- - AFFX-18SRNAMur/X00686_5_at --- --- - AFFX-18SRNAMur/X00686_M_at --- --- - - MWG_Human_30K_A.chip - Probe Set ID Gene Symbol Gene Title - 1-A1 IFNA8 interferon, alpha 8 - 1-A10 AGL amylo-1, 6-glucosidase, 4-alpha-glucanotransferase (glycogen debranching enzyme, glycogen storage disease type III) - 1-A11 NULL NULL - - MWG_Human_30K_B.chip - Probe Set ID Gene Symbol Gene Title - 1-A10 ADAM11 a disintegrin and metalloproteinase domain 11 - 1-A11 NULL NULL - 1-A12 MAST3 microtubule associated serine/threonine kinase 3 - - Netherland_cancer_institute_operon_human_35k.chip - Probe Set ID Gene Symbol Gene Title - H200001046 ZZZ3 zinc finger, ZZ domain containing 3 [Homo sapiens]. [Source:RefSeq;Acc:NM_015534] - H200011098 ZZEF1 zinc finger, ZZ-type with EF hand domain 1 [Homo sapiens]. [Source:RefSeq;Acc:NM_015113] - H200006223 ZYX Zyxin (Zyxin 2). [Source:Uniprot/SWISSPROT;Acc:Q15942] - - Netherland_cancer_institute_operon_mouse_FOOk.chip - Probe Set ID Gene Symbol Gene Title - M300012631 ZZEF1 NULL - M300012632 ZZEF1 NULL - M200002019 ZYX Zyxin. [Source:Uniprot/SWISSPROT;Acc:Q62523] - - NIA15k.chip - Probe Set ID Gene Symbol Gene Title - H3051B03 BG067146 H3051B03-3 NIA Mouse 15K cDNA Clone Set Mus musculus cDNA - H3067H02 BG068648 H3067H02-3 NIA Mouse 15K cDNA Clone Set Mus musculus cDNA - H3121E03 AK029878 NULL - - OPERON_HUMANv2.chip - Probe Set ID Gene Symbol Gene Title - 1-A01 NULL NULL - 1-A02 NULL NULL - 1-A03 NAT2 N-acetyltransferase 2 (arylamine N-acetyltransferase) - - OPERON_HUMANv3.chip - Probe Set ID Gene Symbol Gene Title - 1-A03 TSPAN6 tetraspanin 6 - 1-A04 AK2 adenylate kinase 2 - 1-A05 NULL NULL - - RAE230A.chip - Probe Set ID Gene Symbol Gene Title - 1367452_at SUMO2 SMT3 suppressor of mif two 3 homolog 2 (yeast) - 1367453_at CDC37 cell division cycle 37 homolog (S. cerevisiae) - 1367454_at COPB2 coatomer protein complex, subunit beta 2 (beta prime) - - RAE230B.chip - Probe Set ID Gene Symbol Gene Title - 1367452_at SUMO2 SMT3 suppressor of mif two 3 homolog 2 (yeast) - 1367453_at CDC37 cell division cycle 37 homolog (S. cerevisiae) - 1367454_at COPB2 coatomer protein complex, subunit beta 2 (beta prime) - - Rat230_2.chip - Probe Set ID Gene Symbol Gene Title - 1367452_at SUMO2 SMT3 suppressor of mif two 3 homolog 2 (yeast) - 1367453_at CDC37 cell division cycle 37 homolog (S. cerevisiae) - 1367454_at COPB2 coatomer protein complex, subunit beta 2 (beta prime) - - RefSeq_human.chip - Probe Set ID Gene Symbol Gene Title - NM_030625 CXXC6 CXXC finger 6 - NM_181776 SLC36A2 solute carrier family 36 (proton/amino acid symporter), member 2 - NM_152616 TRIM42 tripartite motif-containing 42 - - RefSeq_NP_Human.chip - Probe Set ID Gene Symbol Gene Title - NP_000005 A2M alpha-2-macroglobulin - NP_000006 NAT2 N-acetyltransferase 2 (arylamine N-acetyltransferase) - NP_000007 ACADM acyl-Coenzyme A dehydrogenase, C-4 to C-12 straight chain - - RefSeq_NP_Mouse.chip - Probe Set ID Gene Symbol Gene Title - AAF62769 NULL NULL - AAF62770 NULL NULL - AAF62771 NULL NULL - - RefSeq_NP_Rat.chip - Probe Set ID Gene Symbol Gene Title - NP_007225 NULL NULL - NP_007226 NULL NULL - NP_007227 NULL NULL - - Research_Genetics.chip - Probe Set ID Gene Symbol Gene Title - 1-a-1 NULL NULL - 1-a-10 WNT2 wingless-type MMTV integration site family member 2 - 1-a-11 VHL von Hippel-Lindau tumor suppressor - - RG_U34A.chip - Probe Set ID Gene Symbol Gene Title - A01157cds_s_at LIPF lipase, gastric - A03913cds_s_at SERPINE2 serine (or cysteine) proteinase inhibitor, clade E, member 2 - A04674cds_s_at UCP1 uncoupling protein 1 (mitochondrial, proton carrier) - - RG_U34B.chip - Probe Set ID Gene Symbol Gene Title - AFFX-BioB-3_at --- --- - AFFX-BioB-3_st --- --- - AFFX-BioB-5_at --- --- - - RG_U34C.chip - Probe Set ID Gene Symbol Gene Title - AA012646_i_at --- Transcribed locus - AA012646_r_at --- Transcribed locus - AA012654_at --- Transcribed locus - - RN_U34.chip - Probe Set ID Gene Symbol Gene Title - A03913cds_s_at SERPINE2 serine (or cysteine) proteinase inhibitor, clade E, member 2 - A17753cds_s_at DRD3 dopamine receptor D3 - AA848563_s_at HSPA1A /// HSPA1B_MAPPED heat shock 70kD protein 1A /// heat shock 70kD protein 1B (mapped) - - Rosetta50K.chip - Probe Set ID Gene Symbol Gene Title - 1360465 RANBP9 RAN binding protein 9 - 1176829 LOC375449 similar to microtubule associated testis specific serine/threonine protein kinase - 1351074 NEDD4L neural precursor cell expressed, developmentally down-regulated 4-like - - Rosetta.chip - Probe Set ID Gene Symbol Gene Title - NM_000504 F10 coagulation factor X - Contig32955_RC ARL6IP6 ADP-ribosylation-like factor 6 interacting protein 6 - AK000455 MGC16733 hypothetical gene MGC16733 similar to CG12113 - - RT_U34.chip - Probe Set ID Gene Symbol Gene Title - AA108277_at HSPH1 heat shock 105kDa/110kDa protein 1 - AA108308_i_at MDM2_PREDICTED Transformed mouse 3T3 cell double minute 2 homolog (mouse) (predicted) - AA108308_s_at --- --- - - RZPD_Human_Ensembl1.1.chip - Probe Set ID Gene Symbol Gene Title - RZPDp203A011001D SARDH sarcosine dehydrogenase - RZPDp203A011002D ARHGAP22 Rho GTPase activating protein 22 - RZPDp203A011003D CNGA3 cyclic nucleotide gated channel alpha 3 - - RZPD_Human_ORF_Clones_Gateway.chip - Probe Set ID Gene Symbol Gene Title - RZPDo834A0110D - Gateway (closed) PTD015 PTD015 protein - RZPDo834A0114D - Gateway (closed) TRIB3 tribbles homolog 3 (Drosophila) - RZPDo834A0116D - Gateway (closed) ORM2 orosomucoid 2 - - RZPD_Human_Unigene3.1.chip - Probe Set ID Gene Symbol Gene Title - HU3_p983A011001D SARDH sarcosine dehydrogenase - HU3_p983A011002D ARHGAP22 Rho GTPase activating protein 22 - HU3_p983A011003D CNGA3 cyclic nucleotide gated channel alpha 3 - - Seq_Accession.chip - Probe Set ID Gene Symbol - AA017197 C21ORF36 - AA191116 MTVR2 - AA280701 CXYORF7 - - Stanford.chip - Probe Set ID Gene Symbol Gene Title - IMAGE:703849 DDB2 damage-specific DNA binding protein 2, 48kDa - IMAGE:1301778 ZFY zinc finger protein, Y-linked - IMAGE:795810 HS.99503HS.520681 Homo sapiens transcribed sequenceHomo sapiens, clone IMAGE:4823270, mRNA - - Stanford_Source_Accessions.chip - Probe Set ID Gene Symbol Gene Title - AI848107 0610010K14RIK RIKEN cDNA 0610010K14 gene - AK002491 0610010K14RIK RIKEN cDNA 0610010K14 gene - AK003842 0610010K14RIK RIKEN cDNA 0610010K14 gene - - TIGR_31K_Human_Set.chip - Probe Set ID Gene Symbol Gene Title - 1-1 NULL NULL - 1-10 WNT2 wingless-type MMTV integration site family member 2 - 1-11 VHL von Hippel-Lindau tumor suppressor - - TIGR_40K_Human_Set.chip - Probe Set ID Gene Symbol Gene Title - 10 NULL NULL - 100 TEX27 testis expressed sequence 27 - 1000 HOXA1 homeo box A1 - - U133_X3P.chip - Probe Set ID Gene Symbol Gene Title - 1053_3p_at RFC2 replication factor C (activator 1) 2, 40kDa - 117_3p_at HSPA6 /// LOC652878 heat shock 70kDa protein 6 (HSP70B') /// similar to heat shock 70kDa protein 6 (HSP70B) - 1494_3p_f_at CYP2A6 cytochrome P450, family 2, subfamily A, polypeptide 6 - - UCLA_NIH_33K.chip - Probe Set ID Gene Symbol Gene Title - 1020181 NULL NULL - 1020315 VAV2 vav 2 oncogene - 1020478 AP1GBP1 AP1 gamma subunit binding protein 1 - - Zebrafish.chip - Probe Set ID Gene Symbol Gene Title - AFFX-BioB-3_at --- --- - AFFX-BioB-5_at --- --- - AFFX-BioB-M_at --- --- - - - .. _LGPL: http://www.gnu.org/copyleft/lesser.html - .. _GSEA: http://www.broadinstitute.org/gsea - .. _GUIDE: http://www.broadinstitute.org/gsea/doc/GSEAUserGuideFrame.html?_Interpreting_GSEA_Results - .. _MSigDB: http://www.broadinstitute.org/gsea/msigdb/index.jsp - .. _2005Paper: http://www.pnas.org/content/102/43/15545.full - -</help> - -</tool> - -
--- a/rgedgeR/rgedgeR.xml Wed Jun 12 02:58:43 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,504 +0,0 @@ -<tool id="rgedgeR" name="edgeR" version="0.18"> - <description>digital DGE between two groups of replicates</description> - <command interpreter="python"> - rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "edgeR" - --output_dir "$html_file.files_path" --output_html "$html_file" --output_tab "$outtab" --make_HTML "yes" - </command> - <inputs> - <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample" - help="Use the HTSeq based count matrix preparation tool to create these count matrices from BAM files and a GTF file"/> - <param name="title" type="text" value="DGE" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain"> - <sanitizer invalid_char=""> - <valid initial="string.letters,string.digits"><add value="_" /> </valid> - </sanitizer> - </param> - <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/> - <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True" - multiple="true" use_header_names="true" size="120" display="checkboxes"> - <validator type="no_options" message="Please select at least one column."/> - </param> - <param name="control_name" type="text" value="Control" size="50" label="Control Name"/> - <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True" - multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true"> - </param> - <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs" - help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/> - <param name="useQuantile" type="boolean" truevalue="T" checked='false' falsevalue="" size="1" label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples" - help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/> - <param name="priorn" type="integer" value="4" size="3" label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance - note this used to be prior.n" - help="Zero = auto-estimate. 1 to force high variance tags out. Use a small value to 'smooth' small samples. See edgeR docs and note below"/> - <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control" - help="Conventional default value of 0.05 recommended"/> - <param name="fdrtype" type="select" label="FDR (Type II error) control method" - help="Use fdr or bh typically to control for the number of tests in a reliable way"> - <option value="fdr" selected="true">fdr</option> - <option value="BH">Benjamini Hochberg</option> - <option value="BY">Benjamini Yukateli</option> - <option value="bonferroni">Bonferroni</option> - <option value="hochberg">Hochberg</option> - <option value="holm">Holm</option> - <option value="hommel">Hommel</option> - <option value="none">no control for multiple tests</option> - </param> - </inputs> - <outputs> - <data format="tabular" name="outtab" label="${title}.xls"/> - <data format="html" name="html_file" label="${title}.html"/> - <data format="gsearank" name="outgsea" label="${title}.gsearank"> - <filter> makeRank == 'Yes' </filter> - </data> - </outputs> -<configfiles> -<configfile name="runme"> - -# edgeR.Rscript -# updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross -# Performs DGE on a count table containing n replicates of two conditions -# -# Parameters -# -# 1 - Output Dir - -# Original edgeR code by: S.Lunke and A.Kaspi -sink(stdout(),append=T,type="message") -reallybig = log10(.Machine\$double.xmax) -reallysmall = log10(.Machine\$double.xmin) -require('stringr') -require('gplots') -library('ggplot2') -library('gridExtra') - -hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here') -{ -# Perform clustering for significant pvalues after controlling FWER - samples = colnames(cmat) - gu = unique(group) - if (length(gu) == 2) { - col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"} - pcols = unlist(lapply(group,col.map)) - } else { - colours = rainbow(length(gu),start=0,end=4/6) - pcols = colours[match(group,gu)] } - print(paste('pcols',pcols)) - gn = rownames(cmat) - dm = cmat[(! is.na(gn)),] - # remove unlabelled hm rows - nprobes = nrow(dm) - # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance') - if (nprobes > nsamp) { - dm =dm[1:nsamp,] - #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total') - } - newcolnames = substr(colnames(dm),1,20) - colnames(dm) = newcolnames - pdf(outpdfname) - heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none', - Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5) - dev.off() -} - -hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here") -{ - # for 2 groups only was - #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"} - #pcols = unlist(lapply(group,col.map)) - gu = unique(group) - colours = rainbow(length(gu),start=0.3,end=0.6) - pcols = colours[match(group,gu)] - nrows = nrow(cmat) - mtitle = paste(myTitle,'Heatmap: n contigs =',nrows) - if (nrows > nsamp) { - cmat = cmat[c(1:nsamp),] - mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='') - } - newcolnames = substr(colnames(cmat),1,20) - colnames(cmat) = newcolnames - pdf(outpdfname) - heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols) - dev.off() -} - -qqPlot = function(descr='Title',pvector, ...) -# stolen from https://gist.github.com/703512 -{ - o = -log10(sort(pvector,decreasing=F)) - e = -log10( 1:length(o)/length(o) ) - o[o==-Inf] = reallysmall - o[o==Inf] = reallybig - pdfname = paste(gsub(" ","", descr , fixed=TRUE),'pval_qq.pdf',sep='_') - maint = paste(descr,'QQ Plot') - pdf(pdfname) - plot(e,o,pch=19,cex=1, main=maint, ..., - xlab=expression(Expected~~-log[10](italic(p))), - ylab=expression(Observed~~-log[10](italic(p))), - xlim=c(0,max(e)), ylim=c(0,max(o))) - lines(e,e,col="red") - grid(col = "lightgray", lty = "dotted") - dev.off() -} - -smearPlot = function(DGEList,deTags, outSmear, outMain) - { - pdf(outSmear) - plotSmear(DGEList,de.tags=deTags,main=outMain) - grid(col="blue") - dev.off() - } - - - -boxPlot = function(rawdat,tmdat,maint,myTitle) - { - # give up on boxplot - it's just too buggy - rscolnames = substr(colnames(rawdat),1,25) - colnames(rawdat) = rscolnames - ccolnames = substr(colnames(tmdat),1,25) - colnames(tmdat) = ccolnames - print(paste('rawdat',paste(rscolnames,collapse=','))) - print(paste('tmdat',paste(ccolnames,collapse=','))) - pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"sampleBoxplot.pdf",sep='_') - raw = data.frame(rawdat) - cn = rscolnames - rdat = reshape(raw, direction="long",varying=list(cn),v.names="counts",times=cn) - rdat\$Sample = factor(rdat\$time,levels=cn) - rdat\$Counts = log(rdat\$counts + 0.1) - p1 = ggplot(rdat,aes(x=Sample,y=Counts)) + geom_boxplot(notch=T) + ylab("log Count") - p1 = p1 + theme(axis.text.x = element_text(angle=90, size=9)) + ggtitle('Raw Contig Counts') - raw = data.frame(tmdat) - cn = ccolnames - rdat = reshape(raw, direction="long",varying=list(cn),v.names="counts",times=cn) - rdat\$Sample = factor(rdat\$time,levels=cn) - rdat\$Counts = log(rdat\$counts + 0.1) - p2 = ggplot(rdat,aes(x=Sample,y=Counts)) + geom_boxplot(notch=T) + ylab("log Count") - p2 = p2 + theme(axis.text.x = element_text(angle=90, size=9)) + ggtitle('Normalised Contig Counts') - pdf(pdfname) - grid.arrange(p1,p2,nrow=1) - dev.off() -} - -cumPlot = function(rawrs,cleanrs,maint,myTitle) -{ # updated to use ecdf - pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_') - defpar = par(no.readonly=T) - pdf(pdfname) - par(mfrow=c(2,1)) - lrs = log(rawrs,10) - lim = max(lrs) - hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)", - ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1) - grid(col="blue") - lrs = log(cleanrs,10) - hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)", - ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1) - grid(col="blue") - dev.off() - par(defpar) -} - -cumPlot1 = function(rawrs,cleanrs,maint,myTitle) -{ # updated to use ecdf - pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_') - pdf(pdfname) - par(mfrow=c(2,1)) - lastx = max(rawrs) - rawe = knots(ecdf(rawrs)) - cleane = knots(ecdf(cleanrs)) - cy = 1:length(cleane)/length(cleane) - ry = 1:length(rawe)/length(rawe) - plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads", - ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle) - grid(col="blue") - plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads", - ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle) - grid(col="blue") - dev.off() -} - - - -edgeIt = function (Count_Matrix,group,outputfilename,fdrtype='fdr',priorn=5,fdrthresh=0.05,outputdir='.', - myTitle='edgeR',libSize=c(),useQuantile="T",filterquantile=0.2,subjects=c()) { - - # Error handling - if (length(unique(group))!=2){ - print("Number of conditions identified in experiment does not equal 2") - q() - } - require(edgeR) - mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ") - allN = nrow(Count_Matrix) - nscut = round(ncol(Count_Matrix)/2) - colTotmillionreads = colSums(Count_Matrix)/1e6 - rawrs = rowSums(Count_Matrix) - nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes - nzN = nrow(nonzerod) - nzrs = rowSums(nonzerod) - zN = allN - nzN - print('# Quantiles for non-zero row counts:',quote=F) - print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F) - if (useQuantile == "T") - { - gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut - lo = colSums(Count_Matrix[!gt1rpin3,]) - workCM = Count_Matrix[gt1rpin3,] - cleanrs = rowSums(workCM) - cleanN = length(cleanrs) - meth = paste( "After removing",length(lo),"contigs with fewer than",nscut,"sample read counts >= 1 per million, there are",sep="") - print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F) - maint = paste('Filter >=1/million reads in >=',nscut,'samples') - } - else { - useme = (nzrs > quantile(nzrs,filterquantile)) - workCM = nonzerod[useme,] - lo = colSums(nonzerod[!useme,]) - cleanrs = rowSums(workCM) - cleanN = length(cleanrs) - meth = paste("After filtering at count quantile =",filterquantile,"there are",sep="") - print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F) - maint = paste('Filter below',filterquantile,'quantile') - } - cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle) - - print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F) - rsums = rowSums(workCM) - # Setup DGEList object - DGEList = DGEList(counts=workCM, group = group) - #Extract T v C names - TName=unique(group)[1] - CName=unique(group)[2] - if (length(subjects) == 0) { mydesign = model.matrix(~group) - } else { sf = factor(subjects) - mydesign = model.matrix(~sf+group) - } - print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=','))) - print.noquote('Using design matrix:') - print.noquote(mydesign) - print.noquote(paste("prior.df =",priorn)) - DGEList = calcNormFactors(DGEList) - DGEList = estimateGLMCommonDisp(DGEList,mydesign) - comdisp = DGEList\$common.dispersion - DGEList = estimateGLMTrendedDisp(DGEList,mydesign) - DGEList = estimateGLMTagwiseDisp(DGEList,mydesign) - DGLM = glmFit(DGEList,design=mydesign) - co = length(colnames(mydesign)) - DE = glmLRT(DGLM,coef=co) # always last one - subject is first if needed - goodness = gof(DGLM, pcutoff=fdrthresh) - if (sum(goodness\$outlier) > 0) { - print.noquote('GLM outliers:') - print.noquote(rownames(DE)[(goodness\$outlier != 0)]) - z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2) - pdf(paste(outputdir,paste(mt,"GoodnessofFit.pdf",sep='_'),sep='/')) - qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion") - abline(0,1,lwd=3) - points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue") - dev.off() - } else { print('No GLM fit outlier genes found\n')} - estpriorn = getPriorN(DGEList) - print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F) - efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors - normData = (1e+06*DGEList\$counts/efflib) - #normData = (1e+06 * DGEList\$counts/expandAsMatrix(DGEList\$samples\$lib.size, dim(DGEList))) - colnames(normData) = paste( colnames(normData),'N',sep="_") - print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=','))) - nzd = data.frame(log(nonzerod + 1e-2,10)) - boxPlot(rawdat=nzd,tmdat=normData,maint='TMM Normalisation',myTitle=myTitle) - #Prepare our output file - output = cbind( - Name=as.character(rownames(DGEList\$counts)), - DE\$table, - adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), - Dispersion=DGEList\$tagwise.dispersion,totreads=rsums,normData, - DGEList\$counts - ) - soutput = output[order(output\$PVal),] # sorted into p value order - for quick toptable - nreads = soutput\$totreads # ordered same way - print('# writing output',quote=F) - write.table(soutput,outputfilename, quote=FALSE, sep="\t",row.names=F) - tt = topTags(DE,n=nrow(DE)) - rn = rownames(tt\$table) - reg = "^chr([0-9]+):([0-9]+)-([0-9]+)" - org="hg19" - genecards="<a href='http://www.genecards.org/index.php?path=/Search/keyword/" - ucsc = paste("<a href='http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='') - testreg = str_match(rn,reg) - if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.9) # is ucsc style string - { - urls = paste(ucsc,'&position=chr',testreg[,2],':',testreg[,3],"-",testreg[,4],"'>",rn,'</a>',sep='') - } else { - urls = paste(genecards,rn,"'>",rn,'</a>',sep="") - } - cat("# Top tags\n") - tt\$table = cbind(tt\$table,ntotreads=nreads,URL=urls) # add to end so table isn't laid out strangely - print(tt[1:50,]) - pdf(paste(mt,"BCV_vs_abundance.pdf",sep='_')) - plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance") - dev.off() - # Plot MAplot - fname = gsub(' ','_',myTitle,fixed=T) - deTags = rownames(output[output\$adj.p.value < fdrthresh,]) - nsig = length(deTags) - print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F) - print('# deTags',quote=F) - print(head(deTags)) - dg = DGEList[order(DE\$table\$PValue),] - #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg))) - efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors - normData = (1e+06*dg\$counts/efflib) - outpdfname=paste(mt,"heatmap.pdf",sep='_') - hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=myTitle) - outSmear = paste(outputdir,paste(fname,"Smearplot.pdf",sep='_'),sep='/') - outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='') - smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain) - qqPlot(descr=myTitle,pvector=DE\$table\$PValue) - # Plot MDS - ug = unique(group) - sample_colors = match(DGEList\$samples\$group,ug) #ifelse (DGEList\$samples\$group==group[1], 1, 2) - pdf(paste(outputdir,paste(fname,"MDSplot.pdf",sep='_'),sep='/')) - plotMDS.DGEList(DGEList,main=paste("MDS Plot for",TName,'Vs',CName),cex=0.5,col=sample_colors,pch=sample_colors) - legend(x="topleft", legend = c(group[1],group[length(group)]),col=c(1,2), pch=19) - grid(col="blue") - dev.off() - if (FALSE==TRUE) { - # need a design matrix and glm to use this - glmfit = glmFit(DGEList, design) - goodness = gof(glmfit, pcutoff=fdrpval) - sum(goodness\$outlier) - rownames(d)[goodness\$outlier] - z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2) - pdf(paste(outputdir,paste(fname,"GoodnessofFit.pdf",sep='_'),sep='/')) - qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion") - abline(0,1,lwd=3) - points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue") - dev.off() - } - #Return our main table - output - -} #Done - -options(width=512) -Out_Dir = "$html_file.files_path" -Input = "$input1" -ORG = "$input1.dbkey" -TreatmentName = "$treatment_name" -TreatmentCols = "$Treat_cols" -ControlName = "$control_name" -ControlCols= "$Control_cols" -outputfilename = "$outtab" -fdrtype = "$fdrtype" -priorn = $priorn -fdrthresh = $fdrthresh -useQuantile = "$useQuantile" -fQ = $fQ # non-differential centile cutoff -myTitle = "$title" -makeRank = "$makeRank" -outgsea = "" -if (makeRank > "") outgsea = "$outgsea" -#Set our columns -TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1 -CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1 -cat('# got TCols=') -cat(TCols) -cat('; CCols=') -cat(CCols) -cat('\n') - - -# Create output dir if non existent - if (file.exists(Out_Dir) == F) dir.create(Out_Dir) - -Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header -Count_Matrix = Count_Matrix[,c(TCols,CCols)] -rn = rownames(Count_Matrix) -islib = rn %in% c('librarySize','NotInBedRegions') -LibSizes = Count_Matrix[subset(rn,islib),][1] # take first -Count_Matrix = Count_Matrix[subset(rn,! islib),] -group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor -group = factor(group, levels=c(ControlName,TreatmentName)) -colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns -if (priorn <= 0) {priorn = ceiling(20/(length(group)-1))} # estimate prior.n if not provided -# see http://comments.gmane.org/gmane.comp.lang.r.sequencing/2009 -results = edgeIt(Count_Matrix=Count_Matrix,group=group,outputfilename=outputfilename,fdrtype=fdrtype,priorn=priorn,fdrthresh=fdrthresh, - outputdir=Out_Dir,myTitle=myTitle,libSize=c(),useQuantile=useQuantile,filterquantile=fQ) #Run the main function -# for the log - - -sessionInfo() - - -</configfile> -</configfiles> -<tests> -<test> -<param name='input1' value='DGEtest.xls' ftype='tabular' /> - <param name='treatment_name' value='case' /> - <param name='title' value='DGEtest' /> - <param name='fdrtype' value='fdr' /> - <param name='priorn' value="5" /> - <param name='fdrthresh' value="0.05" /> - <param name='control_name' value='control' /> - <param name='Treat_cols' value='c3,c6,c9' /> - <param name='Control_cols' value='c2,c5,c8' /> - <output name='outtab' file='DGEtest1out.xls' ftype='tabular' compare='diff' /> - <output name='html_file' file='DGEtest1out.html' ftype='html' compare='diff' lines_diff='20' /> -</test> -</tests> -<help> -**What it does** - -Performs digital gene expression analysis between a treatment and control on a matrix. - -**Documentation** Please see documentation_ for methods and parameter details - -**Input** - -A matrix consisting of non-negative integers. The matrix must have a unique header row identifiying the samples, as well as a unique set of row names -as the first column. - -**Output** - -A matrix which consists the original data and relative expression levels and some helpful plots - -**Note on edgeR versions** - -The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue) -breaking this and all other code that assumed the old name for this variable, -between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing). -This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing -to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly -when their old scripts break. This tool currently now works with 2.4.6. - -**Note on prior.N** - -http://seqanswers.com/forums/showthread.php?t=5591 says: - -*prior.n* - -The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion. -You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood -in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your -tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the -common likelihood the weight of one observation. - -In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value, -or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that -you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation -(squeezing) of the tagwise dispersions. How many samples do you have in your experiment? -What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10. -If you have more samples, then the tagwise dispersion estimates will be more reliable, -so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5. - -**Attribution** Copyright Ross Lazarus (ross period lazarus at gmail period com) May 2012 -Derived from the implementation by Antony Kaspi and Sebastian Lunke at the BakerIDI - -All rights reserved. - -Licensed under the LGPL_ - -.. _LGPL: http://www.gnu.org/copyleft/lesser.html -.. _documentation: http://bioconductor.org/packages/release/bioc/html/edgeR.html -</help> - -</tool> - -
--- a/rgedgeR/rgedgeRglm.xml Wed Jun 12 02:58:43 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,537 +0,0 @@ - -<tool id="rgedgeRglm" name="edgeRglm" version="0.18"> - <description>digital DGE glm</description> - <command interpreter="python"> - rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "edgeRglm" - --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" - </command> - <inputs> - <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample" - help="Use the DGE matrix preparation tool to create these matrices from BAM files and a BED file of contigs"/> - <param name="title" type="text" value="Factorial DGE" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain"> - <sanitizer invalid_char=""> - <valid initial="string.letters,string.digits"><add value="_" /> </valid> - </sanitizer> - </param> - <param name="factor1name" type="text" value="Factor 1" size="80" label="Factor 1 name" help="Supply a meaningful name here to remind you when looking at the results"> - <sanitizer invalid_char=""> - <valid initial="string.letters,string.digits"><add value="_" /> </valid> - </sanitizer> - </param> - <param name="factor1" type="text" optional="false" size="120" - label="Enter comma separated values to indicate the factor level for all input file count columns" - help="EG if there are 4 columns of counts from 2 treatment replicates (2,4) and 2 control replicates (3,5) then enter 'treat,control,treat,control'"> - <sanitizer> - <valid initial="string.digits,string.letters"><add value="," /> </valid> - </sanitizer> - </param> - <param name="factor2name" type="text" value="Factor 2" size="80" label="Factor 2 name" help="Supply a meaningful name here to remind you when looking at the results"> - <sanitizer invalid_char=""> - <valid initial="string.letters,string.digits"><add value="_" /> </valid> - </sanitizer> - </param> - <param name="factor2" type="text" optional="false" size="120" - label="Enter comma separated values to indicate factor 2 level for all input file count columns" - help="Leave blank if no factor 2, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter '1,2,1,2'"> - <sanitizer> - <valid initial="string.digits,string.letters"><add value="," /> </valid> - </sanitizer> - </param> - <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs" - help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/> - <param name="useNDfilt" type="boolean" truevalue="T" checked='false' falsevalue="" size="1" label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples" - help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/> - <param name="priorn" type="integer" value="4" size="3" label="prior.df for tagwise dispersion - higher value = more emphasis on each tag's variance - note this used to be prior.n" - help="Zero = auto-estimate. 1 to force high variance tags out. Use a small value to 'smooth' small samples. See edgeR docs and note below"/> - <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for family wise error rate control" - help="Conventional default value of 0.05 recommended"/> - <param name="fdrtype" type="select" label="FDR (Type II error) control method" - help="Use fdr or bh typically to control for the number of tests in a reliable way"> - <option value="fdr" selected="true">fdr</option> - <option value="BH">Benjamini Hochberg</option> - <option value="BY">Benjamini Yukateli</option> - <option value="bonferroni">Bonferroni</option> - <option value="hochberg">Hochberg</option> - <option value="holm">Holm</option> - <option value="hommel">Hommel</option> - <option value="none">no control for multiple tests</option> - </param> - </inputs> - <outputs> - <data format="tabular" name="outtab1" label="${treatment1_name}-${control1_name}_topTable.xls"/> - <data format="tabular" name="outtab2" label="${treatment2_name}-${control2_name}_topTable.xls"/> - <data format="tabular" name="outtab3" label="${treatment1_name}-${control1_name}-${treatment2_name}-${control2_name}_topTable.xls"/> - <data format="html" name="html_file" label="${input1.name}_${title}.html"/> - </outputs> -<configfiles> -<configfile name="runme"> -# edgeR.Rscript -# new nov 2012 for 2x2 factorial designs -# updated nov 2011 for R 2.14.0 and edgeR 2.4.0 by ross -# Performs DGE on a count table containing n replicates of two conditions -# - -reallybig = log10(.Machine\$double.xmax) -reallysmall = log10(.Machine\$double.xmin) -require('stringr') -require('gplots') -hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here') -{ - samples = colnames(cmat) - gu = unique(group) - if (length(gu) == 2) { - col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"} - pcols = unlist(lapply(group,col.map)) - } else { - colours = rainbow(length(gu),start=0,end=4/6) - pcols = colours[match(group,gu)] } - print(paste('pcols',pcols)) - gn = rownames(cmat) - dm = cmat[(! is.na(gn)),] - nprobes = nrow(dm) - if (nprobes > nsamp) { - dm =dm[1:nsamp,] - } - newcolnames = substr(colnames(dm),1,20) - colnames(dm) = newcolnames - pdf(outpdfname) - heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none', - Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5) - dev.off() -} - -hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here") -{ - gu = unique(group) - colours = rainbow(length(gu),start=0.3,end=0.6) - pcols = colours[match(group,gu)] - nrows = nrow(cmat) - mtitle = paste(myTitle,'Heatmap: n contigs =',nrows) - if (nrows > nsamp) { - cmat = cmat[c(1:nsamp),] - mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='') - } - newcolnames = substr(colnames(cmat),1,20) - colnames(cmat) = newcolnames - pdf(outpdfname) - heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols) - dev.off() -} - -qqPlot = function(descr='Title',pvector, ...) -{ - o = -log10(sort(pvector,decreasing=F)) - e = -log10( 1:length(o)/length(o) ) - o[o==-Inf] = reallysmall - o[o==Inf] = reallybig - pdfname = paste(gsub(" ","", descr , fixed=TRUE),'pval_qq.pdf',sep='_') - maint = paste(descr,'QQ Plot') - pdf(pdfname) - plot(e,o,pch=19,cex=1, main=maint, ..., - xlab=expression(Expected~~-log[10](italic(p))), - ylab=expression(Observed~~-log[10](italic(p))), - xlim=c(0,max(e)), ylim=c(0,max(o))) - lines(e,e,col="red") - grid(col = "lightgray", lty = "dotted") - dev.off() -} - -smearPlot = function(DGEList,deTags, outSmear, outMain) - { - pdf(outSmear) - plotSmear(DGEList,de.tags=deTags,main=outMain) - grid(col="blue") - dev.off() - } - -boxPlot = function(rawrs,cleanrs,maint,myTitle) -{ - newcolnames = substr(colnames(rawrs),1,15) - colnames(rawrs) = newcolnames - newcolnames = substr(colnames(cleanrs),1,15) - colnames(cleanrs) = newcolnames - pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"sampleBoxplot.pdf",sep='_') - defpar = par(no.readonly=T) - pdf(pdfname,height=6,width=8) - par(mfrow=c(1,2)) - boxplot(rawrs,main=paste('Before:',maint),col="maroon",las=3,cex.axis=0.4) - grid(col="blue") - lrs = log(cleanrs,10) - boxplot(cleanrs,main=paste('After:',maint),col="maroon",las=3,cex.axis=0.4) - grid(col="blue") - dev.off() - par(defpar) -} - -cumPlot = function(rawrs,cleanrs,maint,myTitle) -{ - pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_') - defpar = par(no.readonly=T) - pdf(pdfname) - par(mfrow=c(2,1)) - lrs = log(rawrs,10) - lim = max(lrs) - hist(lrs,breaks=100,main=paste('Before:',maint),xlab="Reads (log)", - ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1) - grid(col="blue") - lrs = log(cleanrs,10) - hist(lrs,breaks=100,main=paste('After:',maint),xlab="Reads (log)", - ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1) - grid(col="blue") - dev.off() - par(defpar) -} - -cumPlot1 = function(rawrs,cleanrs,maint,myTitle) -{ - pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_') - pdf(pdfname) - par(mfrow=c(2,1)) - lastx = max(rawrs) - rawe = knots(ecdf(rawrs)) - cleane = knots(ecdf(cleanrs)) - cy = 1:length(cleane)/length(cleane) - ry = 1:length(rawe)/length(rawe) - plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads", - ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle) - grid(col="blue") - plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads", - ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle) - grid(col="blue") - dev.off() -} - - - -edgeIt = function (Count_Matrix,group,outtab1,outtab2,outtab3,fdrtype='fdr',priorn=5,fdrthresh=0.05,outputdir='.', - myTitle='edgeR',libSize=c(),useQuantile=T,filterquantile=0.2,org='hg19') -{ - - if (length(unique(group))!=4) { - print.noquote("Number of conditions identified in experiment does not equal 4 - full 2x2 factorial not possible") - q() - } - require(edgeR) - require(limma) - mt = paste(unlist(strsplit(myTitle,' ')),collapse="") - expfact = factor(group,levels=unique(group)) - sampleTypes = levels(expfact) - mycontrast = NA - colnamesDesign = list() - allN = nrow(Count_Matrix) - nscut = round(ncol(Count_Matrix)/2) - colTotmillionreads = colSums(Count_Matrix)/1e6 - rawrs = rowSums(Count_Matrix) - nonzerod = Count_Matrix[(rawrs > 0),] - nzN = nrow(nonzerod) - nzrs = rowSums(nonzerod) - zN = allN - nzN - print.noquote('Quantiles for non-zero row counts:') - print.noquote(quantile(nzrs,probs=seq(0,1,0.1))) - if (useQuantile == "T") - { - gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut - lo = colSums(Count_Matrix[!gt1rpin3,]) - workCM = Count_Matrix[gt1rpin3,] - cleanrs = rowSums(workCM) - cleanN = length(cleanrs) - meth = paste( "After removing",length(lo),"contigs with fewer than",nscut,"sample read counts >= 1 per million, there are",sep="") - print.noquote(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs")) - maint = paste('Filter >= 1/million reads in >=',nscut,'samples') - } else { - useme = (nzrs > quantile(nzrs,filterquantile)) - workCM = nonzerod[useme,] - lo = colSums(nonzerod[!useme,]) - cleanrs = rowSums(workCM) - cleanN = length(cleanrs) - meth = paste("After filtering at count quantile =",filterquantile,"there are",sep="") - print.noquote(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs")) - maint = paste('Filter below',filterquantile,'quantile') - } - cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle) - print.noquote(paste("Total low count contigs per sample = ",paste(lo,collapse=','))) - rsums = rowSums(workCM) - allttn = c() - colnamesDesign = list() - l = sampleTypes - p1=paste(l[1],'-',l[2],sep='') - p2=paste(l[3],'-',l[4],sep='') - p3=paste('(',l[3],'-',l[4],') - (',l[1],'-',l[2],')',sep='') - colnamesDesign = list(comp1=p1,comp2=p2,diff=p3) - mydesign = model.matrix(~0+expfact) - colnames(mydesign) = sampleTypes - mycontrast = makeContrasts(contrasts=colnamesDesign,levels=mydesign) - print.noquote('Design matrix=') - print(mydesign,quote=F) - print.noquote('Contrast matrix=') - print(mycontrast,quote=F) - print.noquote('Samples=') - print(colnames(workCM),quote=F) - DGEList = DGEList(counts=workCM, group = group) - print.noquote(paste("prior.df =",priorn)) - DGEList = calcNormFactors(DGEList) - DGEList = estimateGLMCommonDisp(DGEList,mydesign) - comdisp = DGEList\$common.dispersion - DGEList = estimateGLMTrendedDisp(DGEList,mydesign) - DGEList = estimateGLMTagwiseDisp(DGEList,mydesign) - pdf(paste(mt,"BCV_vs_abundance.pdf",sep='_')) - plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance") - dev.off() - estpriorn = getPriorN(DGEList) - print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F) - efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors - normData = (1e+06*DGEList\$counts/efflib) - #normData = (1e+06 * DGEList\$counts/expandAsMatrix(DGEList\$samples\$lib.size, dim(DGEList))) - colnames(normData) = paste( colnames(normData),'N',sep="_") - boxPlot(rawrs=log(nonzerod),cleanrs=log(normData),maint='TMM Normalisation (log scale)',myTitle=myTitle) - ug = unique(group) - ncond = length(ug) - sample_colors = match(DGEList\$samples\$group,ug) - pdf(paste(outputdir,paste(mt,"MDSplot.pdf",sep='_'),sep='/')) - plotMDS.DGEList(DGEList,main=paste("MDS Plot for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors) - legend(x="topleft", legend = sampleTypes,col=sample_colors, pch=19) - grid(col="blue") - dev.off() - cn = colnames(Count_Matrix) - print('Using design matrix:') - print(mydesign) - DGLM = glmFit(DGEList, mydesign) - goodness = gof(DGLM, pcutoff=fdrthresh) - if (sum(goodness\$outlier) > 0) { - print.noquote('GLM outliers:') - print(paste(rownames(DGLM)[(goodness\$outlier != 0)],collapse=','),quote=F) - z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2) - pdf(paste(mt,"GoodnessofFit.pdf",sep='_')) - qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion") - abline(0,1,lwd=3) - points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue") - dev.off() - } else { print('No GLM fit outlier genes found\n')} - outtabs = c(outtab1,outtab2,outtab3) - for (coeff in c(1:3)) - { - DE = glmLRT(DGLM,contrast=mycontrast[,coeff]) - cont = paste(unlist(strsplit(colnames(mycontrast)[coeff],' ')),collapse="") - output = cbind(Name=as.character(rownames(DGEList\$counts)), - DE\$table,adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), - Dispersion=DGEList\$tagwise.dispersion,totreads=rsums,normData, - DGEList\$counts) - soutput = output[order(output\$PVal),] - nreads = soutput\$totreads - write.table(soutput,outtabs[coeff], quote=FALSE, sep="\t",row.names=F) - print.noquote(paste("Top tags for",cont)) - genecards="<a href='http://www.genecards.org/index.php?path=/Search/keyword/" - ucsc = paste("<a href='http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='') - tt = topTags(DE,n=nrow(DE)) - rn = rownames(tt\$table) - reg = "^chr([0-9]+):([0-9]+)-([0-9]+)" - testreg = str_match(rn,reg) - isucsc = F - if (!is.na(testreg[1,])) {isucsc = T} - if (isucsc == T) - { - urls = paste(ucsc,'&position=chr',testreg[,2],':',testreg[,3],"-",testreg[,4],"'>",rn,'</a>',sep='') - } else { - urls = paste(genecards,rn,"'>",rn,'</a>',sep="") - } - tt\$table = cbind(tt\$table,ntotreads=nreads,geneCardslink=urls) - print.noquote(tt[1:50,]) - deTags = rownames(output[output\$adj.p.value < fdrthresh,]) - nsig = length(deTags) - print(paste(nsig,'tags significant at adj p=',fdrthresh),quote=F) - outSmear = paste(outputdir,paste(mt,cont,"Smearplot.pdf",sep='_'),sep='/') - outMain = paste(cont,' (FDR@',fdrthresh,' N = ',nsig,')',sep='') - smearPlot(DGEList=DE,deTags=deTags, outSmear=outSmear, outMain = outMain) - qqPlot(descr=paste(myTitle,cont),pvector=DE\$table\$PValue) - } -} - -rossDecide = function (object, adjust.method="BH", p.value=0.05,verbose=T) -{ - if (!is(object, "DGEExact") & !is(object, "DGELRT")) - stop("Need DGEExact or DGELRT object") - DE = decideTestsDGE(object, adjust.method = adjust.method, p.value = p.value) - sumDE = summary(DE) - rownames(sumDE)[1] = "DOWN" - rownames(sumDE)[3] = "UP" - sumDE = sumDE[c(3, 1), ] - if (verbose) { - cat("\n") - cat("------------------------------------------------------", - "\n") - cat(" Method for Selecting DEGs:", DEmethod, "\n") - cat(" Multiple Testing method: ", MTestmethod, "- pval", - PVcut, "\n") - cat("\n") - print.noquote(sumDE) - cat("------------------------------------------------------", - "\n") - } - return(DE) -} - -options(width=512) -Out_Dir = "$html_file.files_path" -Input = "$input1" -factor1name = "$factor1_name" -factor1 = as.factor(strsplit("$factor1",",")) -factor2name = "$factor2_name" -factor2 = as.factor(strsplit("$factor2",",")) -org = "$input1.dbkey" -outtabs = strsplit("$outtab",",") -fdrtype = "$fdrtype" -priorn = $priorn -fdrthresh = $fdrthresh -useNDfilt = "$useNDfilt" -fQ = $fQ -myTitle = "$title" - -print.noquote(paste('# got =',paste(CCols2,collapse=','))) -Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') -rn = rownames(Count_Matrix) -nsamp = length(colnames(Count_Matrix)) - -if (file.exists(Out_Dir) == F) dir.create(Out_Dir) -print.noquote(paste('Got nsamp = ',nsamp,'org=',org)) -islib = rn %in% c('librarySize','NotInBedRegions') -LibSizes = Count_Matrix[subset(rn,islib),][1] -Count_Matrix = Count_Matrix[subset(rn,! islib),] -print(paste('Got groups =',paste(group,collapse=' '))) -if (priorn <= 0) {priorn = ceiling(20/(length(group)-1))} -cn = colnames(Count_Matrix) -results = edgeIt(Count_Matrix,group=group,outtab1=outtab1,outtab2=outtab2,outtab3=outtab3,fdrtype,priorn,fdrthresh, - Out_Dir,myTitle,libSize=c(),useQuantile=useNDfilt,filterquantile=fQ,org=org) -options(width=80) -sessionInfo() - -</configfile> -</configfiles> -<tests> -<test> -<param name='input1' value='DGEtest.xls' ftype='tabular' /> - <param name='treatment_name' value='case' /> - <param name='title' value='DGEtest' /> - <param name='fdrtype' value='fdr' /> - <param name='priorn' value="5" /> - <param name='fdrthresh' value="0.05" /> - <param name='control_name' value='control' /> - <param name='Treat_cols' value='c3,c6,c9' /> - <param name='Control_cols' value='c2,c5,c8' /> - <output name='outtab' file='DGEtest1out.xls' ftype='tabular' compare='diff' /> - <output name='html_file' file='DGEtest1out.html' ftype='html' compare='diff' lines_diff='20' /> -</test> -</tests> -<help> -**Executive Summary** -This is a Galaxy wrapper exposing the Bioconductor edgeR_ package. -There is a companion pairwise comparison tool but this is the two way model tool. -It will help you to analyse the main effects in a 2 way anova. - -**If things go wrong** -edgeR software is thoroughly documented at edgeR_Manual_ and the authors -provide excellent support on the Bioconductor users' mailing list for their code, but -if this tool fails to run properly in Galaxy, please make sure you -fill in the form and send the administrators a Galaxy bug report. -If you have questions about the edgeR code this tool wraps, please -try the Bioconductor users' list. - -**What data is it for?** -Like the pairwise version, it takes a count matrix with columns containing reads per contig (NOT transformed!) for -multiple replicates in comparison groups. The counts can be any kind of counts. It was designed for -short read sequence data - RNA-seq, miR-seq,...ChIP-seq. - -**Slightly Longer Version** -Performs digital gene expression factorial analysis for a 2x2 design. -Factorial designs are old but good if you want to get at the individual effects and interaction between two factors. -EG: a treatment applied to 2 different cell types, or two treatments applied in all 4 possible combinations (-a-b,-a,+b,+a-b,+a+b). -Data with replicates in the four groups is supplied as a count matrix. -The two primary comparisons are defined by selecting two sets of samples for comparison -best thought of as control and treatment (whatever that means) for each of the main comparisons. - -The interaction is defined as the difference between those two comparisons and is reported as a topTable as are the -primary comparisons. - -All comparisons are reported as separate tabular spreadsheets ordered by p value and a comprehensive summary is -provided in the html output. - -This code essentially embelishes the code described by Gordon Smythe in the limma documentation for a factorial -analysis. - -**Input** - -A matrix consisting of non-negative integers. The matrix must have a unique header row identifiying the samples, as well as a unique set of row names -as the first column. - -**Output** - -Tabular files which contain the statistical results and the raw and transformed counts and some colourful -and helpful plots - -**Note on edgeR versions** - -The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue) -breaking this and all other code that assumed the old name for this variable, -between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing). -This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing -to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly -when their old scripts break. This tool currently now works with 2.4.6. - -**Note on prior.N - now replaced with prior.df** - -http://seqanswers.com/forums/showthread.php?t=5591 says: - -*prior.n* - -The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion. -You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood -in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your -tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the -common likelihood the weight of one observation. - -In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value, -or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that -you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation -(squeezing) of the tagwise dispersions. How many samples do you have in your experiment? -What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10. -If you have more samples, then the tagwise dispersion estimates will be more reliable, -so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5. - -From Bioconductor Digest, Vol 118, Issue 5, Gordon writes: - -Dear Dorota, - -The important settings are prior.df and trend. - -prior.n and prior.df are related through prior.df = prior.n * residual.df, -and your experiment has residual.df = 36 - 12 = 24. So the old setting of -prior.n=10 is equivalent for your data to prior.df = 240, a very large -value. Going the other way, the new setting of prior.df=10 is equivalent -to prior.n=10/24. - -To recover old results with the current software you would use - - estimateTagwiseDisp(object, prior.df=240, trend="none") - -To get the new default from old software you would use - - estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE) - -Actually the old trend method is equivalent to trend="loess" in the new -software. You should use plotBCV(object) to see whether a trend is -required. - -Note you could also use - - prior.n = getPriorN(object, prior.df=10) - -to map between prior.df and prior.n. - - - .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html - .. _edgeR_Manual: http://www.bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf - -</help> - -</tool> - -
--- a/rgedgeR/rgedgeRpaired.xml.iaas1 Wed Jun 12 02:58:43 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,627 +0,0 @@ -<tool id="rgedgeRpaired" name="edgeR paired" version="0.18"> - <description>2 level Anova for counts</description> - <command interpreter="python"> - rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "edgeR" - --output_dir "$html_file.files_path" --output_html "$html_file" --output_tab "$outtab" --make_HTML "yes" - </command> - <inputs> - <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample" - help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/> - <param name="title" type="text" value="edgeR" size="80" label="Title for job outputs" help="Supply a meaningful name here to remind you what the outputs contain"> - <sanitizer invalid_char=""> - <valid initial="string.letters,string.digits"><add value="_" /> </valid> - </sanitizer> - </param> - <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/> - <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True" - multiple="true" use_header_names="true" size="120" display="checkboxes"> - <validator type="no_options" message="Please select at least one column."/> - </param> - <param name="control_name" type="text" value="Control" size="50" label="Control Name"/> - <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True" - multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true"> - </param> - <param name="subjectids" type="text" optional="true" size="120" - label="IF SUBJECTS NOT ALL INDEPENDENT! Enter integers to indicate sample pairing for every column in input" - help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter '1,2,1,2'"> - <sanitizer> - <valid initial="string.digits"><add value="," /> </valid> - </sanitizer> - </param> - <param name="fQ" type="float" value="0.3" size="5" label="Non-differential contig count quantile threshold - zero to analyze all non-zero read count contigs" - help="May be a good or a bad idea depending on the biology and the question. EG 0.3 = sparsest 30% of contigs with at least one read are removed before analysis"/> - <param name="useNDF" type="boolean" truevalue="T" checked='false' falsevalue="" size="1" label="Non differential filter - remove contigs below a threshold (1 per million) for half or more samples" - help="May be a good or a bad idea depending on the biology and the question. This was the old default. Quantile based is available as an alternative"/> - <param name="priordf" type="integer" value="20" size="3" label="prior.df for tagwise dispersion - lower value = more emphasis on each tag's variance. Replaces prior.n and prior.df = prior.n * residual.df" - help="Zero = Use edgeR default. Use a small value to 'smooth' small samples. See edgeR docs and note below"/> - <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control" - help="Conventional default value of 0.05 recommended"/> - <param name="fdrtype" type="select" label="FDR (Type II error) control method" - help="Use fdr or bh typically to control for the number of tests in a reliable way"> - <option value="fdr" selected="true">fdr</option> - <option value="BH">Benjamini Hochberg</option> - <option value="BY">Benjamini Yukateli</option> - <option value="bonferroni">Bonferroni</option> - <option value="hochberg">Hochberg</option> - <option value="holm">Holm</option> - <option value="hommel">Hommel</option> - <option value="none">no control for multiple tests</option> - </param> - </inputs> - <outputs> - <data format="tabular" name="outtab" label="${title}.xls"/> - <data format="html" name="html_file" label="${title}.html"/> - </outputs> - <stdio> - <exit_code range="4" level="fatal" description="Number of subject ids must match total number of samples in the input matrix" /> - </stdio> - <tests> -<test> -<param name='input1' value='test_bams2mx.xls' ftype='tabular' /> - <param name='treatment_name' value='case' /> - <param name='title' value='edgeRtest' /> - <param name='fdrtype' value='fdr' /> - <param name='priordf' value="0" /> - <param name='fdrthresh' value="0.05" /> - <param name='control_name' value='control' /> - <param name='Treat_cols' value='c3,c4,c5,c9' /> - <param name='Control_cols' value='c2,c6,c7,c8' /> - <output name='outtab' file='edgeRtest1out.xls' ftype='tabular' compare='diff' /> - <output name='html_file' file='edgeRtest1out.html' ftype='html' compare='diff' lines_diff='20' /> -</test> -</tests> - -<configfiles> -<configfile name="runme"> -<![CDATA[ -# -# edgeR.Rscript -# updated npv 2011 for R 2.14.0 and edgeR 2.4.0 by ross -# Performs DGE on a count table containing n replicates of two conditions -# -# Parameters -# -# 1 - Output Dir - -# Original edgeR code by: S.Lunke and A.Kaspi -reallybig = log10(.Machine\$double.xmax) -reallysmall = log10(.Machine\$double.xmin) -library('stringr') -library('gplots') -library('DESeq') -library('edgeR') -hmap2 = function(cmat,nsamp=100,outpdfname='heatmap2.pdf', TName='Treatment',group=NA,myTitle='title goes here') -{ -# Perform clustering for significant pvalues after controlling FWER - samples = colnames(cmat) - gu = unique(group) - if (length(gu) == 2) { - col.map = function(g) {if (g==gu[1]) "#FF0000" else "#0000FF"} - pcols = unlist(lapply(group,col.map)) - } else { - colours = rainbow(length(gu),start=0,end=4/6) - pcols = colours[match(group,gu)] } - gn = rownames(cmat) - dm = cmat[(! is.na(gn)),] - # remove unlabelled hm rows - nprobes = nrow(dm) - # sub = paste('Showing',nprobes,'contigs ranked for evidence of differential abundance') - if (nprobes > nsamp) { - dm =dm[1:nsamp,] - #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total') - } - newcolnames = substr(colnames(dm),1,20) - colnames(dm) = newcolnames - pdf(outpdfname) - heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none', - Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5) - dev.off() -} - -hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here") -{ - # for 2 groups only was - #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"} - #pcols = unlist(lapply(group,col.map)) - gu = unique(group) - colours = rainbow(length(gu),start=0.3,end=0.6) - pcols = colours[match(group,gu)] - nrows = nrow(cmat) - mtitle = paste(myTitle,'Heatmap: n contigs =',nrows) - if (nrows > nsamp) { - cmat = cmat[c(1:nsamp),] - mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='') - } - newcolnames = substr(colnames(cmat),1,20) - colnames(cmat) = newcolnames - pdf(outpdfname) - heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols) - dev.off() -} - -qqPlot = function(descr='Title',pvector, ...) -# stolen from https://gist.github.com/703512 -{ - o = -log10(sort(pvector,decreasing=F)) - e = -log10( 1:length(o)/length(o) ) - o[o==-Inf] = reallysmall - o[o==Inf] = reallybig - pdfname = paste(gsub(" ","", descr , fixed=TRUE),'pval_qq.pdf',sep='_') - maint = paste(descr,'QQ Plot') - pdf(pdfname) - plot(e,o,pch=19,cex=1, main=maint, ..., - xlab=expression(Expected~~-log[10](italic(p))), - ylab=expression(Observed~~-log[10](italic(p))), - xlim=c(0,max(e)), ylim=c(0,max(o))) - lines(e,e,col="red") - grid(col = "lightgray", lty = "dotted") - dev.off() -} - -smearPlot = function(DGEList,deTags, outSmear, outMain) - { - pdf(outSmear) - plotSmear(DGEList,de.tags=deTags,main=outMain) - grid(col="blue") - dev.off() - } - -boxPlot = function(rawrs,cleanrs,maint,myTitle) -{ # - nc = ncol(rawrs) - for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA} - fullnames = colnames(rawrs) - newcolnames = substr(colnames(rawrs),1,20) - colnames(rawrs) = newcolnames - newcolnames = substr(colnames(cleanrs),1,20) - colnames(cleanrs) = newcolnames - pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"sampleBoxplot.pdf",sep='_') - defpar = par(no.readonly=T) - pdf(pdfname,height=6,width=8) - #par(mfrow=c(1,2)) # 1 rows 2 col - l = layout(matrix(c(1,2),1,2,byrow=T)) - print.noquote('raw contig counts by sample:') - print.noquote(summary(rawrs)) - print.noquote('normalised contig counts by sample:') - print.noquote(summary(cleanrs)) - boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint)) - grid(col="blue") - boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint)) - grid(col="blue") - dev.off() - pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"samplehistplot.pdf",sep='_') - nc = ncol(rawrs) - print.noquote(paste('Using ncol rawrs=',nc)) - ncroot = round(sqrt(nc)) - if (ncroot*ncroot < nc) { ncroot = ncroot + 1 } - m = c() - for (i in c(1:nc)) { - rhist = hist(rawrs[,i],breaks=100,plot=F) - m = append(m,max(rhist\$counts)) - } - ymax = max(m) - pdf(pdfname) - par(mfrow=c(ncroot,ncroot)) - for (i in c(1:nc)) { - hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon", - breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax)) - } - dev.off() - par(defpar) - -} - -cumPlot = function(rawrs,cleanrs,maint,myTitle) -{ # updated to use ecdf - pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_') - defpar = par(no.readonly=T) - pdf(pdfname) - par(mfrow=c(2,1)) - lrs = log(rawrs,10) - lim = max(lrs) - hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)", - ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1) - grid(col="blue") - lrs = log(cleanrs,10) - hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)", - ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1) - grid(col="blue") - dev.off() - par(defpar) -} - -cumPlot1 = function(rawrs,cleanrs,maint,myTitle) -{ # updated to use ecdf - pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_') - pdf(pdfname) - par(mfrow=c(2,1)) - lastx = max(rawrs) - rawe = knots(ecdf(rawrs)) - cleane = knots(ecdf(cleanrs)) - cy = 1:length(cleane)/length(cleane) - ry = 1:length(rawe)/length(rawe) - plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads", - ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle) - grid(col="blue") - plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads", - ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle) - grid(col="blue") - dev.off() -} - - - -edgeIt = function (Count_Matrix,group,outputfilename,fdrtype='fdr',priordf=5,fdrthresh=0.05,outputdir='.', - myTitle='edgeR',libSize=c(),useNDF="T",filterquantile=0.2,subjects=c()) { - - # Error handling - if (length(unique(group))!=2){ - print("Number of conditions identified in experiment does not equal 2") - q() - } - require(edgeR) - mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ") - allN = nrow(Count_Matrix) - nscut = round(ncol(Count_Matrix)/2) - colTotmillionreads = colSums(Count_Matrix)/1e6 - rawrs = rowSums(Count_Matrix) - nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes - nzN = nrow(nonzerod) - nzrs = rowSums(nonzerod) - zN = allN - nzN - print('# Quantiles for non-zero row counts:',quote=F) - print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F) - if (useNDF == "T") - { - gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut - lo = colSums(Count_Matrix[!gt1rpin3,]) - workCM = Count_Matrix[gt1rpin3,] - cleanrs = rowSums(workCM) - cleanN = length(cleanrs) - meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="") - print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F) - maint = paste('Filter >=1/million reads in >=',nscut,'samples') - } - else { - useme = (nzrs > quantile(nzrs,filterquantile)) - workCM = nonzerod[useme,] - lo = colSums(nonzerod[!useme,]) - cleanrs = rowSums(workCM) - cleanN = length(cleanrs) - meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="") - print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F) - maint = paste('Filter below',filterquantile,'quantile') - } - cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle) - allgenes <- rownames(workCM) - print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F) - rsums = rowSums(workCM) - TName=unique(group)[1] - CName=unique(group)[2] - # Setup DGEList object - DGEList = DGEList(counts=workCM, group = group) - if (length(subjects) == 0) - { - doDESEQ = T - mydesign = model.matrix(~group) - } - else { - doDESEQ = F - subjf = factor(subjects) - mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it - } - print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=','))) - print.noquote('Using design matrix:') - print.noquote(mydesign) - DGEList = estimateGLMCommonDisp(DGEList,mydesign) - comdisp = DGEList\$common.dispersion - DGEList = estimateGLMTrendedDisp(DGEList,mydesign) - if (priordf > 0) { - print.noquote(paste("prior.df =",priordf)) - DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = priordf) - } else { - DGEList = estimateGLMTagwiseDisp(DGEList,mydesign) - } - DGLM = glmFit(DGEList,design=mydesign) - efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors - normData = (1e+06*DGEList\$counts/efflib) - co = length(colnames(mydesign)) - DE = glmLRT(DGLM,coef=co) # always last one - subject is first if needed - uoutput = cbind( - Name=as.character(rownames(DGEList\$counts)), - DE\$table, - adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), - Dispersion=DGEList\$tagwise.dispersion,totreads=rsums,normData, - DGEList\$counts - ) - soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable - goodness = gof(DGLM, pcutoff=fdrthresh) - if (sum(goodness\$outlier) > 0) { - print.noquote('GLM outliers:') - print(paste(rownames(DGLM)[(goodness\$outlier != 0)],collapse=','),quote=F) - z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2) - pdf(paste(mt,"GoodnessofFit.pdf",sep='_')) - qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion") - abline(0,1,lwd=3) - points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue") - dev.off() - } else { print('No GLM fit outlier genes found\n')} - estpriorn = getPriorN(DGEList) - print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F) - efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors - normData = (1e+06*DGEList\$counts/efflib) - uniqueg = unique(group) - # Plot MDS - sample_colors = match(group,levels(group)) - pdf(paste(mt,"MDSplot.pdf",sep='_')) - sampleTypes = levels(group) - plotMDS.DGEList(DGEList,main=paste("MDS Plot for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors) - legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19) - grid(col="blue") - dev.off() - colnames(normData) = paste( colnames(normData),'N',sep="_") - print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=','))) - nzd = data.frame(log(nonzerod + 1e-2,10)) - boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle) - if (doDESEQ) - { - # DESeq - deSeqDatcount <- newCountDataSet(workCM, group) - deSeqDatsizefac <- estimateSizeFactors(deSeqDatcount) - deSeqDatdisp <- estimateDispersions(deSeqDatsizefac) - rDESeq <- nbinomTest(deSeqDatdisp, levels(group)[1], levels(group)[2]) - rDESeq <- rDESeq[order(rDESeq\$pval), ] - write.table(rDESeq,paste(mt,'DESeq_TopTable.xls',sep='_'), quote=FALSE, sep="\t",row.names=F) - topresults.DESeq <- rDESeq[which(rDESeq\$padj < fdrthresh), ] - DESeqcountsindex <- which(allgenes %in% topresults.DESeq\$id) - DESeqcounts <- rep(0, length(allgenes)) - DESeqcounts[DESeqcountsindex] <- 1 - } - DGEList = calcNormFactors(DGEList) - norm.factor = DGEList\$samples\$norm.factors - pdf(paste(mt,"voomplot.pdf",sep='_')) - dat.voomed <- voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor) - dev.off() - # Use limma to fit data - fit <- lmFit(dat.voomed, mydesign) - fit <- eBayes(fit) - rvoom <- topTable(fit, coef = length(colnames(mydesign)), adj = "BH", n = Inf) - write.table(rvoom,paste(mt,'VOOM_topTable.xls',sep='_'), quote=FALSE, sep="\t",row.names=F) - # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma - topresults.voom <- rvoom[which(rvoom\$adj.P.Val < fdrthresh), ] - topresults.edgeR <- soutput[which(soutput\$adj.p.value < fdrthresh), ] - # Create venn diagram of hits - edgeRcountsindex <- which(allgenes %in% rownames(topresults.edgeR)) - voomcountsindex <- which(allgenes %in% topresults.voom\$ID) - edgeRcounts <- rep(0, length(allgenes)) - edgeRcounts[edgeRcountsindex] <- 1 - voomcounts <- rep(0, length(allgenes)) - voomcounts[voomcountsindex] <- 1 - if (doDESEQ) { - vennmain = paste(mt,'Voom,edgeR and DESeq overlap at FDR=',fdrthresh) - counts.dataframe <- data.frame(edgeRcounts = edgeRcounts, DESeqcounts = DESeqcounts, - voomcounts = voomcounts, row.names = allgenes) - } else { - vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh) - counts.dataframe <- data.frame(edgeRcounts = edgeRcounts, voomcounts = voomcounts, row.names = allgenes) - } - counts.venn <- vennCounts(counts.dataframe) - vennf = paste(mt,'venn.pdf',sep='_') - pdf(vennf) - vennDiagram(counts.venn,main=vennmain,col="maroon") - dev.off() - #Prepare our output file - nreads = soutput\$totreads # ordered same way - print('# writing output',quote=F) - write.table(soutput,outputfilename, quote=FALSE, sep="\t",row.names=F) - rn = uoutput\$Name - reg = "^chr([0-9]+):([0-9]+)-([0-9]+)" - org="hg19" - genecards="<a href='http://www.genecards.org/index.php?path=/Search/keyword/" - ucsc = paste("<a href='http://genome.ucsc.edu/cgi-bin/hgTracks?db=",org,sep='') - testreg = str_match(rn,reg) - nreads = uoutput\$totreads # ordered same way - if (sum(!is.na(testreg[,1]))/length(testreg[,1]) > 0.8) # is ucsc style string - { - urls = paste(ucsc,'&position=chr',testreg[,2],':',testreg[,3],"-",testreg[,4],"'>",rn,'</a>',sep='') - } else { - urls = paste(genecards,rn,"'></a>",rn,'</a>',sep="") - } - print.noquote('# urls') - cat(head(urls)) - tt = uoutput - cat("# Top tags\n") - tt = cbind(tt,ntotreads=nreads,URL=urls) # add to end so table isn't laid out strangely - tt = tt[order(DE\$table\$PValue),] - options(width = 500) - print.noquote(tt[1:50,]) - pdf(paste(mt,"BCV_vs_abundance.pdf",sep='_')) - plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance") - dev.off() - # Plot MAplot - deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,]) - nsig = length(deTags) - print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F) - print('# deTags',quote=F) - print(head(deTags)) - dg = DGEList[order(DE\$table\$PValue),] - #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg))) - efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors - normData = (1e+06*dg\$counts/efflib) - outpdfname=paste(mt,"heatmap.pdf",sep='_') - hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=myTitle) - outSmear = paste(mt,"Smearplot.pdf",sep='_') - outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='') - smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain) - qqPlot(descr=myTitle,pvector=DE\$table\$PValue) - if (doDESEQ) { - cat("# DESeq top 50\n") - print(rDESeq[1:50,]) - } - cat("# VOOM top 50\n") - print(rvoom[1:50,]) - # need a design matrix and glm to use this - goodness = gof(DGLM, pcutoff=fdrthresh) - nout = sum(goodness\$outlier) - if (nout > 0) { - print.noquote(paste('Found',nout,'Goodness of fit outliers')) - rownames(DGLM)[goodness\$outlier] - z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2) - pdf(paste(mt,"GoodnessofFit.pdf",sep='_')) - qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion") - abline(0,1,lwd=3) - points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="dodgerblue") - dev.off() - } - #Return our main table - uoutput - -} #Done -sink(stdout(),append=T,type="message") -options(width=512) -Out_Dir = "$html_file.files_path" -Input = "$input1" -TreatmentName = "$treatment_name" -TreatmentCols = "$Treat_cols" -ControlName = "$control_name" -ControlCols= "$Control_cols" -outputfilename = "$outtab" -fdrtype = "$fdrtype" -priordf = $priordf -fdrthresh = $fdrthresh -useNDF = "$useNDF" -fQ = $fQ # non-differential centile cutoff -myTitle = "$title" -subjects = c($subjectids) -nsubj = length(subjects) -#Set our columns -TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1 -CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1 -cat('# got TCols=') -cat(TCols) -cat('; CCols=') -cat(CCols) -cat('\n') -useCols = c(TCols,CCols) -# Create output dir if non existent - if (file.exists(Out_Dir) == F) dir.create(Out_Dir) - -Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header -snames = colnames(Count_Matrix) -nsamples = length(snames) -if (nsubj > 0 & nsubj != nsamples) { -options("show.error.messages"=T) -mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=',')) -write(mess, stderr()) -#print(mess) -quit(save="no",status=4) -} - -Count_Matrix = Count_Matrix[,useCols] # reorder columns -if (length(subjects) != 0) {subjects = subjects[useCols]} -rn = rownames(Count_Matrix) -islib = rn %in% c('librarySize','NotInBedRegions') -LibSizes = Count_Matrix[subset(rn,islib),][1] # take first -Count_Matrix = Count_Matrix[subset(rn,! islib),] -group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor -group = factor(group, levels=c(ControlName,TreatmentName)) -colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns -results = edgeIt(Count_Matrix=Count_Matrix,group=group,outputfilename=outputfilename,fdrtype=fdrtype,priordf=priordf,fdrthresh=fdrthresh, - outputdir=Out_Dir,myTitle=myTitle,libSize=c(),useNDF=useNDF,filterquantile=fQ,subjects=subjects) -#Run the main function -# for the log -sessionInfo() -]]> -</configfile> -</configfiles> -<help> -**What it does** - -Performs digital gene expression analysis between a treatment and control on a count matrix. -Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design. - -**Input** - -A matrix consisting of non-negative integers. The matrix must have a unique header row identifiying the samples, and a unique set of row names -as the first column. Typically the row names are gene symbols or probe id's for downstream use in GSEA and other methods. - -If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples), -put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or -A list of integers, one for each subject or an empty string if samples are all independent. -If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix. -Integers for samples that are not in the analysis *must* be present in the string as filler even if not used. - -So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones -eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use -8,9,1,1,2,2 -as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6 - -**Output** - -A matrix which consists the original data and relative expression levels and some helpful plots - -**Note on edgeR versions** - -The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue) -breaking this and all other code that assumed the old name for this variable, -between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing). -This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing -to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly -when their old scripts break. This tool currently now works with 2.4.6. - -**Note on prior.N** - -http://seqanswers.com/forums/showthread.php?t=5591 says: - -*prior.n* - -The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion. -You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood -in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your -tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the -common likelihood the weight of one observation. - -In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value, -or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that -you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation -(squeezing) of the tagwise dispersions. How many samples do you have in your experiment? -What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10. -If you have more samples, then the tagwise dispersion estimates will be more reliable, -so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5. - - -From Bioconductor Digest, Vol 118, Issue 5, Gordon writes: - -Dear Dorota, - -The important settings are prior.df and trend. - -prior.n and prior.df are related through prior.df = prior.n * residual.df, -and your experiment has residual.df = 36 - 12 = 24. So the old setting of -prior.n=10 is equivalent for your data to prior.df = 240, a very large -value. Going the other way, the new setting of prior.df=10 is equivalent -to prior.n=10/24. - -To recover old results with the current software you would use - - estimateTagwiseDisp(object, prior.df=240, trend="none") - -To get the new default from old software you would use - - estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE) - -Actually the old trend method is equivalent to trend="loess" in the new -software. You should use plotBCV(object) to see whether a trend is -required. - -Note you could also use - - prior.n = getPriorN(object, prior.df=10) - -to map between prior.df and prior.n. - -</help> - -</tool> - -