# HG changeset patch
# User fubar
# Date 1387834978 18000
# Node ID 6d9d2c9679ec5b8960e2f63ac6ee3cc1a8820b35
# Parent bdb19fdbd679a04cff3db2bb6f7f876b0c051319
Uploaded
diff -r bdb19fdbd679 -r 6d9d2c9679ec rgToolFactory.py
--- a/rgToolFactory.py Sun Dec 22 06:03:27 2013 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,634 +0,0 @@
-# rgToolFactory.py
-# see https://bitbucket.org/fubar/galaxytoolfactory/wiki/Home
-#
-# copyright ross lazarus (ross stop lazarus at gmail stop com) May 2012
-#
-# all rights reserved
-# Licensed under the LGPL
-# suggestions for improvement and bug fixes welcome at https://bitbucket.org/fubar/galaxytoolfactory/wiki/Home
-#
-# august 2013
-# found a problem with GS if $TMP or $TEMP missing - now inject /tmp and warn
-#
-# july 2013
-# added ability to combine images and individual log files into html output
-# Needs proper yaml control file but for now for each section you want separated out in the
-# HTML page, you need to use a name like foo or bar or header - they will be presented in alphabetic order
-#
-# Iff there's a log file foo.log and it will be layed out as preformated text in the HTML output
-# together with all images named like "foo_*.pdf in a separate "foo" section of the document
-# otherwise old format for html
-#
-# January 2013
-# problem pointed out by Carlos Borroto
-# added escaping for <>$ - thought I did that ages ago...
-#
-# August 11 2012
-# changed to use shell=False and cl as a sequence
-
-# This is a Galaxy tool factory for simple scripts in python, R or whatever ails ye.
-# It also serves as the wrapper for the new tool.
-#
-# you paste and run your script
-# Only works for simple scripts that read one input from the history.
-# Optionally can write one new history dataset,
-# and optionally collect any number of outputs into links on an autogenerated HTML page.
-
-# DO NOT install on a public or important site - please.
-
-# installed generated tools are fine if the script is safe.
-# They just run normally and their user cannot do anything unusually insecure
-# but please, practice safe toolshed.
-# Read the fucking code before you install any tool
-# especially this one
-
-# After you get the script working on some test data, you can
-# optionally generate a toolshed compatible gzip file
-# containing your script safely wrapped as an ordinary Galaxy script in your local toolshed for
-# safe and largely automated installation in a production Galaxy.
-
-# If you opt for an HTML output, you get all the script outputs arranged
-# as a single Html history item - all output files are linked, thumbnails for all the pdfs.
-# Ugly but really inexpensive.
-#
-# Patches appreciated please.
-#
-#
-# long route to June 2012 product
-# Behold the awesome power of Galaxy and the toolshed with the tool factory to bind them
-# derived from an integrated script model
-# called rgBaseScriptWrapper.py
-# Note to the unwary:
-# This tool allows arbitrary scripting on your Galaxy as the Galaxy user
-# There is nothing stopping a malicious user doing whatever they choose
-# Extremely dangerous!!
-# Totally insecure. So, trusted users only
-#
-# preferred model is a developer using their throw away workstation instance - ie a private site.
-# no real risk. The universe_wsgi.ini admin_users string is checked - only admin users are permitted to run this tool.
-#
-
-import sys
-import shutil
-import subprocess
-import os
-import time
-import tempfile
-import optparse
-import tarfile
-import re
-import shutil
-import math
-
-progname = os.path.split(sys.argv[0])[1]
-myversion = 'V000.2 June 2012'
-verbose = False
-debug = False
-toolFactoryURL = 'https://bitbucket.org/fubar/galaxytoolfactory'
-
-def timenow():
- """return current time as a string
- """
- return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-html_escape_table = {
- "&": "&",
- ">": ">",
- "<": "<",
- "$": "\$"
- }
-
-def html_escape(text):
- """Produce entities within text."""
- return "".join(html_escape_table.get(c,c) for c in text)
-
-def cmd_exists(cmd):
- return subprocess.call("type " + cmd, shell=True,
- stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
-
-
-class ScriptRunner:
- """class is a wrapper for an arbitrary script
- """
-
- def __init__(self,opts=None,treatbashSpecial=True):
- """
- cleanup inputs, setup some outputs
-
- """
- self.useGM = cmd_exists('gm')
- self.useIM = cmd_exists('convert')
- self.useGS = cmd_exists('gs')
- self.temp_warned = False # we want only one warning if $TMP not set
- self.treatbashSpecial = treatbashSpecial
- if opts.output_dir: # simplify for the tool tarball
- os.chdir(opts.output_dir)
- self.thumbformat = 'png'
- self.opts = opts
- self.toolname = re.sub('[^a-zA-Z0-9_]+', '', opts.tool_name) # a sanitizer now does this but..
- self.toolid = self.toolname
- self.myname = sys.argv[0] # get our name because we write ourselves out as a tool later
- self.pyfile = self.myname # crude but efficient - the cruft won't hurt much
- self.xmlfile = '%s.xml' % self.toolname
- s = open(self.opts.script_path,'r').readlines()
- s = [x.rstrip() for x in s] # remove pesky dos line endings if needed
- self.script = '\n'.join(s)
- fhandle,self.sfile = tempfile.mkstemp(prefix=self.toolname,suffix=".%s" % (opts.interpreter))
- tscript = open(self.sfile,'w') # use self.sfile as script source for Popen
- tscript.write(self.script)
- tscript.close()
- self.indentedScript = '\n'.join([' %s' % x for x in s]) # for restructured text in help
- self.escapedScript = '\n'.join([html_escape(x) for x in s])
- self.elog = os.path.join(self.opts.output_dir,"%s_error.log" % self.toolname)
- if opts.output_dir: # may not want these complexities
- self.tlog = os.path.join(self.opts.output_dir,"%s_runner.log" % self.toolname)
- art = '%s.%s' % (self.toolname,opts.interpreter)
- artpath = os.path.join(self.opts.output_dir,art) # need full path
- artifact = open(artpath,'w') # use self.sfile as script source for Popen
- artifact.write(self.script)
- artifact.close()
- self.cl = []
- self.html = []
- a = self.cl.append
- a(opts.interpreter)
- if self.treatbashSpecial and opts.interpreter in ['bash','sh']:
- a(self.sfile)
- else:
- a('-') # stdin
- a(opts.input_tab)
- a(opts.output_tab)
- self.outFormats = 'tabular' # TODO make this an option at tool generation time
- self.inputFormats = 'tabular' # TODO make this an option at tool generation time
- self.test1Input = '%s_test1_input.xls' % self.toolname
- self.test1Output = '%s_test1_output.xls' % self.toolname
- self.test1HTML = '%s_test1_output.html' % self.toolname
-
- def makeXML(self):
- """
- Create a Galaxy xml tool wrapper for the new script as a string to write out
- fixme - use templating or something less fugly than this example of what we produce
-
-
- a tabular file
-
- reverse.py --script_path "$runMe" --interpreter "python"
- --tool_name "reverse" --input_tab "$input1" --output_tab "$tab_file"
-
-
-
-
-
-
-
-
-
-
-
-**What it Does**
-
-Reverse the columns in a tabular file
-
-
-
-
-
-# reverse order of columns in a tabular file
-import sys
-inp = sys.argv[1]
-outp = sys.argv[2]
-i = open(inp,'r')
-o = open(outp,'w')
-for row in i:
- rs = row.rstrip().split('\t')
- rs.reverse()
- o.write('\t'.join(rs))
- o.write('\n')
-i.close()
-o.close()
-
-
-
-
-
-
- """
- newXML="""
- %(tooldesc)s
- %(command)s
-
- %(inputs)s
-
-
- %(outputs)s
-
-
-
- %(script)s
-
-
- %(tooltests)s
-
- %(help)s
-
- """ # needs a dict with toolname, toolid, interpreter, scriptname, command, inputs as a multi line string ready to write, outputs ditto, help ditto
-
- newCommand="""
- %(toolname)s.py --script_path "$runMe" --interpreter "%(interpreter)s"
- --tool_name "%(toolname)s" %(command_inputs)s %(command_outputs)s
- """ # may NOT be an input or htmlout
- tooltestsTabOnly = """
-
-
-
-
- """
- tooltestsHTMLOnly = """
-
-
-
-
- """
- tooltestsBoth = """
-
-
-
-
-
- """
- xdict = {}
- xdict['tool_version'] = self.opts.tool_version
- xdict['test1Input'] = self.test1Input
- xdict['test1HTML'] = self.test1HTML
- xdict['test1Output'] = self.test1Output
- if self.opts.make_HTML and self.opts.output_tab <> 'None':
- xdict['tooltests'] = tooltestsBoth % xdict
- elif self.opts.make_HTML:
- xdict['tooltests'] = tooltestsHTMLOnly % xdict
- else:
- xdict['tooltests'] = tooltestsTabOnly % xdict
- xdict['script'] = self.escapedScript
- # configfile is least painful way to embed script to avoid external dependencies
- # but requires escaping of <, > and $ to avoid Mako parsing
- if self.opts.help_text:
- xdict['help'] = open(self.opts.help_text,'r').read()
- else:
- xdict['help'] = 'Please ask the tool author for help as none was supplied at tool generation'
- coda = ['**Script**','Pressing execute will run the following code over your input file and generate some outputs in your history::']
- coda.append(self.indentedScript)
- coda.append('**Attribution** This Galaxy tool was created by %s at %s\nusing the Galaxy Tool Factory.' % (self.opts.user_email,timenow()))
- coda.append('See %s for details of that project' % (toolFactoryURL))
- coda.append('Please cite: Creating re-usable tools from scripts: The Galaxy Tool Factory. Ross Lazarus; Antony Kaspi; Mark Ziemann; The Galaxy Team. ')
- coda.append('Bioinformatics 2012; doi: 10.1093/bioinformatics/bts573')
- xdict['help'] = '%s\n%s' % (xdict['help'],'\n'.join(coda))
- if self.opts.tool_desc:
- xdict['tooldesc'] = '%s' % self.opts.tool_desc
- else:
- xdict['tooldesc'] = ''
- xdict['command_outputs'] = ''
- xdict['outputs'] = ''
- if self.opts.input_tab <> 'None':
- xdict['command_inputs'] = '--input_tab "$input1" ' # the space may matter a lot if we append something
- xdict['inputs'] = ' \n' % self.inputFormats
- else:
- xdict['command_inputs'] = '' # assume no input - eg a random data generator
- xdict['inputs'] = ''
- xdict['inputs'] += ' \n' % self.toolname
- xdict['toolname'] = self.toolname
- xdict['toolid'] = self.toolid
- xdict['interpreter'] = self.opts.interpreter
- xdict['scriptname'] = self.sfile
- if self.opts.make_HTML:
- xdict['command_outputs'] += ' --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" '
- xdict['outputs'] += ' \n'
- if self.opts.output_tab <> 'None':
- xdict['command_outputs'] += ' --output_tab "$tab_file"'
- xdict['outputs'] += ' \n' % self.outFormats
- xdict['command'] = newCommand % xdict
- xmls = newXML % xdict
- xf = open(self.xmlfile,'w')
- xf.write(xmls)
- xf.write('\n')
- xf.close()
- # ready for the tarball
-
-
- def makeTooltar(self):
- """
- a tool is a gz tarball with eg
- /toolname/tool.xml /toolname/tool.py /toolname/test-data/test1_in.foo ...
- """
- retval = self.run()
- if retval:
- print >> sys.stderr,'## Run failed. Cannot build yet. Please fix and retry'
- sys.exit(1)
- self.makeXML()
- tdir = self.toolname
- os.mkdir(tdir)
- if self.opts.input_tab <> 'None': # no reproducible test otherwise? TODO: maybe..
- testdir = os.path.join(tdir,'test-data')
- os.mkdir(testdir) # make tests directory
- shutil.copyfile(self.opts.input_tab,os.path.join(testdir,self.test1Input))
- if self.opts.output_tab <> 'None':
- shutil.copyfile(self.opts.output_tab,os.path.join(testdir,self.test1Output))
- if self.opts.make_HTML:
- shutil.copyfile(self.opts.output_html,os.path.join(testdir,self.test1HTML))
- if self.opts.output_dir:
- shutil.copyfile(self.tlog,os.path.join(testdir,'test1_out.log'))
- op = '%s.py' % self.toolname # new name
- outpiname = os.path.join(tdir,op) # path for the tool tarball
- pyin = os.path.basename(self.pyfile) # our name - we rewrite ourselves (TM)
- notes = ['# %s - a self annotated version of %s generated by running %s\n' % (op,pyin,pyin),]
- notes.append('# to make a new Galaxy tool called %s\n' % self.toolname)
- notes.append('# User %s at %s\n' % (self.opts.user_email,timenow()))
- pi = open(self.pyfile,'r').readlines() # our code becomes new tool wrapper (!) - first Galaxy worm
- notes += pi
- outpi = open(outpiname,'w')
- outpi.write(''.join(notes))
- outpi.write('\n')
- outpi.close()
- stname = os.path.join(tdir,self.sfile)
- if not os.path.exists(stname):
- shutil.copyfile(self.sfile, stname)
- xtname = os.path.join(tdir,self.xmlfile)
- if not os.path.exists(xtname):
- shutil.copyfile(self.xmlfile,xtname)
- tarpath = "%s.gz" % self.toolname
- tar = tarfile.open(tarpath, "w:gz")
- tar.add(tdir,arcname=self.toolname)
- tar.close()
- shutil.copyfile(tarpath,self.opts.new_tool)
- shutil.rmtree(tdir)
- ## TODO: replace with optional direct upload to local toolshed?
- return retval
-
-
- def compressPDF(self,inpdf=None,thumbformat='png'):
- """need absolute path to pdf
- note that GS gets confoozled if no $TMP or $TEMP
- so we set it
- """
- assert os.path.isfile(inpdf), "## Input %s supplied to %s compressPDF not found" % (inpdf,self.myName)
- our_env = os.environ.copy()
- if not (our_env.get('TMP',None) or our_env.get('TEMP',None)):
- our_env['TMP'] = '/tmp'
- if not self.temp_warned:
- print >> sys.stdout,'## WARNING - no $TMP or $TEMP!!! Please fix - using /tmp temporarily'
- self.temp_warned = True
- hlog = os.path.join(self.opts.output_dir,"compress_%s.txt" % os.path.basename(inpdf))
- sto = open(hlog,'w')
- outpdf = '%s_compressed' % inpdf
- cl = ["gs", "-sDEVICE=pdfwrite", "-dNOPAUSE", "-dBATCH","-dPDFSETTINGS=/printer", "-sOutputFile=%s" % outpdf,inpdf]
- x = subprocess.Popen(cl,stdout=sto,stderr=sto,cwd=self.opts.output_dir,env=our_env)
- retval1 = x.wait()
- sto.close()
- if retval1 == 0:
- os.unlink(inpdf)
- shutil.move(outpdf,inpdf)
- os.unlink(hlog)
- else:
- x = open(hlog,'r').readlines()
- print >> sys.stdout,x
- hlog = os.path.join(self.opts.output_dir,"thumbnail_%s.txt" % os.path.basename(inpdf))
- sto = open(hlog,'w')
- outpng = '%s.%s' % (os.path.splitext(inpdf)[0],thumbformat)
- if self.useGM:
- cl2 = ['gm', 'convert', inpdf, outpng]
- else: # assume imagemagick
- cl2 = ['convert', inpdf, outpng]
- x = subprocess.Popen(cl2,stdout=sto,stderr=sto,cwd=self.opts.output_dir,env=our_env)
- retval2 = x.wait()
- sto.close()
- if retval2 <> 0:
- x = open(hlog,'r').readlines()
- print >> sys.stdout,x
- else:
- os.unlink(hlog)
- retval = retval1 or retval2
- return retval
-
-
- def getfSize(self,fpath,outpath):
- """
- format a nice file size string
- """
- size = ''
- fp = os.path.join(outpath,fpath)
- if os.path.isfile(fp):
- size = '0 B'
- n = float(os.path.getsize(fp))
- if n > 2**20:
- size = '%1.1f MB' % (n/2**20)
- elif n > 2**10:
- size = '%1.1f KB' % (n/2**10)
- elif n > 0:
- size = '%d B' % (int(n))
- return size
-
- def makeHtml(self):
- """ Create an HTML file content to list all the artifacts found in the output_dir
- """
-
- galhtmlprefix = """
-
-
\n"""
-
- flist = os.listdir(self.opts.output_dir)
- flist = [x for x in flist if x <> 'Rplots.pdf']
- flist.sort()
- html = []
- html.append(galhtmlprefix % progname)
- html.append('
Galaxy Tool "%s" run at %s
' % (self.toolname,timenow()))
- fhtml = []
- if len(flist) > 0:
- logfiles = [x for x in flist if x.lower().endswith('.log')] # log file names determine sections
- logfiles.sort()
- logfiles = [x for x in logfiles if os.path.abspath(x) <> os.path.abspath(self.tlog)]
- logfiles.append(os.path.abspath(self.tlog)) # make it the last one
- pdflist = []
- npdf = len([x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf'])
- for rownum,fname in enumerate(flist):
- dname,e = os.path.splitext(fname)
- sfsize = self.getfSize(fname,self.opts.output_dir)
- if e.lower() == '.pdf' : # compress and make a thumbnail
- thumb = '%s.%s' % (dname,self.thumbformat)
- pdff = os.path.join(self.opts.output_dir,fname)
- retval = self.compressPDF(inpdf=pdff,thumbformat=self.thumbformat)
- if retval == 0:
- pdflist.append((fname,thumb))
- else:
- pdflist.append((fname,fname))
- if (rownum+1) % 2 == 0:
- fhtml.append('
' % (fname,fname,sfsize))
- for logfname in logfiles: # expect at least tlog - if more
- if os.path.abspath(logfname) == os.path.abspath(self.tlog): # handled later
- sectionname = 'All tool run'
- if (len(logfiles) > 1):
- sectionname = 'Other'
- ourpdfs = pdflist
- else:
- realname = os.path.basename(logfname)
- sectionname = os.path.splitext(realname)[0].split('_')[0] # break in case _ added to log
- ourpdfs = [x for x in pdflist if os.path.basename(x[0]).split('_')[0] == sectionname]
- pdflist = [x for x in pdflist if os.path.basename(x[0]).split('_')[0] <> sectionname] # remove
- nacross = 1
- npdf = len(ourpdfs)
-
- if npdf > 0:
- nacross = math.sqrt(npdf) ## int(round(math.log(npdf,2)))
- if int(nacross)**2 != npdf:
- nacross += 1
- nacross = int(nacross)
- width = min(400,int(1200/nacross))
- html.append('
%s images and outputs
' % sectionname)
- html.append('(Click on a thumbnail image to download the corresponding original PDF image) ')
- ntogo = nacross # counter for table row padding with empty cells
- html.append('
\n
')
- for i,paths in enumerate(ourpdfs):
- fname,thumb = paths
- s= """
\n""" % (fname,thumb,fname,width,fname)
- if ((i+1) % nacross == 0):
- s += '
\n'
- ntogo = 0
- if i < (npdf - 1): # more to come
- s += '
\n')
- else:
- if ntogo > 0: # pad
- html.append('
'*ntogo)
- html.append('\n')
- logt = open(logfname,'r').readlines()
- logtext = [x for x in logt if x.strip() > '']
- html.append('
%s log output
' % sectionname)
- if len(logtext) > 1:
- html.append('\n
\n')
- html += logtext
- html.append('\n
\n')
- else:
- html.append('%s is empty ' % logfname)
- if len(fhtml) > 0:
- fhtml.insert(0,'
Output File Name (click to view)
Size
\n')
- fhtml.append('
')
- html.append('
All output files available for downloading
\n')
- html += fhtml # add all non-pdf files to the end of the display
- else:
- html.append('
### Error - %s returned no files - please confirm that parameters are sane
' % self.opts.interpreter)
- html.append(galhtmlpostfix)
- htmlf = file(self.opts.output_html,'w')
- htmlf.write('\n'.join(html))
- htmlf.write('\n')
- htmlf.close()
- self.html = html
-
-
- def run(self):
- """
- scripts must be small enough not to fill the pipe!
- """
- if self.treatbashSpecial and self.opts.interpreter in ['bash','sh']:
- retval = self.runBash()
- else:
- if self.opts.output_dir:
- ste = open(self.elog,'w')
- sto = open(self.tlog,'w')
- sto.write('## Toolfactory generated command line = %s\n' % ' '.join(self.cl))
- sto.flush()
- p = subprocess.Popen(self.cl,shell=False,stdout=sto,stderr=ste,stdin=subprocess.PIPE,cwd=self.opts.output_dir)
- else:
- p = subprocess.Popen(self.cl,shell=False,stdin=subprocess.PIPE)
- p.stdin.write(self.script)
- p.stdin.close()
- retval = p.wait()
- if self.opts.output_dir:
- sto.close()
- ste.close()
- err = open(self.elog,'r').readlines()
- if retval <> 0 and err: # problem
- print >> sys.stderr,err
- if self.opts.make_HTML:
- self.makeHtml()
- return retval
-
- def runBash(self):
- """
- cannot use - for bash so use self.sfile
- """
- if self.opts.output_dir:
- s = '## Toolfactory generated command line = %s\n' % ' '.join(self.cl)
- sto = open(self.tlog,'w')
- sto.write(s)
- sto.flush()
- p = subprocess.Popen(self.cl,shell=False,stdout=sto,stderr=sto,cwd=self.opts.output_dir)
- else:
- p = subprocess.Popen(self.cl,shell=False)
- retval = p.wait()
- if self.opts.output_dir:
- sto.close()
- if self.opts.make_HTML:
- self.makeHtml()
- return retval
-
-
-def main():
- u = """
- This is a Galaxy wrapper. It expects to be called by a special purpose tool.xml as:
- rgBaseScriptWrapper.py --script_path "$scriptPath" --tool_name "foo" --interpreter "Rscript"
-
- """
- op = optparse.OptionParser()
- a = op.add_option
- a('--script_path',default=None)
- a('--tool_name',default=None)
- a('--interpreter',default=None)
- a('--output_dir',default=None)
- a('--output_html',default=None)
- a('--input_tab',default="None")
- a('--output_tab',default="None")
- a('--user_email',default='Unknown')
- a('--bad_user',default=None)
- a('--make_Tool',default=None)
- a('--make_HTML',default=None)
- a('--help_text',default=None)
- a('--tool_desc',default=None)
- a('--new_tool',default=None)
- a('--tool_version',default=None)
- opts, args = op.parse_args()
- assert not opts.bad_user,'UNAUTHORISED: %s is NOT authorized to use this tool until Galaxy admin adds %s to admin_users in universe_wsgi.ini' % (opts.bad_user,opts.bad_user)
- assert opts.tool_name,'## Tool Factory expects a tool name - eg --tool_name=DESeq'
- assert opts.interpreter,'## Tool Factory wrapper expects an interpreter - eg --interpreter=Rscript'
- assert os.path.isfile(opts.script_path),'## Tool Factory wrapper expects a script path - eg --script_path=foo.R'
- if opts.output_dir:
- try:
- os.makedirs(opts.output_dir)
- except:
- pass
- r = ScriptRunner(opts)
- if opts.make_Tool:
- retcode = r.makeTooltar()
- else:
- retcode = r.run()
- os.unlink(r.sfile)
- if retcode:
- sys.exit(retcode) # indicate failure to job runner
-
-
-if __name__ == "__main__":
- main()
-
-
diff -r bdb19fdbd679 -r 6d9d2c9679ec rgToolFactory.py~
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rgToolFactory.py~ Mon Dec 23 16:42:58 2013 -0500
@@ -0,0 +1,634 @@
+# rgToolFactory.py
+# see https://bitbucket.org/fubar/galaxytoolfactory/wiki/Home
+#
+# copyright ross lazarus (ross stop lazarus at gmail stop com) May 2012
+#
+# all rights reserved
+# Licensed under the LGPL
+# suggestions for improvement and bug fixes welcome at https://bitbucket.org/fubar/galaxytoolfactory/wiki/Home
+#
+# august 2013
+# found a problem with GS if $TMP or $TEMP missing - now inject /tmp and warn
+#
+# july 2013
+# added ability to combine images and individual log files into html output
+# Needs proper yaml control file but for now for each section you want separated out in the
+# HTML page, you need to use a name like foo or bar or header - they will be presented in alphabetic order
+#
+# Iff there's a log file foo.log and it will be layed out as preformated text in the HTML output
+# together with all images named like "foo_*.pdf in a separate "foo" section of the document
+# otherwise old format for html
+#
+# January 2013
+# problem pointed out by Carlos Borroto
+# added escaping for <>$ - thought I did that ages ago...
+#
+# August 11 2012
+# changed to use shell=False and cl as a sequence
+
+# This is a Galaxy tool factory for simple scripts in python, R or whatever ails ye.
+# It also serves as the wrapper for the new tool.
+#
+# you paste and run your script
+# Only works for simple scripts that read one input from the history.
+# Optionally can write one new history dataset,
+# and optionally collect any number of outputs into links on an autogenerated HTML page.
+
+# DO NOT install on a public or important site - please.
+
+# installed generated tools are fine if the script is safe.
+# They just run normally and their user cannot do anything unusually insecure
+# but please, practice safe toolshed.
+# Read the fucking code before you install any tool
+# especially this one
+
+# After you get the script working on some test data, you can
+# optionally generate a toolshed compatible gzip file
+# containing your script safely wrapped as an ordinary Galaxy script in your local toolshed for
+# safe and largely automated installation in a production Galaxy.
+
+# If you opt for an HTML output, you get all the script outputs arranged
+# as a single Html history item - all output files are linked, thumbnails for all the pdfs.
+# Ugly but really inexpensive.
+#
+# Patches appreciated please.
+#
+#
+# long route to June 2012 product
+# Behold the awesome power of Galaxy and the toolshed with the tool factory to bind them
+# derived from an integrated script model
+# called rgBaseScriptWrapper.py
+# Note to the unwary:
+# This tool allows arbitrary scripting on your Galaxy as the Galaxy user
+# There is nothing stopping a malicious user doing whatever they choose
+# Extremely dangerous!!
+# Totally insecure. So, trusted users only
+#
+# preferred model is a developer using their throw away workstation instance - ie a private site.
+# no real risk. The universe_wsgi.ini admin_users string is checked - only admin users are permitted to run this tool.
+#
+
+import sys
+import shutil
+import subprocess
+import os
+import time
+import tempfile
+import optparse
+import tarfile
+import re
+import shutil
+import math
+
+progname = os.path.split(sys.argv[0])[1]
+myversion = 'V000.2 June 2012'
+verbose = False
+debug = False
+toolFactoryURL = 'https://bitbucket.org/fubar/galaxytoolfactory'
+
+def timenow():
+ """return current time as a string
+ """
+ return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
+
+html_escape_table = {
+ "&": "&",
+ ">": ">",
+ "<": "<",
+ "$": "\$"
+ }
+
+def html_escape(text):
+ """Produce entities within text."""
+ return "".join(html_escape_table.get(c,c) for c in text)
+
+def cmd_exists(cmd):
+ return subprocess.call("type " + cmd, shell=True,
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0
+
+
+class ScriptRunner:
+ """class is a wrapper for an arbitrary script
+ """
+
+ def __init__(self,opts=None,treatbashSpecial=True):
+ """
+ cleanup inputs, setup some outputs
+
+ """
+ self.useGM = cmd_exists('gm')
+ self.useIM = cmd_exists('convert')
+ self.useGS = cmd_exists('gs')
+ self.temp_warned = False # we want only one warning if $TMP not set
+ self.treatbashSpecial = treatbashSpecial
+ if opts.output_dir: # simplify for the tool tarball
+ os.chdir(opts.output_dir)
+ self.thumbformat = 'png'
+ self.opts = opts
+ self.toolname = re.sub('[^a-zA-Z0-9_]+', '', opts.tool_name) # a sanitizer now does this but..
+ self.toolid = self.toolname
+ self.myname = sys.argv[0] # get our name because we write ourselves out as a tool later
+ self.pyfile = self.myname # crude but efficient - the cruft won't hurt much
+ self.xmlfile = '%s.xml' % self.toolname
+ s = open(self.opts.script_path,'r').readlines()
+ s = [x.rstrip() for x in s] # remove pesky dos line endings if needed
+ self.script = '\n'.join(s)
+ fhandle,self.sfile = tempfile.mkstemp(prefix=self.toolname,suffix=".%s" % (opts.interpreter))
+ tscript = open(self.sfile,'w') # use self.sfile as script source for Popen
+ tscript.write(self.script)
+ tscript.close()
+ self.indentedScript = '\n'.join([' %s' % x for x in s]) # for restructured text in help
+ self.escapedScript = '\n'.join([html_escape(x) for x in s])
+ self.elog = os.path.join(self.opts.output_dir,"%s_error.log" % self.toolname)
+ if opts.output_dir: # may not want these complexities
+ self.tlog = os.path.join(self.opts.output_dir,"%s_runner.log" % self.toolname)
+ art = '%s.%s' % (self.toolname,opts.interpreter)
+ artpath = os.path.join(self.opts.output_dir,art) # need full path
+ artifact = open(artpath,'w') # use self.sfile as script source for Popen
+ artifact.write(self.script)
+ artifact.close()
+ self.cl = []
+ self.html = []
+ a = self.cl.append
+ a(opts.interpreter)
+ if self.treatbashSpecial and opts.interpreter in ['bash','sh']:
+ a(self.sfile)
+ else:
+ a('-') # stdin
+ a(opts.input_tab)
+ a(opts.output_tab)
+ self.outFormats = 'tabular' # TODO make this an option at tool generation time
+ self.inputFormats = 'tabular' # TODO make this an option at tool generation time
+ self.test1Input = '%s_test1_input.xls' % self.toolname
+ self.test1Output = '%s_test1_output.xls' % self.toolname
+ self.test1HTML = '%s_test1_output.html' % self.toolname
+
+ def makeXML(self):
+ """
+ Create a Galaxy xml tool wrapper for the new script as a string to write out
+ fixme - use templating or something less fugly than this example of what we produce
+
+
+ a tabular file
+
+ reverse.py --script_path "$runMe" --interpreter "python"
+ --tool_name "reverse" --input_tab "$input1" --output_tab "$tab_file"
+
+
+
+
+
+
+
+
+
+
+
+**What it Does**
+
+Reverse the columns in a tabular file
+
+
+
+
+
+# reverse order of columns in a tabular file
+import sys
+inp = sys.argv[1]
+outp = sys.argv[2]
+i = open(inp,'r')
+o = open(outp,'w')
+for row in i:
+ rs = row.rstrip().split('\t')
+ rs.reverse()
+ o.write('\t'.join(rs))
+ o.write('\n')
+i.close()
+o.close()
+
+
+
+
+
+
+ """
+ newXML="""
+ %(tooldesc)s
+ %(command)s
+
+ %(inputs)s
+
+
+ %(outputs)s
+
+
+
+ %(script)s
+
+
+ %(tooltests)s
+
+ %(help)s
+
+ """ # needs a dict with toolname, toolid, interpreter, scriptname, command, inputs as a multi line string ready to write, outputs ditto, help ditto
+
+ newCommand="""
+ %(toolname)s.py --script_path "$runMe" --interpreter "%(interpreter)s"
+ --tool_name "%(toolname)s" %(command_inputs)s %(command_outputs)s
+ """ # may NOT be an input or htmlout
+ tooltestsTabOnly = """
+
+
+
+
+ """
+ tooltestsHTMLOnly = """
+
+
+
+
+ """
+ tooltestsBoth = """
+
+
+
+
+
+ """
+ xdict = {}
+ xdict['tool_version'] = self.opts.tool_version
+ xdict['test1Input'] = self.test1Input
+ xdict['test1HTML'] = self.test1HTML
+ xdict['test1Output'] = self.test1Output
+ if self.opts.make_HTML and self.opts.output_tab <> 'None':
+ xdict['tooltests'] = tooltestsBoth % xdict
+ elif self.opts.make_HTML:
+ xdict['tooltests'] = tooltestsHTMLOnly % xdict
+ else:
+ xdict['tooltests'] = tooltestsTabOnly % xdict
+ xdict['script'] = self.escapedScript
+ # configfile is least painful way to embed script to avoid external dependencies
+ # but requires escaping of <, > and $ to avoid Mako parsing
+ if self.opts.help_text:
+ xdict['help'] = open(self.opts.help_text,'r').read()
+ else:
+ xdict['help'] = 'Please ask the tool author for help as none was supplied at tool generation'
+ coda = ['**Script**','Pressing execute will run the following code over your input file and generate some outputs in your history::']
+ coda.append(self.indentedScript)
+ coda.append('**Attribution** This Galaxy tool was created by %s at %s\nusing the Galaxy Tool Factory.' % (self.opts.user_email,timenow()))
+ coda.append('See %s for details of that project' % (toolFactoryURL))
+ coda.append('Please cite: Creating re-usable tools from scripts: The Galaxy Tool Factory. Ross Lazarus; Antony Kaspi; Mark Ziemann; The Galaxy Team. ')
+ coda.append('Bioinformatics 2012; doi: 10.1093/bioinformatics/bts573')
+ xdict['help'] = '%s\n%s' % (xdict['help'],'\n'.join(coda))
+ if self.opts.tool_desc:
+ xdict['tooldesc'] = '%s' % self.opts.tool_desc
+ else:
+ xdict['tooldesc'] = ''
+ xdict['command_outputs'] = ''
+ xdict['outputs'] = ''
+ if self.opts.input_tab <> 'None':
+ xdict['command_inputs'] = '--input_tab "$input1" ' # the space may matter a lot if we append something
+ xdict['inputs'] = ' \n' % self.inputFormats
+ else:
+ xdict['command_inputs'] = '' # assume no input - eg a random data generator
+ xdict['inputs'] = ''
+ xdict['inputs'] += ' \n' % self.toolname
+ xdict['toolname'] = self.toolname
+ xdict['toolid'] = self.toolid
+ xdict['interpreter'] = self.opts.interpreter
+ xdict['scriptname'] = self.sfile
+ if self.opts.make_HTML:
+ xdict['command_outputs'] += ' --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" '
+ xdict['outputs'] += ' \n'
+ if self.opts.output_tab <> 'None':
+ xdict['command_outputs'] += ' --output_tab "$tab_file"'
+ xdict['outputs'] += ' \n' % self.outFormats
+ xdict['command'] = newCommand % xdict
+ xmls = newXML % xdict
+ xf = open(self.xmlfile,'w')
+ xf.write(xmls)
+ xf.write('\n')
+ xf.close()
+ # ready for the tarball
+
+
+ def makeTooltar(self):
+ """
+ a tool is a gz tarball with eg
+ /toolname/tool.xml /toolname/tool.py /toolname/test-data/test1_in.foo ...
+ """
+ retval = self.run()
+ if retval:
+ print >> sys.stderr,'## Run failed. Cannot build yet. Please fix and retry'
+ sys.exit(1)
+ self.makeXML()
+ tdir = self.toolname
+ os.mkdir(tdir)
+ if self.opts.input_tab <> 'None': # no reproducible test otherwise? TODO: maybe..
+ testdir = os.path.join(tdir,'test-data')
+ os.mkdir(testdir) # make tests directory
+ shutil.copyfile(self.opts.input_tab,os.path.join(testdir,self.test1Input))
+ if self.opts.output_tab <> 'None':
+ shutil.copyfile(self.opts.output_tab,os.path.join(testdir,self.test1Output))
+ if self.opts.make_HTML:
+ shutil.copyfile(self.opts.output_html,os.path.join(testdir,self.test1HTML))
+ if self.opts.output_dir:
+ shutil.copyfile(self.tlog,os.path.join(testdir,'test1_out.log'))
+ op = '%s.py' % self.toolname # new name
+ outpiname = os.path.join(tdir,op) # path for the tool tarball
+ pyin = os.path.basename(self.pyfile) # our name - we rewrite ourselves (TM)
+ notes = ['# %s - a self annotated version of %s generated by running %s\n' % (op,pyin,pyin),]
+ notes.append('# to make a new Galaxy tool called %s\n' % self.toolname)
+ notes.append('# User %s at %s\n' % (self.opts.user_email,timenow()))
+ pi = open(self.pyfile,'r').readlines() # our code becomes new tool wrapper (!) - first Galaxy worm
+ notes += pi
+ outpi = open(outpiname,'w')
+ outpi.write(''.join(notes))
+ outpi.write('\n')
+ outpi.close()
+ stname = os.path.join(tdir,self.sfile)
+ if not os.path.exists(stname):
+ shutil.copyfile(self.sfile, stname)
+ xtname = os.path.join(tdir,self.xmlfile)
+ if not os.path.exists(xtname):
+ shutil.copyfile(self.xmlfile,xtname)
+ tarpath = "%s.gz" % self.toolname
+ tar = tarfile.open(tarpath, "w:gz")
+ tar.add(tdir,arcname=self.toolname)
+ tar.close()
+ shutil.copyfile(tarpath,self.opts.new_tool)
+ shutil.rmtree(tdir)
+ ## TODO: replace with optional direct upload to local toolshed?
+ return retval
+
+
+ def compressPDF(self,inpdf=None,thumbformat='png'):
+ """need absolute path to pdf
+ note that GS gets confoozled if no $TMP or $TEMP
+ so we set it
+ """
+ assert os.path.isfile(inpdf), "## Input %s supplied to %s compressPDF not found" % (inpdf,self.myName)
+ our_env = os.environ.copy()
+ if not (our_env.get('TMP',None) or our_env.get('TEMP',None)):
+ our_env['TMP'] = '/tmp'
+ if not self.temp_warned:
+ print >> sys.stdout,'## WARNING - no $TMP or $TEMP!!! Please fix - using /tmp temporarily'
+ self.temp_warned = True
+ hlog = os.path.join(self.opts.output_dir,"compress_%s.txt" % os.path.basename(inpdf))
+ sto = open(hlog,'w')
+ outpdf = '%s_compressed' % inpdf
+ cl = ["gs", "-sDEVICE=pdfwrite", "-dNOPAUSE", "-dBATCH","-dPDFSETTINGS=/printer", "-sOutputFile=%s" % outpdf,inpdf]
+ x = subprocess.Popen(cl,stdout=sto,stderr=sto,cwd=self.opts.output_dir,env=our_env)
+ retval1 = x.wait()
+ sto.close()
+ if retval1 == 0:
+ os.unlink(inpdf)
+ shutil.move(outpdf,inpdf)
+ os.unlink(hlog)
+ else:
+ x = open(hlog,'r').readlines()
+ print >> sys.stdout,x
+ hlog = os.path.join(self.opts.output_dir,"thumbnail_%s.txt" % os.path.basename(inpdf))
+ sto = open(hlog,'w')
+ outpng = '%s.%s' % (os.path.splitext(inpdf)[0],thumbformat)
+ if self.useGM:
+ cl2 = ['gm', 'convert', inpdf, outpng]
+ else: # assume imagemagick
+ cl2 = ['convert', inpdf, outpng]
+ x = subprocess.Popen(cl2,stdout=sto,stderr=sto,cwd=self.opts.output_dir,env=our_env)
+ retval2 = x.wait()
+ sto.close()
+ if retval2 <> 0:
+ x = open(hlog,'r').readlines()
+ print >> sys.stdout,x
+ else:
+ os.unlink(hlog)
+ retval = retval1 or retval2
+ return retval
+
+
+ def getfSize(self,fpath,outpath):
+ """
+ format a nice file size string
+ """
+ size = ''
+ fp = os.path.join(outpath,fpath)
+ if os.path.isfile(fp):
+ size = '0 B'
+ n = float(os.path.getsize(fp))
+ if n > 2**20:
+ size = '%1.1f MB' % (n/2**20)
+ elif n > 2**10:
+ size = '%1.1f KB' % (n/2**10)
+ elif n > 0:
+ size = '%d B' % (int(n))
+ return size
+
+ def makeHtml(self):
+ """ Create an HTML file content to list all the artifacts found in the output_dir
+ """
+
+ galhtmlprefix = """
+
+
+
+
+
+
+
+
\n"""
+
+ flist = os.listdir(self.opts.output_dir)
+ flist = [x for x in flist if x <> 'Rplots.pdf']
+ flist.sort()
+ html = []
+ html.append(galhtmlprefix % progname)
+ html.append('
Galaxy Tool "%s" run at %s
' % (self.toolname,timenow()))
+ fhtml = []
+ if len(flist) > 0:
+ logfiles = [x for x in flist if x.lower().endswith('.log')] # log file names determine sections
+ logfiles.sort()
+ logfiles = [x for x in logfiles if os.path.abspath(x) <> os.path.abspath(self.tlog)]
+ logfiles.append(os.path.abspath(self.tlog)) # make it the last one
+ pdflist = []
+ npdf = len([x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf'])
+ for rownum,fname in enumerate(flist):
+ dname,e = os.path.splitext(fname)
+ sfsize = self.getfSize(fname,self.opts.output_dir)
+ if e.lower() == '.pdf' : # compress and make a thumbnail
+ thumb = '%s.%s' % (dname,self.thumbformat)
+ pdff = os.path.join(self.opts.output_dir,fname)
+ retval = self.compressPDF(inpdf=pdff,thumbformat=self.thumbformat)
+ if retval == 0:
+ pdflist.append((fname,thumb))
+ else:
+ pdflist.append((fname,fname))
+ if (rownum+1) % 2 == 0:
+ fhtml.append('
' % (fname,fname,sfsize))
+ for logfname in logfiles: # expect at least tlog - if more
+ if os.path.abspath(logfname) == os.path.abspath(self.tlog): # handled later
+ sectionname = 'All tool run'
+ if (len(logfiles) > 1):
+ sectionname = 'Other'
+ ourpdfs = pdflist
+ else:
+ realname = os.path.basename(logfname)
+ sectionname = os.path.splitext(realname)[0].split('_')[0] # break in case _ added to log
+ ourpdfs = [x for x in pdflist if os.path.basename(x[0]).split('_')[0] == sectionname]
+ pdflist = [x for x in pdflist if os.path.basename(x[0]).split('_')[0] <> sectionname] # remove
+ nacross = 1
+ npdf = len(ourpdfs)
+
+ if npdf > 0:
+ nacross = math.sqrt(npdf) ## int(round(math.log(npdf,2)))
+ if int(nacross)**2 != npdf:
+ nacross += 1
+ nacross = int(nacross)
+ width = min(400,int(1200/nacross))
+ html.append('
%s images and outputs
' % sectionname)
+ html.append('(Click on a thumbnail image to download the corresponding original PDF image) ')
+ ntogo = nacross # counter for table row padding with empty cells
+ html.append('
\n
')
+ for i,paths in enumerate(ourpdfs):
+ fname,thumb = paths
+ s= """
\n""" % (fname,thumb,fname,width,fname)
+ if ((i+1) % nacross == 0):
+ s += '
\n'
+ ntogo = 0
+ if i < (npdf - 1): # more to come
+ s += '
\n')
+ else:
+ if ntogo > 0: # pad
+ html.append('
'*ntogo)
+ html.append('\n')
+ logt = open(logfname,'r').readlines()
+ logtext = [x for x in logt if x.strip() > '']
+ html.append('
%s log output
' % sectionname)
+ if len(logtext) > 1:
+ html.append('\n
\n')
+ html += logtext
+ html.append('\n
\n')
+ else:
+ html.append('%s is empty ' % logfname)
+ if len(fhtml) > 0:
+ fhtml.insert(0,'
Output File Name (click to view)
Size
\n')
+ fhtml.append('
')
+ html.append('
All output files available for downloading
\n')
+ html += fhtml # add all non-pdf files to the end of the display
+ else:
+ html.append('
### Error - %s returned no files - please confirm that parameters are sane
' % self.opts.interpreter)
+ html.append(galhtmlpostfix)
+ htmlf = file(self.opts.output_html,'w')
+ htmlf.write('\n'.join(html))
+ htmlf.write('\n')
+ htmlf.close()
+ self.html = html
+
+
+ def run(self):
+ """
+ scripts must be small enough not to fill the pipe!
+ """
+ if self.treatbashSpecial and self.opts.interpreter in ['bash','sh']:
+ retval = self.runBash()
+ else:
+ if self.opts.output_dir:
+ ste = open(self.elog,'w')
+ sto = open(self.tlog,'w')
+ sto.write('## Toolfactory generated command line = %s\n' % ' '.join(self.cl))
+ sto.flush()
+ p = subprocess.Popen(self.cl,shell=False,stdout=sto,stderr=ste,stdin=subprocess.PIPE,cwd=self.opts.output_dir)
+ else:
+ p = subprocess.Popen(self.cl,shell=False,stdin=subprocess.PIPE)
+ p.stdin.write(self.script)
+ p.stdin.close()
+ retval = p.wait()
+ if self.opts.output_dir:
+ sto.close()
+ ste.close()
+ err = open(self.elog,'r').readlines()
+ if retval <> 0 and err: # problem
+ print >> sys.stderr,err
+ if self.opts.make_HTML:
+ self.makeHtml()
+ return retval
+
+ def runBash(self):
+ """
+ cannot use - for bash so use self.sfile
+ """
+ if self.opts.output_dir:
+ s = '## Toolfactory generated command line = %s\n' % ' '.join(self.cl)
+ sto = open(self.tlog,'w')
+ sto.write(s)
+ sto.flush()
+ p = subprocess.Popen(self.cl,shell=False,stdout=sto,stderr=sto,cwd=self.opts.output_dir)
+ else:
+ p = subprocess.Popen(self.cl,shell=False)
+ retval = p.wait()
+ if self.opts.output_dir:
+ sto.close()
+ if self.opts.make_HTML:
+ self.makeHtml()
+ return retval
+
+
+def main():
+ u = """
+ This is a Galaxy wrapper. It expects to be called by a special purpose tool.xml as:
+ rgBaseScriptWrapper.py --script_path "$scriptPath" --tool_name "foo" --interpreter "Rscript"
+
+ """
+ op = optparse.OptionParser()
+ a = op.add_option
+ a('--script_path',default=None)
+ a('--tool_name',default=None)
+ a('--interpreter',default=None)
+ a('--output_dir',default=None)
+ a('--output_html',default=None)
+ a('--input_tab',default="None")
+ a('--output_tab',default="None")
+ a('--user_email',default='Unknown')
+ a('--bad_user',default=None)
+ a('--make_Tool',default=None)
+ a('--make_HTML',default=None)
+ a('--help_text',default=None)
+ a('--tool_desc',default=None)
+ a('--new_tool',default=None)
+ a('--tool_version',default=None)
+ opts, args = op.parse_args()
+ assert not opts.bad_user,'UNAUTHORISED: %s is NOT authorized to use this tool until Galaxy admin adds %s to admin_users in universe_wsgi.ini' % (opts.bad_user,opts.bad_user)
+ assert opts.tool_name,'## Tool Factory expects a tool name - eg --tool_name=DESeq'
+ assert opts.interpreter,'## Tool Factory wrapper expects an interpreter - eg --interpreter=Rscript'
+ assert os.path.isfile(opts.script_path),'## Tool Factory wrapper expects a script path - eg --script_path=foo.R'
+ if opts.output_dir:
+ try:
+ os.makedirs(opts.output_dir)
+ except:
+ pass
+ r = ScriptRunner(opts)
+ if opts.make_Tool:
+ retcode = r.makeTooltar()
+ else:
+ retcode = r.run()
+ os.unlink(r.sfile)
+ if retcode:
+ sys.exit(retcode) # indicate failure to job runner
+
+
+if __name__ == "__main__":
+ main()
+
+
diff -r bdb19fdbd679 -r 6d9d2c9679ec rgedgeRpaired_nocamera.xml
--- a/rgedgeRpaired_nocamera.xml Sun Dec 22 06:03:27 2013 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,1095 +0,0 @@
-
- models using BioConductor packages
-
- r3
- graphicsmagick
- ghostscript
- biocbasics
-
-
-
- rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts"
- --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- edgeR['doedgeR'] == "T"
-
-
- DESeq2['doDESeq2'] == "T"
-
-
- doVoom == "T"
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- nsamp) {
- dm =dm[1:nsamp,]
- #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
- }
- newcolnames = substr(colnames(dm),1,20)
- colnames(dm) = newcolnames
- pdf(outpdfname)
- heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
- Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
- dev.off()
-}
-
-hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
-{
- # for 2 groups only was
- #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
- #pcols = unlist(lapply(group,col.map))
- gu = unique(group)
- colours = rainbow(length(gu),start=0.3,end=0.6)
- pcols = colours[match(group,gu)]
- nrows = nrow(cmat)
- mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
- if (nrows > nsamp) {
- cmat = cmat[c(1:nsamp),]
- mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
- }
- newcolnames = substr(colnames(cmat),1,20)
- colnames(cmat) = newcolnames
- pdf(outpdfname)
- heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
- dev.off()
-}
-
-qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
-# stolen from https://gist.github.com/703512
-{
- o = -log10(sort(pvector,decreasing=F))
- e = -log10( 1:length(o)/length(o) )
- o[o==-Inf] = reallysmall
- o[o==Inf] = reallybig
- maint = descr
- pdf(outpdf)
- plot(e,o,pch=19,cex=1, main=maint, ...,
- xlab=expression(Expected~~-log[10](italic(p))),
- ylab=expression(Observed~~-log[10](italic(p))),
- xlim=c(0,max(e)), ylim=c(0,max(o)))
- lines(e,e,col="red")
- grid(col = "lightgray", lty = "dotted")
- dev.off()
-}
-
-smearPlot = function(DGEList,deTags, outSmear, outMain)
- {
- pdf(outSmear)
- plotSmear(DGEList,de.tags=deTags,main=outMain)
- grid(col="lightgray", lty="dotted")
- dev.off()
- }
-
-boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
-{ #
- nc = ncol(rawrs)
- #### for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
- fullnames = colnames(rawrs)
- newcolnames = substr(colnames(rawrs),1,20)
- colnames(rawrs) = newcolnames
- newcolnames = substr(colnames(cleanrs),1,20)
- colnames(cleanrs) = newcolnames
- defpar = par(no.readonly=T)
- print.noquote('raw contig counts by sample:')
- print.noquote(summary(rawrs))
- print.noquote('normalised contig counts by sample:')
- print.noquote(summary(cleanrs))
- pdf(pdfname)
- par(mfrow=c(1,2))
- boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
- grid(col="lightgray",lty="dotted")
- boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
- grid(col="lightgray",lty="dotted")
- dev.off()
- pdfname = "sample_counts_histogram.pdf"
- nc = ncol(rawrs)
- print.noquote(paste('Using ncol rawrs=',nc))
- ncroot = round(sqrt(nc))
- if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
- m = c()
- for (i in c(1:nc)) {
- rhist = hist(rawrs[,i],breaks=100,plot=F)
- m = append(m,max(rhist\$counts))
- }
- ymax = max(m)
- ncols = length(fullnames)
- if (ncols > 20)
- {
- scale = 7*ncols/20
- pdf(pdfname,width=scale,height=scale)
- } else {
- pdf(pdfname)
- }
- par(mfrow=c(ncroot,ncroot))
- for (i in c(1:nc)) {
- hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
- breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
- }
- dev.off()
- par(defpar)
-
-}
-
-cumPlot = function(rawrs,cleanrs,maint,myTitle)
-{ # updated to use ecdf
- pdfname = "Filtering_rowsum_bar_charts.pdf"
- defpar = par(no.readonly=T)
- lrs = log(rawrs,10)
- lim = max(lrs)
- pdf(pdfname)
- par(mfrow=c(2,1))
- hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
- ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
- grid(col="lightgray", lty="dotted")
- lrs = log(cleanrs,10)
- hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
- ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
- grid(col="lightgray", lty="dotted")
- dev.off()
- par(defpar)
-}
-
-cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
-{ # updated to use ecdf
- pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
- pdf(pdfname)
- par(mfrow=c(2,1))
- lastx = max(rawrs)
- rawe = knots(ecdf(rawrs))
- cleane = knots(ecdf(cleanrs))
- cy = 1:length(cleane)/length(cleane)
- ry = 1:length(rawe)/length(rawe)
- plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
- ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
- grid(col="blue")
- plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
- ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
- grid(col="blue")
- dev.off()
-}
-
-
-
-doGSEA = function(y=NULL,design=NULL,histgmt="",
- bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
- ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
-{
- sink('Camera.log')
- genesets = c()
- if (bigmt > "")
- {
- bigenesets = readLines(bigmt)
- genesets = bigenesets
- }
- if (histgmt > "")
- {
- hgenesets = readLines(histgmt)
- if (bigmt > "") {
- genesets = rbind(genesets,hgenesets)
- } else {
- genesets = hgenesets
- } # use only history if no bi
- }
- print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
- genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
- outf = outfname
- head=paste(myTitle,'edgeR GSEA')
- write(head,file=outfname,append=F)
- ntest=length(genesets)
- urownames = toupper(rownames(y))
- upcam = c()
- downcam = c()
- for (i in 1:ntest) {
- gs = unlist(genesets[i])
- g = gs[1] # geneset_id
- u = gs[2]
- if (u > "") { u = paste("",u,"",sep="") }
- glist = gs[3:length(gs)] # member gene symbols
- glist = toupper(glist)
- inglist = urownames %in% glist
- nin = sum(inglist)
- if ((nin > minnin) && (nin < maxnin)) {
- ### print(paste('@@found',sum(inglist),'genes in glist'))
- camres = camera(y=y,index=inglist,design=design)
- if (! is.null(camres)) {
- rownames(camres) = g # gene set name
- camres = cbind(GeneSet=g,URL=u,camres)
- if (camres\$Direction == "Up")
- {
- upcam = rbind(upcam,camres) } else {
- downcam = rbind(downcam,camres)
- }
- }
- }
- }
- uscam = upcam[order(upcam\$PValue),]
- unadjp = uscam\$PValue
- uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
- nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
- dscam = downcam[order(downcam\$PValue),]
- unadjp = dscam\$PValue
- dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
- ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
- write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
- write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
- print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
- write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
- print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
- write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
- sink()
-}
-
-
-
-
-doGSEAatonce = function(y=NULL,design=NULL,histgmt="",
- bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
- ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
-{
- sink('Camera.log')
- genesets = c()
- if (bigmt > "")
- {
- bigenesets = readLines(bigmt)
- genesets = bigenesets
- }
- if (histgmt > "")
- {
- hgenesets = readLines(histgmt)
- if (bigmt > "") {
- genesets = rbind(genesets,hgenesets)
- } else {
- genesets = hgenesets
- } # use only history if no bi
- }
- print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
- genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
- outf = outfname
- head=paste(myTitle,'edgeR GSEA')
- write(head,file=outfname,append=F)
- ntest=length(genesets)
- urownames = toupper(rownames(y))
- upcam = c()
- downcam = c()
- incam = c()
- urls = c()
- gsids = c()
- for (i in 1:ntest) {
- gs = unlist(genesets[i])
- gsid = gs[1] # geneset_id
- url = gs[2]
- if (url > "") { url = paste("",url,"",sep="") }
- glist = gs[3:length(gs)] # member gene symbols
- glist = toupper(glist)
- inglist = urownames %in% glist
- nin = sum(inglist)
- if ((nin > minnin) && (nin < maxnin)) {
- incam = c(incam,inglist)
- gsids = c(gsids,gsid)
- urls = c(urls,url)
- }
- }
- incam = as.list(incam)
- names(incam) = gsids
- allcam = camera(y=y,index=incam,design=design)
- allcamres = cbind(geneset=gsids,allcam,URL=urls)
- for (i in 1:ntest) {
- camres = allcamres[i]
- res = try(test = (camres\$Direction == "Up"))
- if ("try-error" %in% class(res)) {
- cat("test failed, camres = :")
- print.noquote(camres)
- } else { if (camres\$Direction == "Up")
- { upcam = rbind(upcam,camres)
- } else { downcam = rbind(downcam,camres)
- }
-
- }
- }
- uscam = upcam[order(upcam\$PValue),]
- unadjp = uscam\$PValue
- uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
- nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
- dscam = downcam[order(downcam\$PValue),]
- unadjp = dscam\$PValue
- dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
- ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
- write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
- write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
- print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
- write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
- print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
- write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
- sink()
- }
-
-
-edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
- fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
- filterquantile=0.2, subjects=c(),mydesign=NULL,
- doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
- histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
- doCook=F,DESeq_fitType="parameteric")
-{
- # Error handling
- if (length(unique(group))!=2){
- print("Number of conditions identified in experiment does not equal 2")
- q()
- }
- require(edgeR)
- options(width = 512)
- mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
- allN = nrow(Count_Matrix)
- nscut = round(ncol(Count_Matrix)/2)
- colTotmillionreads = colSums(Count_Matrix)/1e6
- counts.dataframe = as.data.frame(c())
- rawrs = rowSums(Count_Matrix)
- nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
- nzN = nrow(nonzerod)
- nzrs = rowSums(nonzerod)
- zN = allN - nzN
- print('# Quantiles for non-zero row counts:',quote=F)
- print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
- if (useNDF == T)
- {
- gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
- lo = colSums(Count_Matrix[!gt1rpin3,])
- workCM = Count_Matrix[gt1rpin3,]
- cleanrs = rowSums(workCM)
- cleanN = length(cleanrs)
- meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
- print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
- maint = paste('Filter >=1/million reads in >=',nscut,'samples')
- } else {
- useme = (nzrs > quantile(nzrs,filterquantile))
- workCM = nonzerod[useme,]
- lo = colSums(nonzerod[!useme,])
- cleanrs = rowSums(workCM)
- cleanN = length(cleanrs)
- meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
- print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
- maint = paste('Filter below',filterquantile,'quantile')
- }
- cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
- allgenes = rownames(workCM)
- reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
- genecards=" 0.8) # is ucsc style string
- {
- print("@@ using ucsc substitution for urls")
- contigurls = paste0(ucsc,"&position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"")
- } else {
- print.noquote("@@ using genecards substitution for urls")
- contigurls = paste0(genecards,allgenes,"\'>",allgenes,"")
- }
- print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
- cmrowsums = rowSums(workCM)
- TName=unique(group)[1]
- CName=unique(group)[2]
- if (is.null(mydesign)) {
- if (length(subjects) == 0)
- {
- mydesign = model.matrix(~group)
- }
- else {
- subjf = factor(subjects)
- mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
- }
- }
- print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
- print.noquote('Using design matrix:')
- print.noquote(mydesign)
- if (doedgeR) {
- sink('edgeR.log')
- #### Setup DGEList object
- DGEList = DGEList(counts=workCM, group = group)
- DGEList = calcNormFactors(DGEList)
-
- DGEList = estimateGLMCommonDisp(DGEList,mydesign)
- comdisp = DGEList\$common.dispersion
- DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
- if (edgeR_priordf > 0) {
- print.noquote(paste("prior.df =",edgeR_priordf))
- DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
- } else {
- DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
- }
- DGLM = glmFit(DGEList,design=mydesign)
- DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
- efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
- normData = (1e+06*DGEList\$counts/efflib)
- uoutput = cbind(
- Name=as.character(rownames(DGEList\$counts)),
- DE\$table,
- adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
- Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
- DGEList\$counts
- )
- soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
- goodness = gof(DGLM, pcutoff=fdrthresh)
- if (sum(goodness\$outlier) > 0) {
- print.noquote('GLM outliers:')
- print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
- } else {
- print('No GLM fit outlier genes found\n')
- }
- z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
- pdf("edgeR_GoodnessofFit.pdf")
- qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
- abline(0,1,lwd=3)
- points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
- dev.off()
- estpriorn = getPriorN(DGEList)
- print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
- efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
- normData = (1e+06*DGEList\$counts)/efflib
- lnormData = log(normData + 1e-6,10)
- uniqueg = unique(group)
- #### Plot MDS
- sample_colors = match(group,levels(group))
- sampleTypes = levels(factor(group))
- print.noquote(sampleTypes)
- pdf("edgeR_MDSplot.pdf")
- plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
- legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
- grid(col="blue")
- dev.off()
- colnames(normData) = paste( colnames(normData),'N',sep="_")
- print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
- nzd = data.frame(log(nonzerod + 1e-2,10))
- try( boxPlot(rawrs=nzd,cleanrs=lnormData,maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") )
- write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
- tt = cbind(
- Name=as.character(rownames(DGEList\$counts)),
- DE\$table,
- adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
- Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
- )
- print.noquote("# edgeR Top tags\n")
- tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
- tt = tt[order(DE\$table\$PValue),]
- print.noquote(tt[1:50,])
- deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
- nsig = length(deTags)
- print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
- deColours = ifelse(deTags,'red','black')
- pdf("edgeR_BCV_vs_abundance.pdf")
- plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance",col.tagwise=deColours)
- dev.off()
- dg = DGEList[order(DE\$table\$PValue),]
- #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
- efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
- normData = (1e+06*dg\$counts/efflib)
- outpdfname="edgeR_top_100_heatmap.pdf"
- hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle))
- outSmear = "edgeR_smearplot.pdf"
- outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
- smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
- qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf')
- norm.factor = DGEList\$samples\$norm.factors
- topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
- edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
- edgeRcounts = rep(0, length(allgenes))
- edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
- sink()
- } ### doedgeR
- if (doDESeq2 == T)
- {
- sink("DESeq2.log")
- # DESeq2
- require('DESeq2')
- library('RColorBrewer')
- if (length(subjects) == 0)
- {
- pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
- deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
- } else {
- pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
- deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
- }
- #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
- #rDESeq = results(DESeq2)
- #newCountDataSet(workCM, group)
- deSeqDatsizefac = estimateSizeFactors(deSEQds)
- deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
- resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
- rDESeq = as.data.frame(results(resDESeq))
- rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums)
- srDESeq = rDESeq[order(rDESeq\$pvalue),]
- write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
- qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf')
- cat("# DESeq top 50\n")
- rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
- srDESeq = rDESeq[order(rDESeq\$pvalue),]
- print.noquote(srDESeq[1:50,])
- topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
- DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
- DESeqcounts = rep(0, length(allgenes))
- DESeqcounts[DESeqcountsindex] = 1
- pdf("DESeq2_dispersion_estimates.pdf")
- plotDispEsts(resDESeq)
- dev.off()
- ysmall = abs(min(rDESeq\$log2FoldChange))
- ybig = abs(max(rDESeq\$log2FoldChange))
- ylimit = min(4,ysmall,ybig)
- pdf("DESeq2_MA_plot.pdf")
- plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
- dev.off()
- rlogres = rlogTransformation(resDESeq)
- sampledists = dist( t( assay(rlogres) ) )
- sdmat = as.matrix(sampledists)
- pdf("DESeq2_sample_distance_plot.pdf")
- heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
- col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
- dev.off()
- ###outpdfname="DESeq2_top50_heatmap.pdf"
- ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
- sink()
- result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
- if ("try-error" %in% class(result)) {
- print.noquote('DESeq2 plotPCA failed.')
- } else {
- pdf("DESeq2_PCA_plot.pdf")
- #### wtf - print? Seems needed to get this to work
- print(ppca)
- dev.off()
- }
- }
-
- if (doVoom == T) {
- sink('Voom.log')
- if (doedgeR == F) {
- #### Setup DGEList object
- DGEList = DGEList(counts=workCM, group = group)
- DGEList = calcNormFactors(DGEList)
- DGEList = estimateGLMCommonDisp(DGEList,mydesign)
- DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
- DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
- DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
- norm.factor = DGEList\$samples\$norm.factors
- }
- pdf("Voom_mean_variance_plot.pdf")
- dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
- dev.off()
- # Use limma to fit data
- fit = lmFit(dat.voomed, mydesign)
- fit = eBayes(fit)
- rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
- qqPlot(descr=paste(myTitle,'Voom-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='Voom_qqplot.pdf')
- rownames(rvoom) = rownames(workCM)
- rvoom = cbind(rvoom,NReads=cmrowsums)
- srvoom = rvoom[order(rvoom\$P.Value),]
- write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F)
- rvoom = cbind(rvoom,URL=contigurls)
- deTags = rownames(rvoom[rvoom\$adj.p.value < fdrthresh,])
- nsig = length(deTags)
- srvoom = rvoom[order(rvoom\$P.Value),]
- cat("# Voom top 50\n")
- print(srvoom[1:50,])
- normData = srvoom\$E
- outpdfname="Voom_top_100_heatmap.pdf"
- hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('VOOM Heatmap',myTitle))
- outSmear = "Voom_smearplot.pdf"
- outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
- smearPlot(DGEList=rvoom,deTags=deTags, outSmear=outSmear, outMain = outMain)
- qqPlot(descr=paste(myTitle,'VOOM adj p QQ plot'),pvector=srvoom\$adj.P.Val,outpdf='Voom_qqplot.pdf')
- # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
- topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
- voomcountsindex = which(allgenes %in% topresults.voom\$ID)
- voomcounts = rep(0, length(allgenes))
- voomcounts[voomcountsindex] = 1
- sink()
- }
-
- if (doCamera) {
- doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
- outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
- }
-
- if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
- if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
- vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
- counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
- VOOM_limma = voomcounts, row.names = allgenes)
- } else if ((doDESeq2==T) && (doedgeR==T)) {
- vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
- counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
- } else if ((doVoom==T) && (doedgeR==T)) {
- vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
- counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
- }
-
- if (nrow(counts.dataframe > 1)) {
- counts.venn = vennCounts(counts.dataframe)
- vennf = "Venn_significant_genes_overlap.pdf"
- pdf(vennf)
- vennDiagram(counts.venn,main=vennmain,col="maroon")
- dev.off()
- }
- } #### doDESeq2 or doVoom
-
-}
-#### Done
-
-###sink(stdout(),append=T,type="message")
-builtin_gmt = ""
-history_gmt = ""
-history_gmt_name = ""
-out_edgeR = F
-out_DESeq2 = F
-out_VOOM = "$out_VOOM"
-doDESeq2 = $DESeq2.doDESeq2 # make these T or F
-doVoom = $doVoom
-doCamera = F
-doedgeR = $edgeR.doedgeR
-edgeR_priordf = 0
-
-
-#if $doVoom == "T":
- out_VOOM = "$out_VOOM"
-#end if
-
-#if $DESeq2.doDESeq2 == "T":
- out_DESeq2 = "$out_DESeq2"
- DESeq_fitType = "$DESeq2.DESeq_fitType"
-#end if
-
-#if $edgeR.doedgeR == "T":
- out_edgeR = "$out_edgeR"
- edgeR_priordf = $edgeR.edgeR_priordf
-#end if
-
-
-if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
-{
-write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
-quit(save="no",status=2)
-}
-
-Out_Dir = "$html_file.files_path"
-Input = "$input1"
-TreatmentName = "$treatment_name"
-TreatmentCols = "$Treat_cols"
-ControlName = "$control_name"
-ControlCols= "$Control_cols"
-org = "$input1.dbkey"
-if (org == "") { org = "hg19"}
-fdrtype = "$fdrtype"
-fdrthresh = $fdrthresh
-useNDF = $useNDF
-fQ = $fQ # non-differential centile cutoff
-myTitle = "$title"
-sids = strsplit("$subjectids",',')
-subjects = unlist(sids)
-nsubj = length(subjects)
-TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
-CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
-cat('Got TCols=')
-cat(TCols)
-cat('; CCols=')
-cat(CCols)
-cat('\n')
-useCols = c(TCols,CCols)
-if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
-Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
-snames = colnames(Count_Matrix)
-nsamples = length(snames)
-if (nsubj > 0 & nsubj != nsamples) {
-options("show.error.messages"=T)
-mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
- 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
-write(mess, stderr())
-quit(save="no",status=4)
-}
-if (length(subjects) != 0) {subjects = subjects[useCols]}
-Count_Matrix = Count_Matrix[,useCols] ### reorder columns
-rn = rownames(Count_Matrix)
-islib = rn %in% c('librarySize','NotInBedRegions')
-LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
-Count_Matrix = Count_Matrix[subset(rn,! islib),]
-group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
-group = factor(group, levels=c(ControlName,TreatmentName))
-colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
-results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
- fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
- myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,
- doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
- histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType)
-sessionInfo()
-]]>
-
-
-
-
-**What it does**
-
-Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
-Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
-
-**Input**
-
-Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
-and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
-non-negative integer count of reads from one sample overlapping the feature.
-The matrix must have a header row uniquely identifying the source samples, and unique row names in
-the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
-
-**Specifying comparisons**
-
-This is basically dumbed down for two factors - case vs control.
-
-More complex interfaces are possible but painful at present.
-Probably need to specify a phenotype file to do this better.
-Work in progress. Send code.
-
-If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
-put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
-A list of integers, one for each subject or an empty string if samples are all independent.
-If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
-Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
-
-So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
-eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
-8,9,1,1,2,2
-as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
-
-**Methods available**
-
-You can run 3 popular Bioconductor packages available for count data.
-
-edgeR - see edgeR_ for details
-
-VOOM/limma - see limma_VOOM_ for details
-
-DESeq2 - see DESeq2_ for details
-
-and optionally camera in edgeR which works better if MSigDB is installed.
-
-**Outputs**
-
-Some helpful plots and analysis results. Note that most of these are produced using R code
-suggested by the excellent documentation and vignettes for the Bioconductor
-packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
-
-**Note on Voom**
-
-The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
-
-This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
-
-voom is an acronym for mean-variance modelling at the observational level.
-The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
-Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
-This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
-The weights are then used in the linear modelling process to adjust for heteroscedasticity.
-
-In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
-The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
-The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
-Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
-Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
-This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
-
-
-Author(s)
-
-Charity Law and Gordon Smyth
-
-References
-
-Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
-
-Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
-Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
-http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
-
-See Also
-
-A voom case study is given in the edgeR User's Guide.
-
-vooma is a similar function but for microarrays instead of RNA-seq.
-
-
-***old rant on changes to Bioconductor package variable names between versions***
-
-The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
-breaking this and all other code that assumed the old name for this variable,
-between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
-This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
-to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
-when their old scripts break. This tool currently now works with 2.4.6.
-
-**Note on prior.N**
-
-http://seqanswers.com/forums/showthread.php?t=5591 says:
-
-*prior.n*
-
-The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
-You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
-in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
-tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
-common likelihood the weight of one observation.
-
-In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
-or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
-you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
-(squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
-What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
-If you have more samples, then the tagwise dispersion estimates will be more reliable,
-so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
-
-
-From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
-
-Dear Dorota,
-
-The important settings are prior.df and trend.
-
-prior.n and prior.df are related through prior.df = prior.n * residual.df,
-and your experiment has residual.df = 36 - 12 = 24. So the old setting of
-prior.n=10 is equivalent for your data to prior.df = 240, a very large
-value. Going the other way, the new setting of prior.df=10 is equivalent
-to prior.n=10/24.
-
-To recover old results with the current software you would use
-
- estimateTagwiseDisp(object, prior.df=240, trend="none")
-
-To get the new default from old software you would use
-
- estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
-
-Actually the old trend method is equivalent to trend="loess" in the new
-software. You should use plotBCV(object) to see whether a trend is
-required.
-
-Note you could also use
-
- prior.n = getPriorN(object, prior.df=10)
-
-to map between prior.df and prior.n.
-
-----
-
-**Attributions**
-
-edgeR - edgeR_
-
-VOOM/limma - limma_VOOM_
-
-DESeq2 - DESeq2_ for details
-
-See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
-
-Galaxy_ (that's what you are using right now!) for gluing everything together
-
-Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
-licensed to you under the LGPL_ like other rgenetics artefacts
-
-.. _LGPL: http://www.gnu.org/copyleft/lesser.html
-.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
-.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
-.. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
-.. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
-.. _Galaxy: http://getgalaxy.org
-
-
-
-
-
diff -r bdb19fdbd679 -r 6d9d2c9679ec rgedgeRpaired_nocamera.xml~
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rgedgeRpaired_nocamera.xml~ Mon Dec 23 16:42:58 2013 -0500
@@ -0,0 +1,1095 @@
+
+ models using BioConductor packages
+
+ r3
+ graphicsmagick
+ ghostscript
+ biocbasics
+
+
+
+ rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts"
+ --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ edgeR['doedgeR'] == "T"
+
+
+ DESeq2['doDESeq2'] == "T"
+
+
+ doVoom == "T"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ nsamp) {
+ dm =dm[1:nsamp,]
+ #sub = paste('Showing',nsamp,'contigs ranked for evidence for differential abundance out of',nprobes,'total')
+ }
+ newcolnames = substr(colnames(dm),1,20)
+ colnames(dm) = newcolnames
+ pdf(outpdfname)
+ heatmap.2(dm,main=myTitle,ColSideColors=pcols,col=topo.colors(100),dendrogram="col",key=T,density.info='none',
+ Rowv=F,scale='row',trace='none',margins=c(8,8),cexRow=0.4,cexCol=0.5)
+ dev.off()
+}
+
+hmap = function(cmat,nmeans=4,outpdfname="heatMap.pdf",nsamp=250,TName='Treatment',group=NA,myTitle="Title goes here")
+{
+ # for 2 groups only was
+ #col.map = function(g) {if (g==TName) "#FF0000" else "#0000FF"}
+ #pcols = unlist(lapply(group,col.map))
+ gu = unique(group)
+ colours = rainbow(length(gu),start=0.3,end=0.6)
+ pcols = colours[match(group,gu)]
+ nrows = nrow(cmat)
+ mtitle = paste(myTitle,'Heatmap: n contigs =',nrows)
+ if (nrows > nsamp) {
+ cmat = cmat[c(1:nsamp),]
+ mtitle = paste('Heatmap: Top ',nsamp,' DE contigs (of ',nrows,')',sep='')
+ }
+ newcolnames = substr(colnames(cmat),1,20)
+ colnames(cmat) = newcolnames
+ pdf(outpdfname)
+ heatmap(cmat,scale='row',main=mtitle,cexRow=0.3,cexCol=0.4,Rowv=NA,ColSideColors=pcols)
+ dev.off()
+}
+
+qqPlot = function(descr='qqplot',pvector, outpdf='qqplot.pdf',...)
+# stolen from https://gist.github.com/703512
+{
+ o = -log10(sort(pvector,decreasing=F))
+ e = -log10( 1:length(o)/length(o) )
+ o[o==-Inf] = reallysmall
+ o[o==Inf] = reallybig
+ maint = descr
+ pdf(outpdf)
+ plot(e,o,pch=19,cex=1, main=maint, ...,
+ xlab=expression(Expected~~-log[10](italic(p))),
+ ylab=expression(Observed~~-log[10](italic(p))),
+ xlim=c(0,max(e)), ylim=c(0,max(o)))
+ lines(e,e,col="red")
+ grid(col = "lightgray", lty = "dotted")
+ dev.off()
+}
+
+smearPlot = function(DGEList,deTags, outSmear, outMain)
+ {
+ pdf(outSmear)
+ plotSmear(DGEList,de.tags=deTags,main=outMain)
+ grid(col="lightgray", lty="dotted")
+ dev.off()
+ }
+
+boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
+{ #
+ nc = ncol(rawrs)
+ #### for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
+ fullnames = colnames(rawrs)
+ newcolnames = substr(colnames(rawrs),1,20)
+ colnames(rawrs) = newcolnames
+ newcolnames = substr(colnames(cleanrs),1,20)
+ colnames(cleanrs) = newcolnames
+ defpar = par(no.readonly=T)
+ print.noquote('raw contig counts by sample:')
+ print.noquote(summary(rawrs))
+ print.noquote('normalised contig counts by sample:')
+ print.noquote(summary(cleanrs))
+ pdf(pdfname)
+ par(mfrow=c(1,2))
+ boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint))
+ grid(col="lightgray",lty="dotted")
+ boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint))
+ grid(col="lightgray",lty="dotted")
+ dev.off()
+ pdfname = "sample_counts_histogram.pdf"
+ nc = ncol(rawrs)
+ print.noquote(paste('Using ncol rawrs=',nc))
+ ncroot = round(sqrt(nc))
+ if (ncroot*ncroot < nc) { ncroot = ncroot + 1 }
+ m = c()
+ for (i in c(1:nc)) {
+ rhist = hist(rawrs[,i],breaks=100,plot=F)
+ m = append(m,max(rhist\$counts))
+ }
+ ymax = max(m)
+ ncols = length(fullnames)
+ if (ncols > 20)
+ {
+ scale = 7*ncols/20
+ pdf(pdfname,width=scale,height=scale)
+ } else {
+ pdf(pdfname)
+ }
+ par(mfrow=c(ncroot,ncroot))
+ for (i in c(1:nc)) {
+ hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon",
+ breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax))
+ }
+ dev.off()
+ par(defpar)
+
+}
+
+cumPlot = function(rawrs,cleanrs,maint,myTitle)
+{ # updated to use ecdf
+ pdfname = "Filtering_rowsum_bar_charts.pdf"
+ defpar = par(no.readonly=T)
+ lrs = log(rawrs,10)
+ lim = max(lrs)
+ pdf(pdfname)
+ par(mfrow=c(2,1))
+ hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)",
+ ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1)
+ grid(col="lightgray", lty="dotted")
+ lrs = log(cleanrs,10)
+ hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)",
+ ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1)
+ grid(col="lightgray", lty="dotted")
+ dev.off()
+ par(defpar)
+}
+
+cumPlot1 = function(rawrs,cleanrs,maint,myTitle)
+{ # updated to use ecdf
+ pdfname = paste(gsub(" ","", myTitle , fixed=TRUE),"RowsumCum.pdf",sep='_')
+ pdf(pdfname)
+ par(mfrow=c(2,1))
+ lastx = max(rawrs)
+ rawe = knots(ecdf(rawrs))
+ cleane = knots(ecdf(cleanrs))
+ cy = 1:length(cleane)/length(cleane)
+ ry = 1:length(rawe)/length(rawe)
+ plot(rawe,ry,type='l',main=paste('Before',maint),xlab="Log Contig Total Reads",
+ ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
+ grid(col="blue")
+ plot(cleane,cy,type='l',main=paste('After',maint),xlab="Log Contig Total Reads",
+ ylab="Cumulative proportion",col="maroon",log='x',xlim=c(1,lastx),sub=myTitle)
+ grid(col="blue")
+ dev.off()
+}
+
+
+
+doGSEA = function(y=NULL,design=NULL,histgmt="",
+ bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
+ ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
+{
+ sink('Camera.log')
+ genesets = c()
+ if (bigmt > "")
+ {
+ bigenesets = readLines(bigmt)
+ genesets = bigenesets
+ }
+ if (histgmt > "")
+ {
+ hgenesets = readLines(histgmt)
+ if (bigmt > "") {
+ genesets = rbind(genesets,hgenesets)
+ } else {
+ genesets = hgenesets
+ } # use only history if no bi
+ }
+ print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
+ genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
+ outf = outfname
+ head=paste(myTitle,'edgeR GSEA')
+ write(head,file=outfname,append=F)
+ ntest=length(genesets)
+ urownames = toupper(rownames(y))
+ upcam = c()
+ downcam = c()
+ for (i in 1:ntest) {
+ gs = unlist(genesets[i])
+ g = gs[1] # geneset_id
+ u = gs[2]
+ if (u > "") { u = paste("",u,"",sep="") }
+ glist = gs[3:length(gs)] # member gene symbols
+ glist = toupper(glist)
+ inglist = urownames %in% glist
+ nin = sum(inglist)
+ if ((nin > minnin) && (nin < maxnin)) {
+ ### print(paste('@@found',sum(inglist),'genes in glist'))
+ camres = camera(y=y,index=inglist,design=design)
+ if (! is.null(camres)) {
+ rownames(camres) = g # gene set name
+ camres = cbind(GeneSet=g,URL=u,camres)
+ if (camres\$Direction == "Up")
+ {
+ upcam = rbind(upcam,camres) } else {
+ downcam = rbind(downcam,camres)
+ }
+ }
+ }
+ }
+ uscam = upcam[order(upcam\$PValue),]
+ unadjp = uscam\$PValue
+ uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+ nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
+ dscam = downcam[order(downcam\$PValue),]
+ unadjp = dscam\$PValue
+ dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+ ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
+ write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+ write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+ print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
+ write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
+ print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
+ write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
+ sink()
+}
+
+
+
+
+doGSEAatonce = function(y=NULL,design=NULL,histgmt="",
+ bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
+ ntest=0, myTitle="myTitle", outfname="GSEA.xls", minnin=5, maxnin=2000,fdrthresh=0.05,fdrtype="BH")
+{
+ sink('Camera.log')
+ genesets = c()
+ if (bigmt > "")
+ {
+ bigenesets = readLines(bigmt)
+ genesets = bigenesets
+ }
+ if (histgmt > "")
+ {
+ hgenesets = readLines(histgmt)
+ if (bigmt > "") {
+ genesets = rbind(genesets,hgenesets)
+ } else {
+ genesets = hgenesets
+ } # use only history if no bi
+ }
+ print.noquote(paste("@@@read",length(genesets), 'genesets from',histgmt,bigmt))
+ genesets = strsplit(genesets,'\t') # tabular. genesetid\tURLorwhatever\tgene_1\t..\tgene_n
+ outf = outfname
+ head=paste(myTitle,'edgeR GSEA')
+ write(head,file=outfname,append=F)
+ ntest=length(genesets)
+ urownames = toupper(rownames(y))
+ upcam = c()
+ downcam = c()
+ incam = c()
+ urls = c()
+ gsids = c()
+ for (i in 1:ntest) {
+ gs = unlist(genesets[i])
+ gsid = gs[1] # geneset_id
+ url = gs[2]
+ if (url > "") { url = paste("",url,"",sep="") }
+ glist = gs[3:length(gs)] # member gene symbols
+ glist = toupper(glist)
+ inglist = urownames %in% glist
+ nin = sum(inglist)
+ if ((nin > minnin) && (nin < maxnin)) {
+ incam = c(incam,inglist)
+ gsids = c(gsids,gsid)
+ urls = c(urls,url)
+ }
+ }
+ incam = as.list(incam)
+ names(incam) = gsids
+ allcam = camera(y=y,index=incam,design=design)
+ allcamres = cbind(geneset=gsids,allcam,URL=urls)
+ for (i in 1:ntest) {
+ camres = allcamres[i]
+ res = try(test = (camres\$Direction == "Up"))
+ if ("try-error" %in% class(res)) {
+ cat("test failed, camres = :")
+ print.noquote(camres)
+ } else { if (camres\$Direction == "Up")
+ { upcam = rbind(upcam,camres)
+ } else { downcam = rbind(downcam,camres)
+ }
+
+ }
+ }
+ uscam = upcam[order(upcam\$PValue),]
+ unadjp = uscam\$PValue
+ uscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+ nup = max(10,sum((uscam\$adjPValue < fdrthresh)))
+ dscam = downcam[order(downcam\$PValue),]
+ unadjp = dscam\$PValue
+ dscam\$adjPValue = p.adjust(unadjp,method=fdrtype)
+ ndown = max(10,sum((dscam\$adjPValue < fdrthresh)))
+ write.table(uscam,file=paste('camera_up',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+ write.table(dscam,file=paste('camera_down',outfname,sep='_'),quote=F,sep='\t',row.names=F)
+ print.noquote(paste('@@@@@ Camera up top',nup,'gene sets:'))
+ write.table(head(uscam,nup),file="",quote=F,sep='\t',row.names=F)
+ print.noquote(paste('@@@@@ Camera down top',ndown,'gene sets:'))
+ write.table(head(dscam,ndown),file="",quote=F,sep='\t',row.names=F)
+ sink()
+ }
+
+
+edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_VOOM=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
+ fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
+ filterquantile=0.2, subjects=c(),mydesign=NULL,
+ doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
+ histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
+ doCook=F,DESeq_fitType="parameteric")
+{
+ # Error handling
+ if (length(unique(group))!=2){
+ print("Number of conditions identified in experiment does not equal 2")
+ q()
+ }
+ require(edgeR)
+ options(width = 512)
+ mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ")
+ allN = nrow(Count_Matrix)
+ nscut = round(ncol(Count_Matrix)/2)
+ colTotmillionreads = colSums(Count_Matrix)/1e6
+ counts.dataframe = as.data.frame(c())
+ rawrs = rowSums(Count_Matrix)
+ nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
+ nzN = nrow(nonzerod)
+ nzrs = rowSums(nonzerod)
+ zN = allN - nzN
+ print('# Quantiles for non-zero row counts:',quote=F)
+ print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
+ if (useNDF == T)
+ {
+ gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
+ lo = colSums(Count_Matrix[!gt1rpin3,])
+ workCM = Count_Matrix[gt1rpin3,]
+ cleanrs = rowSums(workCM)
+ cleanN = length(cleanrs)
+ meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="")
+ print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F)
+ maint = paste('Filter >=1/million reads in >=',nscut,'samples')
+ } else {
+ useme = (nzrs > quantile(nzrs,filterquantile))
+ workCM = nonzerod[useme,]
+ lo = colSums(nonzerod[!useme,])
+ cleanrs = rowSums(workCM)
+ cleanN = length(cleanrs)
+ meth = paste("After filtering at count quantile =",filterquantile,", there are",sep="")
+ print(paste('Read',allN,"contigs. Removed",zN,"with no reads.",meth,cleanN,"contigs"),quote=F)
+ maint = paste('Filter below',filterquantile,'quantile')
+ }
+ cumPlot(rawrs=rawrs,cleanrs=cleanrs,maint=maint,myTitle=myTitle)
+ allgenes = rownames(workCM)
+ reg = "^chr([0-9]+):([0-9]+)-([0-9]+)"
+ genecards=" 0.8) # is ucsc style string
+ {
+ print("@@ using ucsc substitution for urls")
+ contigurls = paste0(ucsc,"&position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"")
+ } else {
+ print.noquote("@@ using genecards substitution for urls")
+ contigurls = paste0(genecards,allgenes,"\'>",allgenes,"")
+ }
+ print(paste("# Total low count contigs per sample = ",paste(lo,collapse=',')),quote=F)
+ cmrowsums = rowSums(workCM)
+ TName=unique(group)[1]
+ CName=unique(group)[2]
+ if (is.null(mydesign)) {
+ if (length(subjects) == 0)
+ {
+ mydesign = model.matrix(~group)
+ }
+ else {
+ subjf = factor(subjects)
+ mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it
+ }
+ }
+ print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
+ print.noquote('Using design matrix:')
+ print.noquote(mydesign)
+ if (doedgeR) {
+ sink('edgeR.log')
+ #### Setup DGEList object
+ DGEList = DGEList(counts=workCM, group = group)
+ DGEList = calcNormFactors(DGEList)
+
+ DGEList = estimateGLMCommonDisp(DGEList,mydesign)
+ comdisp = DGEList\$common.dispersion
+ DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
+ if (edgeR_priordf > 0) {
+ print.noquote(paste("prior.df =",edgeR_priordf))
+ DGEList = estimateGLMTagwiseDisp(DGEList,mydesign,prior.df = edgeR_priordf)
+ } else {
+ DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
+ }
+ DGLM = glmFit(DGEList,design=mydesign)
+ DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
+ efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
+ normData = (1e+06*DGEList\$counts/efflib)
+ uoutput = cbind(
+ Name=as.character(rownames(DGEList\$counts)),
+ DE\$table,
+ adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
+ Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
+ DGEList\$counts
+ )
+ soutput = uoutput[order(DE\$table\$PValue),] # sorted into p value order - for quick toptable
+ goodness = gof(DGLM, pcutoff=fdrthresh)
+ if (sum(goodness\$outlier) > 0) {
+ print.noquote('GLM outliers:')
+ print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F)
+ } else {
+ print('No GLM fit outlier genes found\n')
+ }
+ z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2)
+ pdf("edgeR_GoodnessofFit.pdf")
+ qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
+ abline(0,1,lwd=3)
+ points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
+ dev.off()
+ estpriorn = getPriorN(DGEList)
+ print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F)
+ efflib = DGEList\$samples\$lib.size*DGEList\$samples\$norm.factors
+ normData = (1e+06*DGEList\$counts)/efflib
+ lnormData = log(normData + 1e-6,10)
+ uniqueg = unique(group)
+ #### Plot MDS
+ sample_colors = match(group,levels(group))
+ sampleTypes = levels(factor(group))
+ print.noquote(sampleTypes)
+ pdf("edgeR_MDSplot.pdf")
+ plotMDS.DGEList(DGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
+ legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
+ grid(col="blue")
+ dev.off()
+ colnames(normData) = paste( colnames(normData),'N',sep="_")
+ print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
+ nzd = data.frame(log(nonzerod + 1e-2,10))
+ try( boxPlot(rawrs=nzd,cleanrs=lnormData,maint='TMM Normalisation',myTitle=myTitle,pdfname="edgeR_raw_norm_counts_box.pdf") )
+ write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
+ tt = cbind(
+ Name=as.character(rownames(DGEList\$counts)),
+ DE\$table,
+ adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
+ Dispersion=DGEList\$tagwise.dispersion,totreads=cmrowsums
+ )
+ print.noquote("# edgeR Top tags\n")
+ tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
+ tt = tt[order(DE\$table\$PValue),]
+ print.noquote(tt[1:50,])
+ deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
+ nsig = length(deTags)
+ print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F)
+ deColours = ifelse(deTags,'red','black')
+ pdf("edgeR_BCV_vs_abundance.pdf")
+ plotBCV(DGEList, cex=0.3, main="Biological CV vs abundance",col.tagwise=deColours)
+ dev.off()
+ dg = DGEList[order(DE\$table\$PValue),]
+ #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
+ efflib = dg\$samples\$lib.size*dg\$samples\$norm.factors
+ normData = (1e+06*dg\$counts/efflib)
+ outpdfname="edgeR_top_100_heatmap.pdf"
+ hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('edgeR Heatmap',myTitle))
+ outSmear = "edgeR_smearplot.pdf"
+ outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
+ smearPlot(DGEList=DGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
+ qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf='edgeR_qqplot.pdf')
+ norm.factor = DGEList\$samples\$norm.factors
+ topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
+ edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
+ edgeRcounts = rep(0, length(allgenes))
+ edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
+ sink()
+ } ### doedgeR
+ if (doDESeq2 == T)
+ {
+ sink("DESeq2.log")
+ # DESeq2
+ require('DESeq2')
+ library('RColorBrewer')
+ if (length(subjects) == 0)
+ {
+ pdata = data.frame(Name=colnames(workCM),Rx=group,row.names=colnames(workCM))
+ deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
+ } else {
+ pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
+ deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
+ }
+ #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
+ #rDESeq = results(DESeq2)
+ #newCountDataSet(workCM, group)
+ deSeqDatsizefac = estimateSizeFactors(deSEQds)
+ deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
+ resDESeq = nbinomWaldTest(deSeqDatdisp, pAdjustMethod=fdrtype)
+ rDESeq = as.data.frame(results(resDESeq))
+ rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums)
+ srDESeq = rDESeq[order(rDESeq\$pvalue),]
+ write.table(srDESeq,file=out_DESeq2, quote=FALSE, sep="\t",row.names=F)
+ qqPlot(descr=paste(myTitle,'DESeq2 adj p qq plot'),pvector=rDESeq\$padj,outpdf='DESeq2_qqplot.pdf')
+ cat("# DESeq top 50\n")
+ rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
+ srDESeq = rDESeq[order(rDESeq\$pvalue),]
+ print.noquote(srDESeq[1:50,])
+ topresults.DESeq = rDESeq[which(rDESeq\$padj < fdrthresh), ]
+ DESeqcountsindex = which(allgenes %in% rownames(topresults.DESeq))
+ DESeqcounts = rep(0, length(allgenes))
+ DESeqcounts[DESeqcountsindex] = 1
+ pdf("DESeq2_dispersion_estimates.pdf")
+ plotDispEsts(resDESeq)
+ dev.off()
+ ysmall = abs(min(rDESeq\$log2FoldChange))
+ ybig = abs(max(rDESeq\$log2FoldChange))
+ ylimit = min(4,ysmall,ybig)
+ pdf("DESeq2_MA_plot.pdf")
+ plotMA(resDESeq,main=paste(myTitle,"DESeq2 MA plot"),ylim=c(-ylimit,ylimit))
+ dev.off()
+ rlogres = rlogTransformation(resDESeq)
+ sampledists = dist( t( assay(rlogres) ) )
+ sdmat = as.matrix(sampledists)
+ pdf("DESeq2_sample_distance_plot.pdf")
+ heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
+ col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
+ dev.off()
+ ###outpdfname="DESeq2_top50_heatmap.pdf"
+ ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
+ sink()
+ result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
+ if ("try-error" %in% class(result)) {
+ print.noquote('DESeq2 plotPCA failed.')
+ } else {
+ pdf("DESeq2_PCA_plot.pdf")
+ #### wtf - print? Seems needed to get this to work
+ print(ppca)
+ dev.off()
+ }
+ }
+
+ if (doVoom == T) {
+ sink('Voom.log')
+ if (doedgeR == F) {
+ #### Setup DGEList object
+ DGEList = DGEList(counts=workCM, group = group)
+ DGEList = calcNormFactors(DGEList)
+ DGEList = estimateGLMCommonDisp(DGEList,mydesign)
+ DGEList = estimateGLMTrendedDisp(DGEList,mydesign)
+ DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
+ DGEList = estimateGLMTagwiseDisp(DGEList,mydesign)
+ norm.factor = DGEList\$samples\$norm.factors
+ }
+ pdf("Voom_mean_variance_plot.pdf")
+ dat.voomed = voom(DGEList, mydesign, plot = TRUE, lib.size = colSums(workCM) * norm.factor)
+ dev.off()
+ # Use limma to fit data
+ fit = lmFit(dat.voomed, mydesign)
+ fit = eBayes(fit)
+ rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
+ qqPlot(descr=paste(myTitle,'Voom-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf='Voom_qqplot.pdf')
+ rownames(rvoom) = rownames(workCM)
+ rvoom = cbind(rvoom,NReads=cmrowsums)
+ srvoom = rvoom[order(rvoom\$P.Value),]
+ write.table(srvoom,file=out_VOOM, quote=FALSE, sep="\t",row.names=F)
+ rvoom = cbind(rvoom,URL=contigurls)
+ deTags = rownames(rvoom[rvoom\$adj.p.value < fdrthresh,])
+ nsig = length(deTags)
+ srvoom = rvoom[order(rvoom\$P.Value),]
+ cat("# Voom top 50\n")
+ print(srvoom[1:50,])
+ normData = srvoom\$E
+ outpdfname="Voom_top_100_heatmap.pdf"
+ hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('VOOM Heatmap',myTitle))
+ outSmear = "Voom_smearplot.pdf"
+ outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
+ smearPlot(DGEList=rvoom,deTags=deTags, outSmear=outSmear, outMain = outMain)
+ qqPlot(descr=paste(myTitle,'VOOM adj p QQ plot'),pvector=srvoom\$adj.P.Val,outpdf='Voom_qqplot.pdf')
+ # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
+ topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
+ voomcountsindex = which(allgenes %in% topresults.voom\$ID)
+ voomcounts = rep(0, length(allgenes))
+ voomcounts[voomcountsindex] = 1
+ sink()
+ }
+
+ if (doCamera) {
+ doGSEA(y=DGEList,design=mydesign,histgmt=histgmt,bigmt=bigmt,ntest=20,myTitle=myTitle,
+ outfname=paste(mt,"GSEA.xls",sep="_"),fdrthresh=fdrthresh,fdrtype=fdrtype)
+ }
+
+ if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) {
+ if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) {
+ vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh)
+ counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts,
+ VOOM_limma = voomcounts, row.names = allgenes)
+ } else if ((doDESeq2==T) && (doedgeR==T)) {
+ vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh)
+ counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, row.names = allgenes)
+ } else if ((doVoom==T) && (doedgeR==T)) {
+ vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh)
+ counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
+ }
+
+ if (nrow(counts.dataframe > 1)) {
+ counts.venn = vennCounts(counts.dataframe)
+ vennf = "Venn_significant_genes_overlap.pdf"
+ pdf(vennf)
+ vennDiagram(counts.venn,main=vennmain,col="maroon")
+ dev.off()
+ }
+ } #### doDESeq2 or doVoom
+
+}
+#### Done
+
+###sink(stdout(),append=T,type="message")
+builtin_gmt = ""
+history_gmt = ""
+history_gmt_name = ""
+out_edgeR = F
+out_DESeq2 = F
+out_VOOM = "$out_VOOM"
+doDESeq2 = $DESeq2.doDESeq2 # make these T or F
+doVoom = $doVoom
+doCamera = F
+doedgeR = $edgeR.doedgeR
+edgeR_priordf = 0
+
+
+#if $doVoom == "T":
+ out_VOOM = "$out_VOOM"
+#end if
+
+#if $DESeq2.doDESeq2 == "T":
+ out_DESeq2 = "$out_DESeq2"
+ DESeq_fitType = "$DESeq2.DESeq_fitType"
+#end if
+
+#if $edgeR.doedgeR == "T":
+ out_edgeR = "$out_edgeR"
+ edgeR_priordf = $edgeR.edgeR_priordf
+#end if
+
+
+if (sum(c(doedgeR,doVoom,doDESeq2)) == 0)
+{
+write("No methods chosen - nothing to do! Please try again after choosing one or more methods", stderr())
+quit(save="no",status=2)
+}
+
+Out_Dir = "$html_file.files_path"
+Input = "$input1"
+TreatmentName = "$treatment_name"
+TreatmentCols = "$Treat_cols"
+ControlName = "$control_name"
+ControlCols= "$Control_cols"
+org = "$input1.dbkey"
+if (org == "") { org = "hg19"}
+fdrtype = "$fdrtype"
+fdrthresh = $fdrthresh
+useNDF = $useNDF
+fQ = $fQ # non-differential centile cutoff
+myTitle = "$title"
+sids = strsplit("$subjectids",',')
+subjects = unlist(sids)
+nsubj = length(subjects)
+TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
+CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1
+cat('Got TCols=')
+cat(TCols)
+cat('; CCols=')
+cat(CCols)
+cat('\n')
+useCols = c(TCols,CCols)
+if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
+Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') #Load tab file assume header
+snames = colnames(Count_Matrix)
+nsamples = length(snames)
+if (nsubj > 0 & nsubj != nsamples) {
+options("show.error.messages"=T)
+mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
+ 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
+write(mess, stderr())
+quit(save="no",status=4)
+}
+if (length(subjects) != 0) {subjects = subjects[useCols]}
+Count_Matrix = Count_Matrix[,useCols] ### reorder columns
+rn = rownames(Count_Matrix)
+islib = rn %in% c('librarySize','NotInBedRegions')
+LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
+Count_Matrix = Count_Matrix[subset(rn,! islib),]
+group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) #Build a group descriptor
+group = factor(group, levels=c(ControlName,TreatmentName))
+colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") #Relable columns
+results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_VOOM=out_VOOM, out_DESeq2=out_DESeq2,
+ fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
+ myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,
+ doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
+ histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType)
+sessionInfo()
+]]>
+
+
+
+
+**What it does**
+
+Allows short read sequence counts from controlled experiments to be analysed for differentially expressed genes.
+Optionally adds a term for subject if not all samples are independent or if some other factor needs to be blocked in the design.
+
+**Input**
+
+Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
+and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
+non-negative integer count of reads from one sample overlapping the feature.
+The matrix must have a header row uniquely identifying the source samples, and unique row names in
+the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
+
+**Specifying comparisons**
+
+This is basically dumbed down for two factors - case vs control.
+
+More complex interfaces are possible but painful at present.
+Probably need to specify a phenotype file to do this better.
+Work in progress. Send code.
+
+If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples),
+put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or
+A list of integers, one for each subject or an empty string if samples are all independent.
+If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix.
+Integers for samples that are not in the analysis *must* be present in the string as filler even if not used.
+
+So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones
+eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use
+8,9,1,1,2,2
+as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6
+
+**Methods available**
+
+You can run 3 popular Bioconductor packages available for count data.
+
+edgeR - see edgeR_ for details
+
+VOOM/limma - see limma_VOOM_ for details
+
+DESeq2 - see DESeq2_ for details
+
+and optionally camera in edgeR which works better if MSigDB is installed.
+
+**Outputs**
+
+Some helpful plots and analysis results. Note that most of these are produced using R code
+suggested by the excellent documentation and vignettes for the Bioconductor
+packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy.
+
+**Note on Voom**
+
+The voom from limma version 3.16.6 help in R includes this from the authors - but you should read the paper to interpret this method.
+
+This function is intended to process RNA-Seq or ChIP-Seq data prior to linear modelling in limma.
+
+voom is an acronym for mean-variance modelling at the observational level.
+The key concern is to estimate the mean-variance relationship in the data, then use this to compute appropriate weights for each observation.
+Count data almost show non-trivial mean-variance relationships. Raw counts show increasing variance with increasing count size, while log-counts typically show a decreasing mean-variance trend.
+This function estimates the mean-variance trend for log-counts, then assigns a weight to each observation based on its predicted variance.
+The weights are then used in the linear modelling process to adjust for heteroscedasticity.
+
+In an experiment, a count value is observed for each tag in each sample. A tag-wise mean-variance trend is computed using lowess.
+The tag-wise mean is the mean log2 count with an offset of 0.5, across samples for a given tag.
+The tag-wise variance is the quarter-root-variance of normalized log2 counts per million values with an offset of 0.5, across samples for a given tag.
+Tags with zero counts across all samples are not included in the lowess fit. Optional normalization is performed using normalizeBetweenArrays.
+Using fitted values of log2 counts from a linear model fit by lmFit, variances from the mean-variance trend were interpolated for each observation.
+This was carried out by approxfun. Inverse variance weights can be used to correct for mean-variance trend in the count data.
+
+
+Author(s)
+
+Charity Law and Gordon Smyth
+
+References
+
+Law, CW (2013). Precision weights for gene expression analysis. PhD Thesis. University of Melbourne, Australia.
+
+Law, CW, Chen, Y, Shi, W, Smyth, GK (2013). Voom! Precision weights unlock linear model analysis tools for RNA-seq read counts.
+Technical Report 1 May 2013, Bioinformatics Division, Walter and Eliza Hall Institute of Medical Reseach, Melbourne, Australia.
+http://www.statsci.org/smyth/pubs/VoomPreprint.pdf
+
+See Also
+
+A voom case study is given in the edgeR User's Guide.
+
+vooma is a similar function but for microarrays instead of RNA-seq.
+
+
+***old rant on changes to Bioconductor package variable names between versions***
+
+The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue)
+breaking this and all other code that assumed the old name for this variable,
+between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing).
+This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing
+to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly
+when their old scripts break. This tool currently now works with 2.4.6.
+
+**Note on prior.N**
+
+http://seqanswers.com/forums/showthread.php?t=5591 says:
+
+*prior.n*
+
+The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion.
+You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood
+in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your
+tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the
+common likelihood the weight of one observation.
+
+In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value,
+or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that
+you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation
+(squeezing) of the tagwise dispersions. How many samples do you have in your experiment?
+What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10.
+If you have more samples, then the tagwise dispersion estimates will be more reliable,
+so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5.
+
+
+From Bioconductor Digest, Vol 118, Issue 5, Gordon writes:
+
+Dear Dorota,
+
+The important settings are prior.df and trend.
+
+prior.n and prior.df are related through prior.df = prior.n * residual.df,
+and your experiment has residual.df = 36 - 12 = 24. So the old setting of
+prior.n=10 is equivalent for your data to prior.df = 240, a very large
+value. Going the other way, the new setting of prior.df=10 is equivalent
+to prior.n=10/24.
+
+To recover old results with the current software you would use
+
+ estimateTagwiseDisp(object, prior.df=240, trend="none")
+
+To get the new default from old software you would use
+
+ estimateTagwiseDisp(object, prior.n=10/24, trend=TRUE)
+
+Actually the old trend method is equivalent to trend="loess" in the new
+software. You should use plotBCV(object) to see whether a trend is
+required.
+
+Note you could also use
+
+ prior.n = getPriorN(object, prior.df=10)
+
+to map between prior.df and prior.n.
+
+----
+
+**Attributions**
+
+edgeR - edgeR_
+
+VOOM/limma - limma_VOOM_
+
+DESeq2 - DESeq2_ for details
+
+See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package.
+
+Galaxy_ (that's what you are using right now!) for gluing everything together
+
+Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is
+licensed to you under the LGPL_ like other rgenetics artefacts
+
+.. _LGPL: http://www.gnu.org/copyleft/lesser.html
+.. _HTSeq: http://www-huber.embl.de/users/anders/HTSeq/doc/index.html
+.. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
+.. _DESeq2: http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html
+.. _limma_VOOM: http://www.bioconductor.org/packages/release/bioc/html/limma.html
+.. _Galaxy: http://getgalaxy.org
+
+
+
+
+
diff -r bdb19fdbd679 -r 6d9d2c9679ec test-data/edgeRtest1out.html
--- a/test-data/edgeRtest1out.html Sun Dec 22 06:03:27 2013 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,733 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
Galaxy Tool "DifferentialCounts" run at 07/08/2013 15:46:55
-
DESeq2 images and outputs
-(Click on a thumbnail image to download the corresponding original PDF image)
-